Add website downloader tool wrapper for GLM-4.7-Flash
- Create website_downloader_tool.py with OpenAI function calling schema - Add comprehensive tool documentation - Update README with usage examples - Update requirements.txt with optional sdk dependency
This commit is contained in:
parent
1623ee8d2c
commit
aa69b2f496
164
README.md
164
README.md
@ -1,2 +1,164 @@
|
|||||||
# docrag
|
# DocRAG - Custom RAG with Document Loader
|
||||||
|
|
||||||
|
A custom RAG (Retrieval-Augmented Generation) system with a custom document loader that acts as a local OpenAI-compatible server using a remote LLM with custom tools.
|
||||||
|
|
||||||
|
## Components
|
||||||
|
|
||||||
|
### Website Downloader Tool
|
||||||
|
|
||||||
|
The `website_downloader_tool.py` provides a tool interface for downloading and mirroring websites for offline use or RAG ingestion. It can be used by GLM-4.7-Flash via the z-ai-web-dev-sdk.
|
||||||
|
|
||||||
|
#### Features
|
||||||
|
|
||||||
|
- Downloads HTML pages and all linked assets (CSS, JS, images, fonts, etc.)
|
||||||
|
- Rewrites links for offline viewing
|
||||||
|
- Supports concurrent downloads with configurable thread count
|
||||||
|
- Optional external asset downloading from CDNs
|
||||||
|
- Domain whitelisting for external assets
|
||||||
|
- Comprehensive error handling and statistics
|
||||||
|
|
||||||
|
#### Tool Schema
|
||||||
|
|
||||||
|
The tool follows the OpenAI function calling format:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from website_downloader_tool import get_tool_schema, website_downloader
|
||||||
|
|
||||||
|
# Get the tool schema for registration
|
||||||
|
schema = get_tool_schema()
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Usage with GLM-4.7-Flash
|
||||||
|
|
||||||
|
```python
|
||||||
|
from zai import ZaiClient
|
||||||
|
from website_downloader_tool import get_tool_schema, website_downloader
|
||||||
|
|
||||||
|
client = ZaiClient(api_key="your-api-key")
|
||||||
|
|
||||||
|
# Define the tool
|
||||||
|
tools = [get_tool_schema()]
|
||||||
|
|
||||||
|
# Create a chat completion with tools
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model="glm-4.7",
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Please download https://example.com for offline use"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
tools=tools,
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Handle tool calls in the response
|
||||||
|
for chunk in response:
|
||||||
|
if chunk.choices[0].delta.tool_calls:
|
||||||
|
tool_call = chunk.choices[0].delta.tool_calls[0]
|
||||||
|
if tool_call.function.name == "website_downloader":
|
||||||
|
import json
|
||||||
|
args = json.loads(tool_call.function.arguments)
|
||||||
|
result = website_downloader(**args)
|
||||||
|
print(result)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Direct Usage
|
||||||
|
|
||||||
|
```python
|
||||||
|
from website_downloader_tool import website_downloader
|
||||||
|
|
||||||
|
# Download a website
|
||||||
|
result = website_downloader(
|
||||||
|
url="https://example.com",
|
||||||
|
destination="./downloaded_site", # Optional
|
||||||
|
max_pages=50, # Max pages to crawl
|
||||||
|
threads=6, # Concurrent downloads
|
||||||
|
download_external_assets=False, # Include CDN assets
|
||||||
|
external_domains=["cdn.example.com"] # Whitelist external domains
|
||||||
|
)
|
||||||
|
|
||||||
|
if result["success"]:
|
||||||
|
print(f"Downloaded to: {result['output_directory']}")
|
||||||
|
print(f"Pages: {result['stats']['pages_crawled']}")
|
||||||
|
print(f"Assets: {result['stats']['assets_downloaded']}")
|
||||||
|
else:
|
||||||
|
print(f"Error: {result['message']}")
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
| Parameter | Type | Required | Default | Description |
|
||||||
|
|-----------|------|----------|---------|-------------|
|
||||||
|
| `url` | string | Yes | - | Starting URL to crawl |
|
||||||
|
| `destination` | string | No | Derived from URL | Output folder path |
|
||||||
|
| `max_pages` | integer | No | 50 | Max HTML pages (1-1000) |
|
||||||
|
| `threads` | integer | No | 6 | Concurrent download threads (1-20) |
|
||||||
|
| `download_external_assets` | boolean | No | False | Download CDN assets |
|
||||||
|
| `external_domains` | array | No | None | Whitelist of external domains |
|
||||||
|
|
||||||
|
#### Return Value
|
||||||
|
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
"success": True/False,
|
||||||
|
"message": "Human-readable summary",
|
||||||
|
"stats": {
|
||||||
|
"pages_crawled": int,
|
||||||
|
"assets_downloaded": int,
|
||||||
|
"failed_downloads": int,
|
||||||
|
"elapsed_seconds": float,
|
||||||
|
"output_directory": str,
|
||||||
|
"pages": [...], # List of downloaded pages
|
||||||
|
"downloaded_items": [...] # List of downloaded assets
|
||||||
|
},
|
||||||
|
"output_directory": "/path/to/downloaded/site"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Website Downloader CLI
|
||||||
|
|
||||||
|
The original `website-downloader.py` can still be used as a standalone CLI tool:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python website-downloader.py --url https://example.com --max-pages 50 --threads 6
|
||||||
|
```
|
||||||
|
|
||||||
|
#### CLI Options
|
||||||
|
|
||||||
|
- `--url`: Starting URL to crawl (required)
|
||||||
|
- `--destination`: Output folder (optional, derived from URL if not provided)
|
||||||
|
- `--max-pages`: Maximum pages to crawl (default: 50)
|
||||||
|
- `--threads`: Number of download threads (default: 6)
|
||||||
|
- `--download-external-assets`: Enable external asset downloading
|
||||||
|
- `--external-domains`: Whitelist of external domains to download from
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
## Project Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
docrag/
|
||||||
|
├── website-downloader.py # Core website downloader (CLI)
|
||||||
|
├── website_downloader_tool.py # Tool wrapper for GLM-4.7-Flash
|
||||||
|
├── requirements.txt # Python dependencies
|
||||||
|
└── README.md # This file
|
||||||
|
```
|
||||||
|
|
||||||
|
## Integration with RAG
|
||||||
|
|
||||||
|
The downloaded website content can be processed for RAG systems:
|
||||||
|
|
||||||
|
1. Use the tool to download website content
|
||||||
|
2. Parse the downloaded HTML files
|
||||||
|
3. Extract text content and metadata
|
||||||
|
4. Chunk and embed the content
|
||||||
|
5. Store in your vector database
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
Private repository - All rights reserved.
|
||||||
|
|||||||
@ -1,4 +1,7 @@
|
|||||||
requests~=2.32.4
|
requests~=2.32.4
|
||||||
beautifulsoup4~=4.13.4
|
beautifulsoup4~=4.13.4
|
||||||
wget~=3.2
|
wget~=3.2
|
||||||
urllib3~=2.5.0
|
urllib3~=2.5.0
|
||||||
|
|
||||||
|
# Optional: For using z-ai-web-dev-sdk with GLM-4.7-Flash
|
||||||
|
# z-ai-web-dev-sdk>=1.0.0
|
||||||
572
website_downloader_tool.py
Normal file
572
website_downloader_tool.py
Normal file
@ -0,0 +1,572 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Website Downloader Tool for GLM-4.7-Flash
|
||||||
|
|
||||||
|
This module provides a tool interface for the website-downloader functionality,
|
||||||
|
allowing it to be used as a function/tool by the GLM-4.7-Flash model via the
|
||||||
|
z-ai-web-dev-sdk.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
The tool can be invoked by the LLM to download and mirror websites for
|
||||||
|
offline use or for ingesting into a RAG system.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import queue
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Optional
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
# Import the core functionality from website_downloader
|
||||||
|
from website_downloader import (
|
||||||
|
SESSION,
|
||||||
|
TIMEOUT,
|
||||||
|
ASSET_EXTENSIONS,
|
||||||
|
CSS_URL_RE,
|
||||||
|
_canonical_netloc,
|
||||||
|
_protocol_fix,
|
||||||
|
canonicalize_url,
|
||||||
|
create_dir,
|
||||||
|
extract_css_assets,
|
||||||
|
fetch_binary,
|
||||||
|
is_httpish,
|
||||||
|
is_internal,
|
||||||
|
is_non_fetchable,
|
||||||
|
is_allowed_external,
|
||||||
|
normalize_url,
|
||||||
|
rewrite_links,
|
||||||
|
safe_write_text,
|
||||||
|
to_local_path,
|
||||||
|
to_local_asset_path,
|
||||||
|
cdn_local_path,
|
||||||
|
fetch_html,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Configure logging for tool use
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Tool Schema Definition
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
TOOL_SCHEMA = {
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "website_downloader",
|
||||||
|
"description": (
|
||||||
|
"Download and mirror a website for offline use or RAG ingestion. "
|
||||||
|
"This tool crawls a website starting from a given URL, downloads HTML pages "
|
||||||
|
"and all linked assets (CSS, JavaScript, images, fonts, etc.), and saves them "
|
||||||
|
"locally with rewritten links for offline viewing. "
|
||||||
|
"Use this tool when you need to: "
|
||||||
|
"1) Archive a website for offline access, "
|
||||||
|
"2) Download website content for analysis or RAG systems, "
|
||||||
|
"3) Create a local mirror of a website."
|
||||||
|
),
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"url": {
|
||||||
|
"type": "string",
|
||||||
|
"description": (
|
||||||
|
"The starting URL to crawl (e.g., 'https://example.com/'). "
|
||||||
|
"Must be a valid HTTP or HTTPS URL."
|
||||||
|
),
|
||||||
|
},
|
||||||
|
"destination": {
|
||||||
|
"type": "string",
|
||||||
|
"description": (
|
||||||
|
"Optional output folder path where the downloaded website "
|
||||||
|
"will be saved. If not provided, a folder name will be derived "
|
||||||
|
"from the URL's domain (e.g., 'example_com')."
|
||||||
|
),
|
||||||
|
"default": None,
|
||||||
|
},
|
||||||
|
"max_pages": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": (
|
||||||
|
"Maximum number of HTML pages to crawl. "
|
||||||
|
"Use lower values for quick downloads, higher for comprehensive archiving."
|
||||||
|
),
|
||||||
|
"default": 50,
|
||||||
|
"minimum": 1,
|
||||||
|
"maximum": 1000,
|
||||||
|
},
|
||||||
|
"threads": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": (
|
||||||
|
"Number of concurrent download threads. "
|
||||||
|
"Higher values can speed up downloads but may trigger rate limits."
|
||||||
|
),
|
||||||
|
"default": 6,
|
||||||
|
"minimum": 1,
|
||||||
|
"maximum": 20,
|
||||||
|
},
|
||||||
|
"download_external_assets": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": (
|
||||||
|
"Whether to download assets from external domains (CDNs, etc.). "
|
||||||
|
"Enable for complete offline functionality, disable for faster downloads "
|
||||||
|
"of only same-domain content."
|
||||||
|
),
|
||||||
|
"default": False,
|
||||||
|
},
|
||||||
|
"external_domains": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"type": "string"},
|
||||||
|
"description": (
|
||||||
|
"Optional list of external domain names to allow downloading from. "
|
||||||
|
"Useful for whitelisting specific CDN domains. "
|
||||||
|
"Example: ['cdn.example.com', 'assets.example.com']"
|
||||||
|
),
|
||||||
|
"default": None,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"required": ["url"],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Tool Implementation
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
def crawl_site_tool(
|
||||||
|
start_url: str,
|
||||||
|
root: Path,
|
||||||
|
max_pages: int,
|
||||||
|
threads: int,
|
||||||
|
download_external_assets: bool = False,
|
||||||
|
external_domains: Optional[set[str]] = None,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Internal crawl implementation that returns detailed results.
|
||||||
|
|
||||||
|
This is a modified version of crawl_site that collects statistics
|
||||||
|
and returns them in a structured format for the tool response.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing crawl statistics and results
|
||||||
|
"""
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Statistics tracking
|
||||||
|
stats = {
|
||||||
|
"pages_crawled": 0,
|
||||||
|
"assets_downloaded": 0,
|
||||||
|
"failed_downloads": 0,
|
||||||
|
"pages": [],
|
||||||
|
"assets": [],
|
||||||
|
"errors": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
q_pages: queue.Queue[str] = queue.Queue()
|
||||||
|
q_pages.put(start_url)
|
||||||
|
|
||||||
|
seen_pages: set[str] = set()
|
||||||
|
queued_pages: set[str] = {start_url}
|
||||||
|
queued_assets: set[str] = set()
|
||||||
|
download_q: queue.Queue[tuple[str, Path]] = queue.Queue()
|
||||||
|
|
||||||
|
root_netloc = _canonical_netloc(urlparse(start_url))
|
||||||
|
|
||||||
|
# Track successfully downloaded items
|
||||||
|
downloaded_items: list[dict[str, str]] = []
|
||||||
|
failed_items: list[dict[str, str]] = []
|
||||||
|
|
||||||
|
def worker() -> None:
|
||||||
|
"""Download worker thread."""
|
||||||
|
while True:
|
||||||
|
url, dest = download_q.get()
|
||||||
|
try:
|
||||||
|
if is_non_fetchable(url) or not is_httpish(url):
|
||||||
|
log.debug("Skip non-fetchable: %s", url)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if dest.exists():
|
||||||
|
stats["assets_downloaded"] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
fetch_binary(
|
||||||
|
url,
|
||||||
|
dest,
|
||||||
|
download_q,
|
||||||
|
site_root=root,
|
||||||
|
root_netloc=root_netloc,
|
||||||
|
download_external_assets=download_external_assets,
|
||||||
|
external_domains=external_domains,
|
||||||
|
)
|
||||||
|
if dest.exists():
|
||||||
|
stats["assets_downloaded"] += 1
|
||||||
|
downloaded_items.append({
|
||||||
|
"url": url,
|
||||||
|
"local_path": str(dest.relative_to(root))
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
stats["failed_downloads"] += 1
|
||||||
|
failed_items.append({
|
||||||
|
"url": url,
|
||||||
|
"error": str(e)
|
||||||
|
})
|
||||||
|
log.debug("Failed to download %s: %s", url, e)
|
||||||
|
finally:
|
||||||
|
download_q.task_done()
|
||||||
|
|
||||||
|
# Spawn worker threads
|
||||||
|
worker_threads = []
|
||||||
|
for i in range(max(1, threads)):
|
||||||
|
t = threading.Thread(target=worker, name=f"DL-{i + 1}", daemon=True)
|
||||||
|
t.start()
|
||||||
|
worker_threads.append(t)
|
||||||
|
|
||||||
|
# Main crawl loop
|
||||||
|
while not q_pages.empty() and len(seen_pages) < max_pages:
|
||||||
|
page_url = q_pages.get()
|
||||||
|
|
||||||
|
if page_url in seen_pages:
|
||||||
|
continue
|
||||||
|
|
||||||
|
seen_pages.add(page_url)
|
||||||
|
stats["pages_crawled"] += 1
|
||||||
|
|
||||||
|
log.info("Crawling page %d/%d: %s", len(seen_pages), max_pages, page_url)
|
||||||
|
|
||||||
|
soup = fetch_html(page_url)
|
||||||
|
if soup is None:
|
||||||
|
stats["errors"].append(f"Failed to fetch page: {page_url}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Record page info
|
||||||
|
local_page_path = to_local_path(urlparse(page_url), root)
|
||||||
|
stats["pages"].append({
|
||||||
|
"url": page_url,
|
||||||
|
"local_path": str(local_page_path.relative_to(root)) if local_page_path.is_relative_to(root) else str(local_page_path)
|
||||||
|
})
|
||||||
|
|
||||||
|
# Find and queue all assets
|
||||||
|
for tag in soup.find_all(True):
|
||||||
|
# Handle various tag types and their URL attributes
|
||||||
|
tag_handlers = {
|
||||||
|
"img": ["src", "data-src", "srcset"],
|
||||||
|
"script": ["src"],
|
||||||
|
"link": ["href"],
|
||||||
|
"video": ["src", "poster"],
|
||||||
|
"audio": ["src"],
|
||||||
|
"source": ["src", "srcset"],
|
||||||
|
"iframe": ["src"],
|
||||||
|
"embed": ["src"],
|
||||||
|
"object": ["data"],
|
||||||
|
}
|
||||||
|
|
||||||
|
attrs_to_check = tag_handlers.get(tag.name, [])
|
||||||
|
|
||||||
|
# Also check for link tags with resource rel types
|
||||||
|
if tag.name == "link":
|
||||||
|
rel = tag.get("rel", [])
|
||||||
|
if isinstance(rel, str):
|
||||||
|
rel = [rel]
|
||||||
|
rel_set = {r.lower() for r in rel}
|
||||||
|
resource_rels = {"stylesheet", "icon", "shortcut", "apple-touch-icon", "preload", "modulepreload", "manifest"}
|
||||||
|
if not rel_set & resource_rels:
|
||||||
|
attrs_to_check = []
|
||||||
|
|
||||||
|
for attr in attrs_to_check:
|
||||||
|
if not tag.has_attr(attr):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if attr == "srcset":
|
||||||
|
# Handle srcset specially
|
||||||
|
for entry in str(tag["srcset"]).split(","):
|
||||||
|
parts = entry.strip().split()
|
||||||
|
if not parts:
|
||||||
|
continue
|
||||||
|
url_part = _protocol_fix(parts[0], page_url)
|
||||||
|
process_asset_url(
|
||||||
|
url_part, page_url, root, root_netloc,
|
||||||
|
download_external_assets, external_domains,
|
||||||
|
queued_assets, download_q, stats
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
url_part = _protocol_fix(str(tag.get(attr, "")), page_url)
|
||||||
|
process_asset_url(
|
||||||
|
url_part, page_url, root, root_netloc,
|
||||||
|
download_external_assets, external_domains,
|
||||||
|
queued_assets, download_q, stats
|
||||||
|
)
|
||||||
|
|
||||||
|
# Handle inline styles
|
||||||
|
if tag.has_attr("style"):
|
||||||
|
style = str(tag["style"])
|
||||||
|
for match in CSS_URL_RE.findall(style):
|
||||||
|
url_part = _protocol_fix(match.strip().strip("'\""), page_url)
|
||||||
|
process_asset_url(
|
||||||
|
url_part, page_url, root, root_netloc,
|
||||||
|
download_external_assets, external_domains,
|
||||||
|
queued_assets, download_q, stats
|
||||||
|
)
|
||||||
|
|
||||||
|
# Handle <style> blocks
|
||||||
|
if tag.name == "style":
|
||||||
|
css_text = tag.string or tag.get_text()
|
||||||
|
if css_text:
|
||||||
|
for asset in extract_css_assets(css_text):
|
||||||
|
asset = _protocol_fix(asset, page_url)
|
||||||
|
process_asset_url(
|
||||||
|
asset, page_url, root, root_netloc,
|
||||||
|
download_external_assets, external_domains,
|
||||||
|
queued_assets, download_q, stats
|
||||||
|
)
|
||||||
|
|
||||||
|
# Find and queue internal links for further crawling
|
||||||
|
if tag.name == "a" and tag.has_attr("href"):
|
||||||
|
href = _protocol_fix(str(tag.get("href", "")), page_url)
|
||||||
|
if href and not href.startswith("#") and is_httpish(href) and not is_non_fetchable(href):
|
||||||
|
abs_url = normalize_url(canonicalize_url(href, page_url))
|
||||||
|
if is_internal(abs_url, root_netloc) and abs_url not in seen_pages and abs_url not in queued_pages:
|
||||||
|
queued_pages.add(abs_url)
|
||||||
|
q_pages.put(abs_url)
|
||||||
|
|
||||||
|
# Save the page with rewritten links
|
||||||
|
local_path = to_local_path(urlparse(page_url), root)
|
||||||
|
create_dir(local_path.parent)
|
||||||
|
rewrite_links(
|
||||||
|
soup,
|
||||||
|
page_url,
|
||||||
|
root,
|
||||||
|
local_path.parent,
|
||||||
|
download_external_assets,
|
||||||
|
external_domains,
|
||||||
|
)
|
||||||
|
safe_write_text(local_path, str(soup), encoding="utf-8")
|
||||||
|
|
||||||
|
# Wait for all downloads to complete
|
||||||
|
download_q.join()
|
||||||
|
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
stats["elapsed_seconds"] = round(elapsed, 2)
|
||||||
|
stats["output_directory"] = str(root.resolve())
|
||||||
|
stats["downloaded_items"] = downloaded_items[:100] # Limit for response size
|
||||||
|
stats["failed_items"] = failed_items[:50] # Limit for response size
|
||||||
|
|
||||||
|
return stats
|
||||||
|
|
||||||
|
|
||||||
|
def process_asset_url(
|
||||||
|
url_part: str,
|
||||||
|
page_url: str,
|
||||||
|
root: Path,
|
||||||
|
root_netloc: str,
|
||||||
|
download_external_assets: bool,
|
||||||
|
external_domains: Optional[set[str]],
|
||||||
|
queued_assets: set[str],
|
||||||
|
download_q: queue.Queue[tuple[str, Path]],
|
||||||
|
stats: dict,
|
||||||
|
) -> None:
|
||||||
|
"""Process and queue an asset URL for download."""
|
||||||
|
if (
|
||||||
|
not url_part
|
||||||
|
or url_part.startswith("#")
|
||||||
|
or url_part.startswith(("data:", "javascript:", "about:"))
|
||||||
|
or is_non_fetchable(url_part)
|
||||||
|
or not is_httpish(url_part)
|
||||||
|
):
|
||||||
|
return
|
||||||
|
|
||||||
|
abs_url = normalize_url(canonicalize_url(url_part, page_url))
|
||||||
|
parsed = urlparse(abs_url)
|
||||||
|
|
||||||
|
if not parsed.path.lower().endswith(ASSET_EXTENSIONS):
|
||||||
|
return
|
||||||
|
|
||||||
|
is_ext = not is_internal(abs_url, root_netloc)
|
||||||
|
|
||||||
|
if is_ext:
|
||||||
|
if not download_external_assets:
|
||||||
|
return
|
||||||
|
if external_domains and not is_allowed_external(abs_url, external_domains):
|
||||||
|
return
|
||||||
|
dest_path = cdn_local_path(parsed, root)
|
||||||
|
else:
|
||||||
|
dest_path = to_local_asset_path(parsed, root)
|
||||||
|
|
||||||
|
if abs_url not in queued_assets:
|
||||||
|
queued_assets.add(abs_url)
|
||||||
|
create_dir(dest_path.parent)
|
||||||
|
download_q.put((abs_url, dest_path))
|
||||||
|
|
||||||
|
|
||||||
|
def make_root(url: str, custom: Optional[str]) -> Path:
|
||||||
|
"""Derive output folder from URL if custom not supplied."""
|
||||||
|
return Path(custom) if custom else Path(urlparse(url).netloc.replace(".", "_"))
|
||||||
|
|
||||||
|
|
||||||
|
def website_downloader(
|
||||||
|
url: str,
|
||||||
|
destination: Optional[str] = None,
|
||||||
|
max_pages: int = 50,
|
||||||
|
threads: int = 6,
|
||||||
|
download_external_assets: bool = False,
|
||||||
|
external_domains: Optional[list[str]] = None,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Download and mirror a website for offline use or RAG ingestion.
|
||||||
|
|
||||||
|
This is the main tool function that can be invoked by the GLM-4.7-Flash model.
|
||||||
|
It wraps the website-downloader functionality in a tool interface.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The starting URL to crawl (e.g., 'https://example.com/')
|
||||||
|
destination: Optional output folder path. If not provided, derived from URL domain.
|
||||||
|
max_pages: Maximum number of HTML pages to crawl (1-1000, default: 50)
|
||||||
|
threads: Number of concurrent download threads (1-20, default: 6)
|
||||||
|
download_external_assets: Whether to download assets from external domains (default: False)
|
||||||
|
external_domains: Optional list of external domain names to allow downloading from
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing:
|
||||||
|
- success: Boolean indicating if the operation was successful
|
||||||
|
- message: Human-readable summary of what was done
|
||||||
|
- stats: Detailed statistics about the crawl
|
||||||
|
- output_directory: Path to the downloaded website
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Validate URL
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
if not parsed_url.scheme or parsed_url.scheme not in ("http", "https"):
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"message": f"Invalid URL: '{url}'. Must be a valid HTTP or HTTPS URL.",
|
||||||
|
"stats": None,
|
||||||
|
"output_directory": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Validate parameters
|
||||||
|
if max_pages < 1 or max_pages > 1000:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"message": f"max_pages must be between 1 and 1000, got {max_pages}",
|
||||||
|
"stats": None,
|
||||||
|
"output_directory": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
if threads < 1 or threads > 20:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"message": f"threads must be between 1 and 20, got {threads}",
|
||||||
|
"stats": None,
|
||||||
|
"output_directory": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Prepare output directory
|
||||||
|
root = make_root(url, destination)
|
||||||
|
|
||||||
|
# Process external domains
|
||||||
|
ext_domains_set = None
|
||||||
|
if external_domains:
|
||||||
|
ext_domains_set = {
|
||||||
|
urlparse(d).hostname.lower() if "://" in d else d.lower()
|
||||||
|
for d in external_domains
|
||||||
|
}
|
||||||
|
download_external_assets = True # Auto-enable if domains specified
|
||||||
|
|
||||||
|
# Log the crawl start
|
||||||
|
log.info(
|
||||||
|
"Starting website download: url=%s, dest=%s, max_pages=%d, threads=%d, external=%s",
|
||||||
|
url, root, max_pages, threads, download_external_assets
|
||||||
|
)
|
||||||
|
|
||||||
|
# Run the crawl
|
||||||
|
stats = crawl_site_tool(
|
||||||
|
start_url=url,
|
||||||
|
root=root,
|
||||||
|
max_pages=max_pages,
|
||||||
|
threads=threads,
|
||||||
|
download_external_assets=download_external_assets,
|
||||||
|
external_domains=ext_domains_set,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build success response
|
||||||
|
message = (
|
||||||
|
f"Successfully downloaded website from {url}\n"
|
||||||
|
f"- Pages crawled: {stats['pages_crawled']}\n"
|
||||||
|
f"- Assets downloaded: {stats['assets_downloaded']}\n"
|
||||||
|
f"- Time elapsed: {stats['elapsed_seconds']}s\n"
|
||||||
|
f"- Output directory: {stats['output_directory']}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if stats["failed_downloads"] > 0:
|
||||||
|
message += f"\n- Failed downloads: {stats['failed_downloads']}"
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"message": message,
|
||||||
|
"stats": stats,
|
||||||
|
"output_directory": stats["output_directory"],
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log.exception("Website download failed")
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"message": f"Website download failed: {str(e)}",
|
||||||
|
"stats": None,
|
||||||
|
"output_directory": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Tool Registration Helper
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
def get_tool_schema() -> dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Get the tool schema for registration with the LLM.
|
||||||
|
|
||||||
|
This schema follows the OpenAI function calling format and can be
|
||||||
|
used directly when creating chat completions with tools.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The tool schema dictionary
|
||||||
|
"""
|
||||||
|
return TOOL_SCHEMA
|
||||||
|
|
||||||
|
|
||||||
|
def get_tool_function():
|
||||||
|
"""
|
||||||
|
Get the tool function for invocation.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The callable tool function
|
||||||
|
"""
|
||||||
|
return website_downloader
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Example Usage
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Example: Direct invocation
|
||||||
|
import json
|
||||||
|
|
||||||
|
print("Website Downloader Tool for GLM-4.7-Flash")
|
||||||
|
print("=" * 50)
|
||||||
|
print("\nTool Schema:")
|
||||||
|
print(json.dumps(TOOL_SCHEMA, indent=2))
|
||||||
|
|
||||||
|
print("\n" + "=" * 50)
|
||||||
|
print("\nExample invocation:")
|
||||||
|
result = website_downloader(
|
||||||
|
url="https://example.com",
|
||||||
|
max_pages=5,
|
||||||
|
threads=4,
|
||||||
|
download_external_assets=False
|
||||||
|
)
|
||||||
|
print(json.dumps(result, indent=2))
|
||||||
Loading…
Reference in New Issue
Block a user