docrag/website_downloader_tool.py
Z User aa69b2f496 Add website downloader tool wrapper for GLM-4.7-Flash
- Create website_downloader_tool.py with OpenAI function calling schema
- Add comprehensive tool documentation
- Update README with usage examples
- Update requirements.txt with optional sdk dependency
2026-03-29 00:16:54 +00:00

573 lines
19 KiB
Python

#!/usr/bin/env python3
"""
Website Downloader Tool for GLM-4.7-Flash
This module provides a tool interface for the website-downloader functionality,
allowing it to be used as a function/tool by the GLM-4.7-Flash model via the
z-ai-web-dev-sdk.
Usage:
The tool can be invoked by the LLM to download and mirror websites for
offline use or for ingesting into a RAG system.
"""
from __future__ import annotations
import logging
import queue
import threading
import time
from pathlib import Path
from typing import Any, Optional
from urllib.parse import urlparse
# Import the core functionality from website_downloader
from website_downloader import (
SESSION,
TIMEOUT,
ASSET_EXTENSIONS,
CSS_URL_RE,
_canonical_netloc,
_protocol_fix,
canonicalize_url,
create_dir,
extract_css_assets,
fetch_binary,
is_httpish,
is_internal,
is_non_fetchable,
is_allowed_external,
normalize_url,
rewrite_links,
safe_write_text,
to_local_path,
to_local_asset_path,
cdn_local_path,
fetch_html,
)
# Configure logging for tool use
log = logging.getLogger(__name__)
# =============================================================================
# Tool Schema Definition
# =============================================================================
TOOL_SCHEMA = {
"type": "function",
"function": {
"name": "website_downloader",
"description": (
"Download and mirror a website for offline use or RAG ingestion. "
"This tool crawls a website starting from a given URL, downloads HTML pages "
"and all linked assets (CSS, JavaScript, images, fonts, etc.), and saves them "
"locally with rewritten links for offline viewing. "
"Use this tool when you need to: "
"1) Archive a website for offline access, "
"2) Download website content for analysis or RAG systems, "
"3) Create a local mirror of a website."
),
"parameters": {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": (
"The starting URL to crawl (e.g., 'https://example.com/'). "
"Must be a valid HTTP or HTTPS URL."
),
},
"destination": {
"type": "string",
"description": (
"Optional output folder path where the downloaded website "
"will be saved. If not provided, a folder name will be derived "
"from the URL's domain (e.g., 'example_com')."
),
"default": None,
},
"max_pages": {
"type": "integer",
"description": (
"Maximum number of HTML pages to crawl. "
"Use lower values for quick downloads, higher for comprehensive archiving."
),
"default": 50,
"minimum": 1,
"maximum": 1000,
},
"threads": {
"type": "integer",
"description": (
"Number of concurrent download threads. "
"Higher values can speed up downloads but may trigger rate limits."
),
"default": 6,
"minimum": 1,
"maximum": 20,
},
"download_external_assets": {
"type": "boolean",
"description": (
"Whether to download assets from external domains (CDNs, etc.). "
"Enable for complete offline functionality, disable for faster downloads "
"of only same-domain content."
),
"default": False,
},
"external_domains": {
"type": "array",
"items": {"type": "string"},
"description": (
"Optional list of external domain names to allow downloading from. "
"Useful for whitelisting specific CDN domains. "
"Example: ['cdn.example.com', 'assets.example.com']"
),
"default": None,
},
},
"required": ["url"],
},
}
}
# =============================================================================
# Tool Implementation
# =============================================================================
def crawl_site_tool(
start_url: str,
root: Path,
max_pages: int,
threads: int,
download_external_assets: bool = False,
external_domains: Optional[set[str]] = None,
) -> dict[str, Any]:
"""
Internal crawl implementation that returns detailed results.
This is a modified version of crawl_site that collects statistics
and returns them in a structured format for the tool response.
Returns:
Dictionary containing crawl statistics and results
"""
start_time = time.time()
# Statistics tracking
stats = {
"pages_crawled": 0,
"assets_downloaded": 0,
"failed_downloads": 0,
"pages": [],
"assets": [],
"errors": [],
}
q_pages: queue.Queue[str] = queue.Queue()
q_pages.put(start_url)
seen_pages: set[str] = set()
queued_pages: set[str] = {start_url}
queued_assets: set[str] = set()
download_q: queue.Queue[tuple[str, Path]] = queue.Queue()
root_netloc = _canonical_netloc(urlparse(start_url))
# Track successfully downloaded items
downloaded_items: list[dict[str, str]] = []
failed_items: list[dict[str, str]] = []
def worker() -> None:
"""Download worker thread."""
while True:
url, dest = download_q.get()
try:
if is_non_fetchable(url) or not is_httpish(url):
log.debug("Skip non-fetchable: %s", url)
continue
if dest.exists():
stats["assets_downloaded"] += 1
continue
try:
fetch_binary(
url,
dest,
download_q,
site_root=root,
root_netloc=root_netloc,
download_external_assets=download_external_assets,
external_domains=external_domains,
)
if dest.exists():
stats["assets_downloaded"] += 1
downloaded_items.append({
"url": url,
"local_path": str(dest.relative_to(root))
})
except Exception as e:
stats["failed_downloads"] += 1
failed_items.append({
"url": url,
"error": str(e)
})
log.debug("Failed to download %s: %s", url, e)
finally:
download_q.task_done()
# Spawn worker threads
worker_threads = []
for i in range(max(1, threads)):
t = threading.Thread(target=worker, name=f"DL-{i + 1}", daemon=True)
t.start()
worker_threads.append(t)
# Main crawl loop
while not q_pages.empty() and len(seen_pages) < max_pages:
page_url = q_pages.get()
if page_url in seen_pages:
continue
seen_pages.add(page_url)
stats["pages_crawled"] += 1
log.info("Crawling page %d/%d: %s", len(seen_pages), max_pages, page_url)
soup = fetch_html(page_url)
if soup is None:
stats["errors"].append(f"Failed to fetch page: {page_url}")
continue
# Record page info
local_page_path = to_local_path(urlparse(page_url), root)
stats["pages"].append({
"url": page_url,
"local_path": str(local_page_path.relative_to(root)) if local_page_path.is_relative_to(root) else str(local_page_path)
})
# Find and queue all assets
for tag in soup.find_all(True):
# Handle various tag types and their URL attributes
tag_handlers = {
"img": ["src", "data-src", "srcset"],
"script": ["src"],
"link": ["href"],
"video": ["src", "poster"],
"audio": ["src"],
"source": ["src", "srcset"],
"iframe": ["src"],
"embed": ["src"],
"object": ["data"],
}
attrs_to_check = tag_handlers.get(tag.name, [])
# Also check for link tags with resource rel types
if tag.name == "link":
rel = tag.get("rel", [])
if isinstance(rel, str):
rel = [rel]
rel_set = {r.lower() for r in rel}
resource_rels = {"stylesheet", "icon", "shortcut", "apple-touch-icon", "preload", "modulepreload", "manifest"}
if not rel_set & resource_rels:
attrs_to_check = []
for attr in attrs_to_check:
if not tag.has_attr(attr):
continue
if attr == "srcset":
# Handle srcset specially
for entry in str(tag["srcset"]).split(","):
parts = entry.strip().split()
if not parts:
continue
url_part = _protocol_fix(parts[0], page_url)
process_asset_url(
url_part, page_url, root, root_netloc,
download_external_assets, external_domains,
queued_assets, download_q, stats
)
else:
url_part = _protocol_fix(str(tag.get(attr, "")), page_url)
process_asset_url(
url_part, page_url, root, root_netloc,
download_external_assets, external_domains,
queued_assets, download_q, stats
)
# Handle inline styles
if tag.has_attr("style"):
style = str(tag["style"])
for match in CSS_URL_RE.findall(style):
url_part = _protocol_fix(match.strip().strip("'\""), page_url)
process_asset_url(
url_part, page_url, root, root_netloc,
download_external_assets, external_domains,
queued_assets, download_q, stats
)
# Handle <style> blocks
if tag.name == "style":
css_text = tag.string or tag.get_text()
if css_text:
for asset in extract_css_assets(css_text):
asset = _protocol_fix(asset, page_url)
process_asset_url(
asset, page_url, root, root_netloc,
download_external_assets, external_domains,
queued_assets, download_q, stats
)
# Find and queue internal links for further crawling
if tag.name == "a" and tag.has_attr("href"):
href = _protocol_fix(str(tag.get("href", "")), page_url)
if href and not href.startswith("#") and is_httpish(href) and not is_non_fetchable(href):
abs_url = normalize_url(canonicalize_url(href, page_url))
if is_internal(abs_url, root_netloc) and abs_url not in seen_pages and abs_url not in queued_pages:
queued_pages.add(abs_url)
q_pages.put(abs_url)
# Save the page with rewritten links
local_path = to_local_path(urlparse(page_url), root)
create_dir(local_path.parent)
rewrite_links(
soup,
page_url,
root,
local_path.parent,
download_external_assets,
external_domains,
)
safe_write_text(local_path, str(soup), encoding="utf-8")
# Wait for all downloads to complete
download_q.join()
elapsed = time.time() - start_time
stats["elapsed_seconds"] = round(elapsed, 2)
stats["output_directory"] = str(root.resolve())
stats["downloaded_items"] = downloaded_items[:100] # Limit for response size
stats["failed_items"] = failed_items[:50] # Limit for response size
return stats
def process_asset_url(
url_part: str,
page_url: str,
root: Path,
root_netloc: str,
download_external_assets: bool,
external_domains: Optional[set[str]],
queued_assets: set[str],
download_q: queue.Queue[tuple[str, Path]],
stats: dict,
) -> None:
"""Process and queue an asset URL for download."""
if (
not url_part
or url_part.startswith("#")
or url_part.startswith(("data:", "javascript:", "about:"))
or is_non_fetchable(url_part)
or not is_httpish(url_part)
):
return
abs_url = normalize_url(canonicalize_url(url_part, page_url))
parsed = urlparse(abs_url)
if not parsed.path.lower().endswith(ASSET_EXTENSIONS):
return
is_ext = not is_internal(abs_url, root_netloc)
if is_ext:
if not download_external_assets:
return
if external_domains and not is_allowed_external(abs_url, external_domains):
return
dest_path = cdn_local_path(parsed, root)
else:
dest_path = to_local_asset_path(parsed, root)
if abs_url not in queued_assets:
queued_assets.add(abs_url)
create_dir(dest_path.parent)
download_q.put((abs_url, dest_path))
def make_root(url: str, custom: Optional[str]) -> Path:
"""Derive output folder from URL if custom not supplied."""
return Path(custom) if custom else Path(urlparse(url).netloc.replace(".", "_"))
def website_downloader(
url: str,
destination: Optional[str] = None,
max_pages: int = 50,
threads: int = 6,
download_external_assets: bool = False,
external_domains: Optional[list[str]] = None,
) -> dict[str, Any]:
"""
Download and mirror a website for offline use or RAG ingestion.
This is the main tool function that can be invoked by the GLM-4.7-Flash model.
It wraps the website-downloader functionality in a tool interface.
Args:
url: The starting URL to crawl (e.g., 'https://example.com/')
destination: Optional output folder path. If not provided, derived from URL domain.
max_pages: Maximum number of HTML pages to crawl (1-1000, default: 50)
threads: Number of concurrent download threads (1-20, default: 6)
download_external_assets: Whether to download assets from external domains (default: False)
external_domains: Optional list of external domain names to allow downloading from
Returns:
Dictionary containing:
- success: Boolean indicating if the operation was successful
- message: Human-readable summary of what was done
- stats: Detailed statistics about the crawl
- output_directory: Path to the downloaded website
"""
try:
# Validate URL
parsed_url = urlparse(url)
if not parsed_url.scheme or parsed_url.scheme not in ("http", "https"):
return {
"success": False,
"message": f"Invalid URL: '{url}'. Must be a valid HTTP or HTTPS URL.",
"stats": None,
"output_directory": None,
}
# Validate parameters
if max_pages < 1 or max_pages > 1000:
return {
"success": False,
"message": f"max_pages must be between 1 and 1000, got {max_pages}",
"stats": None,
"output_directory": None,
}
if threads < 1 or threads > 20:
return {
"success": False,
"message": f"threads must be between 1 and 20, got {threads}",
"stats": None,
"output_directory": None,
}
# Prepare output directory
root = make_root(url, destination)
# Process external domains
ext_domains_set = None
if external_domains:
ext_domains_set = {
urlparse(d).hostname.lower() if "://" in d else d.lower()
for d in external_domains
}
download_external_assets = True # Auto-enable if domains specified
# Log the crawl start
log.info(
"Starting website download: url=%s, dest=%s, max_pages=%d, threads=%d, external=%s",
url, root, max_pages, threads, download_external_assets
)
# Run the crawl
stats = crawl_site_tool(
start_url=url,
root=root,
max_pages=max_pages,
threads=threads,
download_external_assets=download_external_assets,
external_domains=ext_domains_set,
)
# Build success response
message = (
f"Successfully downloaded website from {url}\n"
f"- Pages crawled: {stats['pages_crawled']}\n"
f"- Assets downloaded: {stats['assets_downloaded']}\n"
f"- Time elapsed: {stats['elapsed_seconds']}s\n"
f"- Output directory: {stats['output_directory']}"
)
if stats["failed_downloads"] > 0:
message += f"\n- Failed downloads: {stats['failed_downloads']}"
return {
"success": True,
"message": message,
"stats": stats,
"output_directory": stats["output_directory"],
}
except Exception as e:
log.exception("Website download failed")
return {
"success": False,
"message": f"Website download failed: {str(e)}",
"stats": None,
"output_directory": None,
}
# =============================================================================
# Tool Registration Helper
# =============================================================================
def get_tool_schema() -> dict[str, Any]:
"""
Get the tool schema for registration with the LLM.
This schema follows the OpenAI function calling format and can be
used directly when creating chat completions with tools.
Returns:
The tool schema dictionary
"""
return TOOL_SCHEMA
def get_tool_function():
"""
Get the tool function for invocation.
Returns:
The callable tool function
"""
return website_downloader
# =============================================================================
# Example Usage
# =============================================================================
if __name__ == "__main__":
# Example: Direct invocation
import json
print("Website Downloader Tool for GLM-4.7-Flash")
print("=" * 50)
print("\nTool Schema:")
print(json.dumps(TOOL_SCHEMA, indent=2))
print("\n" + "=" * 50)
print("\nExample invocation:")
result = website_downloader(
url="https://example.com",
max_pages=5,
threads=4,
download_external_assets=False
)
print(json.dumps(result, indent=2))