- Pass all registered tools to LLM during chat completion - Handle tool_calls from LLM response - Execute tools and feed results back to LLM - Loop until LLM returns final response - Updated system prompt to encourage tool use - Updated streaming to handle tool calls - Increased MAX_TOOL_ITERATIONS to 5
573 lines
19 KiB
Python
Executable File
573 lines
19 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Website Downloader Tool for GLM-4.7-Flash
|
|
|
|
This module provides a tool interface for the website-downloader functionality,
|
|
allowing it to be used as a function/tool by the GLM-4.7-Flash model via the
|
|
z-ai-web-dev-sdk.
|
|
|
|
Usage:
|
|
The tool can be invoked by the LLM to download and mirror websites for
|
|
offline use or for ingesting into a RAG system.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import queue
|
|
import threading
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
from urllib.parse import urlparse
|
|
|
|
# Import the core functionality from website_downloader
|
|
from website_downloader import (
|
|
SESSION,
|
|
TIMEOUT,
|
|
ASSET_EXTENSIONS,
|
|
CSS_URL_RE,
|
|
_canonical_netloc,
|
|
_protocol_fix,
|
|
canonicalize_url,
|
|
create_dir,
|
|
extract_css_assets,
|
|
fetch_binary,
|
|
is_httpish,
|
|
is_internal,
|
|
is_non_fetchable,
|
|
is_allowed_external,
|
|
normalize_url,
|
|
rewrite_links,
|
|
safe_write_text,
|
|
to_local_path,
|
|
to_local_asset_path,
|
|
cdn_local_path,
|
|
fetch_html,
|
|
)
|
|
|
|
# Configure logging for tool use
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
# =============================================================================
|
|
# Tool Schema Definition
|
|
# =============================================================================
|
|
|
|
TOOL_SCHEMA = {
|
|
"type": "function",
|
|
"function": {
|
|
"name": "website_downloader",
|
|
"description": (
|
|
"Download and mirror a website for offline use or RAG ingestion. "
|
|
"This tool crawls a website starting from a given URL, downloads HTML pages "
|
|
"and all linked assets (CSS, JavaScript, images, fonts, etc.), and saves them "
|
|
"locally with rewritten links for offline viewing. "
|
|
"Use this tool when you need to: "
|
|
"1) Archive a website for offline access, "
|
|
"2) Download website content for analysis or RAG systems, "
|
|
"3) Create a local mirror of a website."
|
|
),
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"url": {
|
|
"type": "string",
|
|
"description": (
|
|
"The starting URL to crawl (e.g., 'https://example.com/'). "
|
|
"Must be a valid HTTP or HTTPS URL."
|
|
),
|
|
},
|
|
"destination": {
|
|
"type": "string",
|
|
"description": (
|
|
"Optional output folder path where the downloaded website "
|
|
"will be saved. If not provided, a folder name will be derived "
|
|
"from the URL's domain (e.g., 'example_com')."
|
|
),
|
|
"default": None,
|
|
},
|
|
"max_pages": {
|
|
"type": "integer",
|
|
"description": (
|
|
"Maximum number of HTML pages to crawl. "
|
|
"Use lower values for quick downloads, higher for comprehensive archiving."
|
|
),
|
|
"default": 50,
|
|
"minimum": 1,
|
|
"maximum": 1000,
|
|
},
|
|
"threads": {
|
|
"type": "integer",
|
|
"description": (
|
|
"Number of concurrent download threads. "
|
|
"Higher values can speed up downloads but may trigger rate limits."
|
|
),
|
|
"default": 6,
|
|
"minimum": 1,
|
|
"maximum": 20,
|
|
},
|
|
"download_external_assets": {
|
|
"type": "boolean",
|
|
"description": (
|
|
"Whether to download assets from external domains (CDNs, etc.). "
|
|
"Enable for complete offline functionality, disable for faster downloads "
|
|
"of only same-domain content."
|
|
),
|
|
"default": False,
|
|
},
|
|
"external_domains": {
|
|
"type": "array",
|
|
"items": {"type": "string"},
|
|
"description": (
|
|
"Optional list of external domain names to allow downloading from. "
|
|
"Useful for whitelisting specific CDN domains. "
|
|
"Example: ['cdn.example.com', 'assets.example.com']"
|
|
),
|
|
"default": None,
|
|
},
|
|
},
|
|
"required": ["url"],
|
|
},
|
|
}
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Tool Implementation
|
|
# =============================================================================
|
|
|
|
def crawl_site_tool(
|
|
start_url: str,
|
|
root: Path,
|
|
max_pages: int,
|
|
threads: int,
|
|
download_external_assets: bool = False,
|
|
external_domains: Optional[set[str]] = None,
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Internal crawl implementation that returns detailed results.
|
|
|
|
This is a modified version of crawl_site that collects statistics
|
|
and returns them in a structured format for the tool response.
|
|
|
|
Returns:
|
|
Dictionary containing crawl statistics and results
|
|
"""
|
|
start_time = time.time()
|
|
|
|
# Statistics tracking
|
|
stats = {
|
|
"pages_crawled": 0,
|
|
"assets_downloaded": 0,
|
|
"failed_downloads": 0,
|
|
"pages": [],
|
|
"assets": [],
|
|
"errors": [],
|
|
}
|
|
|
|
q_pages: queue.Queue[str] = queue.Queue()
|
|
q_pages.put(start_url)
|
|
|
|
seen_pages: set[str] = set()
|
|
queued_pages: set[str] = {start_url}
|
|
queued_assets: set[str] = set()
|
|
download_q: queue.Queue[tuple[str, Path]] = queue.Queue()
|
|
|
|
root_netloc = _canonical_netloc(urlparse(start_url))
|
|
|
|
# Track successfully downloaded items
|
|
downloaded_items: list[dict[str, str]] = []
|
|
failed_items: list[dict[str, str]] = []
|
|
|
|
def worker() -> None:
|
|
"""Download worker thread."""
|
|
while True:
|
|
url, dest = download_q.get()
|
|
try:
|
|
if is_non_fetchable(url) or not is_httpish(url):
|
|
log.debug("Skip non-fetchable: %s", url)
|
|
continue
|
|
|
|
if dest.exists():
|
|
stats["assets_downloaded"] += 1
|
|
continue
|
|
|
|
try:
|
|
fetch_binary(
|
|
url,
|
|
dest,
|
|
download_q,
|
|
site_root=root,
|
|
root_netloc=root_netloc,
|
|
download_external_assets=download_external_assets,
|
|
external_domains=external_domains,
|
|
)
|
|
if dest.exists():
|
|
stats["assets_downloaded"] += 1
|
|
downloaded_items.append({
|
|
"url": url,
|
|
"local_path": str(dest.relative_to(root))
|
|
})
|
|
except Exception as e:
|
|
stats["failed_downloads"] += 1
|
|
failed_items.append({
|
|
"url": url,
|
|
"error": str(e)
|
|
})
|
|
log.debug("Failed to download %s: %s", url, e)
|
|
finally:
|
|
download_q.task_done()
|
|
|
|
# Spawn worker threads
|
|
worker_threads = []
|
|
for i in range(max(1, threads)):
|
|
t = threading.Thread(target=worker, name=f"DL-{i + 1}", daemon=True)
|
|
t.start()
|
|
worker_threads.append(t)
|
|
|
|
# Main crawl loop
|
|
while not q_pages.empty() and len(seen_pages) < max_pages:
|
|
page_url = q_pages.get()
|
|
|
|
if page_url in seen_pages:
|
|
continue
|
|
|
|
seen_pages.add(page_url)
|
|
stats["pages_crawled"] += 1
|
|
|
|
log.info("Crawling page %d/%d: %s", len(seen_pages), max_pages, page_url)
|
|
|
|
soup = fetch_html(page_url)
|
|
if soup is None:
|
|
stats["errors"].append(f"Failed to fetch page: {page_url}")
|
|
continue
|
|
|
|
# Record page info
|
|
local_page_path = to_local_path(urlparse(page_url), root)
|
|
stats["pages"].append({
|
|
"url": page_url,
|
|
"local_path": str(local_page_path.relative_to(root)) if local_page_path.is_relative_to(root) else str(local_page_path)
|
|
})
|
|
|
|
# Find and queue all assets
|
|
for tag in soup.find_all(True):
|
|
# Handle various tag types and their URL attributes
|
|
tag_handlers = {
|
|
"img": ["src", "data-src", "srcset"],
|
|
"script": ["src"],
|
|
"link": ["href"],
|
|
"video": ["src", "poster"],
|
|
"audio": ["src"],
|
|
"source": ["src", "srcset"],
|
|
"iframe": ["src"],
|
|
"embed": ["src"],
|
|
"object": ["data"],
|
|
}
|
|
|
|
attrs_to_check = tag_handlers.get(tag.name, [])
|
|
|
|
# Also check for link tags with resource rel types
|
|
if tag.name == "link":
|
|
rel = tag.get("rel", [])
|
|
if isinstance(rel, str):
|
|
rel = [rel]
|
|
rel_set = {r.lower() for r in rel}
|
|
resource_rels = {"stylesheet", "icon", "shortcut", "apple-touch-icon", "preload", "modulepreload", "manifest"}
|
|
if not rel_set & resource_rels:
|
|
attrs_to_check = []
|
|
|
|
for attr in attrs_to_check:
|
|
if not tag.has_attr(attr):
|
|
continue
|
|
|
|
if attr == "srcset":
|
|
# Handle srcset specially
|
|
for entry in str(tag["srcset"]).split(","):
|
|
parts = entry.strip().split()
|
|
if not parts:
|
|
continue
|
|
url_part = _protocol_fix(parts[0], page_url)
|
|
process_asset_url(
|
|
url_part, page_url, root, root_netloc,
|
|
download_external_assets, external_domains,
|
|
queued_assets, download_q, stats
|
|
)
|
|
else:
|
|
url_part = _protocol_fix(str(tag.get(attr, "")), page_url)
|
|
process_asset_url(
|
|
url_part, page_url, root, root_netloc,
|
|
download_external_assets, external_domains,
|
|
queued_assets, download_q, stats
|
|
)
|
|
|
|
# Handle inline styles
|
|
if tag.has_attr("style"):
|
|
style = str(tag["style"])
|
|
for match in CSS_URL_RE.findall(style):
|
|
url_part = _protocol_fix(match.strip().strip("'\""), page_url)
|
|
process_asset_url(
|
|
url_part, page_url, root, root_netloc,
|
|
download_external_assets, external_domains,
|
|
queued_assets, download_q, stats
|
|
)
|
|
|
|
# Handle <style> blocks
|
|
if tag.name == "style":
|
|
css_text = tag.string or tag.get_text()
|
|
if css_text:
|
|
for asset in extract_css_assets(css_text):
|
|
asset = _protocol_fix(asset, page_url)
|
|
process_asset_url(
|
|
asset, page_url, root, root_netloc,
|
|
download_external_assets, external_domains,
|
|
queued_assets, download_q, stats
|
|
)
|
|
|
|
# Find and queue internal links for further crawling
|
|
if tag.name == "a" and tag.has_attr("href"):
|
|
href = _protocol_fix(str(tag.get("href", "")), page_url)
|
|
if href and not href.startswith("#") and is_httpish(href) and not is_non_fetchable(href):
|
|
abs_url = normalize_url(canonicalize_url(href, page_url))
|
|
if is_internal(abs_url, root_netloc) and abs_url not in seen_pages and abs_url not in queued_pages:
|
|
queued_pages.add(abs_url)
|
|
q_pages.put(abs_url)
|
|
|
|
# Save the page with rewritten links
|
|
local_path = to_local_path(urlparse(page_url), root)
|
|
create_dir(local_path.parent)
|
|
rewrite_links(
|
|
soup,
|
|
page_url,
|
|
root,
|
|
local_path.parent,
|
|
download_external_assets,
|
|
external_domains,
|
|
)
|
|
safe_write_text(local_path, str(soup), encoding="utf-8")
|
|
|
|
# Wait for all downloads to complete
|
|
download_q.join()
|
|
|
|
elapsed = time.time() - start_time
|
|
stats["elapsed_seconds"] = round(elapsed, 2)
|
|
stats["output_directory"] = str(root.resolve())
|
|
stats["downloaded_items"] = downloaded_items[:100] # Limit for response size
|
|
stats["failed_items"] = failed_items[:50] # Limit for response size
|
|
|
|
return stats
|
|
|
|
|
|
def process_asset_url(
|
|
url_part: str,
|
|
page_url: str,
|
|
root: Path,
|
|
root_netloc: str,
|
|
download_external_assets: bool,
|
|
external_domains: Optional[set[str]],
|
|
queued_assets: set[str],
|
|
download_q: queue.Queue[tuple[str, Path]],
|
|
stats: dict,
|
|
) -> None:
|
|
"""Process and queue an asset URL for download."""
|
|
if (
|
|
not url_part
|
|
or url_part.startswith("#")
|
|
or url_part.startswith(("data:", "javascript:", "about:"))
|
|
or is_non_fetchable(url_part)
|
|
or not is_httpish(url_part)
|
|
):
|
|
return
|
|
|
|
abs_url = normalize_url(canonicalize_url(url_part, page_url))
|
|
parsed = urlparse(abs_url)
|
|
|
|
if not parsed.path.lower().endswith(ASSET_EXTENSIONS):
|
|
return
|
|
|
|
is_ext = not is_internal(abs_url, root_netloc)
|
|
|
|
if is_ext:
|
|
if not download_external_assets:
|
|
return
|
|
if external_domains and not is_allowed_external(abs_url, external_domains):
|
|
return
|
|
dest_path = cdn_local_path(parsed, root)
|
|
else:
|
|
dest_path = to_local_asset_path(parsed, root)
|
|
|
|
if abs_url not in queued_assets:
|
|
queued_assets.add(abs_url)
|
|
create_dir(dest_path.parent)
|
|
download_q.put((abs_url, dest_path))
|
|
|
|
|
|
def make_root(url: str, custom: Optional[str]) -> Path:
|
|
"""Derive output folder from URL if custom not supplied."""
|
|
return Path(custom) if custom else Path(urlparse(url).netloc.replace(".", "_"))
|
|
|
|
|
|
def website_downloader(
|
|
url: str,
|
|
destination: Optional[str] = None,
|
|
max_pages: int = 50,
|
|
threads: int = 6,
|
|
download_external_assets: bool = False,
|
|
external_domains: Optional[list[str]] = None,
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Download and mirror a website for offline use or RAG ingestion.
|
|
|
|
This is the main tool function that can be invoked by the GLM-4.7-Flash model.
|
|
It wraps the website-downloader functionality in a tool interface.
|
|
|
|
Args:
|
|
url: The starting URL to crawl (e.g., 'https://example.com/')
|
|
destination: Optional output folder path. If not provided, derived from URL domain.
|
|
max_pages: Maximum number of HTML pages to crawl (1-1000, default: 50)
|
|
threads: Number of concurrent download threads (1-20, default: 6)
|
|
download_external_assets: Whether to download assets from external domains (default: False)
|
|
external_domains: Optional list of external domain names to allow downloading from
|
|
|
|
Returns:
|
|
Dictionary containing:
|
|
- success: Boolean indicating if the operation was successful
|
|
- message: Human-readable summary of what was done
|
|
- stats: Detailed statistics about the crawl
|
|
- output_directory: Path to the downloaded website
|
|
"""
|
|
try:
|
|
# Validate URL
|
|
parsed_url = urlparse(url)
|
|
if not parsed_url.scheme or parsed_url.scheme not in ("http", "https"):
|
|
return {
|
|
"success": False,
|
|
"message": f"Invalid URL: '{url}'. Must be a valid HTTP or HTTPS URL.",
|
|
"stats": None,
|
|
"output_directory": None,
|
|
}
|
|
|
|
# Validate parameters
|
|
if max_pages < 1 or max_pages > 1000:
|
|
return {
|
|
"success": False,
|
|
"message": f"max_pages must be between 1 and 1000, got {max_pages}",
|
|
"stats": None,
|
|
"output_directory": None,
|
|
}
|
|
|
|
if threads < 1 or threads > 20:
|
|
return {
|
|
"success": False,
|
|
"message": f"threads must be between 1 and 20, got {threads}",
|
|
"stats": None,
|
|
"output_directory": None,
|
|
}
|
|
|
|
# Prepare output directory
|
|
root = make_root(url, destination)
|
|
|
|
# Process external domains
|
|
ext_domains_set = None
|
|
if external_domains:
|
|
ext_domains_set = {
|
|
urlparse(d).hostname.lower() if "://" in d else d.lower()
|
|
for d in external_domains
|
|
}
|
|
download_external_assets = True # Auto-enable if domains specified
|
|
|
|
# Log the crawl start
|
|
log.info(
|
|
"Starting website download: url=%s, dest=%s, max_pages=%d, threads=%d, external=%s",
|
|
url, root, max_pages, threads, download_external_assets
|
|
)
|
|
|
|
# Run the crawl
|
|
stats = crawl_site_tool(
|
|
start_url=url,
|
|
root=root,
|
|
max_pages=max_pages,
|
|
threads=threads,
|
|
download_external_assets=download_external_assets,
|
|
external_domains=ext_domains_set,
|
|
)
|
|
|
|
# Build success response
|
|
message = (
|
|
f"Successfully downloaded website from {url}\n"
|
|
f"- Pages crawled: {stats['pages_crawled']}\n"
|
|
f"- Assets downloaded: {stats['assets_downloaded']}\n"
|
|
f"- Time elapsed: {stats['elapsed_seconds']}s\n"
|
|
f"- Output directory: {stats['output_directory']}"
|
|
)
|
|
|
|
if stats["failed_downloads"] > 0:
|
|
message += f"\n- Failed downloads: {stats['failed_downloads']}"
|
|
|
|
return {
|
|
"success": True,
|
|
"message": message,
|
|
"stats": stats,
|
|
"output_directory": stats["output_directory"],
|
|
}
|
|
|
|
except Exception as e:
|
|
log.exception("Website download failed")
|
|
return {
|
|
"success": False,
|
|
"message": f"Website download failed: {str(e)}",
|
|
"stats": None,
|
|
"output_directory": None,
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Tool Registration Helper
|
|
# =============================================================================
|
|
|
|
def get_tool_schema() -> dict[str, Any]:
|
|
"""
|
|
Get the tool schema for registration with the LLM.
|
|
|
|
This schema follows the OpenAI function calling format and can be
|
|
used directly when creating chat completions with tools.
|
|
|
|
Returns:
|
|
The tool schema dictionary
|
|
"""
|
|
return TOOL_SCHEMA
|
|
|
|
|
|
def get_tool_function():
|
|
"""
|
|
Get the tool function for invocation.
|
|
|
|
Returns:
|
|
The callable tool function
|
|
"""
|
|
return website_downloader
|
|
|
|
|
|
# =============================================================================
|
|
# Example Usage
|
|
# =============================================================================
|
|
|
|
if __name__ == "__main__":
|
|
# Example: Direct invocation
|
|
import json
|
|
|
|
print("Website Downloader Tool for GLM-4.7-Flash")
|
|
print("=" * 50)
|
|
print("\nTool Schema:")
|
|
print(json.dumps(TOOL_SCHEMA, indent=2))
|
|
|
|
print("\n" + "=" * 50)
|
|
print("\nExample invocation:")
|
|
result = website_downloader(
|
|
url="https://example.com",
|
|
max_pages=5,
|
|
threads=4,
|
|
download_external_assets=False
|
|
)
|
|
print(json.dumps(result, indent=2))
|