#!/usr/bin/env python3 """ Website Downloader Tool for GLM-4.7-Flash This module provides a tool interface for the website-downloader functionality, allowing it to be used as a function/tool by the GLM-4.7-Flash model via the z-ai-web-dev-sdk. Usage: The tool can be invoked by the LLM to download and mirror websites for offline use or for ingesting into a RAG system. """ from __future__ import annotations import logging import queue import threading import time from pathlib import Path from typing import Any, Optional from urllib.parse import urlparse # Import the core functionality from website_downloader from website_downloader import ( SESSION, TIMEOUT, ASSET_EXTENSIONS, CSS_URL_RE, _canonical_netloc, _protocol_fix, canonicalize_url, create_dir, extract_css_assets, fetch_binary, is_httpish, is_internal, is_non_fetchable, is_allowed_external, normalize_url, rewrite_links, safe_write_text, to_local_path, to_local_asset_path, cdn_local_path, fetch_html, ) # Configure logging for tool use log = logging.getLogger(__name__) # ============================================================================= # Tool Schema Definition # ============================================================================= TOOL_SCHEMA = { "type": "function", "function": { "name": "website_downloader", "description": ( "Download and mirror a website for offline use or RAG ingestion. " "This tool crawls a website starting from a given URL, downloads HTML pages " "and all linked assets (CSS, JavaScript, images, fonts, etc.), and saves them " "locally with rewritten links for offline viewing. " "Use this tool when you need to: " "1) Archive a website for offline access, " "2) Download website content for analysis or RAG systems, " "3) Create a local mirror of a website." ), "parameters": { "type": "object", "properties": { "url": { "type": "string", "description": ( "The starting URL to crawl (e.g., 'https://example.com/'). " "Must be a valid HTTP or HTTPS URL." ), }, "destination": { "type": "string", "description": ( "Optional output folder path where the downloaded website " "will be saved. If not provided, a folder name will be derived " "from the URL's domain (e.g., 'example_com')." ), "default": None, }, "max_pages": { "type": "integer", "description": ( "Maximum number of HTML pages to crawl. " "Use lower values for quick downloads, higher for comprehensive archiving." ), "default": 50, "minimum": 1, "maximum": 1000, }, "threads": { "type": "integer", "description": ( "Number of concurrent download threads. " "Higher values can speed up downloads but may trigger rate limits." ), "default": 6, "minimum": 1, "maximum": 20, }, "download_external_assets": { "type": "boolean", "description": ( "Whether to download assets from external domains (CDNs, etc.). " "Enable for complete offline functionality, disable for faster downloads " "of only same-domain content." ), "default": False, }, "external_domains": { "type": "array", "items": {"type": "string"}, "description": ( "Optional list of external domain names to allow downloading from. " "Useful for whitelisting specific CDN domains. " "Example: ['cdn.example.com', 'assets.example.com']" ), "default": None, }, }, "required": ["url"], }, } } # ============================================================================= # Tool Implementation # ============================================================================= def crawl_site_tool( start_url: str, root: Path, max_pages: int, threads: int, download_external_assets: bool = False, external_domains: Optional[set[str]] = None, ) -> dict[str, Any]: """ Internal crawl implementation that returns detailed results. This is a modified version of crawl_site that collects statistics and returns them in a structured format for the tool response. Returns: Dictionary containing crawl statistics and results """ start_time = time.time() # Statistics tracking stats = { "pages_crawled": 0, "assets_downloaded": 0, "failed_downloads": 0, "pages": [], "assets": [], "errors": [], } q_pages: queue.Queue[str] = queue.Queue() q_pages.put(start_url) seen_pages: set[str] = set() queued_pages: set[str] = {start_url} queued_assets: set[str] = set() download_q: queue.Queue[tuple[str, Path]] = queue.Queue() root_netloc = _canonical_netloc(urlparse(start_url)) # Track successfully downloaded items downloaded_items: list[dict[str, str]] = [] failed_items: list[dict[str, str]] = [] def worker() -> None: """Download worker thread.""" while True: url, dest = download_q.get() try: if is_non_fetchable(url) or not is_httpish(url): log.debug("Skip non-fetchable: %s", url) continue if dest.exists(): stats["assets_downloaded"] += 1 continue try: fetch_binary( url, dest, download_q, site_root=root, root_netloc=root_netloc, download_external_assets=download_external_assets, external_domains=external_domains, ) if dest.exists(): stats["assets_downloaded"] += 1 downloaded_items.append({ "url": url, "local_path": str(dest.relative_to(root)) }) except Exception as e: stats["failed_downloads"] += 1 failed_items.append({ "url": url, "error": str(e) }) log.debug("Failed to download %s: %s", url, e) finally: download_q.task_done() # Spawn worker threads worker_threads = [] for i in range(max(1, threads)): t = threading.Thread(target=worker, name=f"DL-{i + 1}", daemon=True) t.start() worker_threads.append(t) # Main crawl loop while not q_pages.empty() and len(seen_pages) < max_pages: page_url = q_pages.get() if page_url in seen_pages: continue seen_pages.add(page_url) stats["pages_crawled"] += 1 log.info("Crawling page %d/%d: %s", len(seen_pages), max_pages, page_url) soup = fetch_html(page_url) if soup is None: stats["errors"].append(f"Failed to fetch page: {page_url}") continue # Record page info local_page_path = to_local_path(urlparse(page_url), root) stats["pages"].append({ "url": page_url, "local_path": str(local_page_path.relative_to(root)) if local_page_path.is_relative_to(root) else str(local_page_path) }) # Find and queue all assets for tag in soup.find_all(True): # Handle various tag types and their URL attributes tag_handlers = { "img": ["src", "data-src", "srcset"], "script": ["src"], "link": ["href"], "video": ["src", "poster"], "audio": ["src"], "source": ["src", "srcset"], "iframe": ["src"], "embed": ["src"], "object": ["data"], } attrs_to_check = tag_handlers.get(tag.name, []) # Also check for link tags with resource rel types if tag.name == "link": rel = tag.get("rel", []) if isinstance(rel, str): rel = [rel] rel_set = {r.lower() for r in rel} resource_rels = {"stylesheet", "icon", "shortcut", "apple-touch-icon", "preload", "modulepreload", "manifest"} if not rel_set & resource_rels: attrs_to_check = [] for attr in attrs_to_check: if not tag.has_attr(attr): continue if attr == "srcset": # Handle srcset specially for entry in str(tag["srcset"]).split(","): parts = entry.strip().split() if not parts: continue url_part = _protocol_fix(parts[0], page_url) process_asset_url( url_part, page_url, root, root_netloc, download_external_assets, external_domains, queued_assets, download_q, stats ) else: url_part = _protocol_fix(str(tag.get(attr, "")), page_url) process_asset_url( url_part, page_url, root, root_netloc, download_external_assets, external_domains, queued_assets, download_q, stats ) # Handle inline styles if tag.has_attr("style"): style = str(tag["style"]) for match in CSS_URL_RE.findall(style): url_part = _protocol_fix(match.strip().strip("'\""), page_url) process_asset_url( url_part, page_url, root, root_netloc, download_external_assets, external_domains, queued_assets, download_q, stats ) # Handle