docrag/website_downloader_tool.py

#!/usr/bin/env python3
"""
Website Downloader Tool for GLM-4.7-Flash

This module provides a tool interface for the website-downloader functionality,
allowing it to be used as a function/tool by the GLM-4.7-Flash model via the
z-ai-web-dev-sdk.

Usage:
    The tool can be invoked by the LLM to download and mirror websites for
    offline use or for ingesting into a RAG system.
"""

from __future__ import annotations

import logging
import queue
import threading
import time
from pathlib import Path
from typing import Any, Optional
from urllib.parse import urlparse

# Import the core functionality from website_downloader
from website_downloader import (
    SESSION,
    TIMEOUT,
    ASSET_EXTENSIONS,
    CSS_URL_RE,
    _canonical_netloc,
    _protocol_fix,
    canonicalize_url,
    create_dir,
    extract_css_assets,
    fetch_binary,
    is_httpish,
    is_internal,
    is_non_fetchable,
    is_allowed_external,
    normalize_url,
    rewrite_links,
    safe_write_text,
    to_local_path,
    to_local_asset_path,
    cdn_local_path,
    fetch_html,
)

# Configure logging for tool use
log = logging.getLogger(__name__)


# =============================================================================
# Tool Schema Definition
# =============================================================================

TOOL_SCHEMA = {
    "type": "function",
    "function": {
        "name": "website_downloader",
        "description": (
            "Download and mirror a website for offline use or RAG ingestion. "
            "This tool crawls a website starting from a given URL, downloads HTML pages "
            "and all linked assets (CSS, JavaScript, images, fonts, etc.), and saves them "
            "locally with rewritten links for offline viewing. "
            "Use this tool when you need to: "
            "1) Archive a website for offline access, "
            "2) Download website content for analysis or RAG systems, "
            "3) Create a local mirror of a website."
        ),
        "parameters": {
            "type": "object",
            "properties": {
                "url": {
                    "type": "string",
                    "description": (
                        "The starting URL to crawl (e.g., 'https://example.com/'). "
                        "Must be a valid HTTP or HTTPS URL."
                    ),
                },
                "destination": {
                    "type": "string",
                    "description": (
                        "Optional output folder path where the downloaded website "
                        "will be saved. If not provided, a folder name will be derived "
                        "from the URL's domain (e.g., 'example_com')."
                    ),
                    "default": None,
                },
                "max_pages": {
                    "type": "integer",
                    "description": (
                        "Maximum number of HTML pages to crawl. "
                        "Use lower values for quick downloads, higher for comprehensive archiving."
                    ),
                    "default": 50,
                    "minimum": 1,
                    "maximum": 1000,
                },
                "threads": {
                    "type": "integer",
                    "description": (
                        "Number of concurrent download threads. "
                        "Higher values can speed up downloads but may trigger rate limits."
                    ),
                    "default": 6,
                    "minimum": 1,
                    "maximum": 20,
                },
                "download_external_assets": {
                    "type": "boolean",
                    "description": (
                        "Whether to download assets from external domains (CDNs, etc.). "
                        "Enable for complete offline functionality, disable for faster downloads "
                        "of only same-domain content."
                    ),
                    "default": False,
                },
                "external_domains": {
                    "type": "array",
                    "items": {"type": "string"},
                    "description": (
                        "Optional list of external domain names to allow downloading from. "
                        "Useful for whitelisting specific CDN domains. "
                        "Example: ['cdn.example.com', 'assets.example.com']"
                    ),
                    "default": None,
                },
            },
            "required": ["url"],
        },
    }
}


# =============================================================================
# Tool Implementation
# =============================================================================

def crawl_site_tool(
    start_url: str,
    root: Path,
    max_pages: int,
    threads: int,
    download_external_assets: bool = False,
    external_domains: Optional[set[str]] = None,
) -> dict[str, Any]:
    """
    Internal crawl implementation that returns detailed results.

    This is a modified version of crawl_site that collects statistics
    and returns them in a structured format for the tool response.

    Returns:
        Dictionary containing crawl statistics and results
    """
    start_time = time.time()

    # Statistics tracking
    stats = {
        "pages_crawled": 0,
        "assets_downloaded": 0,
        "failed_downloads": 0,
        "pages": [],
        "assets": [],
        "errors": [],
    }

    q_pages: queue.Queue[str] = queue.Queue()
    q_pages.put(start_url)

    seen_pages: set[str] = set()
    queued_pages: set[str] = {start_url}
    queued_assets: set[str] = set()
    download_q: queue.Queue[tuple[str, Path]] = queue.Queue()

    root_netloc = _canonical_netloc(urlparse(start_url))

    # Track successfully downloaded items
    downloaded_items: list[dict[str, str]] = []
    failed_items: list[dict[str, str]] = []

    def worker() -> None:
        """Download worker thread."""
        while True:
            url, dest = download_q.get()
            try:
                if is_non_fetchable(url) or not is_httpish(url):
                    log.debug("Skip non-fetchable: %s", url)
                    continue

                if dest.exists():
                    stats["assets_downloaded"] += 1
                    continue

                try:
                    fetch_binary(
                        url,
                        dest,
                        download_q,
                        site_root=root,
                        root_netloc=root_netloc,
                        download_external_assets=download_external_assets,
                        external_domains=external_domains,
                    )
                    if dest.exists():
                        stats["assets_downloaded"] += 1
                        downloaded_items.append({
                            "url": url,
                            "local_path": str(dest.relative_to(root))
                        })
                except Exception as e:
                    stats["failed_downloads"] += 1
                    failed_items.append({
                        "url": url,
                        "error": str(e)
                    })
                    log.debug("Failed to download %s: %s", url, e)
            finally:
                download_q.task_done()

    # Spawn worker threads
    worker_threads = []
    for i in range(max(1, threads)):
        t = threading.Thread(target=worker, name=f"DL-{i + 1}", daemon=True)
        t.start()
        worker_threads.append(t)

    # Main crawl loop
    while not q_pages.empty() and len(seen_pages) < max_pages:
        page_url = q_pages.get()

        if page_url in seen_pages:
            continue

        seen_pages.add(page_url)
        stats["pages_crawled"] += 1

        log.info("Crawling page %d/%d: %s", len(seen_pages), max_pages, page_url)

        soup = fetch_html(page_url)
        if soup is None:
            stats["errors"].append(f"Failed to fetch page: {page_url}")
            continue

        # Record page info
        local_page_path = to_local_path(urlparse(page_url), root)
        stats["pages"].append({
            "url": page_url,
            "local_path": str(local_page_path.relative_to(root)) if local_page_path.is_relative_to(root) else str(local_page_path)
        })

        # Find and queue all assets
        for tag in soup.find_all(True):
            # Handle various tag types and their URL attributes
            tag_handlers = {
                "img": ["src", "data-src", "srcset"],
                "script": ["src"],
                "link": ["href"],
                "video": ["src", "poster"],
                "audio": ["src"],
                "source": ["src", "srcset"],
                "iframe": ["src"],
                "embed": ["src"],
                "object": ["data"],
            }

            attrs_to_check = tag_handlers.get(tag.name, [])

            # Also check for link tags with resource rel types
            if tag.name == "link":
                rel = tag.get("rel", [])
                if isinstance(rel, str):
                    rel = [rel]
                rel_set = {r.lower() for r in rel}
                resource_rels = {"stylesheet", "icon", "shortcut", "apple-touch-icon", "preload", "modulepreload", "manifest"}
                if not rel_set & resource_rels:
                    attrs_to_check = []

            for attr in attrs_to_check:
                if not tag.has_attr(attr):
                    continue

                if attr == "srcset":
                    # Handle srcset specially
                    for entry in str(tag["srcset"]).split(","):
                        parts = entry.strip().split()
                        if not parts:
                            continue
                        url_part = _protocol_fix(parts[0], page_url)
                        process_asset_url(
                            url_part, page_url, root, root_netloc,
                            download_external_assets, external_domains,
                            queued_assets, download_q, stats
                        )
                else:
                    url_part = _protocol_fix(str(tag.get(attr, "")), page_url)
                    process_asset_url(
                        url_part, page_url, root, root_netloc,
                        download_external_assets, external_domains,
                        queued_assets, download_q, stats
                    )

            # Handle inline styles
            if tag.has_attr("style"):
                style = str(tag["style"])
                for match in CSS_URL_RE.findall(style):
                    url_part = _protocol_fix(match.strip().strip("'\""), page_url)
                    process_asset_url(
                        url_part, page_url, root, root_netloc,
                        download_external_assets, external_domains,
                        queued_assets, download_q, stats
                    )

            # Handle <style> blocks
            if tag.name == "style":
                css_text = tag.string or tag.get_text()
                if css_text:
                    for asset in extract_css_assets(css_text):
                        asset = _protocol_fix(asset, page_url)
                        process_asset_url(
                            asset, page_url, root, root_netloc,
                            download_external_assets, external_domains,
                            queued_assets, download_q, stats
                        )

            # Find and queue internal links for further crawling
            if tag.name == "a" and tag.has_attr("href"):
                href = _protocol_fix(str(tag.get("href", "")), page_url)
                if href and not href.startswith("#") and is_httpish(href) and not is_non_fetchable(href):
                    abs_url = normalize_url(canonicalize_url(href, page_url))
                    if is_internal(abs_url, root_netloc) and abs_url not in seen_pages and abs_url not in queued_pages:
                        queued_pages.add(abs_url)
                        q_pages.put(abs_url)

        # Save the page with rewritten links
        local_path = to_local_path(urlparse(page_url), root)
        create_dir(local_path.parent)
        rewrite_links(
            soup,
            page_url,
            root,
            local_path.parent,
            download_external_assets,
            external_domains,
        )
        safe_write_text(local_path, str(soup), encoding="utf-8")

    # Wait for all downloads to complete
    download_q.join()

    elapsed = time.time() - start_time
    stats["elapsed_seconds"] = round(elapsed, 2)
    stats["output_directory"] = str(root.resolve())
    stats["downloaded_items"] = downloaded_items[:100]  # Limit for response size
    stats["failed_items"] = failed_items[:50]  # Limit for response size

    return stats


def process_asset_url(
    url_part: str,
    page_url: str,
    root: Path,
    root_netloc: str,
    download_external_assets: bool,
    external_domains: Optional[set[str]],
    queued_assets: set[str],
    download_q: queue.Queue[tuple[str, Path]],
    stats: dict,
) -> None:
    """Process and queue an asset URL for download."""
    if (
        not url_part
        or url_part.startswith("#")
        or url_part.startswith(("data:", "javascript:", "about:"))
        or is_non_fetchable(url_part)
        or not is_httpish(url_part)
    ):
        return

    abs_url = normalize_url(canonicalize_url(url_part, page_url))
    parsed = urlparse(abs_url)

    if not parsed.path.lower().endswith(ASSET_EXTENSIONS):
        return

    is_ext = not is_internal(abs_url, root_netloc)

    if is_ext:
        if not download_external_assets:
            return
        if external_domains and not is_allowed_external(abs_url, external_domains):
            return
        dest_path = cdn_local_path(parsed, root)
    else:
        dest_path = to_local_asset_path(parsed, root)

    if abs_url not in queued_assets:
        queued_assets.add(abs_url)
        create_dir(dest_path.parent)
        download_q.put((abs_url, dest_path))


def make_root(url: str, custom: Optional[str]) -> Path:
    """Derive output folder from URL if custom not supplied."""
    return Path(custom) if custom else Path(urlparse(url).netloc.replace(".", "_"))


def website_downloader(
    url: str,
    destination: Optional[str] = None,
    max_pages: int = 50,
    threads: int = 6,
    download_external_assets: bool = False,
    external_domains: Optional[list[str]] = None,
) -> dict[str, Any]:
    """
    Download and mirror a website for offline use or RAG ingestion.

    This is the main tool function that can be invoked by the GLM-4.7-Flash model.
    It wraps the website-downloader functionality in a tool interface.

    Args:
        url: The starting URL to crawl (e.g., 'https://example.com/')
        destination: Optional output folder path. If not provided, derived from URL domain.
        max_pages: Maximum number of HTML pages to crawl (1-1000, default: 50)
        threads: Number of concurrent download threads (1-20, default: 6)
        download_external_assets: Whether to download assets from external domains (default: False)
        external_domains: Optional list of external domain names to allow downloading from

    Returns:
        Dictionary containing:
        - success: Boolean indicating if the operation was successful
        - message: Human-readable summary of what was done
        - stats: Detailed statistics about the crawl
        - output_directory: Path to the downloaded website
    """
    try:
        # Validate URL
        parsed_url = urlparse(url)
        if not parsed_url.scheme or parsed_url.scheme not in ("http", "https"):
            return {
                "success": False,
                "message": f"Invalid URL: '{url}'. Must be a valid HTTP or HTTPS URL.",
                "stats": None,
                "output_directory": None,
            }

        # Validate parameters
        if max_pages < 1 or max_pages > 1000:
            return {
                "success": False,
                "message": f"max_pages must be between 1 and 1000, got {max_pages}",
                "stats": None,
                "output_directory": None,
            }

        if threads < 1 or threads > 20:
            return {
                "success": False,
                "message": f"threads must be between 1 and 20, got {threads}",
                "stats": None,
                "output_directory": None,
            }

        # Prepare output directory
        root = make_root(url, destination)

        # Process external domains
        ext_domains_set = None
        if external_domains:
            ext_domains_set = {
                urlparse(d).hostname.lower() if "://" in d else d.lower()
                for d in external_domains
            }
            download_external_assets = True  # Auto-enable if domains specified

        # Log the crawl start
        log.info(
            "Starting website download: url=%s, dest=%s, max_pages=%d, threads=%d, external=%s",
            url, root, max_pages, threads, download_external_assets
        )

        # Run the crawl
        stats = crawl_site_tool(
            start_url=url,
            root=root,
            max_pages=max_pages,
            threads=threads,
            download_external_assets=download_external_assets,
            external_domains=ext_domains_set,
        )

        # Build success response
        message = (
            f"Successfully downloaded website from {url}\n"
            f"- Pages crawled: {stats['pages_crawled']}\n"
            f"- Assets downloaded: {stats['assets_downloaded']}\n"
            f"- Time elapsed: {stats['elapsed_seconds']}s\n"
            f"- Output directory: {stats['output_directory']}"
        )

        if stats["failed_downloads"] > 0:
            message += f"\n- Failed downloads: {stats['failed_downloads']}"

        return {
            "success": True,
            "message": message,
            "stats": stats,
            "output_directory": stats["output_directory"],
        }

    except Exception as e:
        log.exception("Website download failed")
        return {
            "success": False,
            "message": f"Website download failed: {str(e)}",
            "stats": None,
            "output_directory": None,
        }


# =============================================================================
# Tool Registration Helper
# =============================================================================

def get_tool_schema() -> dict[str, Any]:
    """
    Get the tool schema for registration with the LLM.

    This schema follows the OpenAI function calling format and can be
    used directly when creating chat completions with tools.

    Returns:
        The tool schema dictionary
    """
    return TOOL_SCHEMA


def get_tool_function():
    """
    Get the tool function for invocation.

    Returns:
        The callable tool function
    """
    return website_downloader


# =============================================================================
# Example Usage
# =============================================================================

if __name__ == "__main__":
    # Example: Direct invocation
    import json

    print("Website Downloader Tool for GLM-4.7-Flash")
    print("=" * 50)
    print("\nTool Schema:")
    print(json.dumps(TOOL_SCHEMA, indent=2))

    print("\n" + "=" * 50)
    print("\nExample invocation:")
    result = website_downloader(
        url="https://example.com",
        max_pages=5,
        threads=4,
        download_external_assets=False
    )
    print(json.dumps(result, indent=2))