From aa69b2f49663a7b24377ea89b04b1f49502c7811 Mon Sep 17 00:00:00 2001 From: Z User Date: Sun, 29 Mar 2026 00:16:54 +0000 Subject: [PATCH] Add website downloader tool wrapper for GLM-4.7-Flash - Create website_downloader_tool.py with OpenAI function calling schema - Add comprehensive tool documentation - Update README with usage examples - Update requirements.txt with optional sdk dependency --- README.md | 164 ++++++++++- requirements.txt | 5 +- website_downloader_tool.py | 572 +++++++++++++++++++++++++++++++++++++ 3 files changed, 739 insertions(+), 2 deletions(-) create mode 100644 website_downloader_tool.py diff --git a/README.md b/README.md index 21a1c9e..549998b 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,164 @@ -# docrag +# DocRAG - Custom RAG with Document Loader +A custom RAG (Retrieval-Augmented Generation) system with a custom document loader that acts as a local OpenAI-compatible server using a remote LLM with custom tools. + +## Components + +### Website Downloader Tool + +The `website_downloader_tool.py` provides a tool interface for downloading and mirroring websites for offline use or RAG ingestion. It can be used by GLM-4.7-Flash via the z-ai-web-dev-sdk. + +#### Features + +- Downloads HTML pages and all linked assets (CSS, JS, images, fonts, etc.) +- Rewrites links for offline viewing +- Supports concurrent downloads with configurable thread count +- Optional external asset downloading from CDNs +- Domain whitelisting for external assets +- Comprehensive error handling and statistics + +#### Tool Schema + +The tool follows the OpenAI function calling format: + +```python +from website_downloader_tool import get_tool_schema, website_downloader + +# Get the tool schema for registration +schema = get_tool_schema() +``` + +#### Usage with GLM-4.7-Flash + +```python +from zai import ZaiClient +from website_downloader_tool import get_tool_schema, website_downloader + +client = ZaiClient(api_key="your-api-key") + +# Define the tool +tools = [get_tool_schema()] + +# Create a chat completion with tools +response = client.chat.completions.create( + model="glm-4.7", + messages=[ + { + "role": "user", + "content": "Please download https://example.com for offline use" + } + ], + tools=tools, + stream=True, +) + +# Handle tool calls in the response +for chunk in response: + if chunk.choices[0].delta.tool_calls: + tool_call = chunk.choices[0].delta.tool_calls[0] + if tool_call.function.name == "website_downloader": + import json + args = json.loads(tool_call.function.arguments) + result = website_downloader(**args) + print(result) +``` + +#### Direct Usage + +```python +from website_downloader_tool import website_downloader + +# Download a website +result = website_downloader( + url="https://example.com", + destination="./downloaded_site", # Optional + max_pages=50, # Max pages to crawl + threads=6, # Concurrent downloads + download_external_assets=False, # Include CDN assets + external_domains=["cdn.example.com"] # Whitelist external domains +) + +if result["success"]: + print(f"Downloaded to: {result['output_directory']}") + print(f"Pages: {result['stats']['pages_crawled']}") + print(f"Assets: {result['stats']['assets_downloaded']}") +else: + print(f"Error: {result['message']}") +``` + +#### Parameters + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `url` | string | Yes | - | Starting URL to crawl | +| `destination` | string | No | Derived from URL | Output folder path | +| `max_pages` | integer | No | 50 | Max HTML pages (1-1000) | +| `threads` | integer | No | 6 | Concurrent download threads (1-20) | +| `download_external_assets` | boolean | No | False | Download CDN assets | +| `external_domains` | array | No | None | Whitelist of external domains | + +#### Return Value + +```python +{ + "success": True/False, + "message": "Human-readable summary", + "stats": { + "pages_crawled": int, + "assets_downloaded": int, + "failed_downloads": int, + "elapsed_seconds": float, + "output_directory": str, + "pages": [...], # List of downloaded pages + "downloaded_items": [...] # List of downloaded assets + }, + "output_directory": "/path/to/downloaded/site" +} +``` + +### Website Downloader CLI + +The original `website-downloader.py` can still be used as a standalone CLI tool: + +```bash +python website-downloader.py --url https://example.com --max-pages 50 --threads 6 +``` + +#### CLI Options + +- `--url`: Starting URL to crawl (required) +- `--destination`: Output folder (optional, derived from URL if not provided) +- `--max-pages`: Maximum pages to crawl (default: 50) +- `--threads`: Number of download threads (default: 6) +- `--download-external-assets`: Enable external asset downloading +- `--external-domains`: Whitelist of external domains to download from + +## Installation + +```bash +pip install -r requirements.txt +``` + +## Project Structure + +``` +docrag/ +├── website-downloader.py # Core website downloader (CLI) +├── website_downloader_tool.py # Tool wrapper for GLM-4.7-Flash +├── requirements.txt # Python dependencies +└── README.md # This file +``` + +## Integration with RAG + +The downloaded website content can be processed for RAG systems: + +1. Use the tool to download website content +2. Parse the downloaded HTML files +3. Extract text content and metadata +4. Chunk and embed the content +5. Store in your vector database + +## License + +Private repository - All rights reserved. diff --git a/requirements.txt b/requirements.txt index 72da2f0..e833513 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,7 @@ requests~=2.32.4 beautifulsoup4~=4.13.4 wget~=3.2 -urllib3~=2.5.0 \ No newline at end of file +urllib3~=2.5.0 + +# Optional: For using z-ai-web-dev-sdk with GLM-4.7-Flash +# z-ai-web-dev-sdk>=1.0.0 \ No newline at end of file diff --git a/website_downloader_tool.py b/website_downloader_tool.py new file mode 100644 index 0000000..8d288d2 --- /dev/null +++ b/website_downloader_tool.py @@ -0,0 +1,572 @@ +#!/usr/bin/env python3 +""" +Website Downloader Tool for GLM-4.7-Flash + +This module provides a tool interface for the website-downloader functionality, +allowing it to be used as a function/tool by the GLM-4.7-Flash model via the +z-ai-web-dev-sdk. + +Usage: + The tool can be invoked by the LLM to download and mirror websites for + offline use or for ingesting into a RAG system. +""" + +from __future__ import annotations + +import logging +import queue +import threading +import time +from pathlib import Path +from typing import Any, Optional +from urllib.parse import urlparse + +# Import the core functionality from website_downloader +from website_downloader import ( + SESSION, + TIMEOUT, + ASSET_EXTENSIONS, + CSS_URL_RE, + _canonical_netloc, + _protocol_fix, + canonicalize_url, + create_dir, + extract_css_assets, + fetch_binary, + is_httpish, + is_internal, + is_non_fetchable, + is_allowed_external, + normalize_url, + rewrite_links, + safe_write_text, + to_local_path, + to_local_asset_path, + cdn_local_path, + fetch_html, +) + +# Configure logging for tool use +log = logging.getLogger(__name__) + + +# ============================================================================= +# Tool Schema Definition +# ============================================================================= + +TOOL_SCHEMA = { + "type": "function", + "function": { + "name": "website_downloader", + "description": ( + "Download and mirror a website for offline use or RAG ingestion. " + "This tool crawls a website starting from a given URL, downloads HTML pages " + "and all linked assets (CSS, JavaScript, images, fonts, etc.), and saves them " + "locally with rewritten links for offline viewing. " + "Use this tool when you need to: " + "1) Archive a website for offline access, " + "2) Download website content for analysis or RAG systems, " + "3) Create a local mirror of a website." + ), + "parameters": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": ( + "The starting URL to crawl (e.g., 'https://example.com/'). " + "Must be a valid HTTP or HTTPS URL." + ), + }, + "destination": { + "type": "string", + "description": ( + "Optional output folder path where the downloaded website " + "will be saved. If not provided, a folder name will be derived " + "from the URL's domain (e.g., 'example_com')." + ), + "default": None, + }, + "max_pages": { + "type": "integer", + "description": ( + "Maximum number of HTML pages to crawl. " + "Use lower values for quick downloads, higher for comprehensive archiving." + ), + "default": 50, + "minimum": 1, + "maximum": 1000, + }, + "threads": { + "type": "integer", + "description": ( + "Number of concurrent download threads. " + "Higher values can speed up downloads but may trigger rate limits." + ), + "default": 6, + "minimum": 1, + "maximum": 20, + }, + "download_external_assets": { + "type": "boolean", + "description": ( + "Whether to download assets from external domains (CDNs, etc.). " + "Enable for complete offline functionality, disable for faster downloads " + "of only same-domain content." + ), + "default": False, + }, + "external_domains": { + "type": "array", + "items": {"type": "string"}, + "description": ( + "Optional list of external domain names to allow downloading from. " + "Useful for whitelisting specific CDN domains. " + "Example: ['cdn.example.com', 'assets.example.com']" + ), + "default": None, + }, + }, + "required": ["url"], + }, + } +} + + +# ============================================================================= +# Tool Implementation +# ============================================================================= + +def crawl_site_tool( + start_url: str, + root: Path, + max_pages: int, + threads: int, + download_external_assets: bool = False, + external_domains: Optional[set[str]] = None, +) -> dict[str, Any]: + """ + Internal crawl implementation that returns detailed results. + + This is a modified version of crawl_site that collects statistics + and returns them in a structured format for the tool response. + + Returns: + Dictionary containing crawl statistics and results + """ + start_time = time.time() + + # Statistics tracking + stats = { + "pages_crawled": 0, + "assets_downloaded": 0, + "failed_downloads": 0, + "pages": [], + "assets": [], + "errors": [], + } + + q_pages: queue.Queue[str] = queue.Queue() + q_pages.put(start_url) + + seen_pages: set[str] = set() + queued_pages: set[str] = {start_url} + queued_assets: set[str] = set() + download_q: queue.Queue[tuple[str, Path]] = queue.Queue() + + root_netloc = _canonical_netloc(urlparse(start_url)) + + # Track successfully downloaded items + downloaded_items: list[dict[str, str]] = [] + failed_items: list[dict[str, str]] = [] + + def worker() -> None: + """Download worker thread.""" + while True: + url, dest = download_q.get() + try: + if is_non_fetchable(url) or not is_httpish(url): + log.debug("Skip non-fetchable: %s", url) + continue + + if dest.exists(): + stats["assets_downloaded"] += 1 + continue + + try: + fetch_binary( + url, + dest, + download_q, + site_root=root, + root_netloc=root_netloc, + download_external_assets=download_external_assets, + external_domains=external_domains, + ) + if dest.exists(): + stats["assets_downloaded"] += 1 + downloaded_items.append({ + "url": url, + "local_path": str(dest.relative_to(root)) + }) + except Exception as e: + stats["failed_downloads"] += 1 + failed_items.append({ + "url": url, + "error": str(e) + }) + log.debug("Failed to download %s: %s", url, e) + finally: + download_q.task_done() + + # Spawn worker threads + worker_threads = [] + for i in range(max(1, threads)): + t = threading.Thread(target=worker, name=f"DL-{i + 1}", daemon=True) + t.start() + worker_threads.append(t) + + # Main crawl loop + while not q_pages.empty() and len(seen_pages) < max_pages: + page_url = q_pages.get() + + if page_url in seen_pages: + continue + + seen_pages.add(page_url) + stats["pages_crawled"] += 1 + + log.info("Crawling page %d/%d: %s", len(seen_pages), max_pages, page_url) + + soup = fetch_html(page_url) + if soup is None: + stats["errors"].append(f"Failed to fetch page: {page_url}") + continue + + # Record page info + local_page_path = to_local_path(urlparse(page_url), root) + stats["pages"].append({ + "url": page_url, + "local_path": str(local_page_path.relative_to(root)) if local_page_path.is_relative_to(root) else str(local_page_path) + }) + + # Find and queue all assets + for tag in soup.find_all(True): + # Handle various tag types and their URL attributes + tag_handlers = { + "img": ["src", "data-src", "srcset"], + "script": ["src"], + "link": ["href"], + "video": ["src", "poster"], + "audio": ["src"], + "source": ["src", "srcset"], + "iframe": ["src"], + "embed": ["src"], + "object": ["data"], + } + + attrs_to_check = tag_handlers.get(tag.name, []) + + # Also check for link tags with resource rel types + if tag.name == "link": + rel = tag.get("rel", []) + if isinstance(rel, str): + rel = [rel] + rel_set = {r.lower() for r in rel} + resource_rels = {"stylesheet", "icon", "shortcut", "apple-touch-icon", "preload", "modulepreload", "manifest"} + if not rel_set & resource_rels: + attrs_to_check = [] + + for attr in attrs_to_check: + if not tag.has_attr(attr): + continue + + if attr == "srcset": + # Handle srcset specially + for entry in str(tag["srcset"]).split(","): + parts = entry.strip().split() + if not parts: + continue + url_part = _protocol_fix(parts[0], page_url) + process_asset_url( + url_part, page_url, root, root_netloc, + download_external_assets, external_domains, + queued_assets, download_q, stats + ) + else: + url_part = _protocol_fix(str(tag.get(attr, "")), page_url) + process_asset_url( + url_part, page_url, root, root_netloc, + download_external_assets, external_domains, + queued_assets, download_q, stats + ) + + # Handle inline styles + if tag.has_attr("style"): + style = str(tag["style"]) + for match in CSS_URL_RE.findall(style): + url_part = _protocol_fix(match.strip().strip("'\""), page_url) + process_asset_url( + url_part, page_url, root, root_netloc, + download_external_assets, external_domains, + queued_assets, download_q, stats + ) + + # Handle