Add website downloader tool wrapper for GLM-4.7-Flash
- Create website_downloader_tool.py with OpenAI function calling schema - Add comprehensive tool documentation - Update README with usage examples - Update requirements.txt with optional sdk dependency
This commit is contained in:
parent
1623ee8d2c
commit
aa69b2f496
164
README.md
164
README.md
@ -1,2 +1,164 @@
|
||||
# docrag
|
||||
# DocRAG - Custom RAG with Document Loader
|
||||
|
||||
A custom RAG (Retrieval-Augmented Generation) system with a custom document loader that acts as a local OpenAI-compatible server using a remote LLM with custom tools.
|
||||
|
||||
## Components
|
||||
|
||||
### Website Downloader Tool
|
||||
|
||||
The `website_downloader_tool.py` provides a tool interface for downloading and mirroring websites for offline use or RAG ingestion. It can be used by GLM-4.7-Flash via the z-ai-web-dev-sdk.
|
||||
|
||||
#### Features
|
||||
|
||||
- Downloads HTML pages and all linked assets (CSS, JS, images, fonts, etc.)
|
||||
- Rewrites links for offline viewing
|
||||
- Supports concurrent downloads with configurable thread count
|
||||
- Optional external asset downloading from CDNs
|
||||
- Domain whitelisting for external assets
|
||||
- Comprehensive error handling and statistics
|
||||
|
||||
#### Tool Schema
|
||||
|
||||
The tool follows the OpenAI function calling format:
|
||||
|
||||
```python
|
||||
from website_downloader_tool import get_tool_schema, website_downloader
|
||||
|
||||
# Get the tool schema for registration
|
||||
schema = get_tool_schema()
|
||||
```
|
||||
|
||||
#### Usage with GLM-4.7-Flash
|
||||
|
||||
```python
|
||||
from zai import ZaiClient
|
||||
from website_downloader_tool import get_tool_schema, website_downloader
|
||||
|
||||
client = ZaiClient(api_key="your-api-key")
|
||||
|
||||
# Define the tool
|
||||
tools = [get_tool_schema()]
|
||||
|
||||
# Create a chat completion with tools
|
||||
response = client.chat.completions.create(
|
||||
model="glm-4.7",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Please download https://example.com for offline use"
|
||||
}
|
||||
],
|
||||
tools=tools,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
# Handle tool calls in the response
|
||||
for chunk in response:
|
||||
if chunk.choices[0].delta.tool_calls:
|
||||
tool_call = chunk.choices[0].delta.tool_calls[0]
|
||||
if tool_call.function.name == "website_downloader":
|
||||
import json
|
||||
args = json.loads(tool_call.function.arguments)
|
||||
result = website_downloader(**args)
|
||||
print(result)
|
||||
```
|
||||
|
||||
#### Direct Usage
|
||||
|
||||
```python
|
||||
from website_downloader_tool import website_downloader
|
||||
|
||||
# Download a website
|
||||
result = website_downloader(
|
||||
url="https://example.com",
|
||||
destination="./downloaded_site", # Optional
|
||||
max_pages=50, # Max pages to crawl
|
||||
threads=6, # Concurrent downloads
|
||||
download_external_assets=False, # Include CDN assets
|
||||
external_domains=["cdn.example.com"] # Whitelist external domains
|
||||
)
|
||||
|
||||
if result["success"]:
|
||||
print(f"Downloaded to: {result['output_directory']}")
|
||||
print(f"Pages: {result['stats']['pages_crawled']}")
|
||||
print(f"Assets: {result['stats']['assets_downloaded']}")
|
||||
else:
|
||||
print(f"Error: {result['message']}")
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
| Parameter | Type | Required | Default | Description |
|
||||
|-----------|------|----------|---------|-------------|
|
||||
| `url` | string | Yes | - | Starting URL to crawl |
|
||||
| `destination` | string | No | Derived from URL | Output folder path |
|
||||
| `max_pages` | integer | No | 50 | Max HTML pages (1-1000) |
|
||||
| `threads` | integer | No | 6 | Concurrent download threads (1-20) |
|
||||
| `download_external_assets` | boolean | No | False | Download CDN assets |
|
||||
| `external_domains` | array | No | None | Whitelist of external domains |
|
||||
|
||||
#### Return Value
|
||||
|
||||
```python
|
||||
{
|
||||
"success": True/False,
|
||||
"message": "Human-readable summary",
|
||||
"stats": {
|
||||
"pages_crawled": int,
|
||||
"assets_downloaded": int,
|
||||
"failed_downloads": int,
|
||||
"elapsed_seconds": float,
|
||||
"output_directory": str,
|
||||
"pages": [...], # List of downloaded pages
|
||||
"downloaded_items": [...] # List of downloaded assets
|
||||
},
|
||||
"output_directory": "/path/to/downloaded/site"
|
||||
}
|
||||
```
|
||||
|
||||
### Website Downloader CLI
|
||||
|
||||
The original `website-downloader.py` can still be used as a standalone CLI tool:
|
||||
|
||||
```bash
|
||||
python website-downloader.py --url https://example.com --max-pages 50 --threads 6
|
||||
```
|
||||
|
||||
#### CLI Options
|
||||
|
||||
- `--url`: Starting URL to crawl (required)
|
||||
- `--destination`: Output folder (optional, derived from URL if not provided)
|
||||
- `--max-pages`: Maximum pages to crawl (default: 50)
|
||||
- `--threads`: Number of download threads (default: 6)
|
||||
- `--download-external-assets`: Enable external asset downloading
|
||||
- `--external-domains`: Whitelist of external domains to download from
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
docrag/
|
||||
├── website-downloader.py # Core website downloader (CLI)
|
||||
├── website_downloader_tool.py # Tool wrapper for GLM-4.7-Flash
|
||||
├── requirements.txt # Python dependencies
|
||||
└── README.md # This file
|
||||
```
|
||||
|
||||
## Integration with RAG
|
||||
|
||||
The downloaded website content can be processed for RAG systems:
|
||||
|
||||
1. Use the tool to download website content
|
||||
2. Parse the downloaded HTML files
|
||||
3. Extract text content and metadata
|
||||
4. Chunk and embed the content
|
||||
5. Store in your vector database
|
||||
|
||||
## License
|
||||
|
||||
Private repository - All rights reserved.
|
||||
|
||||
@ -1,4 +1,7 @@
|
||||
requests~=2.32.4
|
||||
beautifulsoup4~=4.13.4
|
||||
wget~=3.2
|
||||
urllib3~=2.5.0
|
||||
urllib3~=2.5.0
|
||||
|
||||
# Optional: For using z-ai-web-dev-sdk with GLM-4.7-Flash
|
||||
# z-ai-web-dev-sdk>=1.0.0
|
||||
572
website_downloader_tool.py
Normal file
572
website_downloader_tool.py
Normal file
@ -0,0 +1,572 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Website Downloader Tool for GLM-4.7-Flash
|
||||
|
||||
This module provides a tool interface for the website-downloader functionality,
|
||||
allowing it to be used as a function/tool by the GLM-4.7-Flash model via the
|
||||
z-ai-web-dev-sdk.
|
||||
|
||||
Usage:
|
||||
The tool can be invoked by the LLM to download and mirror websites for
|
||||
offline use or for ingesting into a RAG system.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import queue
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
# Import the core functionality from website_downloader
|
||||
from website_downloader import (
|
||||
SESSION,
|
||||
TIMEOUT,
|
||||
ASSET_EXTENSIONS,
|
||||
CSS_URL_RE,
|
||||
_canonical_netloc,
|
||||
_protocol_fix,
|
||||
canonicalize_url,
|
||||
create_dir,
|
||||
extract_css_assets,
|
||||
fetch_binary,
|
||||
is_httpish,
|
||||
is_internal,
|
||||
is_non_fetchable,
|
||||
is_allowed_external,
|
||||
normalize_url,
|
||||
rewrite_links,
|
||||
safe_write_text,
|
||||
to_local_path,
|
||||
to_local_asset_path,
|
||||
cdn_local_path,
|
||||
fetch_html,
|
||||
)
|
||||
|
||||
# Configure logging for tool use
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tool Schema Definition
|
||||
# =============================================================================
|
||||
|
||||
TOOL_SCHEMA = {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "website_downloader",
|
||||
"description": (
|
||||
"Download and mirror a website for offline use or RAG ingestion. "
|
||||
"This tool crawls a website starting from a given URL, downloads HTML pages "
|
||||
"and all linked assets (CSS, JavaScript, images, fonts, etc.), and saves them "
|
||||
"locally with rewritten links for offline viewing. "
|
||||
"Use this tool when you need to: "
|
||||
"1) Archive a website for offline access, "
|
||||
"2) Download website content for analysis or RAG systems, "
|
||||
"3) Create a local mirror of a website."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"The starting URL to crawl (e.g., 'https://example.com/'). "
|
||||
"Must be a valid HTTP or HTTPS URL."
|
||||
),
|
||||
},
|
||||
"destination": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"Optional output folder path where the downloaded website "
|
||||
"will be saved. If not provided, a folder name will be derived "
|
||||
"from the URL's domain (e.g., 'example_com')."
|
||||
),
|
||||
"default": None,
|
||||
},
|
||||
"max_pages": {
|
||||
"type": "integer",
|
||||
"description": (
|
||||
"Maximum number of HTML pages to crawl. "
|
||||
"Use lower values for quick downloads, higher for comprehensive archiving."
|
||||
),
|
||||
"default": 50,
|
||||
"minimum": 1,
|
||||
"maximum": 1000,
|
||||
},
|
||||
"threads": {
|
||||
"type": "integer",
|
||||
"description": (
|
||||
"Number of concurrent download threads. "
|
||||
"Higher values can speed up downloads but may trigger rate limits."
|
||||
),
|
||||
"default": 6,
|
||||
"minimum": 1,
|
||||
"maximum": 20,
|
||||
},
|
||||
"download_external_assets": {
|
||||
"type": "boolean",
|
||||
"description": (
|
||||
"Whether to download assets from external domains (CDNs, etc.). "
|
||||
"Enable for complete offline functionality, disable for faster downloads "
|
||||
"of only same-domain content."
|
||||
),
|
||||
"default": False,
|
||||
},
|
||||
"external_domains": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": (
|
||||
"Optional list of external domain names to allow downloading from. "
|
||||
"Useful for whitelisting specific CDN domains. "
|
||||
"Example: ['cdn.example.com', 'assets.example.com']"
|
||||
),
|
||||
"default": None,
|
||||
},
|
||||
},
|
||||
"required": ["url"],
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tool Implementation
|
||||
# =============================================================================
|
||||
|
||||
def crawl_site_tool(
|
||||
start_url: str,
|
||||
root: Path,
|
||||
max_pages: int,
|
||||
threads: int,
|
||||
download_external_assets: bool = False,
|
||||
external_domains: Optional[set[str]] = None,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Internal crawl implementation that returns detailed results.
|
||||
|
||||
This is a modified version of crawl_site that collects statistics
|
||||
and returns them in a structured format for the tool response.
|
||||
|
||||
Returns:
|
||||
Dictionary containing crawl statistics and results
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
# Statistics tracking
|
||||
stats = {
|
||||
"pages_crawled": 0,
|
||||
"assets_downloaded": 0,
|
||||
"failed_downloads": 0,
|
||||
"pages": [],
|
||||
"assets": [],
|
||||
"errors": [],
|
||||
}
|
||||
|
||||
q_pages: queue.Queue[str] = queue.Queue()
|
||||
q_pages.put(start_url)
|
||||
|
||||
seen_pages: set[str] = set()
|
||||
queued_pages: set[str] = {start_url}
|
||||
queued_assets: set[str] = set()
|
||||
download_q: queue.Queue[tuple[str, Path]] = queue.Queue()
|
||||
|
||||
root_netloc = _canonical_netloc(urlparse(start_url))
|
||||
|
||||
# Track successfully downloaded items
|
||||
downloaded_items: list[dict[str, str]] = []
|
||||
failed_items: list[dict[str, str]] = []
|
||||
|
||||
def worker() -> None:
|
||||
"""Download worker thread."""
|
||||
while True:
|
||||
url, dest = download_q.get()
|
||||
try:
|
||||
if is_non_fetchable(url) or not is_httpish(url):
|
||||
log.debug("Skip non-fetchable: %s", url)
|
||||
continue
|
||||
|
||||
if dest.exists():
|
||||
stats["assets_downloaded"] += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
fetch_binary(
|
||||
url,
|
||||
dest,
|
||||
download_q,
|
||||
site_root=root,
|
||||
root_netloc=root_netloc,
|
||||
download_external_assets=download_external_assets,
|
||||
external_domains=external_domains,
|
||||
)
|
||||
if dest.exists():
|
||||
stats["assets_downloaded"] += 1
|
||||
downloaded_items.append({
|
||||
"url": url,
|
||||
"local_path": str(dest.relative_to(root))
|
||||
})
|
||||
except Exception as e:
|
||||
stats["failed_downloads"] += 1
|
||||
failed_items.append({
|
||||
"url": url,
|
||||
"error": str(e)
|
||||
})
|
||||
log.debug("Failed to download %s: %s", url, e)
|
||||
finally:
|
||||
download_q.task_done()
|
||||
|
||||
# Spawn worker threads
|
||||
worker_threads = []
|
||||
for i in range(max(1, threads)):
|
||||
t = threading.Thread(target=worker, name=f"DL-{i + 1}", daemon=True)
|
||||
t.start()
|
||||
worker_threads.append(t)
|
||||
|
||||
# Main crawl loop
|
||||
while not q_pages.empty() and len(seen_pages) < max_pages:
|
||||
page_url = q_pages.get()
|
||||
|
||||
if page_url in seen_pages:
|
||||
continue
|
||||
|
||||
seen_pages.add(page_url)
|
||||
stats["pages_crawled"] += 1
|
||||
|
||||
log.info("Crawling page %d/%d: %s", len(seen_pages), max_pages, page_url)
|
||||
|
||||
soup = fetch_html(page_url)
|
||||
if soup is None:
|
||||
stats["errors"].append(f"Failed to fetch page: {page_url}")
|
||||
continue
|
||||
|
||||
# Record page info
|
||||
local_page_path = to_local_path(urlparse(page_url), root)
|
||||
stats["pages"].append({
|
||||
"url": page_url,
|
||||
"local_path": str(local_page_path.relative_to(root)) if local_page_path.is_relative_to(root) else str(local_page_path)
|
||||
})
|
||||
|
||||
# Find and queue all assets
|
||||
for tag in soup.find_all(True):
|
||||
# Handle various tag types and their URL attributes
|
||||
tag_handlers = {
|
||||
"img": ["src", "data-src", "srcset"],
|
||||
"script": ["src"],
|
||||
"link": ["href"],
|
||||
"video": ["src", "poster"],
|
||||
"audio": ["src"],
|
||||
"source": ["src", "srcset"],
|
||||
"iframe": ["src"],
|
||||
"embed": ["src"],
|
||||
"object": ["data"],
|
||||
}
|
||||
|
||||
attrs_to_check = tag_handlers.get(tag.name, [])
|
||||
|
||||
# Also check for link tags with resource rel types
|
||||
if tag.name == "link":
|
||||
rel = tag.get("rel", [])
|
||||
if isinstance(rel, str):
|
||||
rel = [rel]
|
||||
rel_set = {r.lower() for r in rel}
|
||||
resource_rels = {"stylesheet", "icon", "shortcut", "apple-touch-icon", "preload", "modulepreload", "manifest"}
|
||||
if not rel_set & resource_rels:
|
||||
attrs_to_check = []
|
||||
|
||||
for attr in attrs_to_check:
|
||||
if not tag.has_attr(attr):
|
||||
continue
|
||||
|
||||
if attr == "srcset":
|
||||
# Handle srcset specially
|
||||
for entry in str(tag["srcset"]).split(","):
|
||||
parts = entry.strip().split()
|
||||
if not parts:
|
||||
continue
|
||||
url_part = _protocol_fix(parts[0], page_url)
|
||||
process_asset_url(
|
||||
url_part, page_url, root, root_netloc,
|
||||
download_external_assets, external_domains,
|
||||
queued_assets, download_q, stats
|
||||
)
|
||||
else:
|
||||
url_part = _protocol_fix(str(tag.get(attr, "")), page_url)
|
||||
process_asset_url(
|
||||
url_part, page_url, root, root_netloc,
|
||||
download_external_assets, external_domains,
|
||||
queued_assets, download_q, stats
|
||||
)
|
||||
|
||||
# Handle inline styles
|
||||
if tag.has_attr("style"):
|
||||
style = str(tag["style"])
|
||||
for match in CSS_URL_RE.findall(style):
|
||||
url_part = _protocol_fix(match.strip().strip("'\""), page_url)
|
||||
process_asset_url(
|
||||
url_part, page_url, root, root_netloc,
|
||||
download_external_assets, external_domains,
|
||||
queued_assets, download_q, stats
|
||||
)
|
||||
|
||||
# Handle <style> blocks
|
||||
if tag.name == "style":
|
||||
css_text = tag.string or tag.get_text()
|
||||
if css_text:
|
||||
for asset in extract_css_assets(css_text):
|
||||
asset = _protocol_fix(asset, page_url)
|
||||
process_asset_url(
|
||||
asset, page_url, root, root_netloc,
|
||||
download_external_assets, external_domains,
|
||||
queued_assets, download_q, stats
|
||||
)
|
||||
|
||||
# Find and queue internal links for further crawling
|
||||
if tag.name == "a" and tag.has_attr("href"):
|
||||
href = _protocol_fix(str(tag.get("href", "")), page_url)
|
||||
if href and not href.startswith("#") and is_httpish(href) and not is_non_fetchable(href):
|
||||
abs_url = normalize_url(canonicalize_url(href, page_url))
|
||||
if is_internal(abs_url, root_netloc) and abs_url not in seen_pages and abs_url not in queued_pages:
|
||||
queued_pages.add(abs_url)
|
||||
q_pages.put(abs_url)
|
||||
|
||||
# Save the page with rewritten links
|
||||
local_path = to_local_path(urlparse(page_url), root)
|
||||
create_dir(local_path.parent)
|
||||
rewrite_links(
|
||||
soup,
|
||||
page_url,
|
||||
root,
|
||||
local_path.parent,
|
||||
download_external_assets,
|
||||
external_domains,
|
||||
)
|
||||
safe_write_text(local_path, str(soup), encoding="utf-8")
|
||||
|
||||
# Wait for all downloads to complete
|
||||
download_q.join()
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
stats["elapsed_seconds"] = round(elapsed, 2)
|
||||
stats["output_directory"] = str(root.resolve())
|
||||
stats["downloaded_items"] = downloaded_items[:100] # Limit for response size
|
||||
stats["failed_items"] = failed_items[:50] # Limit for response size
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def process_asset_url(
|
||||
url_part: str,
|
||||
page_url: str,
|
||||
root: Path,
|
||||
root_netloc: str,
|
||||
download_external_assets: bool,
|
||||
external_domains: Optional[set[str]],
|
||||
queued_assets: set[str],
|
||||
download_q: queue.Queue[tuple[str, Path]],
|
||||
stats: dict,
|
||||
) -> None:
|
||||
"""Process and queue an asset URL for download."""
|
||||
if (
|
||||
not url_part
|
||||
or url_part.startswith("#")
|
||||
or url_part.startswith(("data:", "javascript:", "about:"))
|
||||
or is_non_fetchable(url_part)
|
||||
or not is_httpish(url_part)
|
||||
):
|
||||
return
|
||||
|
||||
abs_url = normalize_url(canonicalize_url(url_part, page_url))
|
||||
parsed = urlparse(abs_url)
|
||||
|
||||
if not parsed.path.lower().endswith(ASSET_EXTENSIONS):
|
||||
return
|
||||
|
||||
is_ext = not is_internal(abs_url, root_netloc)
|
||||
|
||||
if is_ext:
|
||||
if not download_external_assets:
|
||||
return
|
||||
if external_domains and not is_allowed_external(abs_url, external_domains):
|
||||
return
|
||||
dest_path = cdn_local_path(parsed, root)
|
||||
else:
|
||||
dest_path = to_local_asset_path(parsed, root)
|
||||
|
||||
if abs_url not in queued_assets:
|
||||
queued_assets.add(abs_url)
|
||||
create_dir(dest_path.parent)
|
||||
download_q.put((abs_url, dest_path))
|
||||
|
||||
|
||||
def make_root(url: str, custom: Optional[str]) -> Path:
|
||||
"""Derive output folder from URL if custom not supplied."""
|
||||
return Path(custom) if custom else Path(urlparse(url).netloc.replace(".", "_"))
|
||||
|
||||
|
||||
def website_downloader(
|
||||
url: str,
|
||||
destination: Optional[str] = None,
|
||||
max_pages: int = 50,
|
||||
threads: int = 6,
|
||||
download_external_assets: bool = False,
|
||||
external_domains: Optional[list[str]] = None,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Download and mirror a website for offline use or RAG ingestion.
|
||||
|
||||
This is the main tool function that can be invoked by the GLM-4.7-Flash model.
|
||||
It wraps the website-downloader functionality in a tool interface.
|
||||
|
||||
Args:
|
||||
url: The starting URL to crawl (e.g., 'https://example.com/')
|
||||
destination: Optional output folder path. If not provided, derived from URL domain.
|
||||
max_pages: Maximum number of HTML pages to crawl (1-1000, default: 50)
|
||||
threads: Number of concurrent download threads (1-20, default: 6)
|
||||
download_external_assets: Whether to download assets from external domains (default: False)
|
||||
external_domains: Optional list of external domain names to allow downloading from
|
||||
|
||||
Returns:
|
||||
Dictionary containing:
|
||||
- success: Boolean indicating if the operation was successful
|
||||
- message: Human-readable summary of what was done
|
||||
- stats: Detailed statistics about the crawl
|
||||
- output_directory: Path to the downloaded website
|
||||
"""
|
||||
try:
|
||||
# Validate URL
|
||||
parsed_url = urlparse(url)
|
||||
if not parsed_url.scheme or parsed_url.scheme not in ("http", "https"):
|
||||
return {
|
||||
"success": False,
|
||||
"message": f"Invalid URL: '{url}'. Must be a valid HTTP or HTTPS URL.",
|
||||
"stats": None,
|
||||
"output_directory": None,
|
||||
}
|
||||
|
||||
# Validate parameters
|
||||
if max_pages < 1 or max_pages > 1000:
|
||||
return {
|
||||
"success": False,
|
||||
"message": f"max_pages must be between 1 and 1000, got {max_pages}",
|
||||
"stats": None,
|
||||
"output_directory": None,
|
||||
}
|
||||
|
||||
if threads < 1 or threads > 20:
|
||||
return {
|
||||
"success": False,
|
||||
"message": f"threads must be between 1 and 20, got {threads}",
|
||||
"stats": None,
|
||||
"output_directory": None,
|
||||
}
|
||||
|
||||
# Prepare output directory
|
||||
root = make_root(url, destination)
|
||||
|
||||
# Process external domains
|
||||
ext_domains_set = None
|
||||
if external_domains:
|
||||
ext_domains_set = {
|
||||
urlparse(d).hostname.lower() if "://" in d else d.lower()
|
||||
for d in external_domains
|
||||
}
|
||||
download_external_assets = True # Auto-enable if domains specified
|
||||
|
||||
# Log the crawl start
|
||||
log.info(
|
||||
"Starting website download: url=%s, dest=%s, max_pages=%d, threads=%d, external=%s",
|
||||
url, root, max_pages, threads, download_external_assets
|
||||
)
|
||||
|
||||
# Run the crawl
|
||||
stats = crawl_site_tool(
|
||||
start_url=url,
|
||||
root=root,
|
||||
max_pages=max_pages,
|
||||
threads=threads,
|
||||
download_external_assets=download_external_assets,
|
||||
external_domains=ext_domains_set,
|
||||
)
|
||||
|
||||
# Build success response
|
||||
message = (
|
||||
f"Successfully downloaded website from {url}\n"
|
||||
f"- Pages crawled: {stats['pages_crawled']}\n"
|
||||
f"- Assets downloaded: {stats['assets_downloaded']}\n"
|
||||
f"- Time elapsed: {stats['elapsed_seconds']}s\n"
|
||||
f"- Output directory: {stats['output_directory']}"
|
||||
)
|
||||
|
||||
if stats["failed_downloads"] > 0:
|
||||
message += f"\n- Failed downloads: {stats['failed_downloads']}"
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"message": message,
|
||||
"stats": stats,
|
||||
"output_directory": stats["output_directory"],
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
log.exception("Website download failed")
|
||||
return {
|
||||
"success": False,
|
||||
"message": f"Website download failed: {str(e)}",
|
||||
"stats": None,
|
||||
"output_directory": None,
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tool Registration Helper
|
||||
# =============================================================================
|
||||
|
||||
def get_tool_schema() -> dict[str, Any]:
|
||||
"""
|
||||
Get the tool schema for registration with the LLM.
|
||||
|
||||
This schema follows the OpenAI function calling format and can be
|
||||
used directly when creating chat completions with tools.
|
||||
|
||||
Returns:
|
||||
The tool schema dictionary
|
||||
"""
|
||||
return TOOL_SCHEMA
|
||||
|
||||
|
||||
def get_tool_function():
|
||||
"""
|
||||
Get the tool function for invocation.
|
||||
|
||||
Returns:
|
||||
The callable tool function
|
||||
"""
|
||||
return website_downloader
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Example Usage
|
||||
# =============================================================================
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example: Direct invocation
|
||||
import json
|
||||
|
||||
print("Website Downloader Tool for GLM-4.7-Flash")
|
||||
print("=" * 50)
|
||||
print("\nTool Schema:")
|
||||
print(json.dumps(TOOL_SCHEMA, indent=2))
|
||||
|
||||
print("\n" + "=" * 50)
|
||||
print("\nExample invocation:")
|
||||
result = website_downloader(
|
||||
url="https://example.com",
|
||||
max_pages=5,
|
||||
threads=4,
|
||||
download_external_assets=False
|
||||
)
|
||||
print(json.dumps(result, indent=2))
|
||||
Loading…
Reference in New Issue
Block a user