import os import logging import queue import threading import time import re from pathlib import Path from urllib.parse import urlparse, urljoin import requests from bs4 import BeautifulSoup # Reuse some constants and logic from the original downloader ASSET_EXTENSIONS = ( ".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".webp", ".svg", ".ico", ".woff", ".woff2", ".ttf", ".eot" ) logger = logging.getLogger(__name__) class Crawler: def __init__(self, root_dir): self.root_dir = Path(root_dir) self.session = requests.Session() self.session.headers.update({ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" }) def download_page(self, url, max_assets=20): """Download a single page and its essential assets.""" try: resp = self.session.get(url, timeout=10) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") except Exception as e: logger.error(f"Failed to fetch {url}: {e}") return None def _sanitize_filename(self, name): """Remove invalid characters for Windows/Linux filenames.""" # Replace non-alphanumeric with underscores res = re.sub(r'[^a-zA-Z0-9_\-\.]', '_', name) # Limit length return res[:100] def download_page(self, url, max_assets=20): """Download a single page and its essential assets.""" try: resp = self.session.get(url, timeout=10) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") except Exception as e: logger.error(f"Failed to fetch {url}: {e}") return None # Determine local path parsed = urlparse(url) host_dir = self.root_dir / self._sanitize_filename(parsed.netloc) host_dir.mkdir(parents=True, exist_ok=True) # Sanitize the filename from the path raw_name = parsed.path.strip("/").replace("/", "_") or "index" page_name = self._sanitize_filename(raw_name) if not page_name.lower().endswith(".html"): page_name += ".html" local_path = host_dir / page_name # Simple asset downloading (minimal version of website_downloader.py) assets_dir = host_dir / "assets" assets_dir.mkdir(exist_ok=True) asset_count = 0 for tag in soup.find_all(["img", "link", "script"]): if asset_count >= max_assets: break attr = "src" if tag.name in ["img", "script"] else "href" link = tag.get(attr) if not link or link.startswith("data:"): continue asset_url = urljoin(url, link) asset_parsed = urlparse(asset_url) if any(asset_parsed.path.lower().endswith(ext) for ext in ASSET_EXTENSIONS): asset_name = os.path.basename(asset_parsed.path) if not asset_name: continue asset_local_path = assets_dir / asset_name try: asset_resp = self.session.get(asset_url, timeout=5) asset_resp.raise_for_status() with open(asset_local_path, "wb") as f: f.write(asset_resp.content) # Rewrite link in soup tag[attr] = f"assets/{asset_name}" asset_count += 1 except Exception as e: logger.debug(f"Failed to download asset {asset_url}: {e}") with open(local_path, "w", encoding="utf-8") as f: f.write(soup.prettify()) return local_path if __name__ == "__main__": logging.basicConfig(level=logging.INFO) c = Crawler("rag_data/repo/web") path = c.download_page("https://example.com") print(f"Downloaded to: {path}")