projects/pyrag3/crawler.py
2026-04-05 17:30:07 -07:00

112 lines
4.0 KiB
Python

import os
import logging
import queue
import threading
import time
import re
from pathlib import Path
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup
# Reuse some constants and logic from the original downloader
ASSET_EXTENSIONS = (
".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".webp", ".svg", ".ico",
".woff", ".woff2", ".ttf", ".eot"
)
logger = logging.getLogger(__name__)
class Crawler:
def __init__(self, root_dir):
self.root_dir = Path(root_dir)
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
})
def download_page(self, url, max_assets=20):
"""Download a single page and its essential assets."""
try:
resp = self.session.get(url, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
except Exception as e:
logger.error(f"Failed to fetch {url}: {e}")
return None
def _sanitize_filename(self, name):
"""Remove invalid characters for Windows/Linux filenames."""
# Replace non-alphanumeric with underscores
res = re.sub(r'[^a-zA-Z0-9_\-\.]', '_', name)
# Limit length
return res[:100]
def download_page(self, url, max_assets=20):
"""Download a single page and its essential assets."""
try:
resp = self.session.get(url, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
except Exception as e:
logger.error(f"Failed to fetch {url}: {e}")
return None
# Determine local path
parsed = urlparse(url)
host_dir = self.root_dir / self._sanitize_filename(parsed.netloc)
host_dir.mkdir(parents=True, exist_ok=True)
# Sanitize the filename from the path
raw_name = parsed.path.strip("/").replace("/", "_") or "index"
page_name = self._sanitize_filename(raw_name)
if not page_name.lower().endswith(".html"):
page_name += ".html"
local_path = host_dir / page_name
# Simple asset downloading (minimal version of website_downloader.py)
assets_dir = host_dir / "assets"
assets_dir.mkdir(exist_ok=True)
asset_count = 0
for tag in soup.find_all(["img", "link", "script"]):
if asset_count >= max_assets:
break
attr = "src" if tag.name in ["img", "script"] else "href"
link = tag.get(attr)
if not link or link.startswith("data:"):
continue
asset_url = urljoin(url, link)
asset_parsed = urlparse(asset_url)
if any(asset_parsed.path.lower().endswith(ext) for ext in ASSET_EXTENSIONS):
asset_name = os.path.basename(asset_parsed.path)
if not asset_name: continue
asset_local_path = assets_dir / asset_name
try:
asset_resp = self.session.get(asset_url, timeout=5)
asset_resp.raise_for_status()
with open(asset_local_path, "wb") as f:
f.write(asset_resp.content)
# Rewrite link in soup
tag[attr] = f"assets/{asset_name}"
asset_count += 1
except Exception as e:
logger.debug(f"Failed to download asset {asset_url}: {e}")
with open(local_path, "w", encoding="utf-8") as f:
f.write(soup.prettify())
return local_path
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
c = Crawler("rag_data/repo/web")
path = c.download_page("https://example.com")
print(f"Downloaded to: {path}")