projects/pyrag3/crawler.py

import os
import logging
import queue
import threading
import time
import re
from pathlib import Path
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup

# Reuse some constants and logic from the original downloader
ASSET_EXTENSIONS = (
    ".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".webp", ".svg", ".ico",
    ".woff", ".woff2", ".ttf", ".eot"
)

logger = logging.getLogger(__name__)

class Crawler:
    def __init__(self, root_dir):
        self.root_dir = Path(root_dir)
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
        })

    def download_page(self, url, max_assets=20):
        """Download a single page and its essential assets."""
        try:
            resp = self.session.get(url, timeout=10)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "html.parser")
        except Exception as e:
            logger.error(f"Failed to fetch {url}: {e}")
            return None

    def _sanitize_filename(self, name):
        """Remove invalid characters for Windows/Linux filenames."""
        # Replace non-alphanumeric with underscores
        res = re.sub(r'[^a-zA-Z0-9_\-\.]', '_', name)
        # Limit length
        return res[:100]

    def download_page(self, url, max_assets=20):
        """Download a single page and its essential assets."""
        try:
            resp = self.session.get(url, timeout=10)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "html.parser")
        except Exception as e:
            logger.error(f"Failed to fetch {url}: {e}")
            return None

        # Determine local path
        parsed = urlparse(url)
        host_dir = self.root_dir / self._sanitize_filename(parsed.netloc)
        host_dir.mkdir(parents=True, exist_ok=True)

        # Sanitize the filename from the path
        raw_name = parsed.path.strip("/").replace("/", "_") or "index"
        page_name = self._sanitize_filename(raw_name)

        if not page_name.lower().endswith(".html"):
            page_name += ".html"

        local_path = host_dir / page_name

        # Simple asset downloading (minimal version of website_downloader.py)
        assets_dir = host_dir / "assets"
        assets_dir.mkdir(exist_ok=True)

        asset_count = 0
        for tag in soup.find_all(["img", "link", "script"]):
            if asset_count >= max_assets:
                break

            attr = "src" if tag.name in ["img", "script"] else "href"
            link = tag.get(attr)
            if not link or link.startswith("data:"):
                continue

            asset_url = urljoin(url, link)
            asset_parsed = urlparse(asset_url)

            if any(asset_parsed.path.lower().endswith(ext) for ext in ASSET_EXTENSIONS):
                asset_name = os.path.basename(asset_parsed.path)
                if not asset_name: continue

                asset_local_path = assets_dir / asset_name
                try:
                    asset_resp = self.session.get(asset_url, timeout=5)
                    asset_resp.raise_for_status()
                    with open(asset_local_path, "wb") as f:
                        f.write(asset_resp.content)
                    # Rewrite link in soup
                    tag[attr] = f"assets/{asset_name}"
                    asset_count += 1
                except Exception as e:
                    logger.debug(f"Failed to download asset {asset_url}: {e}")

        with open(local_path, "w", encoding="utf-8") as f:
            f.write(soup.prettify())

        return local_path

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    c = Crawler("rag_data/repo/web")
    path = c.download_page("https://example.com")
    print(f"Downloaded to: {path}")