112 lines
4.0 KiB
Python
112 lines
4.0 KiB
Python
import os
|
|
import logging
|
|
import queue
|
|
import threading
|
|
import time
|
|
import re
|
|
from pathlib import Path
|
|
from urllib.parse import urlparse, urljoin
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Reuse some constants and logic from the original downloader
|
|
ASSET_EXTENSIONS = (
|
|
".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".webp", ".svg", ".ico",
|
|
".woff", ".woff2", ".ttf", ".eot"
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class Crawler:
|
|
def __init__(self, root_dir):
|
|
self.root_dir = Path(root_dir)
|
|
self.session = requests.Session()
|
|
self.session.headers.update({
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
|
})
|
|
|
|
def download_page(self, url, max_assets=20):
|
|
"""Download a single page and its essential assets."""
|
|
try:
|
|
resp = self.session.get(url, timeout=10)
|
|
resp.raise_for_status()
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
except Exception as e:
|
|
logger.error(f"Failed to fetch {url}: {e}")
|
|
return None
|
|
|
|
def _sanitize_filename(self, name):
|
|
"""Remove invalid characters for Windows/Linux filenames."""
|
|
# Replace non-alphanumeric with underscores
|
|
res = re.sub(r'[^a-zA-Z0-9_\-\.]', '_', name)
|
|
# Limit length
|
|
return res[:100]
|
|
|
|
def download_page(self, url, max_assets=20):
|
|
"""Download a single page and its essential assets."""
|
|
try:
|
|
resp = self.session.get(url, timeout=10)
|
|
resp.raise_for_status()
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
except Exception as e:
|
|
logger.error(f"Failed to fetch {url}: {e}")
|
|
return None
|
|
|
|
# Determine local path
|
|
parsed = urlparse(url)
|
|
host_dir = self.root_dir / self._sanitize_filename(parsed.netloc)
|
|
host_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Sanitize the filename from the path
|
|
raw_name = parsed.path.strip("/").replace("/", "_") or "index"
|
|
page_name = self._sanitize_filename(raw_name)
|
|
|
|
if not page_name.lower().endswith(".html"):
|
|
page_name += ".html"
|
|
|
|
local_path = host_dir / page_name
|
|
|
|
# Simple asset downloading (minimal version of website_downloader.py)
|
|
assets_dir = host_dir / "assets"
|
|
assets_dir.mkdir(exist_ok=True)
|
|
|
|
asset_count = 0
|
|
for tag in soup.find_all(["img", "link", "script"]):
|
|
if asset_count >= max_assets:
|
|
break
|
|
|
|
attr = "src" if tag.name in ["img", "script"] else "href"
|
|
link = tag.get(attr)
|
|
if not link or link.startswith("data:"):
|
|
continue
|
|
|
|
asset_url = urljoin(url, link)
|
|
asset_parsed = urlparse(asset_url)
|
|
|
|
if any(asset_parsed.path.lower().endswith(ext) for ext in ASSET_EXTENSIONS):
|
|
asset_name = os.path.basename(asset_parsed.path)
|
|
if not asset_name: continue
|
|
|
|
asset_local_path = assets_dir / asset_name
|
|
try:
|
|
asset_resp = self.session.get(asset_url, timeout=5)
|
|
asset_resp.raise_for_status()
|
|
with open(asset_local_path, "wb") as f:
|
|
f.write(asset_resp.content)
|
|
# Rewrite link in soup
|
|
tag[attr] = f"assets/{asset_name}"
|
|
asset_count += 1
|
|
except Exception as e:
|
|
logger.debug(f"Failed to download asset {asset_url}: {e}")
|
|
|
|
with open(local_path, "w", encoding="utf-8") as f:
|
|
f.write(soup.prettify())
|
|
|
|
return local_path
|
|
|
|
if __name__ == "__main__":
|
|
logging.basicConfig(level=logging.INFO)
|
|
c = Crawler("rag_data/repo/web")
|
|
path = c.download_page("https://example.com")
|
|
print(f"Downloaded to: {path}")
|