#!/usr/bin/env python3 from __future__ import annotations import argparse import logging import os import queue import re import sys import threading import time from hashlib import sha256 from importlib.util import find_spec from pathlib import Path from typing import Optional from urllib.parse import ParseResult, unquote, urljoin, urlparse import requests from bs4 import BeautifulSoup from requests.adapters import HTTPAdapter from urllib3.util import Retry HAS_BROTLI = find_spec("brotli") is not None or find_spec("brotlicffi") is not None # --------------------------------------------------------------------------- # Config / constants # --------------------------------------------------------------------------- # Consistent log format across file + console. Thread name is helpful because # asset downloads happen in worker threads. LOG_FMT = "%(asctime)s | %(levelname)-8s | %(threadName)s | %(message)s" # Extensions we treat as “static assets” worth downloading and rewriting. # Used in multiple places: HTML attribute rewriting, CSS url(...) rewriting, # JS string rewriting, and crawl-time asset detection. ASSET_EXTENSIONS = ( ".css", ".js", ".mjs", ".map", ".json", ".wasm", ".webmanifest", ".png", ".jpg", ".jpeg", ".gif", ".webp", ".avif", ".svg", ".ico", ".woff", ".woff2", ".ttf", ".eot", ".mp4", ".webm", ".mp3", ) # Conservative JS string rewriting: # - JS_URL_RE: matches root-relative strings like "/assets/app.js" # - JS_ABS_URL_RE: matches absolute or protocol-relative strings like # "https://cdn.example.com/app.js" or "//cdn.example.com/app.js" # # This is intentionally limited to common static file extensions to avoid # rewriting API endpoints or dynamic URLs that could break functionality. JS_URL_RE = re.compile( r"""["'](/[^"']+\.(?:png|jpg|jpeg|gif|svg|webp|avif|ico|css|js|mjs|map|woff|woff2|ttf|eot|json|wasm|webmanifest)(?:\?[^"']*)?)["']""", re.IGNORECASE, ) JS_ABS_URL_RE = re.compile( r"""["']((?:https?:)?//[^"']+\.(?:png|jpg|jpeg|gif|svg|webp|avif|ico|css|js|mjs|map|woff|woff2|ttf|eot|json|wasm|webmanifest)(?:\?[^"']*)?)["']""", re.IGNORECASE, ) # Default headers can help with sites that block "non-browser" clients. _ACCEPT_ENCODING = "gzip, deflate, br" if HAS_BROTLI else "gzip, deflate" DEFAULT_HEADERS = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/122.0.0.0 Safari/537.36" ), "Accept": ( "text/html,application/xhtml+xml,application/xml;" "q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8" ), "Accept-Language": "en-US,en;q=0.9", "Accept-Encoding": _ACCEPT_ENCODING, "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", } # Network timeouts + streaming chunk size for binary downloads. TIMEOUT = 15 # seconds CHUNK_SIZE = 8192 # bytes # Conservative margins under common OS limits (~255–260 bytes). # These protect you from “File name too long” and odd Windows path rules. MAX_PATH_LEN = 240 MAX_SEG_LEN = 120 # Collapse 3+ dots ("....") down to a single dot to avoid weird filenames. _MULTI_DOTS_RE = re.compile(r"\.{3,}") # CSS url(...) extractor. Note: this is simple (not a full CSS parser), # but good enough for most sites. CSS_URL_RE = re.compile(r"url\(([^)]+)\)") # CSS @import extractor. Also simple-but-effective. CSS_IMPORT_RE = re.compile( r"""@import\s+(?:url\()?['"]?([^'"\);]+)['"]?\)?\s*;""", re.IGNORECASE, ) # Characters that commonly cause filesystem issues, especially on Windows. _BAD_SEG_CHARS_RE = re.compile(r'[<>:"/\\|?*\x00-\x1F]') # Windows reserved filenames; writing these can fail or behave badly. _WINDOWS_RESERVED_NAMES = { "CON", "PRN", "AUX", "NUL", *(f"COM{i}" for i in range(1, 10)), *(f"LPT{i}" for i in range(1, 10)), } RESOURCE_LINK_RELS = { "stylesheet", "icon", "shortcut", "apple-touch-icon", "preload", "modulepreload", "manifest", } # --------------------------------------------------------------------------- # Logging # --------------------------------------------------------------------------- # File logging is DEBUG to help you trace rewrites and queue behavior. logging.basicConfig( filename="web_scraper.log", level=logging.DEBUG, format=LOG_FMT, datefmt="%H:%M:%S", force=True, ) # Console logging is INFO to keep output readable while running. _console = logging.StreamHandler(sys.stdout) _console.setLevel(logging.INFO) _console.setFormatter(logging.Formatter(LOG_FMT, datefmt="%H:%M:%S")) logging.getLogger().addHandler(_console) log = logging.getLogger(__name__) # --------------------------------------------------------------------------- # HTTP session (retry, timeouts, custom UA) # --------------------------------------------------------------------------- # Shared session improves performance and keeps connection pooling. SESSION = requests.Session() # Retry strategy for transient issues (rate limits, 5xx). Helps stability. RETRY_STRAT = Retry( total=5, backoff_factor=0.5, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=["GET", "HEAD"], ) SESSION.mount("http://", HTTPAdapter(max_retries=RETRY_STRAT)) SESSION.mount("https://", HTTPAdapter(max_retries=RETRY_STRAT)) SESSION.headers.update(DEFAULT_HEADERS) log.debug("Accept-Encoding configured as: %s", SESSION.headers.get("Accept-Encoding")) # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def create_dir(path: Path) -> None: """Create path (and parents) if it does not already exist.""" if not path.exists(): path.mkdir(parents=True, exist_ok=True) log.debug("Created directory %s", path) # Schemes that are valid URLs in HTML but are not HTTP fetch targets. # If we try to request these, requests will throw InvalidSchema. NON_FETCHABLE_SCHEMES = { "mailto", "tel", "sms", "javascript", "data", "geo", "blob", "about", } def is_httpish(u: str) -> bool: """ True iff the URL is http(s) or relative (no scheme). Why: - We only fetch http(s) resources. - Relative URLs should still be handled because we can join them to base URLs. """ p = urlparse(u) return (p.scheme in ("http", "https")) or (p.scheme == "") def is_non_fetchable(u: str) -> bool: """ True iff the URL clearly shouldn't be fetched (mailto:, tel:, data:, ...). """ p = urlparse(u) return p.scheme in NON_FETCHABLE_SCHEMES def is_internal(link: str, root_netloc: str) -> bool: """ Decide whether `link` belongs to the same site as `root_netloc`. Notes: - Relative URLs are internal. - We normalize "www." so example.com and www.example.com count as same. """ parsed = urlparse(link) netloc = _canonical_netloc(parsed) if not netloc: return True if netloc == root_netloc: return True # normalize www if netloc.startswith("www."): netloc = netloc[4:] root = root_netloc[4:] if root_netloc.startswith("www.") else root_netloc return netloc == root def _sanitize_segment(segment: str) -> str: """ Sanitize a single path segment for safe writing to disk. - URL decode (turn %20 into space, etc.) - Strip whitespace / trailing dot-space combos (Windows issues) - Collapse accidental multi-dots - Replace illegal filesystem chars with '_' - Neutralize '.' and '..' to prevent traversal-like paths - Avoid Windows reserved names (CON, PRN, COM1, ...) """ segment = unquote(segment).strip() segment = segment.strip(" .") segment = _MULTI_DOTS_RE.sub(".", segment) segment = _BAD_SEG_CHARS_RE.sub("_", segment) if segment in ("", ".", ".."): segment = "_" if segment.upper() in _WINDOWS_RESERVED_NAMES: segment = f"_{segment}_" return segment def _shorten_segment(segment: str, limit: int = MAX_SEG_LEN) -> str: """ Shorten a path segment if it exceeds a length limit. Strategy: - Keep the original extension - Truncate the stem - Append a short hash so different long names don't collide """ if len(segment) <= limit: return segment p = Path(segment) stem, suffix = p.stem, p.suffix h = sha256(segment.encode("utf-8")).hexdigest()[:12] keep = max(0, limit - len(suffix) - 13) # '-' + hash is 13 chars total return f"{stem[:keep]}-{h}{suffix}" def _rel_url(target: Path, base_dir: Path) -> str: """ Compute a URL-style relative path (forward slashes), not an OS-specific path. """ try: rel = os.path.relpath(target, base_dir) except ValueError: # Happens if paths are on different drives on Windows. return target.as_posix() return Path(rel).as_posix() def to_local_path(parsed: ParseResult, site_root: Path) -> Path: """ Map an internal *page* URL to a local HTML file under site_root. Rules: - "/" -> index.html - "/foo/" -> /foo/index.html - "/foo" (no extension) -> /foo.html - query strings get a short hash to prevent collisions: /page?id=1 and /page?id=2 should not overwrite each other - filesystem hardening: sanitize segments, limit segment length and overall path """ rel = parsed.path.lstrip("/") if not rel: rel = "index.html" elif rel.endswith("/"): rel += "index.html" elif not Path(rel).suffix: rel += ".html" if parsed.query: qh = sha256(parsed.query.encode("utf-8")).hexdigest()[:10] p = Path(rel) rel = str(p.with_name(f"{p.stem}-q{qh}{p.suffix}")) parts = Path(rel).parts parts = tuple(_sanitize_segment(seg) for seg in parts) parts = tuple(_shorten_segment(seg, MAX_SEG_LEN) for seg in parts) local_path = site_root / Path(*parts) if len(str(local_path)) > MAX_PATH_LEN: p = local_path h = sha256(parsed.geturl().encode("utf-8")).hexdigest()[:16] leaf = _shorten_segment(f"{p.stem}-{h}{p.suffix}", MAX_SEG_LEN) local_path = p.with_name(leaf) return local_path def to_local_asset_path(parsed: ParseResult, site_root: Path) -> Path: """ Map an internal *asset* URL to a local file path under site_root. Difference vs to_local_path(): - We do NOT force .html for extensionless paths. (Some sites serve extensionless assets, though less common.) """ rel = parsed.path.lstrip("/") if not rel: rel = "index" elif rel.endswith("/"): rel += "index" if parsed.query: qh = sha256(parsed.query.encode("utf-8")).hexdigest()[:10] p = Path(rel) name = f"{p.stem}-q{qh}{p.suffix}" if p.suffix else f"{p.name}-q{qh}" rel = str(p.with_name(name)) parts = Path(rel).parts parts = tuple(_sanitize_segment(seg) for seg in parts) parts = tuple(_shorten_segment(seg, MAX_SEG_LEN) for seg in parts) local_path = site_root / Path(*parts) if len(str(local_path)) > MAX_PATH_LEN: p = local_path h = sha256(parsed.geturl().encode("utf-8")).hexdigest()[:16] leaf = _shorten_segment(f"{p.stem}-{h}{p.suffix}", MAX_SEG_LEN) local_path = p.with_name(leaf) return local_path def cdn_local_path(parsed: ParseResult, site_root: Path) -> Path: """ Map an external (CDN) URL to a local path under: site_root/cdn//... Why: - Keeps external host assets separated from internal assets. - Avoids collisions where internal and external paths look similar. """ rel = parsed.path.lstrip("/") if not rel: rel = "index" elif rel.endswith("/"): rel += "index" if parsed.query: qh = sha256(parsed.query.encode("utf-8")).hexdigest()[:10] p = Path(rel) name = f"{p.stem}-q{qh}{p.suffix}" if p.suffix else f"{p.name}-q{qh}" rel = str(p.with_name(name)) parts = Path(rel).parts parts = tuple(_sanitize_segment(seg) for seg in parts) parts = tuple(_shorten_segment(seg, MAX_SEG_LEN) for seg in parts) netloc = _canonical_netloc(parsed) local_path = site_root / "cdn" / _sanitize_segment(netloc) / Path(*parts) if len(str(local_path)) > MAX_PATH_LEN: p = local_path h = sha256(parsed.geturl().encode("utf-8")).hexdigest()[:16] leaf = _shorten_segment(f"{p.stem}-{h}{p.suffix}", MAX_SEG_LEN) local_path = p.with_name(leaf) return local_path def safe_write_text(path: Path, text: str, encoding: str = "utf-8") -> Path: """ Write text to path safely. If the OS rejects the filename/path (often: path too long), we: - hash the leaf name - write to a fallback name - return the final path used """ try: path.write_text(text, encoding=encoding) return path except OSError as exc: log.warning("Write failed for %s: %s. Falling back to hashed leaf.", path, exc) p = path h = sha256(str(p).encode("utf-8")).hexdigest()[:16] fallback = p.with_name(_shorten_segment(f"{p.stem}-{h}{p.suffix}", MAX_SEG_LEN)) create_dir(fallback.parent) fallback.write_text(text, encoding=encoding) return fallback def normalize_url(url: str) -> str: """ Normalize URLs to avoid duplicates caused by fragments. Example: - https://site/page#section1 and https://site/page#section2 are the same document for our crawler. """ parsed = urlparse(url) clean = parsed._replace(fragment="") return clean.geturl() def _protocol_fix(url: str, base_url: str) -> str: """ Normalize protocol-relative URLs (//host/path) to absolute ones. Browsers interpret //example.com/a.css as "use the current page scheme". We do the same using base_url's scheme. """ if url.startswith("//"): base = urlparse(base_url) scheme = base.scheme or "https" return f"{scheme}:{url}" return url def rewrite_css_text( css_text: str, base_url: str, *, site_root: Path, root_netloc: str, base_dir: Path, download_external_assets: bool, external_domains: Optional[set[str]] = None, download_q: Optional[queue.Queue[tuple[str, Path]]] = None, ) -> str: """ Rewrite CSS url(...) and @import references to local relative paths. base_url: - the remote URL of the CSS *context* - external stylesheet URL for downloaded .css - page URL for inline