From 1623ee8d2cb91f4162a7bb4529dde1bdef852b38 Mon Sep 17 00:00:00 2001
From: turtle89431 <turtle89431@gmail.com>
Date: Sat, 28 Mar 2026 16:04:27 -0700
Subject: [PATCH] tool1 and init req file

---
 requirements.txt      |    4 +
 website-downloader.py | 1610 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 1614 insertions(+)
 create mode 100644 requirements.txt
 create mode 100644 website-downloader.py

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..72da2f0
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,4 @@
+requests~=2.32.4
+beautifulsoup4~=4.13.4
+wget~=3.2
+urllib3~=2.5.0
\ No newline at end of file
diff --git a/website-downloader.py b/website-downloader.py
new file mode 100644
index 0000000..e7907a5
--- /dev/null
+++ b/website-downloader.py
@@ -0,0 +1,1610 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import logging
+import os
+import queue
+import re
+import sys
+import threading
+import time
+from hashlib import sha256
+from importlib.util import find_spec
+from pathlib import Path
+from typing import Optional
+from urllib.parse import ParseResult, unquote, urljoin, urlparse
+
+import requests
+from bs4 import BeautifulSoup
+from requests.adapters import HTTPAdapter
+from urllib3.util import Retry
+
+HAS_BROTLI = find_spec("brotli") is not None or find_spec("brotlicffi") is not None
+
+# ---------------------------------------------------------------------------
+# Config / constants
+# ---------------------------------------------------------------------------
+
+# Consistent log format across file + console. Thread name is helpful because
+# asset downloads happen in worker threads.
+LOG_FMT = "%(asctime)s | %(levelname)-8s | %(threadName)s | %(message)s"
+
+# Extensions we treat as “static assets” worth downloading and rewriting.
+# Used in multiple places: HTML attribute rewriting, CSS url(...) rewriting,
+# JS string rewriting, and crawl-time asset detection.
+ASSET_EXTENSIONS = (
+    ".css",
+    ".js",
+    ".mjs",
+    ".map",
+    ".json",
+    ".wasm",
+    ".webmanifest",
+    ".png",
+    ".jpg",
+    ".jpeg",
+    ".gif",
+    ".webp",
+    ".avif",
+    ".svg",
+    ".ico",
+    ".woff",
+    ".woff2",
+    ".ttf",
+    ".eot",
+    ".mp4",
+    ".webm",
+    ".mp3",
+)
+
+# Conservative JS string rewriting:
+# - JS_URL_RE: matches root-relative strings like "/assets/app.js"
+# - JS_ABS_URL_RE: matches absolute or protocol-relative strings like
+#   "https://cdn.example.com/app.js" or "//cdn.example.com/app.js"
+#
+# This is intentionally limited to common static file extensions to avoid
+# rewriting API endpoints or dynamic URLs that could break functionality.
+JS_URL_RE = re.compile(
+    r"""["'](/[^"']+\.(?:png|jpg|jpeg|gif|svg|webp|avif|ico|css|js|mjs|map|woff|woff2|ttf|eot|json|wasm|webmanifest)(?:\?[^"']*)?)["']""",
+    re.IGNORECASE,
+)
+
+JS_ABS_URL_RE = re.compile(
+    r"""["']((?:https?:)?//[^"']+\.(?:png|jpg|jpeg|gif|svg|webp|avif|ico|css|js|mjs|map|woff|woff2|ttf|eot|json|wasm|webmanifest)(?:\?[^"']*)?)["']""",
+    re.IGNORECASE,
+)
+
+# Default headers can help with sites that block "non-browser" clients.
+_ACCEPT_ENCODING = "gzip, deflate, br" if HAS_BROTLI else "gzip, deflate"
+
+DEFAULT_HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/122.0.0.0 Safari/537.36"
+    ),
+    "Accept": (
+        "text/html,application/xhtml+xml,application/xml;"
+        "q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8"
+    ),
+    "Accept-Language": "en-US,en;q=0.9",
+    "Accept-Encoding": _ACCEPT_ENCODING,
+    "Connection": "keep-alive",
+    "Upgrade-Insecure-Requests": "1",
+}
+
+# Network timeouts + streaming chunk size for binary downloads.
+TIMEOUT = 15  # seconds
+CHUNK_SIZE = 8192  # bytes
+
+# Conservative margins under common OS limits (~255–260 bytes).
+# These protect you from “File name too long” and odd Windows path rules.
+MAX_PATH_LEN = 240
+MAX_SEG_LEN = 120
+
+# Collapse 3+ dots ("....") down to a single dot to avoid weird filenames.
+_MULTI_DOTS_RE = re.compile(r"\.{3,}")
+
+# CSS url(...) extractor. Note: this is simple (not a full CSS parser),
+# but good enough for most sites.
+CSS_URL_RE = re.compile(r"url\(([^)]+)\)")
+
+# CSS @import extractor. Also simple-but-effective.
+CSS_IMPORT_RE = re.compile(
+    r"""@import\s+(?:url\()?['"]?([^'"\);]+)['"]?\)?\s*;""",
+    re.IGNORECASE,
+)
+
+# Characters that commonly cause filesystem issues, especially on Windows.
+_BAD_SEG_CHARS_RE = re.compile(r'[<>:"/\\|?*\x00-\x1F]')
+
+# Windows reserved filenames; writing these can fail or behave badly.
+_WINDOWS_RESERVED_NAMES = {
+    "CON",
+    "PRN",
+    "AUX",
+    "NUL",
+    *(f"COM{i}" for i in range(1, 10)),
+    *(f"LPT{i}" for i in range(1, 10)),
+}
+
+RESOURCE_LINK_RELS = {
+    "stylesheet",
+    "icon",
+    "shortcut",
+    "apple-touch-icon",
+    "preload",
+    "modulepreload",
+    "manifest",
+}
+
+# ---------------------------------------------------------------------------
+# Logging
+# ---------------------------------------------------------------------------
+
+# File logging is DEBUG to help you trace rewrites and queue behavior.
+logging.basicConfig(
+    filename="web_scraper.log",
+    level=logging.DEBUG,
+    format=LOG_FMT,
+    datefmt="%H:%M:%S",
+    force=True,
+)
+
+# Console logging is INFO to keep output readable while running.
+_console = logging.StreamHandler(sys.stdout)
+_console.setLevel(logging.INFO)
+_console.setFormatter(logging.Formatter(LOG_FMT, datefmt="%H:%M:%S"))
+logging.getLogger().addHandler(_console)
+log = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# HTTP session (retry, timeouts, custom UA)
+# ---------------------------------------------------------------------------
+
+# Shared session improves performance and keeps connection pooling.
+SESSION = requests.Session()
+
+# Retry strategy for transient issues (rate limits, 5xx). Helps stability.
+RETRY_STRAT = Retry(
+    total=5,
+    backoff_factor=0.5,
+    status_forcelist=[429, 500, 502, 503, 504],
+    allowed_methods=["GET", "HEAD"],
+)
+
+SESSION.mount("http://", HTTPAdapter(max_retries=RETRY_STRAT))
+SESSION.mount("https://", HTTPAdapter(max_retries=RETRY_STRAT))
+SESSION.headers.update(DEFAULT_HEADERS)
+log.debug("Accept-Encoding configured as: %s", SESSION.headers.get("Accept-Encoding"))
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def create_dir(path: Path) -> None:
+    """Create path (and parents) if it does not already exist."""
+    if not path.exists():
+        path.mkdir(parents=True, exist_ok=True)
+        log.debug("Created directory %s", path)
+
+
+# Schemes that are valid URLs in HTML but are not HTTP fetch targets.
+# If we try to request these, requests will throw InvalidSchema.
+NON_FETCHABLE_SCHEMES = {
+    "mailto",
+    "tel",
+    "sms",
+    "javascript",
+    "data",
+    "geo",
+    "blob",
+    "about",
+}
+
+
+def is_httpish(u: str) -> bool:
+    """
+    True iff the URL is http(s) or relative (no scheme).
+
+    Why:
+    - We only fetch http(s) resources.
+    - Relative URLs should still be handled because we can join them to base URLs.
+    """
+    p = urlparse(u)
+    return (p.scheme in ("http", "https")) or (p.scheme == "")
+
+
+def is_non_fetchable(u: str) -> bool:
+    """
+    True iff the URL clearly shouldn't be fetched (mailto:, tel:, data:, ...).
+    """
+    p = urlparse(u)
+    return p.scheme in NON_FETCHABLE_SCHEMES
+
+
+def is_internal(link: str, root_netloc: str) -> bool:
+    """
+    Decide whether `link` belongs to the same site as `root_netloc`.
+
+    Notes:
+    - Relative URLs are internal.
+    - We normalize "www." so example.com and www.example.com count as same.
+    """
+    parsed = urlparse(link)
+    netloc = _canonical_netloc(parsed)
+
+    if not netloc:
+        return True
+
+    if netloc == root_netloc:
+        return True
+
+    # normalize www
+    if netloc.startswith("www."):
+        netloc = netloc[4:]
+    root = root_netloc[4:] if root_netloc.startswith("www.") else root_netloc
+
+    return netloc == root
+
+
+def _sanitize_segment(segment: str) -> str:
+    """
+    Sanitize a single path segment for safe writing to disk.
+
+    - URL decode (turn %20 into space, etc.)
+    - Strip whitespace / trailing dot-space combos (Windows issues)
+    - Collapse accidental multi-dots
+    - Replace illegal filesystem chars with '_'
+    - Neutralize '.' and '..' to prevent traversal-like paths
+    - Avoid Windows reserved names (CON, PRN, COM1, ...)
+    """
+    segment = unquote(segment).strip()
+    segment = segment.strip(" .")
+    segment = _MULTI_DOTS_RE.sub(".", segment)
+    segment = _BAD_SEG_CHARS_RE.sub("_", segment)
+
+    if segment in ("", ".", ".."):
+        segment = "_"
+
+    if segment.upper() in _WINDOWS_RESERVED_NAMES:
+        segment = f"_{segment}_"
+
+    return segment
+
+
+def _shorten_segment(segment: str, limit: int = MAX_SEG_LEN) -> str:
+    """
+    Shorten a path segment if it exceeds a length limit.
+
+    Strategy:
+    - Keep the original extension
+    - Truncate the stem
+    - Append a short hash so different long names don't collide
+    """
+    if len(segment) <= limit:
+        return segment
+    p = Path(segment)
+    stem, suffix = p.stem, p.suffix
+    h = sha256(segment.encode("utf-8")).hexdigest()[:12]
+    keep = max(0, limit - len(suffix) - 13)  # '-' + hash is 13 chars total
+    return f"{stem[:keep]}-{h}{suffix}"
+
+
+def _rel_url(target: Path, base_dir: Path) -> str:
+    """
+    Compute a URL-style relative path (forward slashes),
+    not an OS-specific path.
+    """
+    try:
+        rel = os.path.relpath(target, base_dir)
+    except ValueError:
+        # Happens if paths are on different drives on Windows.
+        return target.as_posix()
+    return Path(rel).as_posix()
+
+
+def to_local_path(parsed: ParseResult, site_root: Path) -> Path:
+    """
+    Map an internal *page* URL to a local HTML file under site_root.
+
+    Rules:
+    - "/" -> index.html
+    - "/foo/" -> /foo/index.html
+    - "/foo" (no extension) -> /foo.html
+    - query strings get a short hash to prevent collisions:
+      /page?id=1 and /page?id=2 should not overwrite each other
+    - filesystem hardening: sanitize segments, limit segment length and overall path
+    """
+    rel = parsed.path.lstrip("/")
+    if not rel:
+        rel = "index.html"
+    elif rel.endswith("/"):
+        rel += "index.html"
+    elif not Path(rel).suffix:
+        rel += ".html"
+
+    if parsed.query:
+        qh = sha256(parsed.query.encode("utf-8")).hexdigest()[:10]
+        p = Path(rel)
+        rel = str(p.with_name(f"{p.stem}-q{qh}{p.suffix}"))
+
+    parts = Path(rel).parts
+    parts = tuple(_sanitize_segment(seg) for seg in parts)
+    parts = tuple(_shorten_segment(seg, MAX_SEG_LEN) for seg in parts)
+    local_path = site_root / Path(*parts)
+
+    if len(str(local_path)) > MAX_PATH_LEN:
+        p = local_path
+        h = sha256(parsed.geturl().encode("utf-8")).hexdigest()[:16]
+        leaf = _shorten_segment(f"{p.stem}-{h}{p.suffix}", MAX_SEG_LEN)
+        local_path = p.with_name(leaf)
+
+    return local_path
+
+
+def to_local_asset_path(parsed: ParseResult, site_root: Path) -> Path:
+    """
+    Map an internal *asset* URL to a local file path under site_root.
+
+    Difference vs to_local_path():
+    - We do NOT force .html for extensionless paths.
+      (Some sites serve extensionless assets, though less common.)
+    """
+    rel = parsed.path.lstrip("/")
+    if not rel:
+        rel = "index"
+    elif rel.endswith("/"):
+        rel += "index"
+
+    if parsed.query:
+        qh = sha256(parsed.query.encode("utf-8")).hexdigest()[:10]
+        p = Path(rel)
+        name = f"{p.stem}-q{qh}{p.suffix}" if p.suffix else f"{p.name}-q{qh}"
+        rel = str(p.with_name(name))
+
+    parts = Path(rel).parts
+    parts = tuple(_sanitize_segment(seg) for seg in parts)
+    parts = tuple(_shorten_segment(seg, MAX_SEG_LEN) for seg in parts)
+    local_path = site_root / Path(*parts)
+
+    if len(str(local_path)) > MAX_PATH_LEN:
+        p = local_path
+        h = sha256(parsed.geturl().encode("utf-8")).hexdigest()[:16]
+        leaf = _shorten_segment(f"{p.stem}-{h}{p.suffix}", MAX_SEG_LEN)
+        local_path = p.with_name(leaf)
+
+    return local_path
+
+
+def cdn_local_path(parsed: ParseResult, site_root: Path) -> Path:
+    """
+    Map an external (CDN) URL to a local path under:
+        site_root/cdn/<netloc>/...
+
+    Why:
+    - Keeps external host assets separated from internal assets.
+    - Avoids collisions where internal and external paths look similar.
+    """
+    rel = parsed.path.lstrip("/")
+    if not rel:
+        rel = "index"
+    elif rel.endswith("/"):
+        rel += "index"
+
+    if parsed.query:
+        qh = sha256(parsed.query.encode("utf-8")).hexdigest()[:10]
+        p = Path(rel)
+        name = f"{p.stem}-q{qh}{p.suffix}" if p.suffix else f"{p.name}-q{qh}"
+        rel = str(p.with_name(name))
+
+    parts = Path(rel).parts
+    parts = tuple(_sanitize_segment(seg) for seg in parts)
+    parts = tuple(_shorten_segment(seg, MAX_SEG_LEN) for seg in parts)
+
+    netloc = _canonical_netloc(parsed)
+    local_path = site_root / "cdn" / _sanitize_segment(netloc) / Path(*parts)
+
+    if len(str(local_path)) > MAX_PATH_LEN:
+        p = local_path
+        h = sha256(parsed.geturl().encode("utf-8")).hexdigest()[:16]
+        leaf = _shorten_segment(f"{p.stem}-{h}{p.suffix}", MAX_SEG_LEN)
+        local_path = p.with_name(leaf)
+
+    return local_path
+
+
+def safe_write_text(path: Path, text: str, encoding: str = "utf-8") -> Path:
+    """
+    Write text to path safely.
+
+    If the OS rejects the filename/path (often: path too long), we:
+    - hash the leaf name
+    - write to a fallback name
+    - return the final path used
+    """
+    try:
+        path.write_text(text, encoding=encoding)
+        return path
+    except OSError as exc:
+        log.warning("Write failed for %s: %s. Falling back to hashed leaf.", path, exc)
+        p = path
+        h = sha256(str(p).encode("utf-8")).hexdigest()[:16]
+        fallback = p.with_name(_shorten_segment(f"{p.stem}-{h}{p.suffix}", MAX_SEG_LEN))
+        create_dir(fallback.parent)
+        fallback.write_text(text, encoding=encoding)
+        return fallback
+
+
+def normalize_url(url: str) -> str:
+    """
+    Normalize URLs to avoid duplicates caused by fragments.
+
+    Example:
+    - https://site/page#section1 and https://site/page#section2
+      are the same document for our crawler.
+    """
+    parsed = urlparse(url)
+    clean = parsed._replace(fragment="")
+    return clean.geturl()
+
+
+def _protocol_fix(url: str, base_url: str) -> str:
+    """
+    Normalize protocol-relative URLs (//host/path) to absolute ones.
+
+    Browsers interpret //example.com/a.css as "use the current page scheme".
+    We do the same using base_url's scheme.
+    """
+    if url.startswith("//"):
+        base = urlparse(base_url)
+        scheme = base.scheme or "https"
+        return f"{scheme}:{url}"
+    return url
+
+
+def rewrite_css_text(
+    css_text: str,
+    base_url: str,
+    *,
+    site_root: Path,
+    root_netloc: str,
+    base_dir: Path,
+    download_external_assets: bool,
+    external_domains: Optional[set[str]] = None,
+    download_q: Optional[queue.Queue[tuple[str, Path]]] = None,
+) -> str:
+    """
+    Rewrite CSS url(...) and @import references to local relative paths.
+
+    base_url:
+      - the remote URL of the CSS *context*
+      - external stylesheet URL for downloaded .css
+      - page URL for inline <style> blocks or style="..."
+
+    base_dir:
+      - local directory where this CSS lives (controls the relative path output)
+
+    Also:
+    - If download_q is provided, enqueue newly discovered assets referenced by CSS.
+    """
+
+    def map_one(url_part: str) -> Optional[str]:
+        url_part = url_part.strip()
+
+        # Skip empties / anchors / non-fetchable schemes.
+        if not url_part:
+            return None
+        if url_part.startswith("#"):
+            return None
+        if url_part.startswith(("data:", "javascript:", "about:")):
+            return None
+
+        url_part2 = _protocol_fix(url_part, base_url)
+        if is_non_fetchable(url_part2) or not is_httpish(url_part2):
+            return None
+
+        # Canonicalize to a stable absolute URL
+        abs_url = canonicalize_url(url_part2, base_url)
+        parsed = urlparse(abs_url)
+        if not parsed.path:
+            return None
+
+        # Only rewrite things that look like static assets.
+        # (Avoid rewriting API URLs accidentally.)
+        if not parsed.path.lower().endswith(ASSET_EXTENSIONS):
+            return None
+
+        is_ext = not is_internal(abs_url, root_netloc)
+        if is_ext and not is_allowed_external(abs_url, external_domains):
+            return None
+
+        if is_ext and not download_external_assets:
+            return None
+
+        # Decide where to store it locally
+        local_path = (
+            cdn_local_path(parsed, site_root)
+            if is_ext
+            else to_local_asset_path(parsed, site_root)
+        )
+
+        # Queue it for downloading if not already present
+        if download_q is not None and not local_path.exists():
+            log.debug("Queue asset (rewrite): %s -> %s", abs_url, local_path)
+            download_q.put((abs_url, local_path))
+
+        # Output a relative URL for the rewritten CSS
+        rel = _rel_url(local_path, base_dir)
+        if parsed.fragment:
+            rel = f"{rel}#{parsed.fragment}"
+        return rel
+
+    # Replace url(...) references
+    def repl_url(m: re.Match) -> str:
+        raw = m.group(1).strip()
+        quote = ""
+        url_part = raw
+
+        # Preserve quoting style if present
+        if len(raw) >= 2 and raw[0] in ("'", '"') and raw[-1] == raw[0]:
+            quote = raw[0]
+            url_part = raw[1:-1].strip()
+
+        mapped = map_one(url_part)
+        if mapped is None:
+            return m.group(0)
+
+        if quote:
+            return f"url({quote}{mapped}{quote})"
+        return f"url({mapped})"
+
+    # Replace @import references
+    def repl_import(m: re.Match) -> str:
+        url_part = m.group(1).strip().strip("'\"")
+        mapped = map_one(url_part)
+        if mapped is None:
+            return m.group(0)
+        return f'@import "{mapped}";'
+
+    css_text = CSS_URL_RE.sub(repl_url, css_text)
+    css_text = CSS_IMPORT_RE.sub(repl_import, css_text)
+    return css_text
+
+
+def rewrite_js_text(
+    js_text: str,
+    base_url: str,
+    *,
+    site_root: Path,
+    root_netloc: str,
+    base_dir: Path,
+    download_external_assets: bool,
+    external_domains: Optional[set[str]] = None,
+    download_q: Optional[queue.Queue[tuple[str, Path]]] = None,
+) -> str:
+    """
+    Rewrite obvious static asset URL strings inside JS.
+
+    Important:
+    - This does NOT parse JS AST; it does simple regex matching on string literals.
+    - It ONLY rewrites strings that look like static assets by extension.
+    - This prevents accidentally rewriting API endpoints or app routes.
+    """
+
+    def map_one(url_part: str) -> Optional[str]:
+        url_part = url_part.strip()
+
+        if not url_part:
+            return None
+        if url_part.startswith("#"):
+            return None
+        if url_part.startswith(("data:", "javascript:", "about:")):
+            return None
+
+        url_part2 = _protocol_fix(url_part, base_url)
+        if is_non_fetchable(url_part2) or not is_httpish(url_part2):
+            return None
+
+        abs_url = canonicalize_url(url_part2, base_url)
+        parsed = urlparse(abs_url)
+
+        if not parsed.path.lower().endswith(ASSET_EXTENSIONS):
+            return None
+
+        is_ext = not is_internal(abs_url, root_netloc)
+        if is_ext and not is_allowed_external(abs_url, external_domains):
+            return None
+
+        if is_ext and not download_external_assets:
+            return None
+
+        local_path = (
+            cdn_local_path(parsed, site_root)
+            if is_ext
+            else to_local_asset_path(parsed, site_root)
+        )
+        if download_q is not None and not local_path.exists():
+            log.debug("Queue asset (rewrite): %s -> %s", abs_url, local_path)
+            download_q.put((abs_url, local_path))
+
+        rel = _rel_url(local_path, base_dir)
+        if parsed.fragment:
+            rel = f"{rel}#{parsed.fragment}"
+        return rel
+
+    def repl_root_rel(m: re.Match) -> str:
+        url_part = m.group(1)
+        mapped = map_one(url_part)
+        if mapped is None:
+            return m.group(0)
+        quote = m.group(0)[0]
+        return f"{quote}{mapped}{quote}"
+
+    def repl_abs(m: re.Match) -> str:
+        url_part = m.group(1)
+        mapped = map_one(url_part)
+        if mapped is None:
+            return m.group(0)
+        quote = m.group(0)[0]
+        return f"{quote}{mapped}{quote}"
+
+    js_text = JS_URL_RE.sub(repl_root_rel, js_text)
+    js_text = JS_ABS_URL_RE.sub(repl_abs, js_text)
+    return js_text
+
+
+def _canonical_netloc(parsed: ParseResult) -> str:
+    """
+    Lowercase hostname and drop default ports so we don't create different
+    local folders for the same host.
+
+    Example:
+      https://EXAMPLE.com:443/a.css -> example.com
+    """
+    host = (parsed.hostname or "").lower()
+    port = parsed.port
+    if not host:
+        return parsed.netloc.lower()
+
+    if (parsed.scheme == "https" and port == 443) or (
+        parsed.scheme == "http" and port == 80
+    ):
+        port = None
+
+    return f"{host}:{port}" if port else host
+
+
+def canonicalize_url(url: str, base_url: str = "") -> str:
+    """
+    Produce a stable absolute URL key for de-duping + mapping.
+
+    Steps:
+    - Fix protocol-relative URLs
+    - Join relative URLs against base_url
+    - Drop fragments (#...)
+    - Normalize host casing + default ports
+    """
+    if base_url:
+        url = urljoin(base_url, _protocol_fix(url, base_url))
+    else:
+        url = _protocol_fix(url, url)
+
+    p = urlparse(url)
+
+    # If still relative, join using base_url (when available).
+    if not p.scheme and not p.netloc:
+        p = urlparse(urljoin(base_url, url)) if base_url else p
+
+    netloc = _canonical_netloc(p) if p.netloc else ""
+    p = p._replace(fragment="", netloc=netloc)
+    return p.geturl()
+
+
+def is_allowed_external(url: str, allowed_domains: Optional[set[str]]) -> bool:
+    if allowed_domains is None:
+        return True
+
+    host = (urlparse(url).hostname or "").lower()
+
+    return any(host == d or host.endswith("." + d) for d in allowed_domains)
+
+
+# ---------------------------------------------------------------------------
+# Fetchers
+# ---------------------------------------------------------------------------
+
+
+def fetch_html(url: str) -> Optional[BeautifulSoup]:
+    """
+    Download an HTML page and return a BeautifulSoup tree.
+
+    We return None on error so the crawler can continue on failures.
+    """
+    try:
+        resp = SESSION.get(url, timeout=TIMEOUT)
+        resp.raise_for_status()
+        return BeautifulSoup(resp.text, "html.parser")
+    except Exception as exc:  # noqa: BLE001
+        log.warning("HTTP error for %s – %s", url, exc)
+        return None
+
+
+def fetch_binary(
+    url: str,
+    dest: Path,
+    download_q: Optional[queue.Queue[tuple[str, Path]]] = None,
+    *,
+    site_root: Optional[Path] = None,
+    root_netloc: str = "",
+    download_external_assets: bool = False,
+    external_domains: Optional[set[str]] = None,
+) -> None:
+    """
+    Stream a binary/static resource to disk.
+
+    Notes:
+    - If already exists, skip.
+    - Writes using streaming so we don't keep big files in memory.
+    - If the file is CSS or JS, rewrite embedded asset URLs and enqueue them.
+    """
+    is_ext = not is_internal(url, root_netloc)
+
+    if is_ext:
+        if not download_external_assets:
+            log.debug("Blocked external (fetch disabled): %s", url)
+            return
+
+        if not is_allowed_external(url, external_domains):
+            log.info("[BLOCKED EXT] %s", url)
+            return
+
+    if dest.exists():
+        return
+
+    try:
+        resp = SESSION.get(url, timeout=TIMEOUT, stream=True)
+        resp.raise_for_status()
+
+        create_dir(dest.parent)
+
+        # Try normal write
+        try:
+            with dest.open("wb") as fh:
+                for chunk in resp.iter_content(CHUNK_SIZE):
+                    if chunk:
+                        fh.write(chunk)
+            log.debug("Saved resource -> %s", dest)
+
+        # If filesystem rejects it (path too long, invalid name), fallback
+        except OSError as exc:
+            log.warning("Binary write failed for %s: %s. Using fallback.", dest, exc)
+
+            h = sha256(str(dest).encode("utf-8")).hexdigest()[:16]
+            fallback = dest.with_name(
+                _shorten_segment(f"{dest.stem}-{h}{dest.suffix}", MAX_SEG_LEN)
+            )
+            create_dir(fallback.parent)
+
+            with fallback.open("wb") as fh:
+                for chunk in resp.iter_content(CHUNK_SIZE):
+                    if chunk:
+                        fh.write(chunk)
+
+            log.debug("Saved resource (fallback) -> %s", fallback)
+            dest = fallback
+
+        # If we downloaded CSS, rewrite its url(...) and @import references,
+        # and enqueue referenced assets (images/fonts/etc).
+        if (
+            dest.suffix.lower() == ".css"
+            and download_q is not None
+            and site_root is not None
+            and root_netloc
+        ):
+            try:
+                css_text = dest.read_text(encoding="utf-8", errors="ignore")
+                rewritten = rewrite_css_text(
+                    css_text,
+                    url,
+                    site_root=site_root,
+                    root_netloc=root_netloc,
+                    base_dir=dest.parent,
+                    download_external_assets=download_external_assets,
+                    external_domains=external_domains,
+                    download_q=download_q,
+                )
+                if rewritten != css_text:
+                    dest.write_text(rewritten, encoding="utf-8")
+            except Exception as exc:  # noqa: BLE001
+                log.debug("CSS rewrite failed for %s – %s", dest, exc)
+
+        # If we downloaded JS, rewrite obvious static URL strings,
+        # and enqueue referenced assets (only those matching ASSET_EXTENSIONS).
+        if (
+            dest.suffix.lower() in {".js", ".mjs"}
+            and download_q is not None
+            and site_root is not None
+            and root_netloc
+        ):
+            try:
+                js_text = dest.read_text(encoding="utf-8", errors="ignore")
+                rewritten = rewrite_js_text(
+                    js_text,
+                    url,
+                    site_root=site_root,
+                    root_netloc=root_netloc,
+                    base_dir=dest.parent,
+                    download_external_assets=download_external_assets,
+                    external_domains=external_domains,
+                    download_q=download_q,
+                )
+                if rewritten != js_text:
+                    dest.write_text(rewritten, encoding="utf-8")
+            except Exception as exc:  # noqa: BLE001
+                log.debug("JS rewrite failed for %s – %s", dest, exc)
+
+    except Exception as exc:  # noqa: BLE001
+        log.error("Failed to save %s – %s", url, exc)
+
+
+# ---------------------------------------------------------------------------
+# Link rewriting
+# ---------------------------------------------------------------------------
+
+
+def rewrite_links(
+    soup: BeautifulSoup,
+    page_url: str,
+    site_root: Path,
+    page_dir: Path,
+    download_external_assets: bool = False,
+    external_domains: Optional[set[str]] = None,
+) -> None:
+    """
+    Rewrite HTML so it can be opened offline.
+
+    Rules:
+    - Internal page links (<a href>) become local HTML file paths.
+    - Internal asset links (img/src, script/src, link/href, etc) become local asset paths.
+    - External asset links are rewritten to local cdn/... paths when
+        external downloading is enabled and the URL is allowed.
+    - External page links (for example <a href="https://...">) are kept unchanged.
+    - Remove <base href="..."> because it changes browser URL resolution offline.
+    """
+    root_netloc = _canonical_netloc(urlparse(page_url))
+
+    # <base href> breaks relative paths when opening offline.
+    base_tag = soup.find("base")
+    if base_tag is not None and base_tag.has_attr("href"):
+        base_tag.decompose()
+
+    # Common attributes that contain URL-like values.
+    url_attrs = {"src", "href", "data-src", "poster"}
+
+    def strip_sri_and_cors(tag) -> None:
+        for attr in ("integrity", "crossorigin"):
+            if tag.has_attr(attr):
+                del tag[attr]
+
+    for tag in soup.find_all(True):
+
+        # For <link>, only rewrite rel-types that are actually fetched by browsers.
+        # This avoids rewriting <link rel="canonical"> or <link rel="alternate"> etc.
+        if tag.name == "link":
+            rel = tag.get("rel", [])
+            if isinstance(rel, str):
+                rel = [rel]
+            rel = [r.lower() for r in rel]
+
+            rel_set = set(rel)
+            if not rel_set & RESOURCE_LINK_RELS:
+                continue
+
+        # ------------------------------------------------------------------
+        # META IMAGE REWRITE (make og/twitter images local)
+        # ------------------------------------------------------------------
+        if tag.name == "meta":
+            content = str(tag.get("content", "")).strip()
+            prop = (tag.get("property") or tag.get("name") or "").lower()
+
+            if content and ("og:image" in prop or "twitter:image" in prop):
+
+                url_part = _protocol_fix(content, page_url)
+
+                if (
+                    not url_part
+                    or url_part.startswith("#")
+                    or url_part.startswith(("data:", "javascript:", "about:"))
+                    or is_non_fetchable(url_part)
+                    or not is_httpish(url_part)
+                ):
+                    continue
+
+                abs_url = canonicalize_url(url_part, page_url)
+                parsed = urlparse(abs_url)
+
+                is_ext = not is_internal(abs_url, root_netloc)
+
+                if is_ext:
+                    if not download_external_assets:
+                        continue
+                    if not is_allowed_external(abs_url, external_domains):
+                        continue
+
+                # map to local path
+                local_path = (
+                    cdn_local_path(parsed, site_root)
+                    if is_ext
+                    else to_local_asset_path(parsed, site_root)
+                )
+
+                # rewrite to relative path
+                rel = _rel_url(local_path, page_dir)
+                tag["content"] = rel
+
+        # Rewrite each URL attribute we care about
+        for attr in url_attrs:
+            if not tag.has_attr(attr):
+                continue
+
+            original_raw = str(tag.get(attr, "")).strip()
+            if not original_raw:
+                continue
+
+            original = _protocol_fix(original_raw, page_url)
+
+            # Skip anchors, non-fetchable schemes, and things that are not http(s)/relative.
+            if (
+                original.startswith("#")
+                or is_non_fetchable(original)
+                or not is_httpish(original)
+            ):
+                continue
+
+            abs_url = canonicalize_url(original, page_url)
+            parsed = urlparse(abs_url)
+
+            is_ext = not is_internal(abs_url, root_netloc)
+            if is_ext:
+                if not download_external_assets:
+                    continue
+                if not is_allowed_external(abs_url, external_domains):
+                    continue
+
+            # Treat <a href> as a "page". Everything else is treated as an asset.
+            treat_as_page = tag.name == "a" and attr == "href"
+
+            rewritten_external_asset = False
+
+            if is_ext and treat_as_page:
+                continue
+
+            if is_ext:
+                if not download_external_assets:
+                    continue
+                if not is_allowed_external(abs_url, external_domains):
+                    continue
+                local_path = cdn_local_path(parsed, site_root)
+                rewritten_external_asset = True
+            else:
+                local_path = (
+                    to_local_path(parsed, site_root)
+                    if treat_as_page
+                    else to_local_asset_path(parsed, site_root)
+                )
+
+            rel = _rel_url(local_path, page_dir)
+            if parsed.fragment:
+                rel = f"{rel}#{parsed.fragment}"
+            tag[attr] = rel
+
+            if rewritten_external_asset and tag.name in {"script", "link"}:
+                strip_sri_and_cors(tag)
+
+        # srcset="url1 1x, url2 2x" needs special parsing
+        if tag.has_attr("srcset"):
+            new_entries = []
+            for entry in str(tag["srcset"]).split(","):
+                entry = entry.strip()
+                if not entry:
+                    continue
+
+                parts = entry.split()
+                url_part = _protocol_fix(parts[0], page_url)
+
+                if (
+                    url_part.startswith("#")
+                    or is_non_fetchable(url_part)
+                    or not is_httpish(url_part)
+                ):
+                    new_entries.append(entry)
+                    continue
+
+                abs_url = normalize_url(canonicalize_url(url_part, page_url))
+                parsed = urlparse(abs_url)
+
+                is_ext = not is_internal(abs_url, root_netloc)
+                if is_ext:
+                    if not download_external_assets:
+                        new_entries.append(entry)
+                        continue
+
+                    if not is_allowed_external(abs_url, external_domains):
+                        new_entries.append(entry)
+                        continue
+
+                    local_path = cdn_local_path(parsed, site_root)
+                else:
+                    local_path = to_local_asset_path(parsed, site_root)
+
+                rel = _rel_url(local_path, page_dir)
+                if parsed.fragment:
+                    rel = f"{rel}#{parsed.fragment}"
+
+                parts[0] = rel
+                new_entries.append(" ".join(parts))
+
+            tag["srcset"] = ", ".join(new_entries)
+
+        # Inline style="background:url(...)" rewriting
+        if tag.has_attr("style"):
+            style = str(tag["style"])
+
+            def repl_style(m: re.Match) -> str:
+                raw = m.group(1).strip()
+                quote = ""
+                url_part = raw
+
+                if len(raw) >= 2 and raw[0] in ("'", '"') and raw[-1] == raw[0]:
+                    quote = raw[0]
+                    url_part = raw[1:-1].strip()
+
+                if (
+                    not url_part
+                    or url_part.startswith("#")
+                    or url_part.startswith(("data:", "javascript:", "about:"))
+                ):
+                    return m.group(0)
+
+                url_part2 = _protocol_fix(url_part, page_url)
+                if is_non_fetchable(url_part2) or not is_httpish(url_part2):
+                    return m.group(0)
+
+                abs_url = canonicalize_url(url_part2, page_url)
+                parsed = urlparse(abs_url)
+
+                # Only rewrite things that look like assets.
+                if not parsed.path.lower().endswith(ASSET_EXTENSIONS):
+                    return m.group(0)
+
+                is_ext = not is_internal(abs_url, root_netloc)
+                if is_ext:
+                    if not download_external_assets:
+                        return m.group(0)
+
+                    if not is_allowed_external(abs_url, external_domains):
+                        return m.group(0)
+
+                    local_path = cdn_local_path(parsed, site_root)
+                else:
+                    local_path = to_local_asset_path(parsed, site_root)
+
+                rel = _rel_url(local_path, page_dir)
+                if parsed.fragment:
+                    rel = f"{rel}#{parsed.fragment}"
+
+                if quote:
+                    return f"url({quote}{rel}{quote})"
+                return f"url({rel})"
+
+            style = CSS_URL_RE.sub(repl_style, style)
+            tag["style"] = style
+
+    # Rewrite <style> blocks too (internal assets only; CDN kept unchanged here)
+    for style_tag in soup.find_all("style"):
+        try:
+            css_text = style_tag.string or style_tag.get_text()
+            if not css_text:
+                continue
+            rewritten = rewrite_css_text(
+                css_text,
+                page_url,
+                site_root=site_root,
+                root_netloc=root_netloc,
+                base_dir=page_dir,
+                download_external_assets=download_external_assets,
+                external_domains=external_domains,
+                download_q=None,
+            )
+            if rewritten != css_text:
+                style_tag.string = rewritten
+        except Exception as exc:  # noqa: BLE001
+            log.debug("Inline <style> rewrite failed on %s – %s", page_url, exc)
+
+
+# ---------------------------------------------------------------------------
+# Crawl coordinator
+# ---------------------------------------------------------------------------
+
+
+def extract_css_assets(css_text: str) -> list[str]:
+    """
+    Extract asset URLs from CSS url(...) and @import patterns.
+
+    This is used when scanning <style> blocks during HTML parse time
+    (before the CSS is written to disk).
+    """
+    results: list[str] = []
+
+    for match in CSS_URL_RE.findall(css_text):
+        url = match.strip().strip("'\"")
+        if not url or url.startswith(("data:", "javascript:", "about:", "#")):
+            continue
+        results.append(url)
+
+    for match in CSS_IMPORT_RE.findall(css_text):
+        url = match.strip().strip("'\"")
+        if not url or url.startswith(("data:", "javascript:", "about:", "#")):
+            continue
+        results.append(url)
+
+    return results
+
+
+def crawl_site(
+    start_url: str,
+    root: Path,
+    max_pages: int,
+    threads: int,
+    download_external_assets: bool = False,
+    external_domains: Optional[set[str]] = None,
+) -> None:
+    """
+    Breadth-first crawl limited to max_pages.
+
+    - q_pages: pages to crawl (HTML only, internal-only)
+    - download_q: assets to download (internal, and optionally external)
+    - worker threads: process download_q and write to disk
+    """
+    q_pages: queue.Queue[str] = queue.Queue()
+    q_pages.put(start_url)
+
+    seen_pages: set[str] = set()
+    queued_pages: set[str] = {start_url}
+
+    # queued_assets ensures we don't enqueue the same asset URL many times.
+    queued_assets: set[str] = set()
+
+    # download_q holds (abs_url, destination_path) pairs.
+    download_q: queue.Queue[tuple[str, Path]] = queue.Queue()
+
+    root_netloc = _canonical_netloc(urlparse(start_url))
+
+    def worker() -> None:
+        """Download worker thread: pulls tasks from download_q and writes them."""
+        while True:
+            url, dest = download_q.get()
+            try:
+                if is_non_fetchable(url) or not is_httpish(url):
+                    log.debug("Skip non-fetchable: %s", url)
+                    continue
+                fetch_binary(
+                    url,
+                    dest,
+                    download_q,
+                    site_root=root,
+                    root_netloc=root_netloc,
+                    download_external_assets=download_external_assets,
+                    external_domains=external_domains,
+                )
+            finally:
+                download_q.task_done()
+
+    # Spawn the asset download workers.
+    for i in range(max(1, threads)):
+        t = threading.Thread(target=worker, name=f"DL-{i + 1}", daemon=True)
+        t.start()
+
+    start_time = time.time()
+    PAGE_SUFFIXES = {"", ".html", ".htm"}
+
+    while not q_pages.empty() and len(seen_pages) < max_pages:
+        page_url = canonicalize_url(q_pages.get())
+        if page_url in seen_pages:
+            continue
+
+        seen_pages.add(page_url)
+        log.info("[%s/%s] %s", len(seen_pages), max_pages, page_url)
+
+        soup = fetch_html(page_url)
+        if soup is None:
+            continue
+
+        # Walk the DOM once and:
+        # 1) enqueue internal pages from <a href=...>
+        # 2) enqueue assets referenced via src/href/data-src/poster/srcset/style/<style>
+        for tag in soup.find_all(True):
+
+            # Common URL-bearing attributes
+            for attr in ("src", "href", "data-src", "poster"):
+                if not tag.has_attr(attr):
+                    continue
+
+                link_raw = str(tag.get(attr, "")).strip()
+                if not link_raw:
+                    continue
+
+                link = _protocol_fix(link_raw, page_url)
+                if (
+                    link.startswith("#")
+                    or is_non_fetchable(link)
+                    or not is_httpish(link)
+                ):
+                    continue
+
+                abs_url = normalize_url(canonicalize_url(link, page_url))
+                parsed = urlparse(abs_url)
+                is_ext = not is_internal(abs_url, root_netloc)
+
+                # Only crawl internal HTML pages from <a href=...>
+                suffix = Path(parsed.path).suffix.lower()
+                is_page = (
+                    tag.name == "a"
+                    and not is_ext
+                    and (parsed.path.endswith("/") or suffix in PAGE_SUFFIXES)
+                )
+
+                if is_page:
+                    if abs_url not in seen_pages and abs_url not in queued_pages:
+                        q_pages.put(abs_url)
+                        queued_pages.add(abs_url)
+                    continue
+
+                # Otherwise treat it as an asset candidate.
+                if is_ext:
+                    parsed_host = (urlparse(abs_url).hostname or "").lower()
+                    log.debug("[EXT-ASSET] %s", parsed_host)
+
+                    if not download_external_assets:
+                        continue
+
+                    if not is_allowed_external(abs_url, external_domains):
+                        log.debug("Blocked external (not whitelisted): %s", abs_url)
+                        continue
+
+                    # External assets without extensions are only allowed for <script> and <link>
+                    # because CDNs sometimes serve JS/CSS without filename extensions.
+                    if tag.name not in (
+                        "script",
+                        "link",
+                    ) and not parsed.path.lower().endswith(ASSET_EXTENSIONS):
+                        continue
+
+                    dest_path = cdn_local_path(parsed, root)
+                else:
+                    dest_path = to_local_asset_path(parsed, root)
+
+                if abs_url not in queued_assets:
+                    queued_assets.add(abs_url)
+                    create_dir(dest_path.parent)
+                    log.debug("Queue asset: %s -> %s", abs_url, dest_path)
+                    download_q.put((abs_url, dest_path))
+
+            # ------------------------------------------------------------------
+            # META IMAGE SUPPORT (og:image, twitter:image)
+            # ------------------------------------------------------------------
+            if tag.name == "meta":
+                content = str(tag.get("content", "")).strip()
+                prop = (tag.get("property") or tag.get("name") or "").lower()
+
+                if content and ("og:image" in prop or "twitter:image" in prop):
+                    url_part = _protocol_fix(content, page_url)
+
+                    if (
+                        not url_part
+                        or url_part.startswith("#")
+                        or url_part.startswith(("data:", "javascript:", "about:"))
+                        or is_non_fetchable(url_part)
+                        or not is_httpish(url_part)
+                    ):
+                        continue
+                    else:
+                        abs_url = normalize_url(canonicalize_url(url_part, page_url))
+                        parsed = urlparse(abs_url)
+
+                        if parsed.path.lower().endswith(ASSET_EXTENSIONS):
+                            is_ext = not is_internal(abs_url, root_netloc)
+
+                            if is_ext:
+                                if not download_external_assets:
+                                    continue
+                                elif not is_allowed_external(abs_url, external_domains):
+                                    log.debug("Blocked external (meta): %s", abs_url)
+                                    continue
+                                else:
+                                    dest_path = cdn_local_path(parsed, root)
+
+                                    if abs_url not in queued_assets:
+                                        queued_assets.add(abs_url)
+                                        create_dir(dest_path.parent)
+                                        log.debug(
+                                            "Queue meta asset: %s -> %s",
+                                            abs_url,
+                                            dest_path,
+                                        )
+                                        download_q.put((abs_url, dest_path))
+                            else:
+                                dest_path = to_local_asset_path(parsed, root)
+
+                                if abs_url not in queued_assets:
+                                    queued_assets.add(abs_url)
+                                    create_dir(dest_path.parent)
+                                    log.debug(
+                                        "Queue meta asset: %s -> %s", abs_url, dest_path
+                                    )
+                                    download_q.put((abs_url, dest_path))
+
+            # srcset handling (images at multiple resolutions)
+            if tag.has_attr("srcset"):
+                for entry in str(tag["srcset"]).split(","):
+                    entry = entry.strip()
+                    if not entry:
+                        continue
+
+                    url_part = _protocol_fix(entry.split()[0], page_url)
+                    if (
+                        url_part.startswith("#")
+                        or is_non_fetchable(url_part)
+                        or not is_httpish(url_part)
+                    ):
+                        continue
+
+                    abs_url = normalize_url(canonicalize_url(url_part, page_url))
+                    parsed = urlparse(abs_url)
+                    is_ext = not is_internal(abs_url, root_netloc)
+
+                    if is_ext:
+                        if not download_external_assets:
+                            continue
+
+                        if not is_allowed_external(abs_url, external_domains):
+                            log.debug("Blocked external (srcset): %s", abs_url)
+                            continue
+
+                        if not parsed.path.lower().endswith(ASSET_EXTENSIONS):
+                            continue
+
+                        dest_path = cdn_local_path(parsed, root)
+                    else:
+                        dest_path = to_local_asset_path(parsed, root)
+
+                    if abs_url not in queued_assets:
+                        queued_assets.add(abs_url)
+                        create_dir(dest_path.parent)
+                        log.debug("Queue asset: %s -> %s", abs_url, dest_path)
+                        download_q.put((abs_url, dest_path))
+
+            # inline style="...url(...)..." assets
+            if tag.has_attr("style"):
+                style = str(tag["style"])
+                for match in CSS_URL_RE.findall(style):
+                    url_part = _protocol_fix(match.strip().strip("'\""), page_url)
+                    if (
+                        not url_part
+                        or url_part.startswith("#")
+                        or url_part.startswith(("data:", "javascript:", "about:"))
+                        or is_non_fetchable(url_part)
+                        or not is_httpish(url_part)
+                    ):
+                        continue
+
+                    abs_url = normalize_url(canonicalize_url(url_part, page_url))
+                    parsed = urlparse(abs_url)
+
+                    if not parsed.path.lower().endswith(ASSET_EXTENSIONS):
+                        continue
+
+                    is_ext = not is_internal(abs_url, root_netloc)
+
+                    if is_ext:
+                        if not download_external_assets:
+                            continue
+
+                        if not is_allowed_external(abs_url, external_domains):
+                            log.debug("Blocked external (inline style): %s", abs_url)
+                            continue
+
+                    dest_path = (
+                        cdn_local_path(parsed, root)
+                        if is_ext
+                        else to_local_asset_path(parsed, root)
+                    )
+                    if abs_url not in queued_assets:
+                        queued_assets.add(abs_url)
+                        create_dir(dest_path.parent)
+                        log.debug("Queue asset: %s -> %s", abs_url, dest_path)
+                        download_q.put((abs_url, dest_path))
+
+            # <style> blocks: extract CSS asset references and enqueue them
+            if tag.name == "style":
+                css_text = tag.string or tag.get_text()
+                if not css_text:
+                    continue
+
+                for asset in extract_css_assets(css_text):
+                    asset = _protocol_fix(asset, page_url)
+                    if (
+                        not asset
+                        or asset.startswith("#")
+                        or asset.startswith(("data:", "javascript:", "about:"))
+                        or is_non_fetchable(asset)
+                        or not is_httpish(asset)
+                    ):
+                        continue
+
+                    abs_url = canonicalize_url(asset, page_url)
+                    parsed = urlparse(abs_url)
+
+                    if not parsed.path.lower().endswith(ASSET_EXTENSIONS):
+                        continue
+
+                    is_ext = not is_internal(abs_url, root_netloc)
+
+                    if is_ext:
+                        if not download_external_assets:
+                            continue
+
+                        if not is_allowed_external(abs_url, external_domains):
+                            log.debug("Blocked external (<style>): %s", abs_url)
+                            continue
+
+                    dest_path = (
+                        cdn_local_path(parsed, root)
+                        if is_ext
+                        else to_local_asset_path(parsed, root)
+                    )
+                    if abs_url not in queued_assets:
+                        queued_assets.add(abs_url)
+                        create_dir(dest_path.parent)
+                        log.debug("Queue asset: %s -> %s", abs_url, dest_path)
+                        download_q.put((abs_url, dest_path))
+
+        # Save current page:
+        # - determine local filename
+        # - rewrite links inside the HTML
+        # - write out the HTML
+        local_path = to_local_path(urlparse(page_url), root)
+        create_dir(local_path.parent)
+        rewrite_links(
+            soup,
+            page_url,
+            root,
+            local_path.parent,
+            download_external_assets,
+            external_domains,
+        )
+        safe_write_text(local_path, str(soup), encoding="utf-8")
+
+    # Wait for all queued asset downloads to finish
+    download_q.join()
+
+    elapsed = time.time() - start_time
+    if seen_pages:
+        log.info(
+            "Crawl finished: %s pages in %.2fs (%.2fs avg)",
+            len(seen_pages),
+            elapsed,
+            elapsed / len(seen_pages),
+        )
+    else:
+        log.warning("Nothing downloaded – check URL or connectivity")
+
+
+# ---------------------------------------------------------------------------
+# Helper function for output folder
+# ---------------------------------------------------------------------------
+
+
+def make_root(url: str, custom: Optional[str]) -> Path:
+    """
+    Derive output folder from URL if custom not supplied.
+
+    Example:
+      https://example.com -> example_com
+    """
+    return Path(custom) if custom else Path(urlparse(url).netloc.replace(".", "_"))
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def parse_args() -> argparse.Namespace:
+    """
+    Parse command-line arguments.
+
+    --download-external-assets:
+      When enabled, we ALSO download assets from other hosts (CDNs).
+      Your HTML rewriting currently keeps CDN URLs unchanged in HTML,
+      but CSS/JS rewriting can still localize them if those files are downloaded.
+    """
+    p = argparse.ArgumentParser(
+        description="Recursively mirror a website for offline use.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    p.add_argument(
+        "--url",
+        required=True,
+        help="Starting URL to crawl (e.g., https://example.com/).",
+    )
+    p.add_argument(
+        "--destination",
+        default=None,
+        help="Output folder (defaults to a folder derived from the URL).",
+    )
+    p.add_argument(
+        "--max-pages",
+        type=int,
+        default=50,
+        help="Maximum number of HTML pages to crawl.",
+    )
+    p.add_argument(
+        "--threads",
+        type=int,
+        default=6,
+        help="Number of concurrent download workers.",
+    )
+    p.add_argument(
+        "--download-external-assets",
+        action="store_true",
+        help="Download external CDN/static assets and rewrite links for offline use.",
+    )
+    p.add_argument(
+        "--external-domains",
+        nargs="+",
+        default=None,
+        help="Whitelist of external domains to download from (implies external download).",
+    )
+    return p.parse_args()
+
+
+if __name__ == "__main__":
+    # Basic argument validation
+    args = parse_args()
+    if args.max_pages < 1:
+        log.error("--max-pages must be >= 1")
+        sys.exit(2)
+    if args.threads < 1:
+        log.error("--threads must be >= 1")
+        sys.exit(2)
+
+    # start URL + output root folder
+    host = args.url
+    root = make_root(args.url, args.destination)
+
+    external_domains = (
+        {
+            urlparse(d).hostname.lower() if "://" in d else d.lower()
+            for d in args.external_domains
+        }
+        if args.external_domains
+        else None
+    )
+
+    download_external_assets = (
+        args.download_external_assets or args.external_domains is not None
+    )
+
+    # Kick off crawl
+    crawl_site(
+        host,
+        root,
+        args.max_pages,
+        args.threads,
+        download_external_assets,
+        external_domains,
+    )