1611 lines
53 KiB
Python
1611 lines
53 KiB
Python
#!/usr/bin/env python3
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import logging
|
||
import os
|
||
import queue
|
||
import re
|
||
import sys
|
||
import threading
|
||
import time
|
||
from hashlib import sha256
|
||
from importlib.util import find_spec
|
||
from pathlib import Path
|
||
from typing import Optional
|
||
from urllib.parse import ParseResult, unquote, urljoin, urlparse
|
||
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
from requests.adapters import HTTPAdapter
|
||
from urllib3.util import Retry
|
||
|
||
HAS_BROTLI = find_spec("brotli") is not None or find_spec("brotlicffi") is not None
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Config / constants
|
||
# ---------------------------------------------------------------------------
|
||
|
||
# Consistent log format across file + console. Thread name is helpful because
|
||
# asset downloads happen in worker threads.
|
||
LOG_FMT = "%(asctime)s | %(levelname)-8s | %(threadName)s | %(message)s"
|
||
|
||
# Extensions we treat as “static assets” worth downloading and rewriting.
|
||
# Used in multiple places: HTML attribute rewriting, CSS url(...) rewriting,
|
||
# JS string rewriting, and crawl-time asset detection.
|
||
ASSET_EXTENSIONS = (
|
||
".css",
|
||
".js",
|
||
".mjs",
|
||
".map",
|
||
".json",
|
||
".wasm",
|
||
".webmanifest",
|
||
".png",
|
||
".jpg",
|
||
".jpeg",
|
||
".gif",
|
||
".webp",
|
||
".avif",
|
||
".svg",
|
||
".ico",
|
||
".woff",
|
||
".woff2",
|
||
".ttf",
|
||
".eot",
|
||
".mp4",
|
||
".webm",
|
||
".mp3",
|
||
)
|
||
|
||
# Conservative JS string rewriting:
|
||
# - JS_URL_RE: matches root-relative strings like "/assets/app.js"
|
||
# - JS_ABS_URL_RE: matches absolute or protocol-relative strings like
|
||
# "https://cdn.example.com/app.js" or "//cdn.example.com/app.js"
|
||
#
|
||
# This is intentionally limited to common static file extensions to avoid
|
||
# rewriting API endpoints or dynamic URLs that could break functionality.
|
||
JS_URL_RE = re.compile(
|
||
r"""["'](/[^"']+\.(?:png|jpg|jpeg|gif|svg|webp|avif|ico|css|js|mjs|map|woff|woff2|ttf|eot|json|wasm|webmanifest)(?:\?[^"']*)?)["']""",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
JS_ABS_URL_RE = re.compile(
|
||
r"""["']((?:https?:)?//[^"']+\.(?:png|jpg|jpeg|gif|svg|webp|avif|ico|css|js|mjs|map|woff|woff2|ttf|eot|json|wasm|webmanifest)(?:\?[^"']*)?)["']""",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
# Default headers can help with sites that block "non-browser" clients.
|
||
_ACCEPT_ENCODING = "gzip, deflate, br" if HAS_BROTLI else "gzip, deflate"
|
||
|
||
DEFAULT_HEADERS = {
|
||
"User-Agent": (
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||
"Chrome/122.0.0.0 Safari/537.36"
|
||
),
|
||
"Accept": (
|
||
"text/html,application/xhtml+xml,application/xml;"
|
||
"q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8"
|
||
),
|
||
"Accept-Language": "en-US,en;q=0.9",
|
||
"Accept-Encoding": _ACCEPT_ENCODING,
|
||
"Connection": "keep-alive",
|
||
"Upgrade-Insecure-Requests": "1",
|
||
}
|
||
|
||
# Network timeouts + streaming chunk size for binary downloads.
|
||
TIMEOUT = 15 # seconds
|
||
CHUNK_SIZE = 8192 # bytes
|
||
|
||
# Conservative margins under common OS limits (~255–260 bytes).
|
||
# These protect you from “File name too long” and odd Windows path rules.
|
||
MAX_PATH_LEN = 240
|
||
MAX_SEG_LEN = 120
|
||
|
||
# Collapse 3+ dots ("....") down to a single dot to avoid weird filenames.
|
||
_MULTI_DOTS_RE = re.compile(r"\.{3,}")
|
||
|
||
# CSS url(...) extractor. Note: this is simple (not a full CSS parser),
|
||
# but good enough for most sites.
|
||
CSS_URL_RE = re.compile(r"url\(([^)]+)\)")
|
||
|
||
# CSS @import extractor. Also simple-but-effective.
|
||
CSS_IMPORT_RE = re.compile(
|
||
r"""@import\s+(?:url\()?['"]?([^'"\);]+)['"]?\)?\s*;""",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
# Characters that commonly cause filesystem issues, especially on Windows.
|
||
_BAD_SEG_CHARS_RE = re.compile(r'[<>:"/\\|?*\x00-\x1F]')
|
||
|
||
# Windows reserved filenames; writing these can fail or behave badly.
|
||
_WINDOWS_RESERVED_NAMES = {
|
||
"CON",
|
||
"PRN",
|
||
"AUX",
|
||
"NUL",
|
||
*(f"COM{i}" for i in range(1, 10)),
|
||
*(f"LPT{i}" for i in range(1, 10)),
|
||
}
|
||
|
||
RESOURCE_LINK_RELS = {
|
||
"stylesheet",
|
||
"icon",
|
||
"shortcut",
|
||
"apple-touch-icon",
|
||
"preload",
|
||
"modulepreload",
|
||
"manifest",
|
||
}
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Logging
|
||
# ---------------------------------------------------------------------------
|
||
|
||
# File logging is DEBUG to help you trace rewrites and queue behavior.
|
||
logging.basicConfig(
|
||
filename="web_scraper.log",
|
||
level=logging.DEBUG,
|
||
format=LOG_FMT,
|
||
datefmt="%H:%M:%S",
|
||
force=True,
|
||
)
|
||
|
||
# Console logging is INFO to keep output readable while running.
|
||
_console = logging.StreamHandler(sys.stdout)
|
||
_console.setLevel(logging.INFO)
|
||
_console.setFormatter(logging.Formatter(LOG_FMT, datefmt="%H:%M:%S"))
|
||
logging.getLogger().addHandler(_console)
|
||
log = logging.getLogger(__name__)
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# HTTP session (retry, timeouts, custom UA)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
# Shared session improves performance and keeps connection pooling.
|
||
SESSION = requests.Session()
|
||
|
||
# Retry strategy for transient issues (rate limits, 5xx). Helps stability.
|
||
RETRY_STRAT = Retry(
|
||
total=5,
|
||
backoff_factor=0.5,
|
||
status_forcelist=[429, 500, 502, 503, 504],
|
||
allowed_methods=["GET", "HEAD"],
|
||
)
|
||
|
||
SESSION.mount("http://", HTTPAdapter(max_retries=RETRY_STRAT))
|
||
SESSION.mount("https://", HTTPAdapter(max_retries=RETRY_STRAT))
|
||
SESSION.headers.update(DEFAULT_HEADERS)
|
||
log.debug("Accept-Encoding configured as: %s", SESSION.headers.get("Accept-Encoding"))
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def create_dir(path: Path) -> None:
|
||
"""Create path (and parents) if it does not already exist."""
|
||
if not path.exists():
|
||
path.mkdir(parents=True, exist_ok=True)
|
||
log.debug("Created directory %s", path)
|
||
|
||
|
||
# Schemes that are valid URLs in HTML but are not HTTP fetch targets.
|
||
# If we try to request these, requests will throw InvalidSchema.
|
||
NON_FETCHABLE_SCHEMES = {
|
||
"mailto",
|
||
"tel",
|
||
"sms",
|
||
"javascript",
|
||
"data",
|
||
"geo",
|
||
"blob",
|
||
"about",
|
||
}
|
||
|
||
|
||
def is_httpish(u: str) -> bool:
|
||
"""
|
||
True iff the URL is http(s) or relative (no scheme).
|
||
|
||
Why:
|
||
- We only fetch http(s) resources.
|
||
- Relative URLs should still be handled because we can join them to base URLs.
|
||
"""
|
||
p = urlparse(u)
|
||
return (p.scheme in ("http", "https")) or (p.scheme == "")
|
||
|
||
|
||
def is_non_fetchable(u: str) -> bool:
|
||
"""
|
||
True iff the URL clearly shouldn't be fetched (mailto:, tel:, data:, ...).
|
||
"""
|
||
p = urlparse(u)
|
||
return p.scheme in NON_FETCHABLE_SCHEMES
|
||
|
||
|
||
def is_internal(link: str, root_netloc: str) -> bool:
|
||
"""
|
||
Decide whether `link` belongs to the same site as `root_netloc`.
|
||
|
||
Notes:
|
||
- Relative URLs are internal.
|
||
- We normalize "www." so example.com and www.example.com count as same.
|
||
"""
|
||
parsed = urlparse(link)
|
||
netloc = _canonical_netloc(parsed)
|
||
|
||
if not netloc:
|
||
return True
|
||
|
||
if netloc == root_netloc:
|
||
return True
|
||
|
||
# normalize www
|
||
if netloc.startswith("www."):
|
||
netloc = netloc[4:]
|
||
root = root_netloc[4:] if root_netloc.startswith("www.") else root_netloc
|
||
|
||
return netloc == root
|
||
|
||
|
||
def _sanitize_segment(segment: str) -> str:
|
||
"""
|
||
Sanitize a single path segment for safe writing to disk.
|
||
|
||
- URL decode (turn %20 into space, etc.)
|
||
- Strip whitespace / trailing dot-space combos (Windows issues)
|
||
- Collapse accidental multi-dots
|
||
- Replace illegal filesystem chars with '_'
|
||
- Neutralize '.' and '..' to prevent traversal-like paths
|
||
- Avoid Windows reserved names (CON, PRN, COM1, ...)
|
||
"""
|
||
segment = unquote(segment).strip()
|
||
segment = segment.strip(" .")
|
||
segment = _MULTI_DOTS_RE.sub(".", segment)
|
||
segment = _BAD_SEG_CHARS_RE.sub("_", segment)
|
||
|
||
if segment in ("", ".", ".."):
|
||
segment = "_"
|
||
|
||
if segment.upper() in _WINDOWS_RESERVED_NAMES:
|
||
segment = f"_{segment}_"
|
||
|
||
return segment
|
||
|
||
|
||
def _shorten_segment(segment: str, limit: int = MAX_SEG_LEN) -> str:
|
||
"""
|
||
Shorten a path segment if it exceeds a length limit.
|
||
|
||
Strategy:
|
||
- Keep the original extension
|
||
- Truncate the stem
|
||
- Append a short hash so different long names don't collide
|
||
"""
|
||
if len(segment) <= limit:
|
||
return segment
|
||
p = Path(segment)
|
||
stem, suffix = p.stem, p.suffix
|
||
h = sha256(segment.encode("utf-8")).hexdigest()[:12]
|
||
keep = max(0, limit - len(suffix) - 13) # '-' + hash is 13 chars total
|
||
return f"{stem[:keep]}-{h}{suffix}"
|
||
|
||
|
||
def _rel_url(target: Path, base_dir: Path) -> str:
|
||
"""
|
||
Compute a URL-style relative path (forward slashes),
|
||
not an OS-specific path.
|
||
"""
|
||
try:
|
||
rel = os.path.relpath(target, base_dir)
|
||
except ValueError:
|
||
# Happens if paths are on different drives on Windows.
|
||
return target.as_posix()
|
||
return Path(rel).as_posix()
|
||
|
||
|
||
def to_local_path(parsed: ParseResult, site_root: Path) -> Path:
|
||
"""
|
||
Map an internal *page* URL to a local HTML file under site_root.
|
||
|
||
Rules:
|
||
- "/" -> index.html
|
||
- "/foo/" -> /foo/index.html
|
||
- "/foo" (no extension) -> /foo.html
|
||
- query strings get a short hash to prevent collisions:
|
||
/page?id=1 and /page?id=2 should not overwrite each other
|
||
- filesystem hardening: sanitize segments, limit segment length and overall path
|
||
"""
|
||
rel = parsed.path.lstrip("/")
|
||
if not rel:
|
||
rel = "index.html"
|
||
elif rel.endswith("/"):
|
||
rel += "index.html"
|
||
elif not Path(rel).suffix:
|
||
rel += ".html"
|
||
|
||
if parsed.query:
|
||
qh = sha256(parsed.query.encode("utf-8")).hexdigest()[:10]
|
||
p = Path(rel)
|
||
rel = str(p.with_name(f"{p.stem}-q{qh}{p.suffix}"))
|
||
|
||
parts = Path(rel).parts
|
||
parts = tuple(_sanitize_segment(seg) for seg in parts)
|
||
parts = tuple(_shorten_segment(seg, MAX_SEG_LEN) for seg in parts)
|
||
local_path = site_root / Path(*parts)
|
||
|
||
if len(str(local_path)) > MAX_PATH_LEN:
|
||
p = local_path
|
||
h = sha256(parsed.geturl().encode("utf-8")).hexdigest()[:16]
|
||
leaf = _shorten_segment(f"{p.stem}-{h}{p.suffix}", MAX_SEG_LEN)
|
||
local_path = p.with_name(leaf)
|
||
|
||
return local_path
|
||
|
||
|
||
def to_local_asset_path(parsed: ParseResult, site_root: Path) -> Path:
|
||
"""
|
||
Map an internal *asset* URL to a local file path under site_root.
|
||
|
||
Difference vs to_local_path():
|
||
- We do NOT force .html for extensionless paths.
|
||
(Some sites serve extensionless assets, though less common.)
|
||
"""
|
||
rel = parsed.path.lstrip("/")
|
||
if not rel:
|
||
rel = "index"
|
||
elif rel.endswith("/"):
|
||
rel += "index"
|
||
|
||
if parsed.query:
|
||
qh = sha256(parsed.query.encode("utf-8")).hexdigest()[:10]
|
||
p = Path(rel)
|
||
name = f"{p.stem}-q{qh}{p.suffix}" if p.suffix else f"{p.name}-q{qh}"
|
||
rel = str(p.with_name(name))
|
||
|
||
parts = Path(rel).parts
|
||
parts = tuple(_sanitize_segment(seg) for seg in parts)
|
||
parts = tuple(_shorten_segment(seg, MAX_SEG_LEN) for seg in parts)
|
||
local_path = site_root / Path(*parts)
|
||
|
||
if len(str(local_path)) > MAX_PATH_LEN:
|
||
p = local_path
|
||
h = sha256(parsed.geturl().encode("utf-8")).hexdigest()[:16]
|
||
leaf = _shorten_segment(f"{p.stem}-{h}{p.suffix}", MAX_SEG_LEN)
|
||
local_path = p.with_name(leaf)
|
||
|
||
return local_path
|
||
|
||
|
||
def cdn_local_path(parsed: ParseResult, site_root: Path) -> Path:
|
||
"""
|
||
Map an external (CDN) URL to a local path under:
|
||
site_root/cdn/<netloc>/...
|
||
|
||
Why:
|
||
- Keeps external host assets separated from internal assets.
|
||
- Avoids collisions where internal and external paths look similar.
|
||
"""
|
||
rel = parsed.path.lstrip("/")
|
||
if not rel:
|
||
rel = "index"
|
||
elif rel.endswith("/"):
|
||
rel += "index"
|
||
|
||
if parsed.query:
|
||
qh = sha256(parsed.query.encode("utf-8")).hexdigest()[:10]
|
||
p = Path(rel)
|
||
name = f"{p.stem}-q{qh}{p.suffix}" if p.suffix else f"{p.name}-q{qh}"
|
||
rel = str(p.with_name(name))
|
||
|
||
parts = Path(rel).parts
|
||
parts = tuple(_sanitize_segment(seg) for seg in parts)
|
||
parts = tuple(_shorten_segment(seg, MAX_SEG_LEN) for seg in parts)
|
||
|
||
netloc = _canonical_netloc(parsed)
|
||
local_path = site_root / "cdn" / _sanitize_segment(netloc) / Path(*parts)
|
||
|
||
if len(str(local_path)) > MAX_PATH_LEN:
|
||
p = local_path
|
||
h = sha256(parsed.geturl().encode("utf-8")).hexdigest()[:16]
|
||
leaf = _shorten_segment(f"{p.stem}-{h}{p.suffix}", MAX_SEG_LEN)
|
||
local_path = p.with_name(leaf)
|
||
|
||
return local_path
|
||
|
||
|
||
def safe_write_text(path: Path, text: str, encoding: str = "utf-8") -> Path:
|
||
"""
|
||
Write text to path safely.
|
||
|
||
If the OS rejects the filename/path (often: path too long), we:
|
||
- hash the leaf name
|
||
- write to a fallback name
|
||
- return the final path used
|
||
"""
|
||
try:
|
||
path.write_text(text, encoding=encoding)
|
||
return path
|
||
except OSError as exc:
|
||
log.warning("Write failed for %s: %s. Falling back to hashed leaf.", path, exc)
|
||
p = path
|
||
h = sha256(str(p).encode("utf-8")).hexdigest()[:16]
|
||
fallback = p.with_name(_shorten_segment(f"{p.stem}-{h}{p.suffix}", MAX_SEG_LEN))
|
||
create_dir(fallback.parent)
|
||
fallback.write_text(text, encoding=encoding)
|
||
return fallback
|
||
|
||
|
||
def normalize_url(url: str) -> str:
|
||
"""
|
||
Normalize URLs to avoid duplicates caused by fragments.
|
||
|
||
Example:
|
||
- https://site/page#section1 and https://site/page#section2
|
||
are the same document for our crawler.
|
||
"""
|
||
parsed = urlparse(url)
|
||
clean = parsed._replace(fragment="")
|
||
return clean.geturl()
|
||
|
||
|
||
def _protocol_fix(url: str, base_url: str) -> str:
|
||
"""
|
||
Normalize protocol-relative URLs (//host/path) to absolute ones.
|
||
|
||
Browsers interpret //example.com/a.css as "use the current page scheme".
|
||
We do the same using base_url's scheme.
|
||
"""
|
||
if url.startswith("//"):
|
||
base = urlparse(base_url)
|
||
scheme = base.scheme or "https"
|
||
return f"{scheme}:{url}"
|
||
return url
|
||
|
||
|
||
def rewrite_css_text(
|
||
css_text: str,
|
||
base_url: str,
|
||
*,
|
||
site_root: Path,
|
||
root_netloc: str,
|
||
base_dir: Path,
|
||
download_external_assets: bool,
|
||
external_domains: Optional[set[str]] = None,
|
||
download_q: Optional[queue.Queue[tuple[str, Path]]] = None,
|
||
) -> str:
|
||
"""
|
||
Rewrite CSS url(...) and @import references to local relative paths.
|
||
|
||
base_url:
|
||
- the remote URL of the CSS *context*
|
||
- external stylesheet URL for downloaded .css
|
||
- page URL for inline <style> blocks or style="..."
|
||
|
||
base_dir:
|
||
- local directory where this CSS lives (controls the relative path output)
|
||
|
||
Also:
|
||
- If download_q is provided, enqueue newly discovered assets referenced by CSS.
|
||
"""
|
||
|
||
def map_one(url_part: str) -> Optional[str]:
|
||
url_part = url_part.strip()
|
||
|
||
# Skip empties / anchors / non-fetchable schemes.
|
||
if not url_part:
|
||
return None
|
||
if url_part.startswith("#"):
|
||
return None
|
||
if url_part.startswith(("data:", "javascript:", "about:")):
|
||
return None
|
||
|
||
url_part2 = _protocol_fix(url_part, base_url)
|
||
if is_non_fetchable(url_part2) or not is_httpish(url_part2):
|
||
return None
|
||
|
||
# Canonicalize to a stable absolute URL
|
||
abs_url = canonicalize_url(url_part2, base_url)
|
||
parsed = urlparse(abs_url)
|
||
if not parsed.path:
|
||
return None
|
||
|
||
# Only rewrite things that look like static assets.
|
||
# (Avoid rewriting API URLs accidentally.)
|
||
if not parsed.path.lower().endswith(ASSET_EXTENSIONS):
|
||
return None
|
||
|
||
is_ext = not is_internal(abs_url, root_netloc)
|
||
if is_ext and not is_allowed_external(abs_url, external_domains):
|
||
return None
|
||
|
||
if is_ext and not download_external_assets:
|
||
return None
|
||
|
||
# Decide where to store it locally
|
||
local_path = (
|
||
cdn_local_path(parsed, site_root)
|
||
if is_ext
|
||
else to_local_asset_path(parsed, site_root)
|
||
)
|
||
|
||
# Queue it for downloading if not already present
|
||
if download_q is not None and not local_path.exists():
|
||
log.debug("Queue asset (rewrite): %s -> %s", abs_url, local_path)
|
||
download_q.put((abs_url, local_path))
|
||
|
||
# Output a relative URL for the rewritten CSS
|
||
rel = _rel_url(local_path, base_dir)
|
||
if parsed.fragment:
|
||
rel = f"{rel}#{parsed.fragment}"
|
||
return rel
|
||
|
||
# Replace url(...) references
|
||
def repl_url(m: re.Match) -> str:
|
||
raw = m.group(1).strip()
|
||
quote = ""
|
||
url_part = raw
|
||
|
||
# Preserve quoting style if present
|
||
if len(raw) >= 2 and raw[0] in ("'", '"') and raw[-1] == raw[0]:
|
||
quote = raw[0]
|
||
url_part = raw[1:-1].strip()
|
||
|
||
mapped = map_one(url_part)
|
||
if mapped is None:
|
||
return m.group(0)
|
||
|
||
if quote:
|
||
return f"url({quote}{mapped}{quote})"
|
||
return f"url({mapped})"
|
||
|
||
# Replace @import references
|
||
def repl_import(m: re.Match) -> str:
|
||
url_part = m.group(1).strip().strip("'\"")
|
||
mapped = map_one(url_part)
|
||
if mapped is None:
|
||
return m.group(0)
|
||
return f'@import "{mapped}";'
|
||
|
||
css_text = CSS_URL_RE.sub(repl_url, css_text)
|
||
css_text = CSS_IMPORT_RE.sub(repl_import, css_text)
|
||
return css_text
|
||
|
||
|
||
def rewrite_js_text(
|
||
js_text: str,
|
||
base_url: str,
|
||
*,
|
||
site_root: Path,
|
||
root_netloc: str,
|
||
base_dir: Path,
|
||
download_external_assets: bool,
|
||
external_domains: Optional[set[str]] = None,
|
||
download_q: Optional[queue.Queue[tuple[str, Path]]] = None,
|
||
) -> str:
|
||
"""
|
||
Rewrite obvious static asset URL strings inside JS.
|
||
|
||
Important:
|
||
- This does NOT parse JS AST; it does simple regex matching on string literals.
|
||
- It ONLY rewrites strings that look like static assets by extension.
|
||
- This prevents accidentally rewriting API endpoints or app routes.
|
||
"""
|
||
|
||
def map_one(url_part: str) -> Optional[str]:
|
||
url_part = url_part.strip()
|
||
|
||
if not url_part:
|
||
return None
|
||
if url_part.startswith("#"):
|
||
return None
|
||
if url_part.startswith(("data:", "javascript:", "about:")):
|
||
return None
|
||
|
||
url_part2 = _protocol_fix(url_part, base_url)
|
||
if is_non_fetchable(url_part2) or not is_httpish(url_part2):
|
||
return None
|
||
|
||
abs_url = canonicalize_url(url_part2, base_url)
|
||
parsed = urlparse(abs_url)
|
||
|
||
if not parsed.path.lower().endswith(ASSET_EXTENSIONS):
|
||
return None
|
||
|
||
is_ext = not is_internal(abs_url, root_netloc)
|
||
if is_ext and not is_allowed_external(abs_url, external_domains):
|
||
return None
|
||
|
||
if is_ext and not download_external_assets:
|
||
return None
|
||
|
||
local_path = (
|
||
cdn_local_path(parsed, site_root)
|
||
if is_ext
|
||
else to_local_asset_path(parsed, site_root)
|
||
)
|
||
if download_q is not None and not local_path.exists():
|
||
log.debug("Queue asset (rewrite): %s -> %s", abs_url, local_path)
|
||
download_q.put((abs_url, local_path))
|
||
|
||
rel = _rel_url(local_path, base_dir)
|
||
if parsed.fragment:
|
||
rel = f"{rel}#{parsed.fragment}"
|
||
return rel
|
||
|
||
def repl_root_rel(m: re.Match) -> str:
|
||
url_part = m.group(1)
|
||
mapped = map_one(url_part)
|
||
if mapped is None:
|
||
return m.group(0)
|
||
quote = m.group(0)[0]
|
||
return f"{quote}{mapped}{quote}"
|
||
|
||
def repl_abs(m: re.Match) -> str:
|
||
url_part = m.group(1)
|
||
mapped = map_one(url_part)
|
||
if mapped is None:
|
||
return m.group(0)
|
||
quote = m.group(0)[0]
|
||
return f"{quote}{mapped}{quote}"
|
||
|
||
js_text = JS_URL_RE.sub(repl_root_rel, js_text)
|
||
js_text = JS_ABS_URL_RE.sub(repl_abs, js_text)
|
||
return js_text
|
||
|
||
|
||
def _canonical_netloc(parsed: ParseResult) -> str:
|
||
"""
|
||
Lowercase hostname and drop default ports so we don't create different
|
||
local folders for the same host.
|
||
|
||
Example:
|
||
https://EXAMPLE.com:443/a.css -> example.com
|
||
"""
|
||
host = (parsed.hostname or "").lower()
|
||
port = parsed.port
|
||
if not host:
|
||
return parsed.netloc.lower()
|
||
|
||
if (parsed.scheme == "https" and port == 443) or (
|
||
parsed.scheme == "http" and port == 80
|
||
):
|
||
port = None
|
||
|
||
return f"{host}:{port}" if port else host
|
||
|
||
|
||
def canonicalize_url(url: str, base_url: str = "") -> str:
|
||
"""
|
||
Produce a stable absolute URL key for de-duping + mapping.
|
||
|
||
Steps:
|
||
- Fix protocol-relative URLs
|
||
- Join relative URLs against base_url
|
||
- Drop fragments (#...)
|
||
- Normalize host casing + default ports
|
||
"""
|
||
if base_url:
|
||
url = urljoin(base_url, _protocol_fix(url, base_url))
|
||
else:
|
||
url = _protocol_fix(url, url)
|
||
|
||
p = urlparse(url)
|
||
|
||
# If still relative, join using base_url (when available).
|
||
if not p.scheme and not p.netloc:
|
||
p = urlparse(urljoin(base_url, url)) if base_url else p
|
||
|
||
netloc = _canonical_netloc(p) if p.netloc else ""
|
||
p = p._replace(fragment="", netloc=netloc)
|
||
return p.geturl()
|
||
|
||
|
||
def is_allowed_external(url: str, allowed_domains: Optional[set[str]]) -> bool:
|
||
if allowed_domains is None:
|
||
return True
|
||
|
||
host = (urlparse(url).hostname or "").lower()
|
||
|
||
return any(host == d or host.endswith("." + d) for d in allowed_domains)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Fetchers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def fetch_html(url: str) -> Optional[BeautifulSoup]:
|
||
"""
|
||
Download an HTML page and return a BeautifulSoup tree.
|
||
|
||
We return None on error so the crawler can continue on failures.
|
||
"""
|
||
try:
|
||
resp = SESSION.get(url, timeout=TIMEOUT)
|
||
resp.raise_for_status()
|
||
return BeautifulSoup(resp.text, "html.parser")
|
||
except Exception as exc: # noqa: BLE001
|
||
log.warning("HTTP error for %s – %s", url, exc)
|
||
return None
|
||
|
||
|
||
def fetch_binary(
|
||
url: str,
|
||
dest: Path,
|
||
download_q: Optional[queue.Queue[tuple[str, Path]]] = None,
|
||
*,
|
||
site_root: Optional[Path] = None,
|
||
root_netloc: str = "",
|
||
download_external_assets: bool = False,
|
||
external_domains: Optional[set[str]] = None,
|
||
) -> None:
|
||
"""
|
||
Stream a binary/static resource to disk.
|
||
|
||
Notes:
|
||
- If already exists, skip.
|
||
- Writes using streaming so we don't keep big files in memory.
|
||
- If the file is CSS or JS, rewrite embedded asset URLs and enqueue them.
|
||
"""
|
||
is_ext = not is_internal(url, root_netloc)
|
||
|
||
if is_ext:
|
||
if not download_external_assets:
|
||
log.debug("Blocked external (fetch disabled): %s", url)
|
||
return
|
||
|
||
if not is_allowed_external(url, external_domains):
|
||
log.info("[BLOCKED EXT] %s", url)
|
||
return
|
||
|
||
if dest.exists():
|
||
return
|
||
|
||
try:
|
||
resp = SESSION.get(url, timeout=TIMEOUT, stream=True)
|
||
resp.raise_for_status()
|
||
|
||
create_dir(dest.parent)
|
||
|
||
# Try normal write
|
||
try:
|
||
with dest.open("wb") as fh:
|
||
for chunk in resp.iter_content(CHUNK_SIZE):
|
||
if chunk:
|
||
fh.write(chunk)
|
||
log.debug("Saved resource -> %s", dest)
|
||
|
||
# If filesystem rejects it (path too long, invalid name), fallback
|
||
except OSError as exc:
|
||
log.warning("Binary write failed for %s: %s. Using fallback.", dest, exc)
|
||
|
||
h = sha256(str(dest).encode("utf-8")).hexdigest()[:16]
|
||
fallback = dest.with_name(
|
||
_shorten_segment(f"{dest.stem}-{h}{dest.suffix}", MAX_SEG_LEN)
|
||
)
|
||
create_dir(fallback.parent)
|
||
|
||
with fallback.open("wb") as fh:
|
||
for chunk in resp.iter_content(CHUNK_SIZE):
|
||
if chunk:
|
||
fh.write(chunk)
|
||
|
||
log.debug("Saved resource (fallback) -> %s", fallback)
|
||
dest = fallback
|
||
|
||
# If we downloaded CSS, rewrite its url(...) and @import references,
|
||
# and enqueue referenced assets (images/fonts/etc).
|
||
if (
|
||
dest.suffix.lower() == ".css"
|
||
and download_q is not None
|
||
and site_root is not None
|
||
and root_netloc
|
||
):
|
||
try:
|
||
css_text = dest.read_text(encoding="utf-8", errors="ignore")
|
||
rewritten = rewrite_css_text(
|
||
css_text,
|
||
url,
|
||
site_root=site_root,
|
||
root_netloc=root_netloc,
|
||
base_dir=dest.parent,
|
||
download_external_assets=download_external_assets,
|
||
external_domains=external_domains,
|
||
download_q=download_q,
|
||
)
|
||
if rewritten != css_text:
|
||
dest.write_text(rewritten, encoding="utf-8")
|
||
except Exception as exc: # noqa: BLE001
|
||
log.debug("CSS rewrite failed for %s – %s", dest, exc)
|
||
|
||
# If we downloaded JS, rewrite obvious static URL strings,
|
||
# and enqueue referenced assets (only those matching ASSET_EXTENSIONS).
|
||
if (
|
||
dest.suffix.lower() in {".js", ".mjs"}
|
||
and download_q is not None
|
||
and site_root is not None
|
||
and root_netloc
|
||
):
|
||
try:
|
||
js_text = dest.read_text(encoding="utf-8", errors="ignore")
|
||
rewritten = rewrite_js_text(
|
||
js_text,
|
||
url,
|
||
site_root=site_root,
|
||
root_netloc=root_netloc,
|
||
base_dir=dest.parent,
|
||
download_external_assets=download_external_assets,
|
||
external_domains=external_domains,
|
||
download_q=download_q,
|
||
)
|
||
if rewritten != js_text:
|
||
dest.write_text(rewritten, encoding="utf-8")
|
||
except Exception as exc: # noqa: BLE001
|
||
log.debug("JS rewrite failed for %s – %s", dest, exc)
|
||
|
||
except Exception as exc: # noqa: BLE001
|
||
log.error("Failed to save %s – %s", url, exc)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Link rewriting
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def rewrite_links(
|
||
soup: BeautifulSoup,
|
||
page_url: str,
|
||
site_root: Path,
|
||
page_dir: Path,
|
||
download_external_assets: bool = False,
|
||
external_domains: Optional[set[str]] = None,
|
||
) -> None:
|
||
"""
|
||
Rewrite HTML so it can be opened offline.
|
||
|
||
Rules:
|
||
- Internal page links (<a href>) become local HTML file paths.
|
||
- Internal asset links (img/src, script/src, link/href, etc) become local asset paths.
|
||
- External asset links are rewritten to local cdn/... paths when
|
||
external downloading is enabled and the URL is allowed.
|
||
- External page links (for example <a href="https://...">) are kept unchanged.
|
||
- Remove <base href="..."> because it changes browser URL resolution offline.
|
||
"""
|
||
root_netloc = _canonical_netloc(urlparse(page_url))
|
||
|
||
# <base href> breaks relative paths when opening offline.
|
||
base_tag = soup.find("base")
|
||
if base_tag is not None and base_tag.has_attr("href"):
|
||
base_tag.decompose()
|
||
|
||
# Common attributes that contain URL-like values.
|
||
url_attrs = {"src", "href", "data-src", "poster"}
|
||
|
||
def strip_sri_and_cors(tag) -> None:
|
||
for attr in ("integrity", "crossorigin"):
|
||
if tag.has_attr(attr):
|
||
del tag[attr]
|
||
|
||
for tag in soup.find_all(True):
|
||
|
||
# For <link>, only rewrite rel-types that are actually fetched by browsers.
|
||
# This avoids rewriting <link rel="canonical"> or <link rel="alternate"> etc.
|
||
if tag.name == "link":
|
||
rel = tag.get("rel", [])
|
||
if isinstance(rel, str):
|
||
rel = [rel]
|
||
rel = [r.lower() for r in rel]
|
||
|
||
rel_set = set(rel)
|
||
if not rel_set & RESOURCE_LINK_RELS:
|
||
continue
|
||
|
||
# ------------------------------------------------------------------
|
||
# META IMAGE REWRITE (make og/twitter images local)
|
||
# ------------------------------------------------------------------
|
||
if tag.name == "meta":
|
||
content = str(tag.get("content", "")).strip()
|
||
prop = (tag.get("property") or tag.get("name") or "").lower()
|
||
|
||
if content and ("og:image" in prop or "twitter:image" in prop):
|
||
|
||
url_part = _protocol_fix(content, page_url)
|
||
|
||
if (
|
||
not url_part
|
||
or url_part.startswith("#")
|
||
or url_part.startswith(("data:", "javascript:", "about:"))
|
||
or is_non_fetchable(url_part)
|
||
or not is_httpish(url_part)
|
||
):
|
||
continue
|
||
|
||
abs_url = canonicalize_url(url_part, page_url)
|
||
parsed = urlparse(abs_url)
|
||
|
||
is_ext = not is_internal(abs_url, root_netloc)
|
||
|
||
if is_ext:
|
||
if not download_external_assets:
|
||
continue
|
||
if not is_allowed_external(abs_url, external_domains):
|
||
continue
|
||
|
||
# map to local path
|
||
local_path = (
|
||
cdn_local_path(parsed, site_root)
|
||
if is_ext
|
||
else to_local_asset_path(parsed, site_root)
|
||
)
|
||
|
||
# rewrite to relative path
|
||
rel = _rel_url(local_path, page_dir)
|
||
tag["content"] = rel
|
||
|
||
# Rewrite each URL attribute we care about
|
||
for attr in url_attrs:
|
||
if not tag.has_attr(attr):
|
||
continue
|
||
|
||
original_raw = str(tag.get(attr, "")).strip()
|
||
if not original_raw:
|
||
continue
|
||
|
||
original = _protocol_fix(original_raw, page_url)
|
||
|
||
# Skip anchors, non-fetchable schemes, and things that are not http(s)/relative.
|
||
if (
|
||
original.startswith("#")
|
||
or is_non_fetchable(original)
|
||
or not is_httpish(original)
|
||
):
|
||
continue
|
||
|
||
abs_url = canonicalize_url(original, page_url)
|
||
parsed = urlparse(abs_url)
|
||
|
||
is_ext = not is_internal(abs_url, root_netloc)
|
||
if is_ext:
|
||
if not download_external_assets:
|
||
continue
|
||
if not is_allowed_external(abs_url, external_domains):
|
||
continue
|
||
|
||
# Treat <a href> as a "page". Everything else is treated as an asset.
|
||
treat_as_page = tag.name == "a" and attr == "href"
|
||
|
||
rewritten_external_asset = False
|
||
|
||
if is_ext and treat_as_page:
|
||
continue
|
||
|
||
if is_ext:
|
||
if not download_external_assets:
|
||
continue
|
||
if not is_allowed_external(abs_url, external_domains):
|
||
continue
|
||
local_path = cdn_local_path(parsed, site_root)
|
||
rewritten_external_asset = True
|
||
else:
|
||
local_path = (
|
||
to_local_path(parsed, site_root)
|
||
if treat_as_page
|
||
else to_local_asset_path(parsed, site_root)
|
||
)
|
||
|
||
rel = _rel_url(local_path, page_dir)
|
||
if parsed.fragment:
|
||
rel = f"{rel}#{parsed.fragment}"
|
||
tag[attr] = rel
|
||
|
||
if rewritten_external_asset and tag.name in {"script", "link"}:
|
||
strip_sri_and_cors(tag)
|
||
|
||
# srcset="url1 1x, url2 2x" needs special parsing
|
||
if tag.has_attr("srcset"):
|
||
new_entries = []
|
||
for entry in str(tag["srcset"]).split(","):
|
||
entry = entry.strip()
|
||
if not entry:
|
||
continue
|
||
|
||
parts = entry.split()
|
||
url_part = _protocol_fix(parts[0], page_url)
|
||
|
||
if (
|
||
url_part.startswith("#")
|
||
or is_non_fetchable(url_part)
|
||
or not is_httpish(url_part)
|
||
):
|
||
new_entries.append(entry)
|
||
continue
|
||
|
||
abs_url = normalize_url(canonicalize_url(url_part, page_url))
|
||
parsed = urlparse(abs_url)
|
||
|
||
is_ext = not is_internal(abs_url, root_netloc)
|
||
if is_ext:
|
||
if not download_external_assets:
|
||
new_entries.append(entry)
|
||
continue
|
||
|
||
if not is_allowed_external(abs_url, external_domains):
|
||
new_entries.append(entry)
|
||
continue
|
||
|
||
local_path = cdn_local_path(parsed, site_root)
|
||
else:
|
||
local_path = to_local_asset_path(parsed, site_root)
|
||
|
||
rel = _rel_url(local_path, page_dir)
|
||
if parsed.fragment:
|
||
rel = f"{rel}#{parsed.fragment}"
|
||
|
||
parts[0] = rel
|
||
new_entries.append(" ".join(parts))
|
||
|
||
tag["srcset"] = ", ".join(new_entries)
|
||
|
||
# Inline style="background:url(...)" rewriting
|
||
if tag.has_attr("style"):
|
||
style = str(tag["style"])
|
||
|
||
def repl_style(m: re.Match) -> str:
|
||
raw = m.group(1).strip()
|
||
quote = ""
|
||
url_part = raw
|
||
|
||
if len(raw) >= 2 and raw[0] in ("'", '"') and raw[-1] == raw[0]:
|
||
quote = raw[0]
|
||
url_part = raw[1:-1].strip()
|
||
|
||
if (
|
||
not url_part
|
||
or url_part.startswith("#")
|
||
or url_part.startswith(("data:", "javascript:", "about:"))
|
||
):
|
||
return m.group(0)
|
||
|
||
url_part2 = _protocol_fix(url_part, page_url)
|
||
if is_non_fetchable(url_part2) or not is_httpish(url_part2):
|
||
return m.group(0)
|
||
|
||
abs_url = canonicalize_url(url_part2, page_url)
|
||
parsed = urlparse(abs_url)
|
||
|
||
# Only rewrite things that look like assets.
|
||
if not parsed.path.lower().endswith(ASSET_EXTENSIONS):
|
||
return m.group(0)
|
||
|
||
is_ext = not is_internal(abs_url, root_netloc)
|
||
if is_ext:
|
||
if not download_external_assets:
|
||
return m.group(0)
|
||
|
||
if not is_allowed_external(abs_url, external_domains):
|
||
return m.group(0)
|
||
|
||
local_path = cdn_local_path(parsed, site_root)
|
||
else:
|
||
local_path = to_local_asset_path(parsed, site_root)
|
||
|
||
rel = _rel_url(local_path, page_dir)
|
||
if parsed.fragment:
|
||
rel = f"{rel}#{parsed.fragment}"
|
||
|
||
if quote:
|
||
return f"url({quote}{rel}{quote})"
|
||
return f"url({rel})"
|
||
|
||
style = CSS_URL_RE.sub(repl_style, style)
|
||
tag["style"] = style
|
||
|
||
# Rewrite <style> blocks too (internal assets only; CDN kept unchanged here)
|
||
for style_tag in soup.find_all("style"):
|
||
try:
|
||
css_text = style_tag.string or style_tag.get_text()
|
||
if not css_text:
|
||
continue
|
||
rewritten = rewrite_css_text(
|
||
css_text,
|
||
page_url,
|
||
site_root=site_root,
|
||
root_netloc=root_netloc,
|
||
base_dir=page_dir,
|
||
download_external_assets=download_external_assets,
|
||
external_domains=external_domains,
|
||
download_q=None,
|
||
)
|
||
if rewritten != css_text:
|
||
style_tag.string = rewritten
|
||
except Exception as exc: # noqa: BLE001
|
||
log.debug("Inline <style> rewrite failed on %s – %s", page_url, exc)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Crawl coordinator
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def extract_css_assets(css_text: str) -> list[str]:
|
||
"""
|
||
Extract asset URLs from CSS url(...) and @import patterns.
|
||
|
||
This is used when scanning <style> blocks during HTML parse time
|
||
(before the CSS is written to disk).
|
||
"""
|
||
results: list[str] = []
|
||
|
||
for match in CSS_URL_RE.findall(css_text):
|
||
url = match.strip().strip("'\"")
|
||
if not url or url.startswith(("data:", "javascript:", "about:", "#")):
|
||
continue
|
||
results.append(url)
|
||
|
||
for match in CSS_IMPORT_RE.findall(css_text):
|
||
url = match.strip().strip("'\"")
|
||
if not url or url.startswith(("data:", "javascript:", "about:", "#")):
|
||
continue
|
||
results.append(url)
|
||
|
||
return results
|
||
|
||
|
||
def crawl_site(
|
||
start_url: str,
|
||
root: Path,
|
||
max_pages: int,
|
||
threads: int,
|
||
download_external_assets: bool = False,
|
||
external_domains: Optional[set[str]] = None,
|
||
) -> None:
|
||
"""
|
||
Breadth-first crawl limited to max_pages.
|
||
|
||
- q_pages: pages to crawl (HTML only, internal-only)
|
||
- download_q: assets to download (internal, and optionally external)
|
||
- worker threads: process download_q and write to disk
|
||
"""
|
||
q_pages: queue.Queue[str] = queue.Queue()
|
||
q_pages.put(start_url)
|
||
|
||
seen_pages: set[str] = set()
|
||
queued_pages: set[str] = {start_url}
|
||
|
||
# queued_assets ensures we don't enqueue the same asset URL many times.
|
||
queued_assets: set[str] = set()
|
||
|
||
# download_q holds (abs_url, destination_path) pairs.
|
||
download_q: queue.Queue[tuple[str, Path]] = queue.Queue()
|
||
|
||
root_netloc = _canonical_netloc(urlparse(start_url))
|
||
|
||
def worker() -> None:
|
||
"""Download worker thread: pulls tasks from download_q and writes them."""
|
||
while True:
|
||
url, dest = download_q.get()
|
||
try:
|
||
if is_non_fetchable(url) or not is_httpish(url):
|
||
log.debug("Skip non-fetchable: %s", url)
|
||
continue
|
||
fetch_binary(
|
||
url,
|
||
dest,
|
||
download_q,
|
||
site_root=root,
|
||
root_netloc=root_netloc,
|
||
download_external_assets=download_external_assets,
|
||
external_domains=external_domains,
|
||
)
|
||
finally:
|
||
download_q.task_done()
|
||
|
||
# Spawn the asset download workers.
|
||
for i in range(max(1, threads)):
|
||
t = threading.Thread(target=worker, name=f"DL-{i + 1}", daemon=True)
|
||
t.start()
|
||
|
||
start_time = time.time()
|
||
PAGE_SUFFIXES = {"", ".html", ".htm"}
|
||
|
||
while not q_pages.empty() and len(seen_pages) < max_pages:
|
||
page_url = canonicalize_url(q_pages.get())
|
||
if page_url in seen_pages:
|
||
continue
|
||
|
||
seen_pages.add(page_url)
|
||
log.info("[%s/%s] %s", len(seen_pages), max_pages, page_url)
|
||
|
||
soup = fetch_html(page_url)
|
||
if soup is None:
|
||
continue
|
||
|
||
# Walk the DOM once and:
|
||
# 1) enqueue internal pages from <a href=...>
|
||
# 2) enqueue assets referenced via src/href/data-src/poster/srcset/style/<style>
|
||
for tag in soup.find_all(True):
|
||
|
||
# Common URL-bearing attributes
|
||
for attr in ("src", "href", "data-src", "poster"):
|
||
if not tag.has_attr(attr):
|
||
continue
|
||
|
||
link_raw = str(tag.get(attr, "")).strip()
|
||
if not link_raw:
|
||
continue
|
||
|
||
link = _protocol_fix(link_raw, page_url)
|
||
if (
|
||
link.startswith("#")
|
||
or is_non_fetchable(link)
|
||
or not is_httpish(link)
|
||
):
|
||
continue
|
||
|
||
abs_url = normalize_url(canonicalize_url(link, page_url))
|
||
parsed = urlparse(abs_url)
|
||
is_ext = not is_internal(abs_url, root_netloc)
|
||
|
||
# Only crawl internal HTML pages from <a href=...>
|
||
suffix = Path(parsed.path).suffix.lower()
|
||
is_page = (
|
||
tag.name == "a"
|
||
and not is_ext
|
||
and (parsed.path.endswith("/") or suffix in PAGE_SUFFIXES)
|
||
)
|
||
|
||
if is_page:
|
||
if abs_url not in seen_pages and abs_url not in queued_pages:
|
||
q_pages.put(abs_url)
|
||
queued_pages.add(abs_url)
|
||
continue
|
||
|
||
# Otherwise treat it as an asset candidate.
|
||
if is_ext:
|
||
parsed_host = (urlparse(abs_url).hostname or "").lower()
|
||
log.debug("[EXT-ASSET] %s", parsed_host)
|
||
|
||
if not download_external_assets:
|
||
continue
|
||
|
||
if not is_allowed_external(abs_url, external_domains):
|
||
log.debug("Blocked external (not whitelisted): %s", abs_url)
|
||
continue
|
||
|
||
# External assets without extensions are only allowed for <script> and <link>
|
||
# because CDNs sometimes serve JS/CSS without filename extensions.
|
||
if tag.name not in (
|
||
"script",
|
||
"link",
|
||
) and not parsed.path.lower().endswith(ASSET_EXTENSIONS):
|
||
continue
|
||
|
||
dest_path = cdn_local_path(parsed, root)
|
||
else:
|
||
dest_path = to_local_asset_path(parsed, root)
|
||
|
||
if abs_url not in queued_assets:
|
||
queued_assets.add(abs_url)
|
||
create_dir(dest_path.parent)
|
||
log.debug("Queue asset: %s -> %s", abs_url, dest_path)
|
||
download_q.put((abs_url, dest_path))
|
||
|
||
# ------------------------------------------------------------------
|
||
# META IMAGE SUPPORT (og:image, twitter:image)
|
||
# ------------------------------------------------------------------
|
||
if tag.name == "meta":
|
||
content = str(tag.get("content", "")).strip()
|
||
prop = (tag.get("property") or tag.get("name") or "").lower()
|
||
|
||
if content and ("og:image" in prop or "twitter:image" in prop):
|
||
url_part = _protocol_fix(content, page_url)
|
||
|
||
if (
|
||
not url_part
|
||
or url_part.startswith("#")
|
||
or url_part.startswith(("data:", "javascript:", "about:"))
|
||
or is_non_fetchable(url_part)
|
||
or not is_httpish(url_part)
|
||
):
|
||
continue
|
||
else:
|
||
abs_url = normalize_url(canonicalize_url(url_part, page_url))
|
||
parsed = urlparse(abs_url)
|
||
|
||
if parsed.path.lower().endswith(ASSET_EXTENSIONS):
|
||
is_ext = not is_internal(abs_url, root_netloc)
|
||
|
||
if is_ext:
|
||
if not download_external_assets:
|
||
continue
|
||
elif not is_allowed_external(abs_url, external_domains):
|
||
log.debug("Blocked external (meta): %s", abs_url)
|
||
continue
|
||
else:
|
||
dest_path = cdn_local_path(parsed, root)
|
||
|
||
if abs_url not in queued_assets:
|
||
queued_assets.add(abs_url)
|
||
create_dir(dest_path.parent)
|
||
log.debug(
|
||
"Queue meta asset: %s -> %s",
|
||
abs_url,
|
||
dest_path,
|
||
)
|
||
download_q.put((abs_url, dest_path))
|
||
else:
|
||
dest_path = to_local_asset_path(parsed, root)
|
||
|
||
if abs_url not in queued_assets:
|
||
queued_assets.add(abs_url)
|
||
create_dir(dest_path.parent)
|
||
log.debug(
|
||
"Queue meta asset: %s -> %s", abs_url, dest_path
|
||
)
|
||
download_q.put((abs_url, dest_path))
|
||
|
||
# srcset handling (images at multiple resolutions)
|
||
if tag.has_attr("srcset"):
|
||
for entry in str(tag["srcset"]).split(","):
|
||
entry = entry.strip()
|
||
if not entry:
|
||
continue
|
||
|
||
url_part = _protocol_fix(entry.split()[0], page_url)
|
||
if (
|
||
url_part.startswith("#")
|
||
or is_non_fetchable(url_part)
|
||
or not is_httpish(url_part)
|
||
):
|
||
continue
|
||
|
||
abs_url = normalize_url(canonicalize_url(url_part, page_url))
|
||
parsed = urlparse(abs_url)
|
||
is_ext = not is_internal(abs_url, root_netloc)
|
||
|
||
if is_ext:
|
||
if not download_external_assets:
|
||
continue
|
||
|
||
if not is_allowed_external(abs_url, external_domains):
|
||
log.debug("Blocked external (srcset): %s", abs_url)
|
||
continue
|
||
|
||
if not parsed.path.lower().endswith(ASSET_EXTENSIONS):
|
||
continue
|
||
|
||
dest_path = cdn_local_path(parsed, root)
|
||
else:
|
||
dest_path = to_local_asset_path(parsed, root)
|
||
|
||
if abs_url not in queued_assets:
|
||
queued_assets.add(abs_url)
|
||
create_dir(dest_path.parent)
|
||
log.debug("Queue asset: %s -> %s", abs_url, dest_path)
|
||
download_q.put((abs_url, dest_path))
|
||
|
||
# inline style="...url(...)..." assets
|
||
if tag.has_attr("style"):
|
||
style = str(tag["style"])
|
||
for match in CSS_URL_RE.findall(style):
|
||
url_part = _protocol_fix(match.strip().strip("'\""), page_url)
|
||
if (
|
||
not url_part
|
||
or url_part.startswith("#")
|
||
or url_part.startswith(("data:", "javascript:", "about:"))
|
||
or is_non_fetchable(url_part)
|
||
or not is_httpish(url_part)
|
||
):
|
||
continue
|
||
|
||
abs_url = normalize_url(canonicalize_url(url_part, page_url))
|
||
parsed = urlparse(abs_url)
|
||
|
||
if not parsed.path.lower().endswith(ASSET_EXTENSIONS):
|
||
continue
|
||
|
||
is_ext = not is_internal(abs_url, root_netloc)
|
||
|
||
if is_ext:
|
||
if not download_external_assets:
|
||
continue
|
||
|
||
if not is_allowed_external(abs_url, external_domains):
|
||
log.debug("Blocked external (inline style): %s", abs_url)
|
||
continue
|
||
|
||
dest_path = (
|
||
cdn_local_path(parsed, root)
|
||
if is_ext
|
||
else to_local_asset_path(parsed, root)
|
||
)
|
||
if abs_url not in queued_assets:
|
||
queued_assets.add(abs_url)
|
||
create_dir(dest_path.parent)
|
||
log.debug("Queue asset: %s -> %s", abs_url, dest_path)
|
||
download_q.put((abs_url, dest_path))
|
||
|
||
# <style> blocks: extract CSS asset references and enqueue them
|
||
if tag.name == "style":
|
||
css_text = tag.string or tag.get_text()
|
||
if not css_text:
|
||
continue
|
||
|
||
for asset in extract_css_assets(css_text):
|
||
asset = _protocol_fix(asset, page_url)
|
||
if (
|
||
not asset
|
||
or asset.startswith("#")
|
||
or asset.startswith(("data:", "javascript:", "about:"))
|
||
or is_non_fetchable(asset)
|
||
or not is_httpish(asset)
|
||
):
|
||
continue
|
||
|
||
abs_url = canonicalize_url(asset, page_url)
|
||
parsed = urlparse(abs_url)
|
||
|
||
if not parsed.path.lower().endswith(ASSET_EXTENSIONS):
|
||
continue
|
||
|
||
is_ext = not is_internal(abs_url, root_netloc)
|
||
|
||
if is_ext:
|
||
if not download_external_assets:
|
||
continue
|
||
|
||
if not is_allowed_external(abs_url, external_domains):
|
||
log.debug("Blocked external (<style>): %s", abs_url)
|
||
continue
|
||
|
||
dest_path = (
|
||
cdn_local_path(parsed, root)
|
||
if is_ext
|
||
else to_local_asset_path(parsed, root)
|
||
)
|
||
if abs_url not in queued_assets:
|
||
queued_assets.add(abs_url)
|
||
create_dir(dest_path.parent)
|
||
log.debug("Queue asset: %s -> %s", abs_url, dest_path)
|
||
download_q.put((abs_url, dest_path))
|
||
|
||
# Save current page:
|
||
# - determine local filename
|
||
# - rewrite links inside the HTML
|
||
# - write out the HTML
|
||
local_path = to_local_path(urlparse(page_url), root)
|
||
create_dir(local_path.parent)
|
||
rewrite_links(
|
||
soup,
|
||
page_url,
|
||
root,
|
||
local_path.parent,
|
||
download_external_assets,
|
||
external_domains,
|
||
)
|
||
safe_write_text(local_path, str(soup), encoding="utf-8")
|
||
|
||
# Wait for all queued asset downloads to finish
|
||
download_q.join()
|
||
|
||
elapsed = time.time() - start_time
|
||
if seen_pages:
|
||
log.info(
|
||
"Crawl finished: %s pages in %.2fs (%.2fs avg)",
|
||
len(seen_pages),
|
||
elapsed,
|
||
elapsed / len(seen_pages),
|
||
)
|
||
else:
|
||
log.warning("Nothing downloaded – check URL or connectivity")
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helper function for output folder
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def make_root(url: str, custom: Optional[str]) -> Path:
|
||
"""
|
||
Derive output folder from URL if custom not supplied.
|
||
|
||
Example:
|
||
https://example.com -> example_com
|
||
"""
|
||
return Path(custom) if custom else Path(urlparse(url).netloc.replace(".", "_"))
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# CLI
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def parse_args() -> argparse.Namespace:
|
||
"""
|
||
Parse command-line arguments.
|
||
|
||
--download-external-assets:
|
||
When enabled, we ALSO download assets from other hosts (CDNs).
|
||
Your HTML rewriting currently keeps CDN URLs unchanged in HTML,
|
||
but CSS/JS rewriting can still localize them if those files are downloaded.
|
||
"""
|
||
p = argparse.ArgumentParser(
|
||
description="Recursively mirror a website for offline use.",
|
||
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||
)
|
||
p.add_argument(
|
||
"--url",
|
||
required=True,
|
||
help="Starting URL to crawl (e.g., https://example.com/).",
|
||
)
|
||
p.add_argument(
|
||
"--destination",
|
||
default=None,
|
||
help="Output folder (defaults to a folder derived from the URL).",
|
||
)
|
||
p.add_argument(
|
||
"--max-pages",
|
||
type=int,
|
||
default=50,
|
||
help="Maximum number of HTML pages to crawl.",
|
||
)
|
||
p.add_argument(
|
||
"--threads",
|
||
type=int,
|
||
default=6,
|
||
help="Number of concurrent download workers.",
|
||
)
|
||
p.add_argument(
|
||
"--download-external-assets",
|
||
action="store_true",
|
||
help="Download external CDN/static assets and rewrite links for offline use.",
|
||
)
|
||
p.add_argument(
|
||
"--external-domains",
|
||
nargs="+",
|
||
default=None,
|
||
help="Whitelist of external domains to download from (implies external download).",
|
||
)
|
||
return p.parse_args()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# Basic argument validation
|
||
args = parse_args()
|
||
if args.max_pages < 1:
|
||
log.error("--max-pages must be >= 1")
|
||
sys.exit(2)
|
||
if args.threads < 1:
|
||
log.error("--threads must be >= 1")
|
||
sys.exit(2)
|
||
|
||
# start URL + output root folder
|
||
host = args.url
|
||
root = make_root(args.url, args.destination)
|
||
|
||
external_domains = (
|
||
{
|
||
urlparse(d).hostname.lower() if "://" in d else d.lower()
|
||
for d in args.external_domains
|
||
}
|
||
if args.external_domains
|
||
else None
|
||
)
|
||
|
||
download_external_assets = (
|
||
args.download_external_assets or args.external_domains is not None
|
||
)
|
||
|
||
# Kick off crawl
|
||
crawl_site(
|
||
host,
|
||
root,
|
||
args.max_pages,
|
||
args.threads,
|
||
download_external_assets,
|
||
external_domains,
|
||
)
|