From 1623ee8d2cb91f4162a7bb4529dde1bdef852b38 Mon Sep 17 00:00:00 2001 From: turtle89431 Date: Sat, 28 Mar 2026 16:04:27 -0700 Subject: [PATCH] tool1 and init req file --- requirements.txt | 4 + website-downloader.py | 1610 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1614 insertions(+) create mode 100644 requirements.txt create mode 100644 website-downloader.py diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..72da2f0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +requests~=2.32.4 +beautifulsoup4~=4.13.4 +wget~=3.2 +urllib3~=2.5.0 \ No newline at end of file diff --git a/website-downloader.py b/website-downloader.py new file mode 100644 index 0000000..e7907a5 --- /dev/null +++ b/website-downloader.py @@ -0,0 +1,1610 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import logging +import os +import queue +import re +import sys +import threading +import time +from hashlib import sha256 +from importlib.util import find_spec +from pathlib import Path +from typing import Optional +from urllib.parse import ParseResult, unquote, urljoin, urlparse + +import requests +from bs4 import BeautifulSoup +from requests.adapters import HTTPAdapter +from urllib3.util import Retry + +HAS_BROTLI = find_spec("brotli") is not None or find_spec("brotlicffi") is not None + +# --------------------------------------------------------------------------- +# Config / constants +# --------------------------------------------------------------------------- + +# Consistent log format across file + console. Thread name is helpful because +# asset downloads happen in worker threads. +LOG_FMT = "%(asctime)s | %(levelname)-8s | %(threadName)s | %(message)s" + +# Extensions we treat as “static assets” worth downloading and rewriting. +# Used in multiple places: HTML attribute rewriting, CSS url(...) rewriting, +# JS string rewriting, and crawl-time asset detection. +ASSET_EXTENSIONS = ( + ".css", + ".js", + ".mjs", + ".map", + ".json", + ".wasm", + ".webmanifest", + ".png", + ".jpg", + ".jpeg", + ".gif", + ".webp", + ".avif", + ".svg", + ".ico", + ".woff", + ".woff2", + ".ttf", + ".eot", + ".mp4", + ".webm", + ".mp3", +) + +# Conservative JS string rewriting: +# - JS_URL_RE: matches root-relative strings like "/assets/app.js" +# - JS_ABS_URL_RE: matches absolute or protocol-relative strings like +# "https://cdn.example.com/app.js" or "//cdn.example.com/app.js" +# +# This is intentionally limited to common static file extensions to avoid +# rewriting API endpoints or dynamic URLs that could break functionality. +JS_URL_RE = re.compile( + r"""["'](/[^"']+\.(?:png|jpg|jpeg|gif|svg|webp|avif|ico|css|js|mjs|map|woff|woff2|ttf|eot|json|wasm|webmanifest)(?:\?[^"']*)?)["']""", + re.IGNORECASE, +) + +JS_ABS_URL_RE = re.compile( + r"""["']((?:https?:)?//[^"']+\.(?:png|jpg|jpeg|gif|svg|webp|avif|ico|css|js|mjs|map|woff|woff2|ttf|eot|json|wasm|webmanifest)(?:\?[^"']*)?)["']""", + re.IGNORECASE, +) + +# Default headers can help with sites that block "non-browser" clients. +_ACCEPT_ENCODING = "gzip, deflate, br" if HAS_BROTLI else "gzip, deflate" + +DEFAULT_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/122.0.0.0 Safari/537.36" + ), + "Accept": ( + "text/html,application/xhtml+xml,application/xml;" + "q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8" + ), + "Accept-Language": "en-US,en;q=0.9", + "Accept-Encoding": _ACCEPT_ENCODING, + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", +} + +# Network timeouts + streaming chunk size for binary downloads. +TIMEOUT = 15 # seconds +CHUNK_SIZE = 8192 # bytes + +# Conservative margins under common OS limits (~255–260 bytes). +# These protect you from “File name too long” and odd Windows path rules. +MAX_PATH_LEN = 240 +MAX_SEG_LEN = 120 + +# Collapse 3+ dots ("....") down to a single dot to avoid weird filenames. +_MULTI_DOTS_RE = re.compile(r"\.{3,}") + +# CSS url(...) extractor. Note: this is simple (not a full CSS parser), +# but good enough for most sites. +CSS_URL_RE = re.compile(r"url\(([^)]+)\)") + +# CSS @import extractor. Also simple-but-effective. +CSS_IMPORT_RE = re.compile( + r"""@import\s+(?:url\()?['"]?([^'"\);]+)['"]?\)?\s*;""", + re.IGNORECASE, +) + +# Characters that commonly cause filesystem issues, especially on Windows. +_BAD_SEG_CHARS_RE = re.compile(r'[<>:"/\\|?*\x00-\x1F]') + +# Windows reserved filenames; writing these can fail or behave badly. +_WINDOWS_RESERVED_NAMES = { + "CON", + "PRN", + "AUX", + "NUL", + *(f"COM{i}" for i in range(1, 10)), + *(f"LPT{i}" for i in range(1, 10)), +} + +RESOURCE_LINK_RELS = { + "stylesheet", + "icon", + "shortcut", + "apple-touch-icon", + "preload", + "modulepreload", + "manifest", +} + +# --------------------------------------------------------------------------- +# Logging +# --------------------------------------------------------------------------- + +# File logging is DEBUG to help you trace rewrites and queue behavior. +logging.basicConfig( + filename="web_scraper.log", + level=logging.DEBUG, + format=LOG_FMT, + datefmt="%H:%M:%S", + force=True, +) + +# Console logging is INFO to keep output readable while running. +_console = logging.StreamHandler(sys.stdout) +_console.setLevel(logging.INFO) +_console.setFormatter(logging.Formatter(LOG_FMT, datefmt="%H:%M:%S")) +logging.getLogger().addHandler(_console) +log = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# HTTP session (retry, timeouts, custom UA) +# --------------------------------------------------------------------------- + +# Shared session improves performance and keeps connection pooling. +SESSION = requests.Session() + +# Retry strategy for transient issues (rate limits, 5xx). Helps stability. +RETRY_STRAT = Retry( + total=5, + backoff_factor=0.5, + status_forcelist=[429, 500, 502, 503, 504], + allowed_methods=["GET", "HEAD"], +) + +SESSION.mount("http://", HTTPAdapter(max_retries=RETRY_STRAT)) +SESSION.mount("https://", HTTPAdapter(max_retries=RETRY_STRAT)) +SESSION.headers.update(DEFAULT_HEADERS) +log.debug("Accept-Encoding configured as: %s", SESSION.headers.get("Accept-Encoding")) + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def create_dir(path: Path) -> None: + """Create path (and parents) if it does not already exist.""" + if not path.exists(): + path.mkdir(parents=True, exist_ok=True) + log.debug("Created directory %s", path) + + +# Schemes that are valid URLs in HTML but are not HTTP fetch targets. +# If we try to request these, requests will throw InvalidSchema. +NON_FETCHABLE_SCHEMES = { + "mailto", + "tel", + "sms", + "javascript", + "data", + "geo", + "blob", + "about", +} + + +def is_httpish(u: str) -> bool: + """ + True iff the URL is http(s) or relative (no scheme). + + Why: + - We only fetch http(s) resources. + - Relative URLs should still be handled because we can join them to base URLs. + """ + p = urlparse(u) + return (p.scheme in ("http", "https")) or (p.scheme == "") + + +def is_non_fetchable(u: str) -> bool: + """ + True iff the URL clearly shouldn't be fetched (mailto:, tel:, data:, ...). + """ + p = urlparse(u) + return p.scheme in NON_FETCHABLE_SCHEMES + + +def is_internal(link: str, root_netloc: str) -> bool: + """ + Decide whether `link` belongs to the same site as `root_netloc`. + + Notes: + - Relative URLs are internal. + - We normalize "www." so example.com and www.example.com count as same. + """ + parsed = urlparse(link) + netloc = _canonical_netloc(parsed) + + if not netloc: + return True + + if netloc == root_netloc: + return True + + # normalize www + if netloc.startswith("www."): + netloc = netloc[4:] + root = root_netloc[4:] if root_netloc.startswith("www.") else root_netloc + + return netloc == root + + +def _sanitize_segment(segment: str) -> str: + """ + Sanitize a single path segment for safe writing to disk. + + - URL decode (turn %20 into space, etc.) + - Strip whitespace / trailing dot-space combos (Windows issues) + - Collapse accidental multi-dots + - Replace illegal filesystem chars with '_' + - Neutralize '.' and '..' to prevent traversal-like paths + - Avoid Windows reserved names (CON, PRN, COM1, ...) + """ + segment = unquote(segment).strip() + segment = segment.strip(" .") + segment = _MULTI_DOTS_RE.sub(".", segment) + segment = _BAD_SEG_CHARS_RE.sub("_", segment) + + if segment in ("", ".", ".."): + segment = "_" + + if segment.upper() in _WINDOWS_RESERVED_NAMES: + segment = f"_{segment}_" + + return segment + + +def _shorten_segment(segment: str, limit: int = MAX_SEG_LEN) -> str: + """ + Shorten a path segment if it exceeds a length limit. + + Strategy: + - Keep the original extension + - Truncate the stem + - Append a short hash so different long names don't collide + """ + if len(segment) <= limit: + return segment + p = Path(segment) + stem, suffix = p.stem, p.suffix + h = sha256(segment.encode("utf-8")).hexdigest()[:12] + keep = max(0, limit - len(suffix) - 13) # '-' + hash is 13 chars total + return f"{stem[:keep]}-{h}{suffix}" + + +def _rel_url(target: Path, base_dir: Path) -> str: + """ + Compute a URL-style relative path (forward slashes), + not an OS-specific path. + """ + try: + rel = os.path.relpath(target, base_dir) + except ValueError: + # Happens if paths are on different drives on Windows. + return target.as_posix() + return Path(rel).as_posix() + + +def to_local_path(parsed: ParseResult, site_root: Path) -> Path: + """ + Map an internal *page* URL to a local HTML file under site_root. + + Rules: + - "/" -> index.html + - "/foo/" -> /foo/index.html + - "/foo" (no extension) -> /foo.html + - query strings get a short hash to prevent collisions: + /page?id=1 and /page?id=2 should not overwrite each other + - filesystem hardening: sanitize segments, limit segment length and overall path + """ + rel = parsed.path.lstrip("/") + if not rel: + rel = "index.html" + elif rel.endswith("/"): + rel += "index.html" + elif not Path(rel).suffix: + rel += ".html" + + if parsed.query: + qh = sha256(parsed.query.encode("utf-8")).hexdigest()[:10] + p = Path(rel) + rel = str(p.with_name(f"{p.stem}-q{qh}{p.suffix}")) + + parts = Path(rel).parts + parts = tuple(_sanitize_segment(seg) for seg in parts) + parts = tuple(_shorten_segment(seg, MAX_SEG_LEN) for seg in parts) + local_path = site_root / Path(*parts) + + if len(str(local_path)) > MAX_PATH_LEN: + p = local_path + h = sha256(parsed.geturl().encode("utf-8")).hexdigest()[:16] + leaf = _shorten_segment(f"{p.stem}-{h}{p.suffix}", MAX_SEG_LEN) + local_path = p.with_name(leaf) + + return local_path + + +def to_local_asset_path(parsed: ParseResult, site_root: Path) -> Path: + """ + Map an internal *asset* URL to a local file path under site_root. + + Difference vs to_local_path(): + - We do NOT force .html for extensionless paths. + (Some sites serve extensionless assets, though less common.) + """ + rel = parsed.path.lstrip("/") + if not rel: + rel = "index" + elif rel.endswith("/"): + rel += "index" + + if parsed.query: + qh = sha256(parsed.query.encode("utf-8")).hexdigest()[:10] + p = Path(rel) + name = f"{p.stem}-q{qh}{p.suffix}" if p.suffix else f"{p.name}-q{qh}" + rel = str(p.with_name(name)) + + parts = Path(rel).parts + parts = tuple(_sanitize_segment(seg) for seg in parts) + parts = tuple(_shorten_segment(seg, MAX_SEG_LEN) for seg in parts) + local_path = site_root / Path(*parts) + + if len(str(local_path)) > MAX_PATH_LEN: + p = local_path + h = sha256(parsed.geturl().encode("utf-8")).hexdigest()[:16] + leaf = _shorten_segment(f"{p.stem}-{h}{p.suffix}", MAX_SEG_LEN) + local_path = p.with_name(leaf) + + return local_path + + +def cdn_local_path(parsed: ParseResult, site_root: Path) -> Path: + """ + Map an external (CDN) URL to a local path under: + site_root/cdn//... + + Why: + - Keeps external host assets separated from internal assets. + - Avoids collisions where internal and external paths look similar. + """ + rel = parsed.path.lstrip("/") + if not rel: + rel = "index" + elif rel.endswith("/"): + rel += "index" + + if parsed.query: + qh = sha256(parsed.query.encode("utf-8")).hexdigest()[:10] + p = Path(rel) + name = f"{p.stem}-q{qh}{p.suffix}" if p.suffix else f"{p.name}-q{qh}" + rel = str(p.with_name(name)) + + parts = Path(rel).parts + parts = tuple(_sanitize_segment(seg) for seg in parts) + parts = tuple(_shorten_segment(seg, MAX_SEG_LEN) for seg in parts) + + netloc = _canonical_netloc(parsed) + local_path = site_root / "cdn" / _sanitize_segment(netloc) / Path(*parts) + + if len(str(local_path)) > MAX_PATH_LEN: + p = local_path + h = sha256(parsed.geturl().encode("utf-8")).hexdigest()[:16] + leaf = _shorten_segment(f"{p.stem}-{h}{p.suffix}", MAX_SEG_LEN) + local_path = p.with_name(leaf) + + return local_path + + +def safe_write_text(path: Path, text: str, encoding: str = "utf-8") -> Path: + """ + Write text to path safely. + + If the OS rejects the filename/path (often: path too long), we: + - hash the leaf name + - write to a fallback name + - return the final path used + """ + try: + path.write_text(text, encoding=encoding) + return path + except OSError as exc: + log.warning("Write failed for %s: %s. Falling back to hashed leaf.", path, exc) + p = path + h = sha256(str(p).encode("utf-8")).hexdigest()[:16] + fallback = p.with_name(_shorten_segment(f"{p.stem}-{h}{p.suffix}", MAX_SEG_LEN)) + create_dir(fallback.parent) + fallback.write_text(text, encoding=encoding) + return fallback + + +def normalize_url(url: str) -> str: + """ + Normalize URLs to avoid duplicates caused by fragments. + + Example: + - https://site/page#section1 and https://site/page#section2 + are the same document for our crawler. + """ + parsed = urlparse(url) + clean = parsed._replace(fragment="") + return clean.geturl() + + +def _protocol_fix(url: str, base_url: str) -> str: + """ + Normalize protocol-relative URLs (//host/path) to absolute ones. + + Browsers interpret //example.com/a.css as "use the current page scheme". + We do the same using base_url's scheme. + """ + if url.startswith("//"): + base = urlparse(base_url) + scheme = base.scheme or "https" + return f"{scheme}:{url}" + return url + + +def rewrite_css_text( + css_text: str, + base_url: str, + *, + site_root: Path, + root_netloc: str, + base_dir: Path, + download_external_assets: bool, + external_domains: Optional[set[str]] = None, + download_q: Optional[queue.Queue[tuple[str, Path]]] = None, +) -> str: + """ + Rewrite CSS url(...) and @import references to local relative paths. + + base_url: + - the remote URL of the CSS *context* + - external stylesheet URL for downloaded .css + - page URL for inline