#!/usr/bin/env python3 from __future__ import annotations import argparse import logging import os import queue import sys import threading import time from hashlib import sha256 from pathlib import Path from typing import Optional from urllib.parse import urljoin, urlparse import requests from bs4 import BeautifulSoup from requests.adapters import HTTPAdapter from urllib3.util import Retry # --------------------------------------------------------------------------- # Config / constants # --------------------------------------------------------------------------- LOG_FMT = "%(asctime)s | %(levelname)-8s | %(threadName)s | %(message)s" DEFAULT_HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:128.0) " "Gecko/20100101 Firefox/128.0" } TIMEOUT = 15 # seconds CHUNK_SIZE = 8192 # bytes # Conservative margins under common OS limits (~255–260 bytes) MAX_PATH_LEN = 240 MAX_SEG_LEN = 120 # --------------------------------------------------------------------------- # Logging # --------------------------------------------------------------------------- logging.basicConfig( filename="web_scraper.log", level=logging.DEBUG, format=LOG_FMT, datefmt="%H:%M:%S", force=True, ) _console = logging.StreamHandler(sys.stdout) _console.setLevel(logging.INFO) _console.setFormatter(logging.Formatter(LOG_FMT, datefmt="%H:%M:%S")) logging.getLogger().addHandler(_console) log = logging.getLogger(__name__) # --------------------------------------------------------------------------- # HTTP session (retry, timeouts, custom UA) # --------------------------------------------------------------------------- SESSION = requests.Session() RETRY_STRAT = Retry( total=5, backoff_factor=0.5, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=["GET", "HEAD"], ) SESSION.mount("http://", HTTPAdapter(max_retries=RETRY_STRAT)) SESSION.mount("https://", HTTPAdapter(max_retries=RETRY_STRAT)) SESSION.headers.update(DEFAULT_HEADERS) # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def create_dir(path: Path) -> None: """Create path (and parents) if it does not already exist.""" if not path.exists(): path.mkdir(parents=True, exist_ok=True) log.debug("Created directory %s", path) def sanitize(url_fragment: str) -> str: """Strip back-references and Windows backslashes.""" return url_fragment.replace("\\", "/").replace("..", "").strip() NON_FETCHABLE_SCHEMES = {"mailto", "tel", "sms", "javascript", "data", "geo", "blob"} def is_httpish(u: str) -> bool: """True iff the URL is http(s) or relative (no scheme).""" p = urlparse(u) return (p.scheme in ("http", "https")) or (p.scheme == "") def is_non_fetchable(u: str) -> bool: """True iff the URL clearly shouldn't be fetched (mailto:, tel:, data:, ...).""" p = urlparse(u) return p.scheme in NON_FETCHABLE_SCHEMES def is_internal(link: str, root_netloc: str) -> bool: """Return True if link belongs to root_netloc (or is protocol-relative).""" parsed = urlparse(link) return not parsed.netloc or parsed.netloc == root_netloc def _shorten_segment(segment: str, limit: int = MAX_SEG_LEN) -> str: """ Shorten a single path segment if over limit. Preserve extension; append a short hash to keep it unique. """ if len(segment) <= limit: return segment p = Path(segment) stem, suffix = p.stem, p.suffix h = sha256(segment.encode("utf-8")).hexdigest()[:12] # leave room for '-' + hash + suffix keep = max(0, limit - len(suffix) - 13) return f"{stem[:keep]}-{h}{suffix}" def to_local_path(parsed: urlparse, site_root: Path) -> Path: """ Map an internal URL to a local file path under site_root. - Adds 'index.html' where appropriate. - Converts extensionless paths to '.html'. - Appends a short query-hash when ?query is present to avoid collisions. - Enforces per-segment and overall path length limits. If still too long, hashes the leaf name. """ rel = parsed.path.lstrip("/") if not rel: rel = "index.html" elif rel.endswith("/"): rel += "index.html" elif not Path(rel).suffix: rel += ".html" if parsed.query: qh = sha256(parsed.query.encode("utf-8")).hexdigest()[:10] p = Path(rel) rel = str(p.with_name(f"{p.stem}-q{qh}{p.suffix}")) # Shorten individual segments parts = Path(rel).parts parts = tuple(_shorten_segment(seg, MAX_SEG_LEN) for seg in parts) local_path = site_root / Path(*parts) # If full path is still too long, hash the leaf if len(str(local_path)) > MAX_PATH_LEN: p = local_path h = sha256(parsed.geturl().encode("utf-8")).hexdigest()[:16] leaf = _shorten_segment(f"{p.stem}-{h}{p.suffix}", MAX_SEG_LEN) local_path = p.with_name(leaf) return local_path def safe_write_text(path: Path, text: str, encoding: str = "utf-8") -> Path: """ Write text to path, falling back to a hashed filename if OS rejects it (e.g., filename too long). Returns the final path used. """ try: path.write_text(text, encoding=encoding) return path except OSError as exc: log.warning("Write failed for %s: %s. Falling back to hashed leaf.", path, exc) p = path h = sha256(str(p).encode("utf-8")).hexdigest()[:16] fallback = p.with_name(_shorten_segment(f"{p.stem}-{h}{p.suffix}", MAX_SEG_LEN)) create_dir(fallback.parent) fallback.write_text(text, encoding=encoding) return fallback # --------------------------------------------------------------------------- # Fetchers # --------------------------------------------------------------------------- def fetch_html(url: str) -> Optional[BeautifulSoup]: """Download url and return a BeautifulSoup tree (or None on error).""" try: resp = SESSION.get(url, timeout=TIMEOUT) resp.raise_for_status() return BeautifulSoup(resp.text, "html.parser") except Exception as exc: # noqa: BLE001 log.warning("HTTP error for %s – %s", url, exc) return None def fetch_binary(url: str, dest: Path) -> None: """Stream url to dest unless it already exists. Safe against long paths.""" if dest.exists(): return try: resp = SESSION.get(url, timeout=TIMEOUT, stream=True) resp.raise_for_status() create_dir(dest.parent) try: with dest.open("wb") as fh: for chunk in resp.iter_content(CHUNK_SIZE): fh.write(chunk) log.debug("Saved resource -> %s", dest) except OSError as exc: # Fallback to hashed leaf if OS rejects path log.warning("Binary write failed for %s: %s. Using fallback.", dest, exc) p = dest h = sha256(str(p).encode("utf-8")).hexdigest()[:16] fallback = p.with_name( _shorten_segment(f"{p.stem}-{h}{p.suffix}", MAX_SEG_LEN) ) create_dir(fallback.parent) with fallback.open("wb") as fh: for chunk in resp.iter_content(CHUNK_SIZE): fh.write(chunk) log.debug("Saved resource (fallback) -> %s", fallback) except Exception as exc: # noqa: BLE001 log.error("Failed to save %s – %s", url, exc) # --------------------------------------------------------------------------- # Link rewriting # --------------------------------------------------------------------------- def rewrite_links( soup: BeautifulSoup, page_url: str, site_root: Path, page_dir: Path ) -> None: """Rewrite internal links to local relative paths under site_root.""" root_netloc = urlparse(page_url).netloc for tag in soup.find_all(["a", "img", "script", "link"]): attr = "href" if tag.name in {"a", "link"} else "src" if not tag.has_attr(attr): continue original = sanitize(tag[attr]) if ( original.startswith("#") or is_non_fetchable(original) or not is_httpish(original) ): continue abs_url = urljoin(page_url, original) if not is_internal(abs_url, root_netloc): continue # external – leave untouched local_path = to_local_path(urlparse(abs_url), site_root) try: tag[attr] = os.path.relpath(local_path, page_dir) except ValueError: # Different drives on Windows, etc. tag[attr] = str(local_path) # --------------------------------------------------------------------------- # Crawl coordinator # --------------------------------------------------------------------------- def crawl_site(start_url: str, root: Path, max_pages: int, threads: int) -> None: """Breadth-first crawl limited to max_pages. Downloads assets via workers.""" q_pages: queue.Queue[str] = queue.Queue() q_pages.put(start_url) seen_pages: set[str] = set() download_q: queue.Queue[tuple[str, Path]] = queue.Queue() def worker() -> None: while True: try: url, dest = download_q.get(timeout=3) except queue.Empty: return if is_non_fetchable(url) or not is_httpish(url): log.debug("Skip non-fetchable: %s", url) download_q.task_done() continue fetch_binary(url, dest) download_q.task_done() workers: list[threading.Thread] = [] for i in range(max(1, threads)): t = threading.Thread(target=worker, name=f"DL-{i+1}", daemon=True) t.start() workers.append(t) start_time = time.time() root_netloc = urlparse(start_url).netloc while not q_pages.empty() and len(seen_pages) < max_pages: page_url = q_pages.get() if page_url in seen_pages: continue seen_pages.add(page_url) log.info("[%s/%s] %s", len(seen_pages), max_pages, page_url) soup = fetch_html(page_url) if soup is None: continue # Gather links & assets for tag in soup.find_all(["img", "script", "link", "a"]): link = tag.get("src") or tag.get("href") if not link: continue link = sanitize(link) if link.startswith("#") or is_non_fetchable(link) or not is_httpish(link): continue abs_url = urljoin(page_url, link) parsed = urlparse(abs_url) if not is_internal(abs_url, root_netloc): continue dest_path = to_local_path(parsed, root) # HTML? if parsed.path.endswith("/") or not Path(parsed.path).suffix: if abs_url not in seen_pages and abs_url not in list( q_pages.queue ): # type: ignore[arg-type] q_pages.put(abs_url) else: download_q.put((abs_url, dest_path)) # Save current page local_path = to_local_path(urlparse(page_url), root) create_dir(local_path.parent) rewrite_links(soup, page_url, root, local_path.parent) html = soup.prettify() final_path = safe_write_text(local_path, html, encoding="utf-8") log.debug("Saved page %s", final_path) download_q.join() elapsed = time.time() - start_time if seen_pages: log.info( "Crawl finished: %s pages in %.2fs (%.2fs avg)", len(seen_pages), elapsed, elapsed / len(seen_pages), ) else: log.warning("Nothing downloaded – check URL or connectivity") # --------------------------------------------------------------------------- # Helper function for output folder # --------------------------------------------------------------------------- def make_root(url: str, custom: Optional[str]) -> Path: """Derive output folder from URL if custom not supplied.""" return Path(custom) if custom else Path(urlparse(url).netloc.replace(".", "_")) # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def parse_args() -> argparse.Namespace: p = argparse.ArgumentParser( description="Recursively mirror a website for offline use.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) p.add_argument( "--url", required=True, help="Starting URL to crawl (e.g., https://example.com/).", ) p.add_argument( "--destination", default=None, help="Output folder (defaults to a folder derived from the URL).", ) p.add_argument( "--max-pages", type=int, default=50, help="Maximum number of HTML pages to crawl.", ) p.add_argument( "--threads", type=int, default=6, help="Number of concurrent download workers.", ) return p.parse_args() if __name__ == "__main__": args = parse_args() if args.max_pages < 1: log.error("--max-pages must be >= 1") sys.exit(2) if args.threads < 1: log.error("--threads must be >= 1") sys.exit(2) host = args.url root = make_root(args.url, args.destination) crawl_site(host, root, args.max_pages, args.threads)