#!/usr/bin/env python3 """ Benchmark script comparing fetch methods: 1. Direct urllib (lightweight HTTP) 2. Scrapling (StealthyFetcher -> Chromium via patchright) 3. CloakBrowser (patched Chromium with stealth) Usage: .venv-scrapling/bin/python scripts/benchmark_fetch.py [--url URL] [--iterations N] """ import argparse import gc import os import resource import sys import time import urllib.request import ssl from pathlib import Path # Add venv site-packages to path if needed venv = Path(__file__).parent.parent / ".venv-scrapling" if venv.exists(): import site site.addsitedir(str(venv / "lib" / "python3.13" / "site-packages")) BROWSER_UA = ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0 Safari/537.36" ) CF_SIGNS = [ b"just a moment...", b"attention required!", b"enable javascript and cookies to continue", b"checking if the site connection is secure", b"cf-browser-verification", b"/cdn-cgi/challenge-platform/", ] def looks_like_cloudflare_block(body: bytes) -> bool: if not body: return False low = body.lower() # Must contain an actual challenge title, not just CDN references hard_signals = [ b"just a moment...", b"attention required!", b"enable javascript and cookies to continue", b"checking if the site connection is secure", ] for sig in hard_signals: if sig in low: return True # Secondary: challenge platform JS + challenge token if b"/cdn-cgi/challenge-platform/" in low and ( b"window._cf_chl_opt" in low or b"__cf_chl_rt_tk" in low ): return True return False def get_memory_mb() -> float: """Return current process RSS memory in MB.""" usage = resource.getrusage(resource.RUSAGE_SELF) return usage.ru_maxrss / 1024.0 # KB -> MB on Linux def direct_fetch(url: str, referer: str = "", timeout: float = 15.0) -> bytes: req = urllib.request.Request( url, headers={ "User-Agent": BROWSER_UA, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "cs-CZ,cs;q=0.9,en;q=0.8", "Accept-Encoding": "identity", "Connection": "keep-alive", **({"Referer": referer} if referer else {}), }, ) ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp: body = resp.read() if looks_like_cloudflare_block(body): raise RuntimeError("Cloudflare block detected") return body def scrapling_fetch(url: str, referer: str = "", timeout_ms: int = 90000, wait_ms: int = 500) -> bytes: from scrapling.fetchers import StealthyFetcher extra_headers = {} if referer: extra_headers["Referer"] = referer fetch_kwargs = { "headless": True, "network_idle": False, "google_search": False, "solve_cloudflare": True, "timeout": timeout_ms, "wait": wait_ms, } if extra_headers: fetch_kwargs["extra_headers"] = extra_headers response = StealthyFetcher.fetch(url, **fetch_kwargs) status = getattr(response, "status", None) if isinstance(status, int) and status >= 400: raise RuntimeError(f"HTTP {status}") body = getattr(response, "body", None) if isinstance(body, (bytes, bytearray)): return bytes(body) if isinstance(body, str): return body.encode("utf-8") text = getattr(response, "text", None) if isinstance(text, str): return text.encode("utf-8") return str(response).encode("utf-8") def cloakbrowser_fetch(url: str, referer: str = "", timeout_ms: int = 90000) -> bytes: from cloakbrowser import launch_context ctx = launch_context(headless=True) page = ctx.new_page() try: extra_headers = {} if referer: extra_headers["Referer"] = referer if extra_headers: page.set_extra_http_headers(extra_headers) page.goto(url, timeout=timeout_ms, wait_until="networkidle") html = page.content() body = html.encode("utf-8") if looks_like_cloudflare_block(body): raise RuntimeError("Cloudflare block detected") return body finally: ctx.close() def benchmark_method(name: str, fn, url: str, referer: str, iterations: int = 1): """Run a fetch method and return timing + metadata.""" gc.collect() results = [] for i in range(iterations): print(f" [{name}] iteration {i + 1}/{iterations}...", flush=True) mem_before = get_memory_mb() start = time.monotonic() error = None body = b"" try: body = fn(url, referer) if not body: error = "empty body" except Exception as exc: error = str(exc) elapsed = time.monotonic() - start mem_after = get_memory_mb() results.append({ "iteration": i + 1, "elapsed_sec": elapsed, "success": error is None, "error": error, "body_size": len(body), "mem_before_mb": mem_before, "mem_after_mb": mem_after, "mem_delta_mb": mem_after - mem_before, }) if error: print(f" FAILED: {error}", flush=True) else: print(f" OK in {elapsed:.2f}s, {len(body)} bytes, mem +{mem_after - mem_before:.1f}MB", flush=True) return results def main() -> int: parser = argparse.ArgumentParser() parser.add_argument("--url", default="https://www.fotbal.cz/souteze/club/club/7eacd9f0-bfa0-4928-a9b6-936140168f58") parser.add_argument("--search-url", default="https://www.fotbal.cz/club/hledej?q=fotbalovy+klub+krnov") parser.add_argument("--iterations", type=int, default=1) parser.add_argument("--methods", default="all", help="Comma-separated: direct,scrapling,cloakbrowser,all") args = parser.parse_args() methods = [m.strip().lower() for m in args.methods.split(",")] test_all = "all" in methods print("=" * 70) print("FACR Scraper Fetch Benchmark") print("=" * 70) print(f"Python: {sys.version}") print(f"Iterations per method: {args.iterations}") print() urls = [ ("Club page", args.url), ("Search page", args.search_url), ] for label, url in urls: print(f"\n{'=' * 70}") print(f"Testing: {label}") print(f"URL: {url}") print("=" * 70) if test_all or "direct" in methods: print("\n--- Direct HTTP (urllib) ---") benchmark_method("direct", direct_fetch, url, "", args.iterations) if test_all or "cloakbrowser" in methods: print("\n--- CloakBrowser ---") benchmark_method("cloakbrowser", cloakbrowser_fetch, url, "", args.iterations) if test_all or "scrapling" in methods: print("\n--- Scrapling ---") benchmark_method("scrapling", scrapling_fetch, url, "", args.iterations) print("\n" + "=" * 70) print("Benchmark complete.") print("=" * 70) return 0 if __name__ == "__main__": raise SystemExit(main())