FacrScraper/scripts/benchmark_fetch.py

#!/usr/bin/env python3
"""
Benchmark script comparing fetch methods:
1. Direct urllib (lightweight HTTP)
2. Scrapling (StealthyFetcher -> Chromium via patchright)
3. CloakBrowser (patched Chromium with stealth)

Usage:
    .venv-scrapling/bin/python scripts/benchmark_fetch.py [--url URL] [--iterations N]
"""

import argparse
import gc
import os
import resource
import sys
import time
import urllib.request
import ssl
from pathlib import Path

# Add venv site-packages to path if needed
venv = Path(__file__).parent.parent / ".venv-scrapling"
if venv.exists():
    import site
    site.addsitedir(str(venv / "lib" / "python3.13" / "site-packages"))

BROWSER_UA = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0 Safari/537.36"
)

CF_SIGNS = [
    b"<title>just a moment...</title>",
    b"attention required!",
    b"enable javascript and cookies to continue",
    b"checking if the site connection is secure",
    b"cf-browser-verification",
    b"/cdn-cgi/challenge-platform/",
]


def looks_like_cloudflare_block(body: bytes) -> bool:
    if not body:
        return False
    low = body.lower()
    # Must contain an actual challenge title, not just CDN references
    hard_signals = [
        b"<title>just a moment...</title>",
        b"attention required!",
        b"enable javascript and cookies to continue",
        b"checking if the site connection is secure",
    ]
    for sig in hard_signals:
        if sig in low:
            return True
    # Secondary: challenge platform JS + challenge token
    if b"/cdn-cgi/challenge-platform/" in low and (
        b"window._cf_chl_opt" in low or b"__cf_chl_rt_tk" in low
    ):
        return True
    return False


def get_memory_mb() -> float:
    """Return current process RSS memory in MB."""
    usage = resource.getrusage(resource.RUSAGE_SELF)
    return usage.ru_maxrss / 1024.0  # KB -> MB on Linux


def direct_fetch(url: str, referer: str = "", timeout: float = 15.0) -> bytes:
    req = urllib.request.Request(
        url,
        headers={
            "User-Agent": BROWSER_UA,
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "cs-CZ,cs;q=0.9,en;q=0.8",
            "Accept-Encoding": "identity",
            "Connection": "keep-alive",
            **({"Referer": referer} if referer else {}),
        },
    )
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE
    with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
        body = resp.read()
    if looks_like_cloudflare_block(body):
        raise RuntimeError("Cloudflare block detected")
    return body


def scrapling_fetch(url: str, referer: str = "", timeout_ms: int = 90000, wait_ms: int = 500) -> bytes:
    from scrapling.fetchers import StealthyFetcher

    extra_headers = {}
    if referer:
        extra_headers["Referer"] = referer

    fetch_kwargs = {
        "headless": True,
        "network_idle": False,
        "google_search": False,
        "solve_cloudflare": True,
        "timeout": timeout_ms,
        "wait": wait_ms,
    }
    if extra_headers:
        fetch_kwargs["extra_headers"] = extra_headers

    response = StealthyFetcher.fetch(url, **fetch_kwargs)

    status = getattr(response, "status", None)
    if isinstance(status, int) and status >= 400:
        raise RuntimeError(f"HTTP {status}")

    body = getattr(response, "body", None)
    if isinstance(body, (bytes, bytearray)):
        return bytes(body)
    if isinstance(body, str):
        return body.encode("utf-8")
    text = getattr(response, "text", None)
    if isinstance(text, str):
        return text.encode("utf-8")
    return str(response).encode("utf-8")


def cloakbrowser_fetch(url: str, referer: str = "", timeout_ms: int = 90000) -> bytes:
    from cloakbrowser import launch_context

    ctx = launch_context(headless=True)
    page = ctx.new_page()

    try:
        extra_headers = {}
        if referer:
            extra_headers["Referer"] = referer

        if extra_headers:
            page.set_extra_http_headers(extra_headers)

        page.goto(url, timeout=timeout_ms, wait_until="networkidle")
        html = page.content()
        body = html.encode("utf-8")

        if looks_like_cloudflare_block(body):
            raise RuntimeError("Cloudflare block detected")
        return body
    finally:
        ctx.close()


def benchmark_method(name: str, fn, url: str, referer: str, iterations: int = 1):
    """Run a fetch method and return timing + metadata."""
    gc.collect()
    results = []

    for i in range(iterations):
        print(f"  [{name}] iteration {i + 1}/{iterations}...", flush=True)
        mem_before = get_memory_mb()
        start = time.monotonic()
        error = None
        body = b""
        try:
            body = fn(url, referer)
            if not body:
                error = "empty body"
        except Exception as exc:
            error = str(exc)
        elapsed = time.monotonic() - start
        mem_after = get_memory_mb()

        results.append({
            "iteration": i + 1,
            "elapsed_sec": elapsed,
            "success": error is None,
            "error": error,
            "body_size": len(body),
            "mem_before_mb": mem_before,
            "mem_after_mb": mem_after,
            "mem_delta_mb": mem_after - mem_before,
        })

        if error:
            print(f"    FAILED: {error}", flush=True)
        else:
            print(f"    OK in {elapsed:.2f}s, {len(body)} bytes, mem +{mem_after - mem_before:.1f}MB", flush=True)

    return results


def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("--url", default="https://www.fotbal.cz/souteze/club/club/7eacd9f0-bfa0-4928-a9b6-936140168f58")
    parser.add_argument("--search-url", default="https://www.fotbal.cz/club/hledej?q=fotbalovy+klub+krnov")
    parser.add_argument("--iterations", type=int, default=1)
    parser.add_argument("--methods", default="all", help="Comma-separated: direct,scrapling,cloakbrowser,all")
    args = parser.parse_args()

    methods = [m.strip().lower() for m in args.methods.split(",")]
    test_all = "all" in methods

    print("=" * 70)
    print("FACR Scraper Fetch Benchmark")
    print("=" * 70)
    print(f"Python: {sys.version}")
    print(f"Iterations per method: {args.iterations}")
    print()

    urls = [
        ("Club page", args.url),
        ("Search page", args.search_url),
    ]

    for label, url in urls:
        print(f"\n{'=' * 70}")
        print(f"Testing: {label}")
        print(f"URL: {url}")
        print("=" * 70)

        if test_all or "direct" in methods:
            print("\n--- Direct HTTP (urllib) ---")
            benchmark_method("direct", direct_fetch, url, "", args.iterations)

        if test_all or "cloakbrowser" in methods:
            print("\n--- CloakBrowser ---")
            benchmark_method("cloakbrowser", cloakbrowser_fetch, url, "", args.iterations)

        if test_all or "scrapling" in methods:
            print("\n--- Scrapling ---")
            benchmark_method("scrapling", scrapling_fetch, url, "", args.iterations)

    print("\n" + "=" * 70)
    print("Benchmark complete.")
    print("=" * 70)
    return 0


if __name__ == "__main__":
    raise SystemExit(main())