Files
FacrScraper/scripts/benchmark_fetch.py
T
Tomas Dvorak ed61d8ab8e feat(scraper): implement CloakBrowser support and enhance request stealth
Integrate CloakBrowser to improve success rates against Cloudflare
challenges and implement more robust request handling in the Go backend.

- Add CloakBrowser integration to Dockerfile and requirements
- Implement domain-specific request semaphores in Go to prevent rate-limiting
- Add shared HTTP client with cookie jar and header preservation for
  better session management
- Enhance request headers in Go to include modern client hints (Sec-Ch-Ua)
- Add benchmarking scripts to compare fetch methods (urllib vs Scrapling
  vs CloakBrowser)
- Update docker-compose to support CloakBrowser environment variables
- Optimize Docker image by pre-downloading patched Chromium binaries
2026-05-17 17:52:52 +02:00

241 lines
7.2 KiB
Python

#!/usr/bin/env python3
"""
Benchmark script comparing fetch methods:
1. Direct urllib (lightweight HTTP)
2. Scrapling (StealthyFetcher -> Chromium via patchright)
3. CloakBrowser (patched Chromium with stealth)
Usage:
.venv-scrapling/bin/python scripts/benchmark_fetch.py [--url URL] [--iterations N]
"""
import argparse
import gc
import os
import resource
import sys
import time
import urllib.request
import ssl
from pathlib import Path
# Add venv site-packages to path if needed
venv = Path(__file__).parent.parent / ".venv-scrapling"
if venv.exists():
import site
site.addsitedir(str(venv / "lib" / "python3.13" / "site-packages"))
BROWSER_UA = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0 Safari/537.36"
)
CF_SIGNS = [
b"<title>just a moment...</title>",
b"attention required!",
b"enable javascript and cookies to continue",
b"checking if the site connection is secure",
b"cf-browser-verification",
b"/cdn-cgi/challenge-platform/",
]
def looks_like_cloudflare_block(body: bytes) -> bool:
if not body:
return False
low = body.lower()
# Must contain an actual challenge title, not just CDN references
hard_signals = [
b"<title>just a moment...</title>",
b"attention required!",
b"enable javascript and cookies to continue",
b"checking if the site connection is secure",
]
for sig in hard_signals:
if sig in low:
return True
# Secondary: challenge platform JS + challenge token
if b"/cdn-cgi/challenge-platform/" in low and (
b"window._cf_chl_opt" in low or b"__cf_chl_rt_tk" in low
):
return True
return False
def get_memory_mb() -> float:
"""Return current process RSS memory in MB."""
usage = resource.getrusage(resource.RUSAGE_SELF)
return usage.ru_maxrss / 1024.0 # KB -> MB on Linux
def direct_fetch(url: str, referer: str = "", timeout: float = 15.0) -> bytes:
req = urllib.request.Request(
url,
headers={
"User-Agent": BROWSER_UA,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "cs-CZ,cs;q=0.9,en;q=0.8",
"Accept-Encoding": "identity",
"Connection": "keep-alive",
**({"Referer": referer} if referer else {}),
},
)
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
body = resp.read()
if looks_like_cloudflare_block(body):
raise RuntimeError("Cloudflare block detected")
return body
def scrapling_fetch(url: str, referer: str = "", timeout_ms: int = 90000, wait_ms: int = 500) -> bytes:
from scrapling.fetchers import StealthyFetcher
extra_headers = {}
if referer:
extra_headers["Referer"] = referer
fetch_kwargs = {
"headless": True,
"network_idle": False,
"google_search": False,
"solve_cloudflare": True,
"timeout": timeout_ms,
"wait": wait_ms,
}
if extra_headers:
fetch_kwargs["extra_headers"] = extra_headers
response = StealthyFetcher.fetch(url, **fetch_kwargs)
status = getattr(response, "status", None)
if isinstance(status, int) and status >= 400:
raise RuntimeError(f"HTTP {status}")
body = getattr(response, "body", None)
if isinstance(body, (bytes, bytearray)):
return bytes(body)
if isinstance(body, str):
return body.encode("utf-8")
text = getattr(response, "text", None)
if isinstance(text, str):
return text.encode("utf-8")
return str(response).encode("utf-8")
def cloakbrowser_fetch(url: str, referer: str = "", timeout_ms: int = 90000) -> bytes:
from cloakbrowser import launch_context
ctx = launch_context(headless=True)
page = ctx.new_page()
try:
extra_headers = {}
if referer:
extra_headers["Referer"] = referer
if extra_headers:
page.set_extra_http_headers(extra_headers)
page.goto(url, timeout=timeout_ms, wait_until="networkidle")
html = page.content()
body = html.encode("utf-8")
if looks_like_cloudflare_block(body):
raise RuntimeError("Cloudflare block detected")
return body
finally:
ctx.close()
def benchmark_method(name: str, fn, url: str, referer: str, iterations: int = 1):
"""Run a fetch method and return timing + metadata."""
gc.collect()
results = []
for i in range(iterations):
print(f" [{name}] iteration {i + 1}/{iterations}...", flush=True)
mem_before = get_memory_mb()
start = time.monotonic()
error = None
body = b""
try:
body = fn(url, referer)
if not body:
error = "empty body"
except Exception as exc:
error = str(exc)
elapsed = time.monotonic() - start
mem_after = get_memory_mb()
results.append({
"iteration": i + 1,
"elapsed_sec": elapsed,
"success": error is None,
"error": error,
"body_size": len(body),
"mem_before_mb": mem_before,
"mem_after_mb": mem_after,
"mem_delta_mb": mem_after - mem_before,
})
if error:
print(f" FAILED: {error}", flush=True)
else:
print(f" OK in {elapsed:.2f}s, {len(body)} bytes, mem +{mem_after - mem_before:.1f}MB", flush=True)
return results
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--url", default="https://www.fotbal.cz/souteze/club/club/7eacd9f0-bfa0-4928-a9b6-936140168f58")
parser.add_argument("--search-url", default="https://www.fotbal.cz/club/hledej?q=fotbalovy+klub+krnov")
parser.add_argument("--iterations", type=int, default=1)
parser.add_argument("--methods", default="all", help="Comma-separated: direct,scrapling,cloakbrowser,all")
args = parser.parse_args()
methods = [m.strip().lower() for m in args.methods.split(",")]
test_all = "all" in methods
print("=" * 70)
print("FACR Scraper Fetch Benchmark")
print("=" * 70)
print(f"Python: {sys.version}")
print(f"Iterations per method: {args.iterations}")
print()
urls = [
("Club page", args.url),
("Search page", args.search_url),
]
for label, url in urls:
print(f"\n{'=' * 70}")
print(f"Testing: {label}")
print(f"URL: {url}")
print("=" * 70)
if test_all or "direct" in methods:
print("\n--- Direct HTTP (urllib) ---")
benchmark_method("direct", direct_fetch, url, "", args.iterations)
if test_all or "cloakbrowser" in methods:
print("\n--- CloakBrowser ---")
benchmark_method("cloakbrowser", cloakbrowser_fetch, url, "", args.iterations)
if test_all or "scrapling" in methods:
print("\n--- Scrapling ---")
benchmark_method("scrapling", scrapling_fetch, url, "", args.iterations)
print("\n" + "=" * 70)
print("Benchmark complete.")
print("=" * 70)
return 0
if __name__ == "__main__":
raise SystemExit(main())