mirror of
https://github.com/Dvorinka/facr-scraper.git
synced 2026-06-03 20:12:57 +00:00
feat(scraper): implement CloakBrowser support and enhance request stealth
Integrate CloakBrowser to improve success rates against Cloudflare challenges and implement more robust request handling in the Go backend. - Add CloakBrowser integration to Dockerfile and requirements - Implement domain-specific request semaphores in Go to prevent rate-limiting - Add shared HTTP client with cookie jar and header preservation for better session management - Enhance request headers in Go to include modern client hints (Sec-Ch-Ua) - Add benchmarking scripts to compare fetch methods (urllib vs Scrapling vs CloakBrowser) - Update docker-compose to support CloakBrowser environment variables - Optimize Docker image by pre-downloading patched Chromium binaries
This commit is contained in:
@@ -0,0 +1,240 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Benchmark script comparing fetch methods:
|
||||
1. Direct urllib (lightweight HTTP)
|
||||
2. Scrapling (StealthyFetcher -> Chromium via patchright)
|
||||
3. CloakBrowser (patched Chromium with stealth)
|
||||
|
||||
Usage:
|
||||
.venv-scrapling/bin/python scripts/benchmark_fetch.py [--url URL] [--iterations N]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import gc
|
||||
import os
|
||||
import resource
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
import ssl
|
||||
from pathlib import Path
|
||||
|
||||
# Add venv site-packages to path if needed
|
||||
venv = Path(__file__).parent.parent / ".venv-scrapling"
|
||||
if venv.exists():
|
||||
import site
|
||||
site.addsitedir(str(venv / "lib" / "python3.13" / "site-packages"))
|
||||
|
||||
BROWSER_UA = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0 Safari/537.36"
|
||||
)
|
||||
|
||||
CF_SIGNS = [
|
||||
b"<title>just a moment...</title>",
|
||||
b"attention required!",
|
||||
b"enable javascript and cookies to continue",
|
||||
b"checking if the site connection is secure",
|
||||
b"cf-browser-verification",
|
||||
b"/cdn-cgi/challenge-platform/",
|
||||
]
|
||||
|
||||
|
||||
def looks_like_cloudflare_block(body: bytes) -> bool:
|
||||
if not body:
|
||||
return False
|
||||
low = body.lower()
|
||||
# Must contain an actual challenge title, not just CDN references
|
||||
hard_signals = [
|
||||
b"<title>just a moment...</title>",
|
||||
b"attention required!",
|
||||
b"enable javascript and cookies to continue",
|
||||
b"checking if the site connection is secure",
|
||||
]
|
||||
for sig in hard_signals:
|
||||
if sig in low:
|
||||
return True
|
||||
# Secondary: challenge platform JS + challenge token
|
||||
if b"/cdn-cgi/challenge-platform/" in low and (
|
||||
b"window._cf_chl_opt" in low or b"__cf_chl_rt_tk" in low
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def get_memory_mb() -> float:
|
||||
"""Return current process RSS memory in MB."""
|
||||
usage = resource.getrusage(resource.RUSAGE_SELF)
|
||||
return usage.ru_maxrss / 1024.0 # KB -> MB on Linux
|
||||
|
||||
|
||||
def direct_fetch(url: str, referer: str = "", timeout: float = 15.0) -> bytes:
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
headers={
|
||||
"User-Agent": BROWSER_UA,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "cs-CZ,cs;q=0.9,en;q=0.8",
|
||||
"Accept-Encoding": "identity",
|
||||
"Connection": "keep-alive",
|
||||
**({"Referer": referer} if referer else {}),
|
||||
},
|
||||
)
|
||||
ctx = ssl.create_default_context()
|
||||
ctx.check_hostname = False
|
||||
ctx.verify_mode = ssl.CERT_NONE
|
||||
with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
|
||||
body = resp.read()
|
||||
if looks_like_cloudflare_block(body):
|
||||
raise RuntimeError("Cloudflare block detected")
|
||||
return body
|
||||
|
||||
|
||||
def scrapling_fetch(url: str, referer: str = "", timeout_ms: int = 90000, wait_ms: int = 500) -> bytes:
|
||||
from scrapling.fetchers import StealthyFetcher
|
||||
|
||||
extra_headers = {}
|
||||
if referer:
|
||||
extra_headers["Referer"] = referer
|
||||
|
||||
fetch_kwargs = {
|
||||
"headless": True,
|
||||
"network_idle": False,
|
||||
"google_search": False,
|
||||
"solve_cloudflare": True,
|
||||
"timeout": timeout_ms,
|
||||
"wait": wait_ms,
|
||||
}
|
||||
if extra_headers:
|
||||
fetch_kwargs["extra_headers"] = extra_headers
|
||||
|
||||
response = StealthyFetcher.fetch(url, **fetch_kwargs)
|
||||
|
||||
status = getattr(response, "status", None)
|
||||
if isinstance(status, int) and status >= 400:
|
||||
raise RuntimeError(f"HTTP {status}")
|
||||
|
||||
body = getattr(response, "body", None)
|
||||
if isinstance(body, (bytes, bytearray)):
|
||||
return bytes(body)
|
||||
if isinstance(body, str):
|
||||
return body.encode("utf-8")
|
||||
text = getattr(response, "text", None)
|
||||
if isinstance(text, str):
|
||||
return text.encode("utf-8")
|
||||
return str(response).encode("utf-8")
|
||||
|
||||
|
||||
def cloakbrowser_fetch(url: str, referer: str = "", timeout_ms: int = 90000) -> bytes:
|
||||
from cloakbrowser import launch_context
|
||||
|
||||
ctx = launch_context(headless=True)
|
||||
page = ctx.new_page()
|
||||
|
||||
try:
|
||||
extra_headers = {}
|
||||
if referer:
|
||||
extra_headers["Referer"] = referer
|
||||
|
||||
if extra_headers:
|
||||
page.set_extra_http_headers(extra_headers)
|
||||
|
||||
page.goto(url, timeout=timeout_ms, wait_until="networkidle")
|
||||
html = page.content()
|
||||
body = html.encode("utf-8")
|
||||
|
||||
if looks_like_cloudflare_block(body):
|
||||
raise RuntimeError("Cloudflare block detected")
|
||||
return body
|
||||
finally:
|
||||
ctx.close()
|
||||
|
||||
|
||||
def benchmark_method(name: str, fn, url: str, referer: str, iterations: int = 1):
|
||||
"""Run a fetch method and return timing + metadata."""
|
||||
gc.collect()
|
||||
results = []
|
||||
|
||||
for i in range(iterations):
|
||||
print(f" [{name}] iteration {i + 1}/{iterations}...", flush=True)
|
||||
mem_before = get_memory_mb()
|
||||
start = time.monotonic()
|
||||
error = None
|
||||
body = b""
|
||||
try:
|
||||
body = fn(url, referer)
|
||||
if not body:
|
||||
error = "empty body"
|
||||
except Exception as exc:
|
||||
error = str(exc)
|
||||
elapsed = time.monotonic() - start
|
||||
mem_after = get_memory_mb()
|
||||
|
||||
results.append({
|
||||
"iteration": i + 1,
|
||||
"elapsed_sec": elapsed,
|
||||
"success": error is None,
|
||||
"error": error,
|
||||
"body_size": len(body),
|
||||
"mem_before_mb": mem_before,
|
||||
"mem_after_mb": mem_after,
|
||||
"mem_delta_mb": mem_after - mem_before,
|
||||
})
|
||||
|
||||
if error:
|
||||
print(f" FAILED: {error}", flush=True)
|
||||
else:
|
||||
print(f" OK in {elapsed:.2f}s, {len(body)} bytes, mem +{mem_after - mem_before:.1f}MB", flush=True)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--url", default="https://www.fotbal.cz/souteze/club/club/7eacd9f0-bfa0-4928-a9b6-936140168f58")
|
||||
parser.add_argument("--search-url", default="https://www.fotbal.cz/club/hledej?q=fotbalovy+klub+krnov")
|
||||
parser.add_argument("--iterations", type=int, default=1)
|
||||
parser.add_argument("--methods", default="all", help="Comma-separated: direct,scrapling,cloakbrowser,all")
|
||||
args = parser.parse_args()
|
||||
|
||||
methods = [m.strip().lower() for m in args.methods.split(",")]
|
||||
test_all = "all" in methods
|
||||
|
||||
print("=" * 70)
|
||||
print("FACR Scraper Fetch Benchmark")
|
||||
print("=" * 70)
|
||||
print(f"Python: {sys.version}")
|
||||
print(f"Iterations per method: {args.iterations}")
|
||||
print()
|
||||
|
||||
urls = [
|
||||
("Club page", args.url),
|
||||
("Search page", args.search_url),
|
||||
]
|
||||
|
||||
for label, url in urls:
|
||||
print(f"\n{'=' * 70}")
|
||||
print(f"Testing: {label}")
|
||||
print(f"URL: {url}")
|
||||
print("=" * 70)
|
||||
|
||||
if test_all or "direct" in methods:
|
||||
print("\n--- Direct HTTP (urllib) ---")
|
||||
benchmark_method("direct", direct_fetch, url, "", args.iterations)
|
||||
|
||||
if test_all or "cloakbrowser" in methods:
|
||||
print("\n--- CloakBrowser ---")
|
||||
benchmark_method("cloakbrowser", cloakbrowser_fetch, url, "", args.iterations)
|
||||
|
||||
if test_all or "scrapling" in methods:
|
||||
print("\n--- Scrapling ---")
|
||||
benchmark_method("scrapling", scrapling_fetch, url, "", args.iterations)
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("Benchmark complete.")
|
||||
print("=" * 70)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user