Files
Tomas Dvorak ed61d8ab8e feat(scraper): implement CloakBrowser support and enhance request stealth
Integrate CloakBrowser to improve success rates against Cloudflare
challenges and implement more robust request handling in the Go backend.

- Add CloakBrowser integration to Dockerfile and requirements
- Implement domain-specific request semaphores in Go to prevent rate-limiting
- Add shared HTTP client with cookie jar and header preservation for
  better session management
- Enhance request headers in Go to include modern client hints (Sec-Ch-Ua)
- Add benchmarking scripts to compare fetch methods (urllib vs Scrapling
  vs CloakBrowser)
- Update docker-compose to support CloakBrowser environment variables
- Optimize Docker image by pre-downloading patched Chromium binaries
2026-05-17 17:52:52 +02:00

139 lines
4.2 KiB
Python

#!/usr/bin/env python3
import argparse
import contextlib
import logging
import ssl
import sys
import urllib.request
BROWSER_UA = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0 Safari/537.36"
)
CF_SIGNS = [
b"<title>just a moment...</title>",
b"attention required!",
b"enable javascript and cookies to continue",
b"checking if the site connection is secure",
b"cf-browser-verification",
b"/cdn-cgi/challenge-platform/",
]
def looks_like_cloudflare_block(body: bytes) -> bool:
if not body:
return False
low = body.lower()
for sig in CF_SIGNS:
if sig in low:
return True
return False
def response_body_bytes(response) -> bytes:
body = getattr(response, "body", None)
if isinstance(body, (bytes, bytearray)):
return bytes(body)
if isinstance(body, str):
return body.encode("utf-8")
text = getattr(response, "text", None)
if isinstance(text, str):
return text.encode("utf-8")
return str(response).encode("utf-8")
def lightweight_fetch(url: str, referer: str = "", timeout: float = 15.0) -> bytes:
"""Try a lightweight urllib fetch with browser headers first."""
req = urllib.request.Request(
url,
headers={
"User-Agent": BROWSER_UA,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "cs-CZ,cs;q=0.9,en;q=0.8",
"Accept-Encoding": "identity",
"Connection": "keep-alive",
**({"Referer": referer} if referer else {}),
},
)
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
body = resp.read()
if looks_like_cloudflare_block(body):
raise RuntimeError(" lightweight fetch returned Cloudflare challenge")
return body
def scrapling_fetch(url: str, referer: str = "", timeout_ms: int = 30000, wait_ms: int = 500) -> bytes:
try:
from scrapling.fetchers import StealthyFetcher
except Exception as exc:
raise RuntimeError(f"Scrapling import failed: {exc}") from exc
logging.getLogger().setLevel(logging.ERROR)
extra_headers = {}
if referer:
extra_headers["Referer"] = referer
# Increase challenge-solving timeout; network_idle can interfere with
# ongoing Cloudflare polling so we disable it.
fetch_kwargs = {
"headless": True,
"network_idle": False,
"google_search": False,
"solve_cloudflare": True,
"timeout": timeout_ms,
"wait": wait_ms,
}
if extra_headers:
fetch_kwargs["extra_headers"] = extra_headers
with contextlib.redirect_stdout(sys.stderr):
response = StealthyFetcher.fetch(url, **fetch_kwargs)
status = getattr(response, "status", None)
if isinstance(status, int) and status >= 400:
raise RuntimeError(f"Scrapling returned HTTP {status}")
body = response_body_bytes(response)
if not body:
raise RuntimeError("Scrapling returned an empty body")
return body
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--url", required=True)
parser.add_argument("--referer", default="")
parser.add_argument("--timeout-ms", type=int, default=30000)
parser.add_argument("--wait-ms", type=int, default=500)
args = parser.parse_args()
# 1) Try lightweight urllib fetch first (no browser, instant)
try:
body = lightweight_fetch(args.url, args.referer, timeout=min(args.timeout_ms / 1000.0, 15.0))
sys.stdout.buffer.write(body)
return 0
except Exception as exc:
print(f"Lightweight fetch failed: {exc}", file=sys.stderr)
# 2) Fall back to Scrapling / Playwright only if lightweight failed
try:
body = scrapling_fetch(args.url, args.referer, args.timeout_ms, args.wait_ms)
sys.stdout.buffer.write(body)
return 0
except Exception as exc:
print(f"Scrapling fetch failed: {exc}", file=sys.stderr)
return 1
if __name__ == "__main__":
raise SystemExit(main())