#!/usr/bin/env python3 import argparse import contextlib import logging import ssl import sys import urllib.request BROWSER_UA = ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0 Safari/537.36" ) CF_SIGNS = [ b"just a moment...", b"attention required!", b"enable javascript and cookies to continue", b"checking if the site connection is secure", b"cf-browser-verification", b"/cdn-cgi/challenge-platform/", ] def looks_like_cloudflare_block(body: bytes) -> bool: if not body: return False low = body.lower() for sig in CF_SIGNS: if sig in low: return True return False def response_body_bytes(response) -> bytes: body = getattr(response, "body", None) if isinstance(body, (bytes, bytearray)): return bytes(body) if isinstance(body, str): return body.encode("utf-8") text = getattr(response, "text", None) if isinstance(text, str): return text.encode("utf-8") return str(response).encode("utf-8") def lightweight_fetch(url: str, referer: str = "", timeout: float = 15.0) -> bytes: """Try a lightweight urllib fetch with browser headers first.""" req = urllib.request.Request( url, headers={ "User-Agent": BROWSER_UA, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", "Accept-Language": "cs-CZ,cs;q=0.9,en;q=0.8", "Accept-Encoding": "identity", "Connection": "keep-alive", **({"Referer": referer} if referer else {}), }, ) ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp: body = resp.read() if looks_like_cloudflare_block(body): raise RuntimeError(" lightweight fetch returned Cloudflare challenge") return body def scrapling_fetch(url: str, referer: str = "", timeout_ms: int = 30000, wait_ms: int = 500) -> bytes: try: from scrapling.fetchers import StealthyFetcher except Exception as exc: raise RuntimeError(f"Scrapling import failed: {exc}") from exc logging.getLogger().setLevel(logging.ERROR) extra_headers = {} if referer: extra_headers["Referer"] = referer # Increase challenge-solving timeout; network_idle can interfere with # ongoing Cloudflare polling so we disable it. fetch_kwargs = { "headless": True, "network_idle": False, "google_search": False, "solve_cloudflare": True, "timeout": timeout_ms, "wait": wait_ms, } if extra_headers: fetch_kwargs["extra_headers"] = extra_headers with contextlib.redirect_stdout(sys.stderr): response = StealthyFetcher.fetch(url, **fetch_kwargs) status = getattr(response, "status", None) if isinstance(status, int) and status >= 400: raise RuntimeError(f"Scrapling returned HTTP {status}") body = response_body_bytes(response) if not body: raise RuntimeError("Scrapling returned an empty body") return body def main() -> int: parser = argparse.ArgumentParser() parser.add_argument("--url", required=True) parser.add_argument("--referer", default="") parser.add_argument("--timeout-ms", type=int, default=30000) parser.add_argument("--wait-ms", type=int, default=500) args = parser.parse_args() # 1) Try lightweight urllib fetch first (no browser, instant) try: body = lightweight_fetch(args.url, args.referer, timeout=min(args.timeout_ms / 1000.0, 15.0)) sys.stdout.buffer.write(body) return 0 except Exception as exc: print(f"Lightweight fetch failed: {exc}", file=sys.stderr) # 2) Fall back to Scrapling / Playwright only if lightweight failed try: body = scrapling_fetch(args.url, args.referer, args.timeout_ms, args.wait_ms) sys.stdout.buffer.write(body) return 0 except Exception as exc: print(f"Scrapling fetch failed: {exc}", file=sys.stderr) return 1 if __name__ == "__main__": raise SystemExit(main())