FacrScraper/scripts/scrapling_fetch.py

#!/usr/bin/env python3

import argparse
import contextlib
import logging
import ssl
import sys
import urllib.request


BROWSER_UA = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0 Safari/537.36"
)

CF_SIGNS = [
    b"<title>just a moment...</title>",
    b"attention required!",
    b"enable javascript and cookies to continue",
    b"checking if the site connection is secure",
    b"cf-browser-verification",
    b"/cdn-cgi/challenge-platform/",
]


def looks_like_cloudflare_block(body: bytes) -> bool:
    if not body:
        return False
    low = body.lower()
    for sig in CF_SIGNS:
        if sig in low:
            return True
    return False


def response_body_bytes(response) -> bytes:
    body = getattr(response, "body", None)
    if isinstance(body, (bytes, bytearray)):
        return bytes(body)
    if isinstance(body, str):
        return body.encode("utf-8")

    text = getattr(response, "text", None)
    if isinstance(text, str):
        return text.encode("utf-8")

    return str(response).encode("utf-8")


def lightweight_fetch(url: str, referer: str = "", timeout: float = 15.0) -> bytes:
    """Try a lightweight urllib fetch with browser headers first."""
    req = urllib.request.Request(
        url,
        headers={
            "User-Agent": BROWSER_UA,
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
            "Accept-Language": "cs-CZ,cs;q=0.9,en;q=0.8",
            "Accept-Encoding": "identity",
            "Connection": "keep-alive",
            **({"Referer": referer} if referer else {}),
        },
    )
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE
    with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
        body = resp.read()
    if looks_like_cloudflare_block(body):
        raise RuntimeError(" lightweight fetch returned Cloudflare challenge")
    return body


def scrapling_fetch(url: str, referer: str = "", timeout_ms: int = 30000, wait_ms: int = 500) -> bytes:
    try:
        from scrapling.fetchers import StealthyFetcher
    except Exception as exc:
        raise RuntimeError(f"Scrapling import failed: {exc}") from exc

    logging.getLogger().setLevel(logging.ERROR)

    extra_headers = {}
    if referer:
        extra_headers["Referer"] = referer

    # Increase challenge-solving timeout; network_idle can interfere with
    # ongoing Cloudflare polling so we disable it.
    fetch_kwargs = {
        "headless": True,
        "network_idle": False,
        "google_search": False,
        "solve_cloudflare": True,
        "timeout": timeout_ms,
        "wait": wait_ms,
    }
    if extra_headers:
        fetch_kwargs["extra_headers"] = extra_headers

    with contextlib.redirect_stdout(sys.stderr):
        response = StealthyFetcher.fetch(url, **fetch_kwargs)

    status = getattr(response, "status", None)
    if isinstance(status, int) and status >= 400:
        raise RuntimeError(f"Scrapling returned HTTP {status}")

    body = response_body_bytes(response)
    if not body:
        raise RuntimeError("Scrapling returned an empty body")
    return body


def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("--url", required=True)
    parser.add_argument("--referer", default="")
    parser.add_argument("--timeout-ms", type=int, default=30000)
    parser.add_argument("--wait-ms", type=int, default=500)
    args = parser.parse_args()

    # 1) Try lightweight urllib fetch first (no browser, instant)
    try:
        body = lightweight_fetch(args.url, args.referer, timeout=min(args.timeout_ms / 1000.0, 15.0))
        sys.stdout.buffer.write(body)
        return 0
    except Exception as exc:
        print(f"Lightweight fetch failed: {exc}", file=sys.stderr)

    # 2) Fall back to Scrapling / Playwright only if lightweight failed
    try:
        body = scrapling_fetch(args.url, args.referer, args.timeout_ms, args.wait_ms)
        sys.stdout.buffer.write(body)
        return 0
    except Exception as exc:
        print(f"Scrapling fetch failed: {exc}", file=sys.stderr)
        return 1


if __name__ == "__main__":
    raise SystemExit(main())