refactor: optimize docker image and implement lightweight fetching

This commit improves the overall efficiency and reliability of the scraper by: - Optimizing the Dockerfile by reducing layers, using `--no-install-recommends`, and consolidating Playwright installation. - Adding resource limits (CPU/Memory) to the docker-compose configuration. - Refactoring `main.go` to remove unused Cloudflare client structures and increasing cache TTL. - Implementing a `lightweight_fetch` mechanism in `scrapling_fetch.py` using `urllib` to attempt fast requests before falling back to the heavier Scrapling/Playwright engine. - Adding Cloudflare challenge detection to the lightweight fetcher.
2026-06-03 20:12:57 +00:00 · 2026-05-11 19:50:59 +02:00
parent a8a4e1acaf
commit aa47f4309f
4 changed files with 474 additions and 440 deletions
@@ -3,7 +3,34 @@
 import argparse
 import contextlib
 import logging
+import ssl
 import sys
+import urllib.request
+
+
+BROWSER_UA = (
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0 Safari/537.36"
+)
+
+CF_SIGNS = [
+    b"<title>just a moment...</title>",
+    b"attention required!",
+    b"enable javascript and cookies to continue",
+    b"checking if the site connection is secure",
+    b"cf-browser-verification",
+    b"/cdn-cgi/challenge-platform/",
+]
+
+
+def looks_like_cloudflare_block(body: bytes) -> bool:
+    if not body:
+        return False
+    low = body.lower()
+    for sig in CF_SIGNS:
+        if sig in low:
+            return True
+    return False


 def response_body_bytes(response) -> bytes:
@@ -20,60 +47,90 @@ def response_body_bytes(response) -> bytes:
    return str(response).encode("utf-8")


-def main() -> int:
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--url", required=True)
-    parser.add_argument("--referer", default="")
-    parser.add_argument("--timeout-ms", type=int, default=45000)
-    parser.add_argument("--wait-ms", type=int, default=1000)
-    args = parser.parse_args()
+def lightweight_fetch(url: str, referer: str = "", timeout: float = 15.0) -> bytes:
+    """Try a lightweight urllib fetch with browser headers first."""
+    req = urllib.request.Request(
+        url,
+        headers={
+            "User-Agent": BROWSER_UA,
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
+            "Accept-Language": "cs-CZ,cs;q=0.9,en;q=0.8",
+            "Accept-Encoding": "identity",
+            "Connection": "keep-alive",
+            **({"Referer": referer} if referer else {}),
+        },
+    )
+    ctx = ssl.create_default_context()
+    ctx.check_hostname = False
+    ctx.verify_mode = ssl.CERT_NONE
+    with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
+        body = resp.read()
+    if looks_like_cloudflare_block(body):
+        raise RuntimeError(" lightweight fetch returned Cloudflare challenge")
+    return body

+
+def scrapling_fetch(url: str, referer: str = "", timeout_ms: int = 30000, wait_ms: int = 500) -> bytes:
    try:
        from scrapling.fetchers import StealthyFetcher
    except Exception as exc:
-        print(f"Scrapling import failed: {exc}", file=sys.stderr)
-        return 2
+        raise RuntimeError(f"Scrapling import failed: {exc}") from exc

    logging.getLogger().setLevel(logging.ERROR)

    extra_headers = {}
-    if args.referer:
-        extra_headers["Referer"] = args.referer
+    if referer:
+        extra_headers["Referer"] = referer

    fetch_kwargs = {
        "headless": True,
        "network_idle": True,
        "google_search": False,
        "solve_cloudflare": True,
-        "timeout": args.timeout_ms,
-        "wait": args.wait_ms,
+        "timeout": timeout_ms,
+        "wait": wait_ms,
    }
    if extra_headers:
        fetch_kwargs["extra_headers"] = extra_headers

+    with contextlib.redirect_stdout(sys.stderr):
+        response = StealthyFetcher.fetch(url, **fetch_kwargs)
+
+    status = getattr(response, "status", None)
+    if isinstance(status, int) and status >= 400:
+        raise RuntimeError(f"Scrapling returned HTTP {status}")
+
+    body = response_body_bytes(response)
+    if not body:
+        raise RuntimeError("Scrapling returned an empty body")
+    return body
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--url", required=True)
+    parser.add_argument("--referer", default="")
+    parser.add_argument("--timeout-ms", type=int, default=30000)
+    parser.add_argument("--wait-ms", type=int, default=500)
+    args = parser.parse_args()
+
+    # 1) Try lightweight urllib fetch first (no browser, instant)
    try:
-        with contextlib.redirect_stdout(sys.stderr):
-            response = StealthyFetcher.fetch(args.url, **fetch_kwargs)
+        body = lightweight_fetch(args.url, args.referer, timeout=min(args.timeout_ms / 1000.0, 15.0))
+        sys.stdout.buffer.write(body)
+        return 0
+    except Exception as exc:
+        print(f"Lightweight fetch failed: {exc}", file=sys.stderr)
+
+    # 2) Fall back to Scrapling / Playwright only if lightweight failed
+    try:
+        body = scrapling_fetch(args.url, args.referer, args.timeout_ms, args.wait_ms)
+        sys.stdout.buffer.write(body)
+        return 0
    except Exception as exc:
        print(f"Scrapling fetch failed: {exc}", file=sys.stderr)
        return 1

-    status = getattr(response, "status", None)
-    if isinstance(status, int) and status >= 400:
-        print(f"Scrapling returned HTTP {status}", file=sys.stderr)
-        return 1
-
-    body = response_body_bytes(response)
-    if not body:
-        print("Scrapling returned an empty body", file=sys.stderr)
-        return 1
-
-    try:
-        sys.stdout.buffer.write(body)
-    except BrokenPipeError:
-        return 0
-    return 0
-

 if __name__ == "__main__":
    raise SystemExit(main())