mirror of
https://github.com/Dvorinka/facr-scraper.git
synced 2026-06-03 20:12:57 +00:00
refactor: optimize docker image and implement lightweight fetching
This commit improves the overall efficiency and reliability of the scraper by: - Optimizing the Dockerfile by reducing layers, using `--no-install-recommends`, and consolidating Playwright installation. - Adding resource limits (CPU/Memory) to the docker-compose configuration. - Refactoring `main.go` to remove unused Cloudflare client structures and increasing cache TTL. - Implementing a `lightweight_fetch` mechanism in `scrapling_fetch.py` using `urllib` to attempt fast requests before falling back to the heavier Scrapling/Playwright engine. - Adding Cloudflare challenge detection to the lightweight fetcher.
This commit is contained in:
+88
-31
@@ -3,7 +3,34 @@
|
||||
import argparse
|
||||
import contextlib
|
||||
import logging
|
||||
import ssl
|
||||
import sys
|
||||
import urllib.request
|
||||
|
||||
|
||||
BROWSER_UA = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0 Safari/537.36"
|
||||
)
|
||||
|
||||
CF_SIGNS = [
|
||||
b"<title>just a moment...</title>",
|
||||
b"attention required!",
|
||||
b"enable javascript and cookies to continue",
|
||||
b"checking if the site connection is secure",
|
||||
b"cf-browser-verification",
|
||||
b"/cdn-cgi/challenge-platform/",
|
||||
]
|
||||
|
||||
|
||||
def looks_like_cloudflare_block(body: bytes) -> bool:
|
||||
if not body:
|
||||
return False
|
||||
low = body.lower()
|
||||
for sig in CF_SIGNS:
|
||||
if sig in low:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def response_body_bytes(response) -> bytes:
|
||||
@@ -20,60 +47,90 @@ def response_body_bytes(response) -> bytes:
|
||||
return str(response).encode("utf-8")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--url", required=True)
|
||||
parser.add_argument("--referer", default="")
|
||||
parser.add_argument("--timeout-ms", type=int, default=45000)
|
||||
parser.add_argument("--wait-ms", type=int, default=1000)
|
||||
args = parser.parse_args()
|
||||
def lightweight_fetch(url: str, referer: str = "", timeout: float = 15.0) -> bytes:
|
||||
"""Try a lightweight urllib fetch with browser headers first."""
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
headers={
|
||||
"User-Agent": BROWSER_UA,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
||||
"Accept-Language": "cs-CZ,cs;q=0.9,en;q=0.8",
|
||||
"Accept-Encoding": "identity",
|
||||
"Connection": "keep-alive",
|
||||
**({"Referer": referer} if referer else {}),
|
||||
},
|
||||
)
|
||||
ctx = ssl.create_default_context()
|
||||
ctx.check_hostname = False
|
||||
ctx.verify_mode = ssl.CERT_NONE
|
||||
with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
|
||||
body = resp.read()
|
||||
if looks_like_cloudflare_block(body):
|
||||
raise RuntimeError(" lightweight fetch returned Cloudflare challenge")
|
||||
return body
|
||||
|
||||
|
||||
def scrapling_fetch(url: str, referer: str = "", timeout_ms: int = 30000, wait_ms: int = 500) -> bytes:
|
||||
try:
|
||||
from scrapling.fetchers import StealthyFetcher
|
||||
except Exception as exc:
|
||||
print(f"Scrapling import failed: {exc}", file=sys.stderr)
|
||||
return 2
|
||||
raise RuntimeError(f"Scrapling import failed: {exc}") from exc
|
||||
|
||||
logging.getLogger().setLevel(logging.ERROR)
|
||||
|
||||
extra_headers = {}
|
||||
if args.referer:
|
||||
extra_headers["Referer"] = args.referer
|
||||
if referer:
|
||||
extra_headers["Referer"] = referer
|
||||
|
||||
fetch_kwargs = {
|
||||
"headless": True,
|
||||
"network_idle": True,
|
||||
"google_search": False,
|
||||
"solve_cloudflare": True,
|
||||
"timeout": args.timeout_ms,
|
||||
"wait": args.wait_ms,
|
||||
"timeout": timeout_ms,
|
||||
"wait": wait_ms,
|
||||
}
|
||||
if extra_headers:
|
||||
fetch_kwargs["extra_headers"] = extra_headers
|
||||
|
||||
with contextlib.redirect_stdout(sys.stderr):
|
||||
response = StealthyFetcher.fetch(url, **fetch_kwargs)
|
||||
|
||||
status = getattr(response, "status", None)
|
||||
if isinstance(status, int) and status >= 400:
|
||||
raise RuntimeError(f"Scrapling returned HTTP {status}")
|
||||
|
||||
body = response_body_bytes(response)
|
||||
if not body:
|
||||
raise RuntimeError("Scrapling returned an empty body")
|
||||
return body
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--url", required=True)
|
||||
parser.add_argument("--referer", default="")
|
||||
parser.add_argument("--timeout-ms", type=int, default=30000)
|
||||
parser.add_argument("--wait-ms", type=int, default=500)
|
||||
args = parser.parse_args()
|
||||
|
||||
# 1) Try lightweight urllib fetch first (no browser, instant)
|
||||
try:
|
||||
with contextlib.redirect_stdout(sys.stderr):
|
||||
response = StealthyFetcher.fetch(args.url, **fetch_kwargs)
|
||||
body = lightweight_fetch(args.url, args.referer, timeout=min(args.timeout_ms / 1000.0, 15.0))
|
||||
sys.stdout.buffer.write(body)
|
||||
return 0
|
||||
except Exception as exc:
|
||||
print(f"Lightweight fetch failed: {exc}", file=sys.stderr)
|
||||
|
||||
# 2) Fall back to Scrapling / Playwright only if lightweight failed
|
||||
try:
|
||||
body = scrapling_fetch(args.url, args.referer, args.timeout_ms, args.wait_ms)
|
||||
sys.stdout.buffer.write(body)
|
||||
return 0
|
||||
except Exception as exc:
|
||||
print(f"Scrapling fetch failed: {exc}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
status = getattr(response, "status", None)
|
||||
if isinstance(status, int) and status >= 400:
|
||||
print(f"Scrapling returned HTTP {status}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
body = response_body_bytes(response)
|
||||
if not body:
|
||||
print("Scrapling returned an empty body", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
try:
|
||||
sys.stdout.buffer.write(body)
|
||||
except BrokenPipeError:
|
||||
return 0
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
|
||||
Reference in New Issue
Block a user