mirror of
https://github.com/Dvorinka/facr-scraper.git
synced 2026-06-03 20:12:57 +00:00
ed61d8ab8e
Integrate CloakBrowser to improve success rates against Cloudflare challenges and implement more robust request handling in the Go backend. - Add CloakBrowser integration to Dockerfile and requirements - Implement domain-specific request semaphores in Go to prevent rate-limiting - Add shared HTTP client with cookie jar and header preservation for better session management - Enhance request headers in Go to include modern client hints (Sec-Ch-Ua) - Add benchmarking scripts to compare fetch methods (urllib vs Scrapling vs CloakBrowser) - Update docker-compose to support CloakBrowser environment variables - Optimize Docker image by pre-downloading patched Chromium binaries
25 lines
673 B
Python
25 lines
673 B
Python
import sys, os, time
|
|
from cloakbrowser import launch_context
|
|
|
|
url = sys.argv[1]
|
|
tz = os.environ.get('CLOAKBROWSER_TIMEZONE', 'Europe/Prague')
|
|
lc = os.environ.get('CLOAKBROWSER_LOCALE', 'cs-CZ')
|
|
|
|
ctx = launch_context(
|
|
headless=True,
|
|
timezone=tz,
|
|
locale=lc,
|
|
args=['--no-sandbox', '--disable-dev-shm-usage']
|
|
)
|
|
page = ctx.new_page()
|
|
|
|
# Note: we intentionally do NOT set a custom Referer here.
|
|
# A self-referring Referer (e.g. /club/hledej -> /club/hledej) triggers
|
|
# Cloudflare's bot detection even with CloakBrowser's stealth patches.
|
|
|
|
try:
|
|
page.goto(url, timeout=30000, wait_until='networkidle')
|
|
print(page.content(), end='')
|
|
finally:
|
|
ctx.close()
|