mirror of
https://github.com/Dvorinka/facr-scraper.git
synced 2026-06-03 20:12:57 +00:00
ed61d8ab8e
Integrate CloakBrowser to improve success rates against Cloudflare challenges and implement more robust request handling in the Go backend. - Add CloakBrowser integration to Dockerfile and requirements - Implement domain-specific request semaphores in Go to prevent rate-limiting - Add shared HTTP client with cookie jar and header preservation for better session management - Enhance request headers in Go to include modern client hints (Sec-Ch-Ua) - Add benchmarking scripts to compare fetch methods (urllib vs Scrapling vs CloakBrowser) - Update docker-compose to support CloakBrowser environment variables - Optimize Docker image by pre-downloading patched Chromium binaries
40 lines
1.2 KiB
YAML
40 lines
1.2 KiB
YAML
version: '3.8'
|
|
|
|
services:
|
|
facr-scraper:
|
|
build: .
|
|
ports:
|
|
- "8686:8686"
|
|
environment:
|
|
- LOGOAPI_BASE_URL=${LOGOAPI_BASE_URL:-https://logoapi.sportcreative.eu}
|
|
- CLOUDFLARE_ACCOUNT_ID=${CLOUDFLARE_ACCOUNT_ID}
|
|
- CLOUDFLARE_API_TOKEN=${CLOUDFLARE_API_TOKEN}
|
|
- SCRAPLING_PYTHON_BIN=/opt/scrapling/bin/python
|
|
- SCRAPLING_SCRIPT=/opt/scrapling/scripts/scrapling_fetch.py
|
|
- CLOAKBROWSER_PYTHON_BIN=/opt/scrapling/bin/python
|
|
- CLOAKBROWSER_SCRIPT=/opt/scrapling/scripts/cloakbrowser_fetch.py
|
|
- CLOAKBROWSER_TIMEZONE=${CLOAKBROWSER_TIMEZONE:-Europe/Prague}
|
|
- CLOAKBROWSER_LOCALE=${CLOAKBROWSER_LOCALE:-cs-CZ}
|
|
- DEBUG_SAVE_HTML=${DEBUG_SAVE_HTML:-}
|
|
restart: unless-stopped
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
cpus: '2.0'
|
|
memory: 4G
|
|
reservations:
|
|
cpus: '0.5'
|
|
memory: 512M
|
|
volumes:
|
|
# Optional: Mount cache for Playwright browsers
|
|
- playwright_cache:/home/scraper/.cache
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:8686/"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
start_period: 40s
|
|
|
|
volumes:
|
|
playwright_cache:
|