mirror of
https://github.com/Dvorinka/facr-scraper.git
synced 2026-06-03 20:12:57 +00:00
ed61d8ab8e
Integrate CloakBrowser to improve success rates against Cloudflare challenges and implement more robust request handling in the Go backend. - Add CloakBrowser integration to Dockerfile and requirements - Implement domain-specific request semaphores in Go to prevent rate-limiting - Add shared HTTP client with cookie jar and header preservation for better session management - Enhance request headers in Go to include modern client hints (Sec-Ch-Ua) - Add benchmarking scripts to compare fetch methods (urllib vs Scrapling vs CloakBrowser) - Update docker-compose to support CloakBrowser environment variables - Optimize Docker image by pre-downloading patched Chromium binaries
94 lines
2.9 KiB
Docker
94 lines
2.9 KiB
Docker
# Multi-stage build for Go application with Python/Scrapling support
|
|
FROM golang:1.24-alpine AS go-builder
|
|
|
|
# Install build dependencies
|
|
RUN apk add --no-cache git ca-certificates tzdata
|
|
|
|
WORKDIR /app
|
|
COPY go.mod go.sum ./
|
|
RUN go mod download
|
|
|
|
COPY . .
|
|
RUN CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -o facr-scraper .
|
|
|
|
# Python stage for Scrapling
|
|
FROM python:3.11-slim AS python-builder
|
|
|
|
ENV PYTHONDONTWRITEBYTECODE=1
|
|
ENV PYTHONUNBUFFERED=1
|
|
|
|
# Install system dependencies for Playwright
|
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
wget curl ca-certificates gnupg \
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
# Create virtual environment and install Scrapling
|
|
RUN python -m venv /opt/scrapling
|
|
ENV PATH="/opt/scrapling/bin:$PATH"
|
|
COPY requirements-scrapling.txt .
|
|
RUN pip install --no-cache-dir -r requirements-scrapling.txt
|
|
|
|
# Install Playwright browsers with deps in one layer
|
|
RUN playwright install chromium --with-deps
|
|
|
|
# Pre-download CloakBrowser patched Chromium binary so it doesn't
|
|
# download at runtime (saves ~10-20s per cold-start request).
|
|
RUN python -m cloakbrowser install
|
|
|
|
# Fix Python symlinks
|
|
RUN ln -sf /usr/local/bin/python /opt/scrapling/bin/python \
|
|
&& ln -sf /usr/local/bin/python /opt/scrapling/bin/python3
|
|
|
|
# Final stage
|
|
FROM python:3.11-slim
|
|
|
|
ENV PYTHONDONTWRITEBYTECODE=1
|
|
ENV PYTHONUNBUFFERED=1
|
|
ENV PATH="/opt/scrapling/bin:$PATH"
|
|
|
|
# Install runtime dependencies for both Go and Playwright
|
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
ca-certificates wget curl \
|
|
libglib2.0-0 libgobject-2.0-0 libnspr4 libnss3 libdbus-1-3 \
|
|
libatk1.0-0 libatk-bridge2.0-0 libcups2 libexpat1 libxcb1 \
|
|
libxkbcommon0 libatspi2.0-0 libx11-6 libxcomposite1 libxdamage1 \
|
|
libxext6 libxfixes3 libxrandr2 libgbm1 libcairo2 libpango-1.0-0 \
|
|
libasound2 \
|
|
fonts-liberation fonts-noto-color-emoji fonts-noto-core \
|
|
fontconfig locales \
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
# Create non-root user
|
|
RUN useradd -m -u 1000 scraper
|
|
|
|
# Copy Go binary
|
|
COPY --from=go-builder /app/facr-scraper /usr/local/bin/facr-scraper
|
|
|
|
# Copy Python environment
|
|
COPY --from=python-builder /opt/scrapling /opt/scrapling
|
|
|
|
# Copy Playwright browser cache
|
|
COPY --from=python-builder /root/.cache/ms-playwright /home/scraper/.cache/ms-playwright
|
|
|
|
# Copy CloakBrowser patched Chromium binary cache
|
|
COPY --from=python-builder /root/.cloakbrowser /home/scraper/.cloakbrowser
|
|
|
|
# Copy scrapling and cloakbrowser scripts
|
|
COPY scripts/scrapling_fetch.py /opt/scrapling/scripts/scrapling_fetch.py
|
|
COPY scripts/cloakbrowser_fetch.py /opt/scrapling/scripts/cloakbrowser_fetch.py
|
|
|
|
# Create cache directory and fix permissions
|
|
RUN mkdir -p /home/scraper/.cache && chown -R scraper:scraper /home/scraper /opt/scrapling
|
|
|
|
USER scraper
|
|
WORKDIR /home/scraper
|
|
|
|
# Expose port
|
|
EXPOSE 8686
|
|
|
|
# Health check
|
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
|
CMD curl -f http://localhost:8686/ || exit 1
|
|
|
|
CMD ["facr-scraper"]
|