Files
FacrScraper/Dockerfile
T
Tomas Dvorak ed61d8ab8e feat(scraper): implement CloakBrowser support and enhance request stealth
Integrate CloakBrowser to improve success rates against Cloudflare
challenges and implement more robust request handling in the Go backend.

- Add CloakBrowser integration to Dockerfile and requirements
- Implement domain-specific request semaphores in Go to prevent rate-limiting
- Add shared HTTP client with cookie jar and header preservation for
  better session management
- Enhance request headers in Go to include modern client hints (Sec-Ch-Ua)
- Add benchmarking scripts to compare fetch methods (urllib vs Scrapling
  vs CloakBrowser)
- Update docker-compose to support CloakBrowser environment variables
- Optimize Docker image by pre-downloading patched Chromium binaries
2026-05-17 17:52:52 +02:00

94 lines
2.9 KiB
Docker

# Multi-stage build for Go application with Python/Scrapling support
FROM golang:1.24-alpine AS go-builder
# Install build dependencies
RUN apk add --no-cache git ca-certificates tzdata
WORKDIR /app
COPY go.mod go.sum ./
RUN go mod download
COPY . .
RUN CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -o facr-scraper .
# Python stage for Scrapling
FROM python:3.11-slim AS python-builder
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
# Install system dependencies for Playwright
RUN apt-get update && apt-get install -y --no-install-recommends \
wget curl ca-certificates gnupg \
&& rm -rf /var/lib/apt/lists/*
# Create virtual environment and install Scrapling
RUN python -m venv /opt/scrapling
ENV PATH="/opt/scrapling/bin:$PATH"
COPY requirements-scrapling.txt .
RUN pip install --no-cache-dir -r requirements-scrapling.txt
# Install Playwright browsers with deps in one layer
RUN playwright install chromium --with-deps
# Pre-download CloakBrowser patched Chromium binary so it doesn't
# download at runtime (saves ~10-20s per cold-start request).
RUN python -m cloakbrowser install
# Fix Python symlinks
RUN ln -sf /usr/local/bin/python /opt/scrapling/bin/python \
&& ln -sf /usr/local/bin/python /opt/scrapling/bin/python3
# Final stage
FROM python:3.11-slim
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
ENV PATH="/opt/scrapling/bin:$PATH"
# Install runtime dependencies for both Go and Playwright
RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates wget curl \
libglib2.0-0 libgobject-2.0-0 libnspr4 libnss3 libdbus-1-3 \
libatk1.0-0 libatk-bridge2.0-0 libcups2 libexpat1 libxcb1 \
libxkbcommon0 libatspi2.0-0 libx11-6 libxcomposite1 libxdamage1 \
libxext6 libxfixes3 libxrandr2 libgbm1 libcairo2 libpango-1.0-0 \
libasound2 \
fonts-liberation fonts-noto-color-emoji fonts-noto-core \
fontconfig locales \
&& rm -rf /var/lib/apt/lists/*
# Create non-root user
RUN useradd -m -u 1000 scraper
# Copy Go binary
COPY --from=go-builder /app/facr-scraper /usr/local/bin/facr-scraper
# Copy Python environment
COPY --from=python-builder /opt/scrapling /opt/scrapling
# Copy Playwright browser cache
COPY --from=python-builder /root/.cache/ms-playwright /home/scraper/.cache/ms-playwright
# Copy CloakBrowser patched Chromium binary cache
COPY --from=python-builder /root/.cloakbrowser /home/scraper/.cloakbrowser
# Copy scrapling and cloakbrowser scripts
COPY scripts/scrapling_fetch.py /opt/scrapling/scripts/scrapling_fetch.py
COPY scripts/cloakbrowser_fetch.py /opt/scrapling/scripts/cloakbrowser_fetch.py
# Create cache directory and fix permissions
RUN mkdir -p /home/scraper/.cache && chown -R scraper:scraper /home/scraper /opt/scrapling
USER scraper
WORKDIR /home/scraper
# Expose port
EXPOSE 8686
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8686/ || exit 1
CMD ["facr-scraper"]