refactor: optimize docker image and implement lightweight fetching

This commit improves the overall efficiency and reliability of the scraper by:

- Optimizing the Dockerfile by reducing layers, using `--no-install-recommends`, and consolidating Playwright installation.
- Adding resource limits (CPU/Memory) to the docker-compose configuration.
- Refactoring `main.go` to remove unused Cloudflare client structures and increasing cache TTL.
- Implementing a `lightweight_fetch` mechanism in `scrapling_fetch.py` using `urllib` to attempt fast requests before falling back to the heavier Scrapling/Playwright engine.
- Adding Cloudflare challenge detection to the lightweight fetcher.
This commit is contained in:
Tomas Dvorak
2026-05-11 19:50:59 +02:00
parent a8a4e1acaf
commit aa47f4309f
4 changed files with 474 additions and 440 deletions
+20 -38
View File
@@ -14,12 +14,12 @@ RUN CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -o facr-scraper .
# Python stage for Scrapling # Python stage for Scrapling
FROM python:3.11-slim AS python-builder FROM python:3.11-slim AS python-builder
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
# Install system dependencies for Playwright # Install system dependencies for Playwright
RUN apt-get update && apt-get install -y \ RUN apt-get update && apt-get install -y --no-install-recommends \
wget \ wget curl ca-certificates gnupg \
gnupg \
ca-certificates \
curl \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# Create virtual environment and install Scrapling # Create virtual environment and install Scrapling
@@ -28,44 +28,27 @@ ENV PATH="/opt/scrapling/bin:$PATH"
COPY requirements-scrapling.txt . COPY requirements-scrapling.txt .
RUN pip install --no-cache-dir -r requirements-scrapling.txt RUN pip install --no-cache-dir -r requirements-scrapling.txt
# Install Playwright browsers # Install Playwright browsers with deps in one layer
RUN playwright install chromium RUN playwright install chromium --with-deps
RUN playwright install-deps
# Fix Python symlinks # Fix Python symlinks
RUN ln -sf /usr/local/bin/python /opt/scrapling/bin/python RUN ln -sf /usr/local/bin/python /opt/scrapling/bin/python \
RUN ln -sf /usr/local/bin/python /opt/scrapling/bin/python3 && ln -sf /usr/local/bin/python /opt/scrapling/bin/python3
# Final stage # Final stage
FROM python:3.11-slim FROM python:3.11-slim
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
ENV PATH="/opt/scrapling/bin:$PATH"
# Install runtime dependencies for both Go and Playwright # Install runtime dependencies for both Go and Playwright
RUN apt-get update && apt-get install -y \ RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates \ ca-certificates wget curl \
wget \ libglib2.0-0 libgobject-2.0-0 libnspr4 libnss3 libdbus-1-3 \
curl \ libatk1.0-0 libatk-bridge2.0-0 libcups2 libexpat1 libxcb1 \
gnupg \ libxkbcommon0 libatspi2.0-0 libx11-6 libxcomposite1 libxdamage1 \
libglib2.0-0 \ libxext6 libxfixes3 libxrandr2 libgbm1 libcairo2 libpango-1.0-0 \
libgobject-2.0-0 \
libnspr4 \
libnss3 \
libdbus-1-3 \
libatk1.0-0 \
libatk-bridge2.0-0 \
libcups2 \
libexpat1 \
libxcb1 \
libxkbcommon0 \
libatspi2.0-0 \
libx11-6 \
libxcomposite1 \
libxdamage1 \
libxext6 \
libxfixes3 \
libxrandr2 \
libgbm1 \
libcairo2 \
libpango-1.0-0 \
libasound2 \ libasound2 \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
@@ -80,13 +63,12 @@ COPY --from=python-builder /opt/scrapling /opt/scrapling
# Copy Playwright browser cache # Copy Playwright browser cache
COPY --from=python-builder /root/.cache/ms-playwright /home/scraper/.cache/ms-playwright COPY --from=python-builder /root/.cache/ms-playwright /home/scraper/.cache/ms-playwright
ENV PATH="/opt/scrapling/bin:$PATH"
# Copy scrapling script # Copy scrapling script
COPY scripts/scrapling_fetch.py /opt/scrapling/scripts/scrapling_fetch.py COPY scripts/scrapling_fetch.py /opt/scrapling/scripts/scrapling_fetch.py
# Create cache directory for Playwright # Create cache directory for Playwright
RUN mkdir -p /home/scraper/.cache && chown -R scraper:scraper /home/scraper RUN mkdir -p /home/scraper/.cache && chown -R scraper:scraper /home/scraper /opt/scrapling
USER scraper USER scraper
WORKDIR /home/scraper WORKDIR /home/scraper
+8
View File
@@ -13,6 +13,14 @@ services:
- SCRAPLING_SCRIPT=/opt/scrapling/scripts/scrapling_fetch.py - SCRAPLING_SCRIPT=/opt/scrapling/scripts/scrapling_fetch.py
- DEBUG_SAVE_HTML=${DEBUG_SAVE_HTML:-} - DEBUG_SAVE_HTML=${DEBUG_SAVE_HTML:-}
restart: unless-stopped restart: unless-stopped
deploy:
resources:
limits:
cpus: '2.0'
memory: 4G
reservations:
cpus: '0.5'
memory: 512M
volumes: volumes:
# Optional: Mount cache for Playwright browsers # Optional: Mount cache for Playwright browsers
- playwright_cache:/home/scraper/.cache - playwright_cache:/home/scraper/.cache
+273 -286
View File
@@ -16,6 +16,8 @@ import (
"regexp" "regexp"
"strings" "strings"
"sync" "sync"
"sync/atomic"
"syscall"
"time" "time"
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
@@ -32,48 +34,6 @@ type Competition struct {
Table *CompetitionTable `json:"table,omitempty"` Table *CompetitionTable `json:"table,omitempty"`
} }
// Cloudflare Browser Rendering API structures
type CloudflareCrawlRequest struct {
URL string `json:"url"`
Limit int `json:"limit,omitempty"`
Depth int `json:"depth,omitempty"`
Formats []string `json:"formats,omitempty"`
Render bool `json:"render,omitempty"`
Source string `json:"source,omitempty"`
Options map[string]interface{} `json:"options,omitempty"`
}
type CloudflareCrawlResponse struct {
Success bool `json:"success"`
Result string `json:"result"` // job ID
}
type CloudflareCrawlJob struct {
ID string `json:"id"`
Status string `json:"status"`
BrowserSecondsUsed float64 `json:"browserSecondsUsed"`
Total int `json:"total"`
Finished int `json:"finished"`
Records []CloudflareCrawlRecord `json:"records"`
Cursor string `json:"cursor,omitempty"`
}
type CloudflareCrawlRecord struct {
URL string `json:"url"`
Status string `json:"status"`
Markdown string `json:"markdown,omitempty"`
HTML string `json:"html,omitempty"`
JSON interface{} `json:"json,omitempty"`
Metadata map[string]interface{} `json:"metadata"`
}
type CloudflareClient struct {
AccountID string
APIToken string
BaseURL string
Client *http.Client
}
type fetchOptions struct { type fetchOptions struct {
Referer string Referer string
} }
@@ -103,211 +63,95 @@ type cacheEntry struct {
timestamp time.Time timestamp time.Time
} }
const cacheTTL = 5 * time.Minute const cacheTTL = 15 * time.Minute
// NewCloudflareClient creates a new Cloudflare Browser Rendering API client // domainBreakers is a per-domain circuit breaker map so failures on one site
func NewCloudflareClient() *CloudflareClient { // don't block Scrapling for unrelated sites.
accountID := strings.TrimSpace(os.Getenv("CLOUDFLARE_ACCOUNT_ID")) var domainBreakers struct {
apiToken := strings.TrimSpace(os.Getenv("CLOUDFLARE_API_TOKEN")) mu sync.RWMutex
breakers map[string]*circuitBreaker
if accountID == "" || apiToken == "" {
return nil
}
return &CloudflareClient{
AccountID: accountID,
APIToken: apiToken,
BaseURL: "https://api.cloudflare.com/client/v4",
Client: &http.Client{
Timeout: 30 * time.Second,
},
}
} }
// StartCrawl initiates a crawl job // scraplingSem limits concurrent Chromium launches to avoid zombie processes
func (c *CloudflareClient) StartCrawl(ctx context.Context, req CloudflareCrawlRequest) (string, error) { // and resource exhaustion.
if c == nil { var scraplingSem = newSemaphore(2)
return "", fmt.Errorf("Cloudflare client not initialized")
}
// Set defaults type circuitBreaker struct {
if req.Limit == 0 { failures int32
req.Limit = 10 lastFail time.Time
} threshold int
if req.Depth == 0 { timeout time.Duration
req.Depth = 1 mu sync.Mutex
}
if len(req.Formats) == 0 {
req.Formats = []string{"html", "markdown"}
}
if req.Source == "" {
req.Source = "all"
}
// Restrict to specific URL patterns for fotbal.cz to avoid crawling unrelated content
if req.Options == nil {
req.Options = make(map[string]interface{})
}
// Only crawl URLs from the same domain and specific paths
includePatterns := []string{
"https://www.fotbal.cz/**",
}
excludePatterns := []string{
"**/api/**",
"**/static/**",
"**/media/**",
}
req.Options["includePatterns"] = includePatterns
req.Options["excludePatterns"] = excludePatterns
req.Options["includeExternalLinks"] = false
req.Options["includeSubdomains"] = false
body, err := json.Marshal(req)
if err != nil {
return "", fmt.Errorf("failed to marshal request: %w", err)
}
url := fmt.Sprintf("%s/accounts/%s/browser-rendering/crawl", c.BaseURL, c.AccountID)
httpReq, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(body))
if err != nil {
return "", fmt.Errorf("failed to create request: %w", err)
}
httpReq.Header.Set("Authorization", "Bearer "+c.APIToken)
httpReq.Header.Set("Content-Type", "application/json")
resp, err := c.Client.Do(httpReq)
if err != nil {
return "", fmt.Errorf("failed to send request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
return "", fmt.Errorf("API request failed with status %d: %s", resp.StatusCode, string(body))
}
var crawlResp CloudflareCrawlResponse
if err := json.NewDecoder(resp.Body).Decode(&crawlResp); err != nil {
return "", fmt.Errorf("failed to decode response: %w", err)
}
if !crawlResp.Success {
return "", fmt.Errorf("API returned unsuccessful response")
}
return crawlResp.Result, nil
} }
// GetCrawlResults retrieves the results of a crawl job func getDomainBreaker(domain string) *circuitBreaker {
func (c *CloudflareClient) GetCrawlResults(ctx context.Context, jobID string, limit int) (*CloudflareCrawlJob, error) { domainBreakers.mu.RLock()
if c == nil { if cb, ok := domainBreakers.breakers[domain]; ok {
return nil, fmt.Errorf("Cloudflare client not initialized") domainBreakers.mu.RUnlock()
return cb
} }
domainBreakers.mu.RUnlock()
url := fmt.Sprintf("%s/accounts/%s/browser-rendering/crawl/%s", c.BaseURL, c.AccountID, jobID) domainBreakers.mu.Lock()
if limit > 0 { defer domainBreakers.mu.Unlock()
url += fmt.Sprintf("?limit=%d", limit) if cb, ok := domainBreakers.breakers[domain]; ok {
return cb
} }
cb := &circuitBreaker{
httpReq, err := http.NewRequestWithContext(ctx, "GET", url, nil) threshold: 15,
if err != nil { timeout: 30 * time.Minute,
return nil, fmt.Errorf("failed to create request: %w", err)
} }
if domainBreakers.breakers == nil {
httpReq.Header.Set("Authorization", "Bearer "+c.APIToken) domainBreakers.breakers = make(map[string]*circuitBreaker)
resp, err := c.Client.Do(httpReq)
if err != nil {
return nil, fmt.Errorf("failed to send request: %w", err)
} }
defer resp.Body.Close() domainBreakers.breakers[domain] = cb
return cb
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("API request failed with status %d: %s", resp.StatusCode, string(body))
}
var result struct {
Success bool `json:"success"`
Result CloudflareCrawlJob `json:"result"`
}
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return nil, fmt.Errorf("failed to decode response: %w", err)
}
if !result.Success {
return nil, fmt.Errorf("API returned unsuccessful response")
}
return &result.Result, nil
} }
// WaitForCrawlCompletion waits for a crawl job to complete and returns the results func (cb *circuitBreaker) RecordFailure() {
func (c *CloudflareClient) WaitForCrawlCompletion(ctx context.Context, jobID string, maxAttempts int, delay time.Duration) (*CloudflareCrawlJob, error) { atomic.AddInt32(&cb.failures, 1)
if c == nil { cb.mu.Lock()
return nil, fmt.Errorf("Cloudflare client not initialized") cb.lastFail = time.Now()
} cb.mu.Unlock()
}
for i := 0; i < maxAttempts; i++ { func (cb *circuitBreaker) RecordSuccess() {
job, err := c.GetCrawlResults(ctx, jobID, 1) // Use limit=1 for status checks atomic.StoreInt32(&cb.failures, 0)
if err != nil { }
return nil, err
}
if job.Status != "running" { func (cb *circuitBreaker) IsOpen() bool {
// Get full results if atomic.LoadInt32(&cb.failures) < int32(cb.threshold) {
fullJob, err := c.GetCrawlResults(ctx, jobID, 0) // No limit for full results return false
if err != nil {
return nil, err
}
return fullJob, nil
} }
cb.mu.Lock()
last := cb.lastFail
cb.mu.Unlock()
return time.Since(last) < cb.timeout
}
// semaphore limits concurrent operations.
type semaphore struct {
ch chan struct{}
}
func newSemaphore(n int) *semaphore {
return &semaphore{ch: make(chan struct{}, n)}
}
func (s *semaphore) Acquire(ctx context.Context) error {
select { select {
case s.ch <- struct{}{}:
return nil
case <-ctx.Done(): case <-ctx.Done():
return nil, ctx.Err() return ctx.Err()
case <-time.After(delay):
continue
} }
}
return nil, fmt.Errorf("crawl job did not complete within timeout")
} }
// CrawlURL performs a complete crawl operation for a single URL func (s *semaphore) Release() {
func (c *CloudflareClient) CrawlURL(ctx context.Context, url string) (*CloudflareCrawlJob, error) { select {
if c == nil { case <-s.ch:
return nil, fmt.Errorf("Cloudflare client not initialized") default:
} }
req := CloudflareCrawlRequest{
URL: url,
Limit: 1, // Only crawl the specific URL
Depth: 0, // Don't follow links
Formats: []string{"html", "markdown"},
Render: true,
Source: "links", // Only crawl the specific URL, not sitemaps
}
jobID, err := c.StartCrawl(ctx, req)
if err != nil {
return nil, fmt.Errorf("failed to start crawl: %w", err)
}
// Wait for completion with reasonable timeout
ctx, cancel := context.WithTimeout(ctx, 2*time.Minute)
defer cancel()
job, err := c.WaitForCrawlCompletion(ctx, jobID, 24, 5*time.Second)
if err != nil {
return nil, fmt.Errorf("failed to wait for crawl completion: %w", err)
}
return job, nil
} }
func newBrowserRequest(url string, opts fetchOptions) (*http.Request, error) { func newBrowserRequest(url string, opts fetchOptions) (*http.Request, error) {
@@ -364,11 +208,12 @@ func compactErrorText(s string) string {
return s return s
} }
func fetchPageDirect(url string, opts fetchOptions) ([]byte, error) { func fetchPageDirectOnce(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
req, err := newBrowserRequest(url, opts) req, err := newBrowserRequest(url, opts)
if err != nil { if err != nil {
return nil, err return nil, err
} }
req = req.WithContext(ctx)
client := &http.Client{Timeout: 15 * time.Second} client := &http.Client{Timeout: 15 * time.Second}
resp, err := client.Do(req) resp, err := client.Do(req)
@@ -392,12 +237,31 @@ func fetchPageDirect(url string, opts fetchOptions) ([]byte, error) {
return body, nil return body, nil
} }
func fetchPageWithWget(url string, opts fetchOptions) ([]byte, error) { func fetchPageDirect(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
var lastErr error
for attempt := 0; attempt < 3; attempt++ {
if attempt > 0 {
select {
case <-time.After(time.Duration(attempt) * time.Second):
case <-ctx.Done():
return nil, ctx.Err()
}
}
body, err := fetchPageDirectOnce(ctx, url, opts)
if err == nil {
return body, nil
}
lastErr = err
}
return nil, fmt.Errorf("direct fetch failed after 3 attempts: %w", lastErr)
}
func fetchPageWithWget(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
if _, err := exec.LookPath("wget"); err != nil { if _, err := exec.LookPath("wget"); err != nil {
return nil, fmt.Errorf("wget not available: %w", err) return nil, fmt.Errorf("wget not available: %w", err)
} }
ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second) ctx, cancel := context.WithTimeout(ctx, 20*time.Second)
defer cancel() defer cancel()
args := []string{ args := []string{
@@ -509,6 +373,52 @@ func ensureEmbeddedScraplingHelper() (string, error) {
return embeddedScraplingHelperFile, nil return embeddedScraplingHelperFile, nil
} }
func fetchPageWithCurl(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
if _, err := exec.LookPath("curl"); err != nil {
return nil, fmt.Errorf("curl not available: %w", err)
}
ctx, cancel := context.WithTimeout(ctx, 20*time.Second)
defer cancel()
args := []string{
"-sSL",
"--max-time", "15",
"-A", browserUserAgent,
"-H", "Accept: " + browserAccept,
"-H", "Accept-Language: " + browserAcceptLanguage,
"-H", "Connection: keep-alive",
}
if opts.Referer != "" {
args = append(args, "-H", "Referer: "+opts.Referer)
}
args = append(args, url)
cmd := exec.CommandContext(ctx, "curl", args...)
var stdout bytes.Buffer
var stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
details := compactErrorText(stderr.String())
if details == "" {
details = compactErrorText(err.Error())
}
return nil, fmt.Errorf("curl request failed: %s", details)
}
body := stdout.Bytes()
if len(body) == 0 {
return nil, fmt.Errorf("curl returned an empty body")
}
if looksLikeCloudflareBlock(body) {
return nil, fmt.Errorf("curl returned a Cloudflare challenge page")
}
return body, nil
}
func findScraplingHelperScript() (string, error) { func findScraplingHelperScript() (string, error) {
cwd, _ := os.Getwd() cwd, _ := os.Getwd()
@@ -553,7 +463,17 @@ func findScraplingPython() string {
) )
} }
func fetchPageWithScrapling(url string, opts fetchOptions) ([]byte, error) { func fetchPageWithScrapling(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
parsedURL, err := neturl.Parse(url)
if err != nil {
return nil, fmt.Errorf("Scrapling skipped: invalid URL: %w", err)
}
domain := parsedURL.Host
if getDomainBreaker(domain).IsOpen() {
return nil, fmt.Errorf("Scrapling skipped: circuit breaker is open for %s", domain)
}
pythonBin := findScraplingPython() pythonBin := findScraplingPython()
if pythonBin == "" { if pythonBin == "" {
return nil, fmt.Errorf("Scrapling skipped: no Python runtime found") return nil, fmt.Errorf("Scrapling skipped: no Python runtime found")
@@ -564,10 +484,16 @@ func fetchPageWithScrapling(url string, opts fetchOptions) ([]byte, error) {
return nil, fmt.Errorf("Scrapling skipped: %w", err) return nil, fmt.Errorf("Scrapling skipped: %w", err)
} }
ctx, cancel := context.WithTimeout(context.Background(), 90*time.Second) // Acquire global Scrapling semaphore to limit concurrent Chromium launches
if err := scraplingSem.Acquire(ctx); err != nil {
return nil, fmt.Errorf("Scrapling skipped: %w", err)
}
defer scraplingSem.Release()
ctx, cancel := context.WithTimeout(ctx, 90*time.Second)
defer cancel() defer cancel()
args := []string{helperScript, "--url", url} args := []string{helperScript, "--url", url, "--timeout-ms", "60000", "--wait-ms", "500"}
if opts.Referer != "" { if opts.Referer != "" {
args = append(args, "--referer", opts.Referer) args = append(args, "--referer", opts.Referer)
} }
@@ -577,32 +503,51 @@ func fetchPageWithScrapling(url string, opts fetchOptions) ([]byte, error) {
var stderr bytes.Buffer var stderr bytes.Buffer
cmd.Stdout = &stdout cmd.Stdout = &stdout
cmd.Stderr = &stderr cmd.Stderr = &stderr
// Run in a new process group so we can kill all Chromium children on timeout
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
if err := cmd.Run(); err != nil { if err := cmd.Run(); err != nil {
// Kill the entire process group including Chromium children
if cmd.Process != nil {
syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
}
details := compactErrorText(stderr.String()) details := compactErrorText(stderr.String())
if details == "" { if details == "" {
details = compactErrorText(err.Error()) details = compactErrorText(err.Error())
} }
// Don't count context cancellations (client timeouts) or infrastructure failures
if ctx.Err() == nil && !strings.Contains(details, "Executable doesn't exist") {
getDomainBreaker(domain).RecordFailure()
}
return nil, fmt.Errorf("Scrapling request failed: %s", details) return nil, fmt.Errorf("Scrapling request failed: %s", details)
} }
body := stdout.Bytes() body := stdout.Bytes()
if len(body) == 0 { if len(body) == 0 {
getDomainBreaker(domain).RecordFailure()
return nil, fmt.Errorf("Scrapling returned an empty body") return nil, fmt.Errorf("Scrapling returned an empty body")
} }
if looksLikeCloudflareBlock(body) { if looksLikeCloudflareBlock(body) {
getDomainBreaker(domain).RecordFailure()
return nil, fmt.Errorf("Scrapling returned a Cloudflare challenge page") return nil, fmt.Errorf("Scrapling returned a Cloudflare challenge page")
} }
getDomainBreaker(domain).RecordSuccess()
return body, nil return body, nil
} }
func fetchPageWithFallback(url string) ([]byte, error) { func fetchPageWithFallback(ctx context.Context, url string) ([]byte, error) {
return fetchPageWithFallbackOptions(url, fetchOptions{}) return fetchPageWithFallbackOptions(ctx, url, fetchOptions{})
} }
// fetchPageWithFallback tries Go HTTP first, then wget, then Scrapling, then Cloudflare Browser Rendering. // fetchPageWithFallback tries Go HTTP first, then curl/wget, then Scrapling.
func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error) { // When direct HTTP returns a Cloudflare block, curl/wget are skipped since they
// will just return the same challenge page and waste ~20 seconds.
func fetchPageWithFallbackOptions(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
if err := ctx.Err(); err != nil {
return nil, err
}
// Check cache first // Check cache first
pageCacheMu.RLock() pageCacheMu.RLock()
if entry, ok := pageCache[url]; ok { if entry, ok := pageCache[url]; ok {
@@ -614,14 +559,29 @@ func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error)
} }
pageCacheMu.RUnlock() pageCacheMu.RUnlock()
body, err := fetchPageDirect(url, opts) body, err := fetchPageDirect(ctx, url, opts)
if err == nil { if err == nil {
cachePage(url, body) cachePage(url, body)
return body, nil return body, nil
} }
log.Printf("Direct request failed for %s: %v", url, err) log.Printf("Direct request failed for %s: %v", url, err)
body, err = fetchPageWithWget(url, opts) // If direct HTTP returned a Cloudflare block, skip curl/wget time-wasters
// and go straight to Scrapling which can solve the challenge.
if strings.Contains(err.Error(), "403") || strings.Contains(err.Error(), "Cloudflare") {
log.Printf("Skipping curl/wget fallbacks for %s: direct HTTP hit Cloudflare wall", url)
goto scraplingFallback
}
body, err = fetchPageWithCurl(ctx, url, opts)
if err == nil {
log.Printf("Successfully retrieved content via curl for %s", url)
cachePage(url, body)
return body, nil
}
log.Printf("curl fallback failed for %s: %v", url, err)
body, err = fetchPageWithWget(ctx, url, opts)
if err == nil { if err == nil {
log.Printf("Successfully retrieved content via wget for %s", url) log.Printf("Successfully retrieved content via wget for %s", url)
cachePage(url, body) cachePage(url, body)
@@ -629,7 +589,8 @@ func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error)
} }
log.Printf("wget fallback failed for %s: %v", url, err) log.Printf("wget fallback failed for %s: %v", url, err)
body, err = fetchPageWithScrapling(url, opts) scraplingFallback:
body, err = fetchPageWithScrapling(ctx, url, opts)
if err == nil { if err == nil {
log.Printf("Successfully retrieved content via Scrapling for %s", url) log.Printf("Successfully retrieved content via Scrapling for %s", url)
cachePage(url, body) cachePage(url, body)
@@ -637,32 +598,7 @@ func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error)
} }
log.Printf("Scrapling fallback failed for %s: %v", url, err) log.Printf("Scrapling fallback failed for %s: %v", url, err)
if cfClient := NewCloudflareClient(); cfClient != nil { return nil, fmt.Errorf("all fetch methods failed for %s: %w", url, err)
log.Printf("Attempting Cloudflare crawl fallback for %s", url)
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
defer cancel()
job, err := cfClient.CrawlURL(ctx, url)
if err != nil {
log.Printf("Cloudflare crawl failed for %s: %v", url, err)
return nil, fmt.Errorf("go scraping failed, wget failed, Scrapling failed, and Cloudflare crawl failed: %w", err)
}
if len(job.Records) > 0 && job.Records[0].Status == "completed" {
body := []byte(job.Records[0].HTML)
if looksLikeCloudflareBlock(body) {
return nil, fmt.Errorf("Cloudflare crawl returned a challenge page")
}
log.Printf("Successfully retrieved content via Cloudflare crawl for %s", url)
cachePage(url, body)
return body, nil
}
log.Printf("Cloudflare crawl returned no completed records for %s", url)
return nil, fmt.Errorf("Cloudflare crawl returned no completed records")
}
return nil, fmt.Errorf("go scraping failed, wget failed, Scrapling failed, and Cloudflare client is not available")
} }
func cachePage(url string, body []byte) { func cachePage(url string, body []byte) {
@@ -674,13 +610,13 @@ func cachePage(url string, body []byte) {
// parseCompetitionMatchesFromFotbal scrapes matches from the public fotbal.cz // parseCompetitionMatchesFromFotbal scrapes matches from the public fotbal.cz
// competition page (e.g., https://www.fotbal.cz/souteze/turnaje/table/{id}). // competition page (e.g., https://www.fotbal.cz/souteze/turnaje/table/{id}).
// It filters to only include matches involving the given clubName if provided. // It filters to only include matches involving the given clubName if provided.
func parseCompetitionMatchesFromFotbal(pageURL, clubType, clubName, clubID string) []Match { func parseCompetitionMatchesFromFotbal(ctx context.Context, pageURL, clubType, clubName, clubID string) []Match {
pageURL = strings.TrimSpace(pageURL) pageURL = strings.TrimSpace(pageURL)
if pageURL == "" { if pageURL == "" {
return nil return nil
} }
body, err := fetchPageWithFallback(pageURL) body, err := fetchPageWithFallback(ctx, pageURL)
if err != nil { if err != nil {
log.Printf("fotbal.cz matches fetch failed for %s: %v", pageURL, err) log.Printf("fotbal.cz matches fetch failed for %s: %v", pageURL, err)
return nil return nil
@@ -846,8 +782,13 @@ func parseCompetitionMatchesFromFotbal(pageURL, clubType, clubName, clubID strin
} }
// parseCompetitionMatchesFromIS scrapes matches from the IS portal as fallback. // parseCompetitionMatchesFromIS scrapes matches from the IS portal as fallback.
func parseCompetitionMatchesFromIS(detailURL, clubType, clubName, clubID string) []Match { func parseCompetitionMatchesFromIS(ctx context.Context, detailURL, clubType, clubName, clubID string) []Match {
resp, err := http.Get(detailURL) req, err := http.NewRequestWithContext(ctx, "GET", detailURL, nil)
if err != nil {
log.Printf("IS matches request error for %s: %v", detailURL, err)
return nil
}
resp, err := http.DefaultClient.Do(req)
if err != nil { if err != nil {
log.Printf("IS matches fetch error for %s: %v", detailURL, err) log.Printf("IS matches fetch error for %s: %v", detailURL, err)
return nil return nil
@@ -1275,14 +1216,14 @@ func getClubSearch(w http.ResponseWriter, r *http.Request) {
vals.Set("q", q) vals.Set("q", q)
searchURL := "https://www.fotbal.cz/club/hledej?" + vals.Encode() searchURL := "https://www.fotbal.cz/club/hledej?" + vals.Encode()
fetchSearchPage := func(url string) ([]byte, error) { fetchSearchPage := func(ctx context.Context, url string) ([]byte, error) {
return fetchPageWithFallbackOptions(url, fetchOptions{ return fetchPageWithFallbackOptions(ctx, url, fetchOptions{
Referer: "https://www.fotbal.cz/club/hledej", Referer: "https://www.fotbal.cz/club/hledej",
}) })
} }
// Try direct HTTP request first // Try direct HTTP request first
body, err := fetchSearchPage(searchURL) body, err := fetchSearchPage(r.Context(), searchURL)
if err != nil { if err != nil {
log.Printf("Direct search request failed for %s: %v", searchURL, err) log.Printf("Direct search request failed for %s: %v", searchURL, err)
@@ -1298,7 +1239,7 @@ func getClubSearch(w http.ResponseWriter, r *http.Request) {
} }
} }
body, err = fetchSearchPage(searchURL2) body, err = fetchSearchPage(r.Context(), searchURL2)
if err != nil { if err != nil {
log.Printf("Retried search request failed for %s: %v", searchURL2, err) log.Printf("Retried search request failed for %s: %v", searchURL2, err)
// Return empty results instead of error // Return empty results instead of error
@@ -1409,7 +1350,7 @@ func getClubTables(w http.ResponseWriter, r *http.Request) {
} }
url := fmt.Sprintf("%s/%s", baseURL, clubID) url := fmt.Sprintf("%s/%s", baseURL, clubID)
body, err := fetchPageWithFallback(url) body, err := fetchPageWithFallback(r.Context(), url)
if err != nil { if err != nil {
http.Error(w, fmt.Sprintf("Error fetching club data: %v", err), http.StatusInternalServerError) http.Error(w, fmt.Sprintf("Error fetching club data: %v", err), http.StatusInternalServerError)
return return
@@ -1459,26 +1400,43 @@ func getClubTables(w http.ResponseWriter, r *http.Request) {
}) })
}) })
// For each competition, fetch the standings tables from is.fotbal.cz // For each competition, fetch the standings tables from is.fotbal.cz concurrently
sem := newSemaphore(4)
var wg sync.WaitGroup
var mu sync.Mutex
for i := range competitions { for i := range competitions {
comp := &competitions[i] wg.Add(1)
go func(idx int) {
defer wg.Done()
if err := sem.Acquire(r.Context()); err != nil {
return
}
defer sem.Release()
comp := &competitions[idx]
tableURL := fmt.Sprintf("https://is.fotbal.cz/public/souteze/tabulky-souteze.aspx?req=%s&sport=%s", comp.ID, sportParam) tableURL := fmt.Sprintf("https://is.fotbal.cz/public/souteze/tabulky-souteze.aspx?req=%s&sport=%s", comp.ID, sportParam)
resp, err := http.Get(tableURL) req, err := http.NewRequestWithContext(r.Context(), "GET", tableURL, nil)
if err != nil {
log.Printf("error creating request for competition table %s: %v", comp.ID, err)
return
}
resp, err := http.DefaultClient.Do(req)
if err != nil { if err != nil {
log.Printf("error fetching competition table for %s: %v", comp.ID, err) log.Printf("error fetching competition table for %s: %v", comp.ID, err)
continue return
} }
defer resp.Body.Close() defer resp.Body.Close()
if resp.StatusCode != http.StatusOK { if resp.StatusCode != http.StatusOK {
log.Printf("non-200 response for %s: %d", comp.ID, resp.StatusCode) log.Printf("non-200 response for %s: %d", comp.ID, resp.StatusCode)
continue return
} }
docTable, err := goquery.NewDocumentFromReader(resp.Body) docTable, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil { if err != nil {
log.Printf("error parsing table HTML for %s: %v", comp.ID, err) log.Printf("error parsing table HTML for %s: %v", comp.ID, err)
continue return
} }
// Parse section: Tabulka celková (only overall) // Parse section: Tabulka celková (only overall)
@@ -1532,8 +1490,12 @@ func getClubTables(w http.ResponseWriter, r *http.Request) {
} }
overall = parseSection("Tabulka celková") overall = parseSection("Tabulka celková")
mu.Lock()
comp.Table = &CompetitionTable{Overall: overall} comp.Table = &CompetitionTable{Overall: overall}
mu.Unlock()
}(i)
} }
wg.Wait()
clubName := strings.TrimSpace(doc.Find("h1.H4 span").First().Text()) clubName := strings.TrimSpace(doc.Find("h1.H4 span").First().Text())
clubURL := strings.TrimSpace(doc.Find("h1.H4 a").First().AttrOr("href", "")) clubURL := strings.TrimSpace(doc.Find("h1.H4 a").First().AttrOr("href", ""))
@@ -1583,7 +1545,7 @@ func getClubInfo(w http.ResponseWriter, r *http.Request) {
} }
url := fmt.Sprintf("%s/%s", baseURL, clubID) url := fmt.Sprintf("%s/%s", baseURL, clubID)
body, err := fetchPageWithFallback(url) body, err := fetchPageWithFallback(r.Context(), url)
if err != nil { if err != nil {
http.Error(w, fmt.Sprintf("Error fetching club data: %v", err), http.StatusInternalServerError) http.Error(w, fmt.Sprintf("Error fetching club data: %v", err), http.StatusInternalServerError)
return return
@@ -1637,21 +1599,37 @@ func getClubInfo(w http.ResponseWriter, r *http.Request) {
competitions = append(competitions, Competition{ID: compID, Code: code, Name: name, TeamCount: teamCount, MatchesLink: tableLink}) competitions = append(competitions, Competition{ID: compID, Code: code, Name: name, TeamCount: teamCount, MatchesLink: tableLink})
}) })
// For each competition, fetch matches // For each competition, fetch matches concurrently with limits
sem := newSemaphore(4)
var wg sync.WaitGroup
var mu sync.Mutex
for i := range competitions { for i := range competitions {
comp := &competitions[i] wg.Add(1)
go func(idx int) {
defer wg.Done()
if err := sem.Acquire(r.Context()); err != nil {
return
}
defer sem.Release()
comp := &competitions[idx]
matchesLink := comp.MatchesLink matchesLink := comp.MatchesLink
// 1) Try parsing from the public fotbal.cz competition page (matches_link) // 1) Try parsing from the public fotbal.cz competition page (matches_link)
matches := parseCompetitionMatchesFromFotbal(matchesLink, clubType, clubName, clubID) matches := parseCompetitionMatchesFromFotbal(r.Context(), matchesLink, clubType, clubName, clubID)
// Always try IS as well and prefer it if it provides at least as many matches // Always try IS as well and prefer it if it provides at least as many matches
detailURL := fmt.Sprintf("https://is.fotbal.cz/public/souteze/detail-souteze.aspx?req=%s&sport=%s", comp.ID, sportParam) detailURL := fmt.Sprintf("https://is.fotbal.cz/public/souteze/detail-souteze.aspx?req=%s&sport=%s", comp.ID, sportParam)
isMatches := parseCompetitionMatchesFromIS(detailURL, clubType, clubName, clubID) isMatches := parseCompetitionMatchesFromIS(r.Context(), detailURL, clubType, clubName, clubID)
// Prefer IS whenever it yields any results, as IS often contains alias team names // Prefer IS whenever it yields any results, as IS often contains alias team names
if len(isMatches) > 0 { if len(isMatches) > 0 {
matches = isMatches matches = isMatches
} }
mu.Lock()
comp.Matches = matches comp.Matches = matches
mu.Unlock()
}(i)
} }
wg.Wait()
clubInfo := ClubInfo{ clubInfo := ClubInfo{
Name: clubName, Name: clubName,
@@ -1679,9 +1657,18 @@ func main() {
http.Redirect(w, r, "/club/football/"+vars["id"], http.StatusMovedPermanently) http.Redirect(w, r, "/club/football/"+vars["id"], http.StatusMovedPermanently)
}).Methods("GET") }).Methods("GET")
r.HandleFunc("/", docsHandler) r.HandleFunc("/", docsHandler)
port := "0.0.0.0:8686"
fmt.Printf("Server running on http://%s\n", port) addr := "0.0.0.0:8686"
log.Fatal(http.ListenAndServe(port, r)) srv := &http.Server{
Addr: addr,
Handler: r,
ReadTimeout: 30 * time.Second,
WriteTimeout: 5 * time.Minute,
IdleTimeout: 120 * time.Second,
MaxHeaderBytes: 1 << 20,
}
fmt.Printf("Server running on http://%s\n", addr)
log.Fatal(srv.ListenAndServe())
} }
// docsHandler serves a simple HTML API documentation at the root endpoint. // docsHandler serves a simple HTML API documentation at the root endpoint.
+88 -31
View File
@@ -3,7 +3,34 @@
import argparse import argparse
import contextlib import contextlib
import logging import logging
import ssl
import sys import sys
import urllib.request
BROWSER_UA = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0 Safari/537.36"
)
CF_SIGNS = [
b"<title>just a moment...</title>",
b"attention required!",
b"enable javascript and cookies to continue",
b"checking if the site connection is secure",
b"cf-browser-verification",
b"/cdn-cgi/challenge-platform/",
]
def looks_like_cloudflare_block(body: bytes) -> bool:
if not body:
return False
low = body.lower()
for sig in CF_SIGNS:
if sig in low:
return True
return False
def response_body_bytes(response) -> bytes: def response_body_bytes(response) -> bytes:
@@ -20,60 +47,90 @@ def response_body_bytes(response) -> bytes:
return str(response).encode("utf-8") return str(response).encode("utf-8")
def main() -> int: def lightweight_fetch(url: str, referer: str = "", timeout: float = 15.0) -> bytes:
parser = argparse.ArgumentParser() """Try a lightweight urllib fetch with browser headers first."""
parser.add_argument("--url", required=True) req = urllib.request.Request(
parser.add_argument("--referer", default="") url,
parser.add_argument("--timeout-ms", type=int, default=45000) headers={
parser.add_argument("--wait-ms", type=int, default=1000) "User-Agent": BROWSER_UA,
args = parser.parse_args() "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "cs-CZ,cs;q=0.9,en;q=0.8",
"Accept-Encoding": "identity",
"Connection": "keep-alive",
**({"Referer": referer} if referer else {}),
},
)
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
body = resp.read()
if looks_like_cloudflare_block(body):
raise RuntimeError(" lightweight fetch returned Cloudflare challenge")
return body
def scrapling_fetch(url: str, referer: str = "", timeout_ms: int = 30000, wait_ms: int = 500) -> bytes:
try: try:
from scrapling.fetchers import StealthyFetcher from scrapling.fetchers import StealthyFetcher
except Exception as exc: except Exception as exc:
print(f"Scrapling import failed: {exc}", file=sys.stderr) raise RuntimeError(f"Scrapling import failed: {exc}") from exc
return 2
logging.getLogger().setLevel(logging.ERROR) logging.getLogger().setLevel(logging.ERROR)
extra_headers = {} extra_headers = {}
if args.referer: if referer:
extra_headers["Referer"] = args.referer extra_headers["Referer"] = referer
fetch_kwargs = { fetch_kwargs = {
"headless": True, "headless": True,
"network_idle": True, "network_idle": True,
"google_search": False, "google_search": False,
"solve_cloudflare": True, "solve_cloudflare": True,
"timeout": args.timeout_ms, "timeout": timeout_ms,
"wait": args.wait_ms, "wait": wait_ms,
} }
if extra_headers: if extra_headers:
fetch_kwargs["extra_headers"] = extra_headers fetch_kwargs["extra_headers"] = extra_headers
try:
with contextlib.redirect_stdout(sys.stderr): with contextlib.redirect_stdout(sys.stderr):
response = StealthyFetcher.fetch(args.url, **fetch_kwargs) response = StealthyFetcher.fetch(url, **fetch_kwargs)
status = getattr(response, "status", None)
if isinstance(status, int) and status >= 400:
raise RuntimeError(f"Scrapling returned HTTP {status}")
body = response_body_bytes(response)
if not body:
raise RuntimeError("Scrapling returned an empty body")
return body
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--url", required=True)
parser.add_argument("--referer", default="")
parser.add_argument("--timeout-ms", type=int, default=30000)
parser.add_argument("--wait-ms", type=int, default=500)
args = parser.parse_args()
# 1) Try lightweight urllib fetch first (no browser, instant)
try:
body = lightweight_fetch(args.url, args.referer, timeout=min(args.timeout_ms / 1000.0, 15.0))
sys.stdout.buffer.write(body)
return 0
except Exception as exc:
print(f"Lightweight fetch failed: {exc}", file=sys.stderr)
# 2) Fall back to Scrapling / Playwright only if lightweight failed
try:
body = scrapling_fetch(args.url, args.referer, args.timeout_ms, args.wait_ms)
sys.stdout.buffer.write(body)
return 0
except Exception as exc: except Exception as exc:
print(f"Scrapling fetch failed: {exc}", file=sys.stderr) print(f"Scrapling fetch failed: {exc}", file=sys.stderr)
return 1 return 1
status = getattr(response, "status", None)
if isinstance(status, int) and status >= 400:
print(f"Scrapling returned HTTP {status}", file=sys.stderr)
return 1
body = response_body_bytes(response)
if not body:
print("Scrapling returned an empty body", file=sys.stderr)
return 1
try:
sys.stdout.buffer.write(body)
except BrokenPipeError:
return 0
return 0
if __name__ == "__main__": if __name__ == "__main__":
raise SystemExit(main()) raise SystemExit(main())