refactor: optimize docker image and implement lightweight fetching

This commit improves the overall efficiency and reliability of the scraper by: - Optimizing the Dockerfile by reducing layers, using `--no-install-recommends`, and consolidating Playwright installation. - Adding resource limits (CPU/Memory) to the docker-compose configuration. - Refactoring `main.go` to remove unused Cloudflare client structures and increasing cache TTL. - Implementing a `lightweight_fetch` mechanism in `scrapling_fetch.py` using `urllib` to attempt fast requests before falling back to the heavier Scrapling/Playwright engine. - Adding Cloudflare challenge detection to the lightweight fetcher.
2026-07-29 05:53:49 +00:00 · 2026-05-11 19:50:59 +02:00
parent a8a4e1acaf
commit aa47f4309f
4 changed files with 474 additions and 440 deletions
@@ -14,12 +14,12 @@ RUN CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -o facr-scraper .
 # Python stage for Scrapling
 FROM python:3.11-slim AS python-builder
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
 # Install system dependencies for Playwright
-RUN apt-get update && apt-get install -y \
+RUN apt-get update && apt-get install -y --no-install-recommends \
-    wget \
+    wget curl ca-certificates gnupg \
    gnupg \
    ca-certificates \
    curl \
    && rm -rf /var/lib/apt/lists/*
 # Create virtual environment and install Scrapling
@@ -28,44 +28,27 @@ ENV PATH="/opt/scrapling/bin:$PATH"
 COPY requirements-scrapling.txt .
 RUN pip install --no-cache-dir -r requirements-scrapling.txt
-# Install Playwright browsers
+# Install Playwright browsers with deps in one layer
-RUN playwright install chromium
+RUN playwright install chromium --with-deps
 RUN playwright install-deps
 # Fix Python symlinks
-RUN ln -sf /usr/local/bin/python /opt/scrapling/bin/python
+RUN ln -sf /usr/local/bin/python /opt/scrapling/bin/python \
-RUN ln -sf /usr/local/bin/python /opt/scrapling/bin/python3
+    && ln -sf /usr/local/bin/python /opt/scrapling/bin/python3
 # Final stage
 FROM python:3.11-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
 ENV PATH="/opt/scrapling/bin:$PATH"
 # Install runtime dependencies for both Go and Playwright
-RUN apt-get update && apt-get install -y \
+RUN apt-get update && apt-get install -y --no-install-recommends \
-    ca-certificates \
+    ca-certificates wget curl \
-    wget \
+    libglib2.0-0 libgobject-2.0-0 libnspr4 libnss3 libdbus-1-3 \
-    curl \
+    libatk1.0-0 libatk-bridge2.0-0 libcups2 libexpat1 libxcb1 \
-    gnupg \
+    libxkbcommon0 libatspi2.0-0 libx11-6 libxcomposite1 libxdamage1 \
-    libglib2.0-0 \
+    libxext6 libxfixes3 libxrandr2 libgbm1 libcairo2 libpango-1.0-0 \
    libgobject-2.0-0 \
    libnspr4 \
    libnss3 \
    libdbus-1-3 \
    libatk1.0-0 \
    libatk-bridge2.0-0 \
    libcups2 \
    libexpat1 \
    libxcb1 \
    libxkbcommon0 \
    libatspi2.0-0 \
    libx11-6 \
    libxcomposite1 \
    libxdamage1 \
    libxext6 \
    libxfixes3 \
    libxrandr2 \
    libgbm1 \
    libcairo2 \
    libpango-1.0-0 \
    libasound2 \
    && rm -rf /var/lib/apt/lists/*
@@ -80,13 +63,12 @@ COPY --from=python-builder /opt/scrapling /opt/scrapling
 # Copy Playwright browser cache
 COPY --from=python-builder /root/.cache/ms-playwright /home/scraper/.cache/ms-playwright
 ENV PATH="/opt/scrapling/bin:$PATH"
 # Copy scrapling script
 COPY scripts/scrapling_fetch.py /opt/scrapling/scripts/scrapling_fetch.py
 # Create cache directory for Playwright
-RUN mkdir -p /home/scraper/.cache && chown -R scraper:scraper /home/scraper
+RUN mkdir -p /home/scraper/.cache && chown -R scraper:scraper /home/scraper /opt/scrapling
 USER scraper
 WORKDIR /home/scraper
@@ -13,6 +13,14 @@ services:
      - SCRAPLING_SCRIPT=/opt/scrapling/scripts/scrapling_fetch.py
      - DEBUG_SAVE_HTML=${DEBUG_SAVE_HTML:-}
    restart: unless-stopped
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 4G
        reservations:
          cpus: '0.5'
          memory: 512M
    volumes:
      # Optional: Mount cache for Playwright browsers
      - playwright_cache:/home/scraper/.cache
@@ -16,6 +16,8 @@ import (
 	"regexp"
 	"strings"
 	"sync"
 	"sync/atomic"
 	"syscall"
 	"time"
 	"github.com/PuerkitoBio/goquery"
@@ -32,48 +34,6 @@ type Competition struct {
 	Table       *CompetitionTable `json:"table,omitempty"`
 }
 // Cloudflare Browser Rendering API structures
 type CloudflareCrawlRequest struct {
 	URL     string                 `json:"url"`
 	Limit   int                    `json:"limit,omitempty"`
 	Depth   int                    `json:"depth,omitempty"`
 	Formats []string               `json:"formats,omitempty"`
 	Render  bool                   `json:"render,omitempty"`
 	Source  string                 `json:"source,omitempty"`
 	Options map[string]interface{} `json:"options,omitempty"`
 }
 type CloudflareCrawlResponse struct {
 	Success bool   `json:"success"`
 	Result  string `json:"result"` // job ID
 }
 type CloudflareCrawlJob struct {
 	ID                 string                  `json:"id"`
 	Status             string                  `json:"status"`
 	BrowserSecondsUsed float64                 `json:"browserSecondsUsed"`
 	Total              int                     `json:"total"`
 	Finished           int                     `json:"finished"`
 	Records            []CloudflareCrawlRecord `json:"records"`
 	Cursor             string                  `json:"cursor,omitempty"`
 }
 type CloudflareCrawlRecord struct {
 	URL      string                 `json:"url"`
 	Status   string                 `json:"status"`
 	Markdown string                 `json:"markdown,omitempty"`
 	HTML     string                 `json:"html,omitempty"`
 	JSON     interface{}            `json:"json,omitempty"`
 	Metadata map[string]interface{} `json:"metadata"`
 }
 type CloudflareClient struct {
 	AccountID string
 	APIToken  string
 	BaseURL   string
 	Client    *http.Client
 }
 type fetchOptions struct {
 	Referer string
 }
@@ -103,211 +63,95 @@ type cacheEntry struct {
 	timestamp time.Time
 }
-const cacheTTL = 5 * time.Minute
+const cacheTTL = 15 * time.Minute
-// NewCloudflareClient creates a new Cloudflare Browser Rendering API client
+// domainBreakers is a per-domain circuit breaker map so failures on one site
-func NewCloudflareClient() *CloudflareClient {
+// don't block Scrapling for unrelated sites.
-	accountID := strings.TrimSpace(os.Getenv("CLOUDFLARE_ACCOUNT_ID"))
+var domainBreakers struct {
-	apiToken := strings.TrimSpace(os.Getenv("CLOUDFLARE_API_TOKEN"))
+	mu       sync.RWMutex
-
+	breakers map[string]*circuitBreaker
 	if accountID == "" || apiToken == "" {
 		return nil
 	}
 	return &CloudflareClient{
 		AccountID: accountID,
 		APIToken:  apiToken,
 		BaseURL:   "https://api.cloudflare.com/client/v4",
 		Client: &http.Client{
 			Timeout: 30 * time.Second,
 		},
 	}
 }
-// StartCrawl initiates a crawl job
+// scraplingSem limits concurrent Chromium launches to avoid zombie processes
-func (c *CloudflareClient) StartCrawl(ctx context.Context, req CloudflareCrawlRequest) (string, error) {
+// and resource exhaustion.
-	if c == nil {
+var scraplingSem = newSemaphore(2)
 		return "", fmt.Errorf("Cloudflare client not initialized")
 	}
-	// Set defaults
+type circuitBreaker struct {
-	if req.Limit == 0 {
+	failures  int32
-		req.Limit = 10
+	lastFail  time.Time
-	}
+	threshold int
-	if req.Depth == 0 {
+	timeout   time.Duration
-		req.Depth = 1
+	mu        sync.Mutex
 	}
 	if len(req.Formats) == 0 {
 		req.Formats = []string{"html", "markdown"}
 	}
 	if req.Source == "" {
 		req.Source = "all"
 	}
 	// Restrict to specific URL patterns for fotbal.cz to avoid crawling unrelated content
 	if req.Options == nil {
 		req.Options = make(map[string]interface{})
 	}
 	// Only crawl URLs from the same domain and specific paths
 	includePatterns := []string{
 		"https://www.fotbal.cz/**",
 	}
 	excludePatterns := []string{
 		"**/api/**",
 		"**/static/**",
 		"**/media/**",
 	}
 	req.Options["includePatterns"] = includePatterns
 	req.Options["excludePatterns"] = excludePatterns
 	req.Options["includeExternalLinks"] = false
 	req.Options["includeSubdomains"] = false
 	body, err := json.Marshal(req)
 	if err != nil {
 		return "", fmt.Errorf("failed to marshal request: %w", err)
 	}
 	url := fmt.Sprintf("%s/accounts/%s/browser-rendering/crawl", c.BaseURL, c.AccountID)
 	httpReq, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(body))
 	if err != nil {
 		return "", fmt.Errorf("failed to create request: %w", err)
 	}
 	httpReq.Header.Set("Authorization", "Bearer "+c.APIToken)
 	httpReq.Header.Set("Content-Type", "application/json")
 	resp, err := c.Client.Do(httpReq)
 	if err != nil {
 		return "", fmt.Errorf("failed to send request: %w", err)
 	}
 	defer resp.Body.Close()
 	if resp.StatusCode != http.StatusOK {
 		body, _ := io.ReadAll(resp.Body)
 		return "", fmt.Errorf("API request failed with status %d: %s", resp.StatusCode, string(body))
 	}
 	var crawlResp CloudflareCrawlResponse
 	if err := json.NewDecoder(resp.Body).Decode(&crawlResp); err != nil {
 		return "", fmt.Errorf("failed to decode response: %w", err)
 	}
 	if !crawlResp.Success {
 		return "", fmt.Errorf("API returned unsuccessful response")
 	}
 	return crawlResp.Result, nil
 }
-// GetCrawlResults retrieves the results of a crawl job
+func getDomainBreaker(domain string) *circuitBreaker {
-func (c *CloudflareClient) GetCrawlResults(ctx context.Context, jobID string, limit int) (*CloudflareCrawlJob, error) {
+	domainBreakers.mu.RLock()
-	if c == nil {
+	if cb, ok := domainBreakers.breakers[domain]; ok {
-		return nil, fmt.Errorf("Cloudflare client not initialized")
+		domainBreakers.mu.RUnlock()
 		return cb
 	}
 	domainBreakers.mu.RUnlock()
-	url := fmt.Sprintf("%s/accounts/%s/browser-rendering/crawl/%s", c.BaseURL, c.AccountID, jobID)
+	domainBreakers.mu.Lock()
-	if limit > 0 {
+	defer domainBreakers.mu.Unlock()
-		url += fmt.Sprintf("?limit=%d", limit)
+	if cb, ok := domainBreakers.breakers[domain]; ok {
 		return cb
 	}
-
+	cb := &circuitBreaker{
-	httpReq, err := http.NewRequestWithContext(ctx, "GET", url, nil)
+		threshold: 15,
-	if err != nil {
+		timeout:   30 * time.Minute,
 		return nil, fmt.Errorf("failed to create request: %w", err)
 	}
-
+	if domainBreakers.breakers == nil {
-	httpReq.Header.Set("Authorization", "Bearer "+c.APIToken)
+		domainBreakers.breakers = make(map[string]*circuitBreaker)
 	resp, err := c.Client.Do(httpReq)
 	if err != nil {
 		return nil, fmt.Errorf("failed to send request: %w", err)
 	}
-	defer resp.Body.Close()
+	domainBreakers.breakers[domain] = cb
-
+	return cb
 	if resp.StatusCode != http.StatusOK {
 		body, _ := io.ReadAll(resp.Body)
 		return nil, fmt.Errorf("API request failed with status %d: %s", resp.StatusCode, string(body))
 	}
 	var result struct {
 		Success bool               `json:"success"`
 		Result  CloudflareCrawlJob `json:"result"`
 	}
 	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
 		return nil, fmt.Errorf("failed to decode response: %w", err)
 	}
 	if !result.Success {
 		return nil, fmt.Errorf("API returned unsuccessful response")
 	}
 	return &result.Result, nil
 }
-// WaitForCrawlCompletion waits for a crawl job to complete and returns the results
+func (cb *circuitBreaker) RecordFailure() {
-func (c *CloudflareClient) WaitForCrawlCompletion(ctx context.Context, jobID string, maxAttempts int, delay time.Duration) (*CloudflareCrawlJob, error) {
+	atomic.AddInt32(&cb.failures, 1)
-	if c == nil {
+	cb.mu.Lock()
-		return nil, fmt.Errorf("Cloudflare client not initialized")
+	cb.lastFail = time.Now()
-	}
+	cb.mu.Unlock()
 }
-	for i := 0; i < maxAttempts; i++ {
+func (cb *circuitBreaker) RecordSuccess() {
-		job, err := c.GetCrawlResults(ctx, jobID, 1) // Use limit=1 for status checks
+	atomic.StoreInt32(&cb.failures, 0)
-		if err != nil {
+}
 			return nil, err
 		}
-		if job.Status != "running" {
+func (cb *circuitBreaker) IsOpen() bool {
-			// Get full results
+	if atomic.LoadInt32(&cb.failures) < int32(cb.threshold) {
-			fullJob, err := c.GetCrawlResults(ctx, jobID, 0) // No limit for full results
+		return false
 			if err != nil {
 				return nil, err
 			}
 			return fullJob, nil
 	}
 	cb.mu.Lock()
 	last := cb.lastFail
 	cb.mu.Unlock()
 	return time.Since(last) < cb.timeout
 }
 // semaphore limits concurrent operations.
 type semaphore struct {
 	ch chan struct{}
 }
 func newSemaphore(n int) *semaphore {
 	return &semaphore{ch: make(chan struct{}, n)}
 }
 func (s *semaphore) Acquire(ctx context.Context) error {
 	select {
 	case s.ch <- struct{}{}:
 		return nil
 	case <-ctx.Done():
-			return nil, ctx.Err()
+		return ctx.Err()
 		case <-time.After(delay):
 			continue
 	}
 	}
 	return nil, fmt.Errorf("crawl job did not complete within timeout")
 }
-// CrawlURL performs a complete crawl operation for a single URL
+func (s *semaphore) Release() {
-func (c *CloudflareClient) CrawlURL(ctx context.Context, url string) (*CloudflareCrawlJob, error) {
+	select {
-	if c == nil {
+	case <-s.ch:
-		return nil, fmt.Errorf("Cloudflare client not initialized")
+	default:
 	}
 	req := CloudflareCrawlRequest{
 		URL:     url,
 		Limit:   1, // Only crawl the specific URL
 		Depth:   0, // Don't follow links
 		Formats: []string{"html", "markdown"},
 		Render:  true,
 		Source:  "links", // Only crawl the specific URL, not sitemaps
 	}
 	jobID, err := c.StartCrawl(ctx, req)
 	if err != nil {
 		return nil, fmt.Errorf("failed to start crawl: %w", err)
 	}
 	// Wait for completion with reasonable timeout
 	ctx, cancel := context.WithTimeout(ctx, 2*time.Minute)
 	defer cancel()
 	job, err := c.WaitForCrawlCompletion(ctx, jobID, 24, 5*time.Second)
 	if err != nil {
 		return nil, fmt.Errorf("failed to wait for crawl completion: %w", err)
 	}
 	return job, nil
 }
 func newBrowserRequest(url string, opts fetchOptions) (*http.Request, error) {
@@ -364,11 +208,12 @@ func compactErrorText(s string) string {
 	return s
 }
-func fetchPageDirect(url string, opts fetchOptions) ([]byte, error) {
+func fetchPageDirectOnce(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
 	req, err := newBrowserRequest(url, opts)
 	if err != nil {
 		return nil, err
 	}
 	req = req.WithContext(ctx)
 	client := &http.Client{Timeout: 15 * time.Second}
 	resp, err := client.Do(req)
@@ -392,12 +237,31 @@ func fetchPageDirect(url string, opts fetchOptions) ([]byte, error) {
 	return body, nil
 }
-func fetchPageWithWget(url string, opts fetchOptions) ([]byte, error) {
+func fetchPageDirect(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
 	var lastErr error
 	for attempt := 0; attempt < 3; attempt++ {
 		if attempt > 0 {
 			select {
 			case <-time.After(time.Duration(attempt) * time.Second):
 			case <-ctx.Done():
 				return nil, ctx.Err()
 			}
 		}
 		body, err := fetchPageDirectOnce(ctx, url, opts)
 		if err == nil {
 			return body, nil
 		}
 		lastErr = err
 	}
 	return nil, fmt.Errorf("direct fetch failed after 3 attempts: %w", lastErr)
 }
 func fetchPageWithWget(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
 	if _, err := exec.LookPath("wget"); err != nil {
 		return nil, fmt.Errorf("wget not available: %w", err)
 	}
-	ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second)
+	ctx, cancel := context.WithTimeout(ctx, 20*time.Second)
 	defer cancel()
 	args := []string{
@@ -509,6 +373,52 @@ func ensureEmbeddedScraplingHelper() (string, error) {
 	return embeddedScraplingHelperFile, nil
 }
 func fetchPageWithCurl(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
 	if _, err := exec.LookPath("curl"); err != nil {
 		return nil, fmt.Errorf("curl not available: %w", err)
 	}
 	ctx, cancel := context.WithTimeout(ctx, 20*time.Second)
 	defer cancel()
 	args := []string{
 		"-sSL",
 		"--max-time", "15",
 		"-A", browserUserAgent,
 		"-H", "Accept: " + browserAccept,
 		"-H", "Accept-Language: " + browserAcceptLanguage,
 		"-H", "Connection: keep-alive",
 	}
 	if opts.Referer != "" {
 		args = append(args, "-H", "Referer: "+opts.Referer)
 	}
 	args = append(args, url)
 	cmd := exec.CommandContext(ctx, "curl", args...)
 	var stdout bytes.Buffer
 	var stderr bytes.Buffer
 	cmd.Stdout = &stdout
 	cmd.Stderr = &stderr
 	if err := cmd.Run(); err != nil {
 		details := compactErrorText(stderr.String())
 		if details == "" {
 			details = compactErrorText(err.Error())
 		}
 		return nil, fmt.Errorf("curl request failed: %s", details)
 	}
 	body := stdout.Bytes()
 	if len(body) == 0 {
 		return nil, fmt.Errorf("curl returned an empty body")
 	}
 	if looksLikeCloudflareBlock(body) {
 		return nil, fmt.Errorf("curl returned a Cloudflare challenge page")
 	}
 	return body, nil
 }
 func findScraplingHelperScript() (string, error) {
 	cwd, _ := os.Getwd()
@@ -553,7 +463,17 @@ func findScraplingPython() string {
 	)
 }
-func fetchPageWithScrapling(url string, opts fetchOptions) ([]byte, error) {
+func fetchPageWithScrapling(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
 	parsedURL, err := neturl.Parse(url)
 	if err != nil {
 		return nil, fmt.Errorf("Scrapling skipped: invalid URL: %w", err)
 	}
 	domain := parsedURL.Host
 	if getDomainBreaker(domain).IsOpen() {
 		return nil, fmt.Errorf("Scrapling skipped: circuit breaker is open for %s", domain)
 	}
 	pythonBin := findScraplingPython()
 	if pythonBin == "" {
 		return nil, fmt.Errorf("Scrapling skipped: no Python runtime found")
@@ -564,10 +484,16 @@ func fetchPageWithScrapling(url string, opts fetchOptions) ([]byte, error) {
 		return nil, fmt.Errorf("Scrapling skipped: %w", err)
 	}
-	ctx, cancel := context.WithTimeout(context.Background(), 90*time.Second)
+	// Acquire global Scrapling semaphore to limit concurrent Chromium launches
 	if err := scraplingSem.Acquire(ctx); err != nil {
 		return nil, fmt.Errorf("Scrapling skipped: %w", err)
 	}
 	defer scraplingSem.Release()
 	ctx, cancel := context.WithTimeout(ctx, 90*time.Second)
 	defer cancel()
-	args := []string{helperScript, "--url", url}
+	args := []string{helperScript, "--url", url, "--timeout-ms", "60000", "--wait-ms", "500"}
 	if opts.Referer != "" {
 		args = append(args, "--referer", opts.Referer)
 	}
@@ -577,32 +503,51 @@ func fetchPageWithScrapling(url string, opts fetchOptions) ([]byte, error) {
 	var stderr bytes.Buffer
 	cmd.Stdout = &stdout
 	cmd.Stderr = &stderr
 	// Run in a new process group so we can kill all Chromium children on timeout
 	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
 	if err := cmd.Run(); err != nil {
 		// Kill the entire process group including Chromium children
 		if cmd.Process != nil {
 			syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
 		}
 		details := compactErrorText(stderr.String())
 		if details == "" {
 			details = compactErrorText(err.Error())
 		}
 		// Don't count context cancellations (client timeouts) or infrastructure failures
 		if ctx.Err() == nil && !strings.Contains(details, "Executable doesn't exist") {
 			getDomainBreaker(domain).RecordFailure()
 		}
 		return nil, fmt.Errorf("Scrapling request failed: %s", details)
 	}
 	body := stdout.Bytes()
 	if len(body) == 0 {
 		getDomainBreaker(domain).RecordFailure()
 		return nil, fmt.Errorf("Scrapling returned an empty body")
 	}
 	if looksLikeCloudflareBlock(body) {
 		getDomainBreaker(domain).RecordFailure()
 		return nil, fmt.Errorf("Scrapling returned a Cloudflare challenge page")
 	}
 	getDomainBreaker(domain).RecordSuccess()
 	return body, nil
 }
-func fetchPageWithFallback(url string) ([]byte, error) {
+func fetchPageWithFallback(ctx context.Context, url string) ([]byte, error) {
-	return fetchPageWithFallbackOptions(url, fetchOptions{})
+	return fetchPageWithFallbackOptions(ctx, url, fetchOptions{})
 }
-// fetchPageWithFallback tries Go HTTP first, then wget, then Scrapling, then Cloudflare Browser Rendering.
+// fetchPageWithFallback tries Go HTTP first, then curl/wget, then Scrapling.
-func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error) {
+// When direct HTTP returns a Cloudflare block, curl/wget are skipped since they
 // will just return the same challenge page and waste ~20 seconds.
 func fetchPageWithFallbackOptions(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
 	if err := ctx.Err(); err != nil {
 		return nil, err
 	}
 	// Check cache first
 	pageCacheMu.RLock()
 	if entry, ok := pageCache[url]; ok {
@@ -614,14 +559,29 @@ func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error)
 	}
 	pageCacheMu.RUnlock()
-	body, err := fetchPageDirect(url, opts)
+	body, err := fetchPageDirect(ctx, url, opts)
 	if err == nil {
 		cachePage(url, body)
 		return body, nil
 	}
 	log.Printf("Direct request failed for %s: %v", url, err)
-	body, err = fetchPageWithWget(url, opts)
+	// If direct HTTP returned a Cloudflare block, skip curl/wget time-wasters
 	// and go straight to Scrapling which can solve the challenge.
 	if strings.Contains(err.Error(), "403") || strings.Contains(err.Error(), "Cloudflare") {
 		log.Printf("Skipping curl/wget fallbacks for %s: direct HTTP hit Cloudflare wall", url)
 		goto scraplingFallback
 	}
 	body, err = fetchPageWithCurl(ctx, url, opts)
 	if err == nil {
 		log.Printf("Successfully retrieved content via curl for %s", url)
 		cachePage(url, body)
 		return body, nil
 	}
 	log.Printf("curl fallback failed for %s: %v", url, err)
 	body, err = fetchPageWithWget(ctx, url, opts)
 	if err == nil {
 		log.Printf("Successfully retrieved content via wget for %s", url)
 		cachePage(url, body)
@@ -629,7 +589,8 @@ func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error)
 	}
 	log.Printf("wget fallback failed for %s: %v", url, err)
-	body, err = fetchPageWithScrapling(url, opts)
+scraplingFallback:
 	body, err = fetchPageWithScrapling(ctx, url, opts)
 	if err == nil {
 		log.Printf("Successfully retrieved content via Scrapling for %s", url)
 		cachePage(url, body)
@@ -637,32 +598,7 @@ func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error)
 	}
 	log.Printf("Scrapling fallback failed for %s: %v", url, err)
-	if cfClient := NewCloudflareClient(); cfClient != nil {
+	return nil, fmt.Errorf("all fetch methods failed for %s: %w", url, err)
 		log.Printf("Attempting Cloudflare crawl fallback for %s", url)
 		ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
 		defer cancel()
 		job, err := cfClient.CrawlURL(ctx, url)
 		if err != nil {
 			log.Printf("Cloudflare crawl failed for %s: %v", url, err)
 			return nil, fmt.Errorf("go scraping failed, wget failed, Scrapling failed, and Cloudflare crawl failed: %w", err)
 		}
 		if len(job.Records) > 0 && job.Records[0].Status == "completed" {
 			body := []byte(job.Records[0].HTML)
 			if looksLikeCloudflareBlock(body) {
 				return nil, fmt.Errorf("Cloudflare crawl returned a challenge page")
 			}
 			log.Printf("Successfully retrieved content via Cloudflare crawl for %s", url)
 			cachePage(url, body)
 			return body, nil
 		}
 		log.Printf("Cloudflare crawl returned no completed records for %s", url)
 		return nil, fmt.Errorf("Cloudflare crawl returned no completed records")
 	}
 	return nil, fmt.Errorf("go scraping failed, wget failed, Scrapling failed, and Cloudflare client is not available")
 }
 func cachePage(url string, body []byte) {
@@ -674,13 +610,13 @@ func cachePage(url string, body []byte) {
 // parseCompetitionMatchesFromFotbal scrapes matches from the public fotbal.cz
 // competition page (e.g., https://www.fotbal.cz/souteze/turnaje/table/{id}).
 // It filters to only include matches involving the given clubName if provided.
-func parseCompetitionMatchesFromFotbal(pageURL, clubType, clubName, clubID string) []Match {
+func parseCompetitionMatchesFromFotbal(ctx context.Context, pageURL, clubType, clubName, clubID string) []Match {
 	pageURL = strings.TrimSpace(pageURL)
 	if pageURL == "" {
 		return nil
 	}
-	body, err := fetchPageWithFallback(pageURL)
+	body, err := fetchPageWithFallback(ctx, pageURL)
 	if err != nil {
 		log.Printf("fotbal.cz matches fetch failed for %s: %v", pageURL, err)
 		return nil
@@ -846,8 +782,13 @@ func parseCompetitionMatchesFromFotbal(pageURL, clubType, clubName, clubID strin
 }
 // parseCompetitionMatchesFromIS scrapes matches from the IS portal as fallback.
-func parseCompetitionMatchesFromIS(detailURL, clubType, clubName, clubID string) []Match {
+func parseCompetitionMatchesFromIS(ctx context.Context, detailURL, clubType, clubName, clubID string) []Match {
-	resp, err := http.Get(detailURL)
+	req, err := http.NewRequestWithContext(ctx, "GET", detailURL, nil)
 	if err != nil {
 		log.Printf("IS matches request error for %s: %v", detailURL, err)
 		return nil
 	}
 	resp, err := http.DefaultClient.Do(req)
 	if err != nil {
 		log.Printf("IS matches fetch error for %s: %v", detailURL, err)
 		return nil
@@ -1275,14 +1216,14 @@ func getClubSearch(w http.ResponseWriter, r *http.Request) {
 	vals.Set("q", q)
 	searchURL := "https://www.fotbal.cz/club/hledej?" + vals.Encode()
-	fetchSearchPage := func(url string) ([]byte, error) {
+	fetchSearchPage := func(ctx context.Context, url string) ([]byte, error) {
-		return fetchPageWithFallbackOptions(url, fetchOptions{
+		return fetchPageWithFallbackOptions(ctx, url, fetchOptions{
 			Referer: "https://www.fotbal.cz/club/hledej",
 		})
 	}
 	// Try direct HTTP request first
-	body, err := fetchSearchPage(searchURL)
+	body, err := fetchSearchPage(r.Context(), searchURL)
 	if err != nil {
 		log.Printf("Direct search request failed for %s: %v", searchURL, err)
@@ -1298,7 +1239,7 @@ func getClubSearch(w http.ResponseWriter, r *http.Request) {
 			}
 		}
-		body, err = fetchSearchPage(searchURL2)
+		body, err = fetchSearchPage(r.Context(), searchURL2)
 		if err != nil {
 			log.Printf("Retried search request failed for %s: %v", searchURL2, err)
 			// Return empty results instead of error
@@ -1409,7 +1350,7 @@ func getClubTables(w http.ResponseWriter, r *http.Request) {
 	}
 	url := fmt.Sprintf("%s/%s", baseURL, clubID)
-	body, err := fetchPageWithFallback(url)
+	body, err := fetchPageWithFallback(r.Context(), url)
 	if err != nil {
 		http.Error(w, fmt.Sprintf("Error fetching club data: %v", err), http.StatusInternalServerError)
 		return
@@ -1459,26 +1400,43 @@ func getClubTables(w http.ResponseWriter, r *http.Request) {
 		})
 	})
-	// For each competition, fetch the standings tables from is.fotbal.cz
+	// For each competition, fetch the standings tables from is.fotbal.cz concurrently
 	sem := newSemaphore(4)
 	var wg sync.WaitGroup
 	var mu sync.Mutex
 	for i := range competitions {
-		comp := &competitions[i]
+		wg.Add(1)
 		go func(idx int) {
 			defer wg.Done()
 			if err := sem.Acquire(r.Context()); err != nil {
 				return
 			}
 			defer sem.Release()
 			comp := &competitions[idx]
 			tableURL := fmt.Sprintf("https://is.fotbal.cz/public/souteze/tabulky-souteze.aspx?req=%s&sport=%s", comp.ID, sportParam)
-		resp, err := http.Get(tableURL)
+			req, err := http.NewRequestWithContext(r.Context(), "GET", tableURL, nil)
 			if err != nil {
 				log.Printf("error creating request for competition table %s: %v", comp.ID, err)
 				return
 			}
 			resp, err := http.DefaultClient.Do(req)
 			if err != nil {
 				log.Printf("error fetching competition table for %s: %v", comp.ID, err)
-			continue
+				return
 			}
 			defer resp.Body.Close()
 			if resp.StatusCode != http.StatusOK {
 				log.Printf("non-200 response for %s: %d", comp.ID, resp.StatusCode)
-			continue
+				return
 			}
 			docTable, err := goquery.NewDocumentFromReader(resp.Body)
 			if err != nil {
 				log.Printf("error parsing table HTML for %s: %v", comp.ID, err)
-			continue
+				return
 			}
 			// Parse section: Tabulka celková (only overall)
@@ -1532,8 +1490,12 @@ func getClubTables(w http.ResponseWriter, r *http.Request) {
 			}
 			overall = parseSection("Tabulka celková")
 			mu.Lock()
 			comp.Table = &CompetitionTable{Overall: overall}
 			mu.Unlock()
 		}(i)
 	}
 	wg.Wait()
 	clubName := strings.TrimSpace(doc.Find("h1.H4 span").First().Text())
 	clubURL := strings.TrimSpace(doc.Find("h1.H4 a").First().AttrOr("href", ""))
@@ -1583,7 +1545,7 @@ func getClubInfo(w http.ResponseWriter, r *http.Request) {
 	}
 	url := fmt.Sprintf("%s/%s", baseURL, clubID)
-	body, err := fetchPageWithFallback(url)
+	body, err := fetchPageWithFallback(r.Context(), url)
 	if err != nil {
 		http.Error(w, fmt.Sprintf("Error fetching club data: %v", err), http.StatusInternalServerError)
 		return
@@ -1637,21 +1599,37 @@ func getClubInfo(w http.ResponseWriter, r *http.Request) {
 		competitions = append(competitions, Competition{ID: compID, Code: code, Name: name, TeamCount: teamCount, MatchesLink: tableLink})
 	})
-	// For each competition, fetch matches
+	// For each competition, fetch matches concurrently with limits
 	sem := newSemaphore(4)
 	var wg sync.WaitGroup
 	var mu sync.Mutex
 	for i := range competitions {
-		comp := &competitions[i]
+		wg.Add(1)
 		go func(idx int) {
 			defer wg.Done()
 			if err := sem.Acquire(r.Context()); err != nil {
 				return
 			}
 			defer sem.Release()
 			comp := &competitions[idx]
 			matchesLink := comp.MatchesLink
 			// 1) Try parsing from the public fotbal.cz competition page (matches_link)
-		matches := parseCompetitionMatchesFromFotbal(matchesLink, clubType, clubName, clubID)
+			matches := parseCompetitionMatchesFromFotbal(r.Context(), matchesLink, clubType, clubName, clubID)
 			// Always try IS as well and prefer it if it provides at least as many matches
 			detailURL := fmt.Sprintf("https://is.fotbal.cz/public/souteze/detail-souteze.aspx?req=%s&sport=%s", comp.ID, sportParam)
-		isMatches := parseCompetitionMatchesFromIS(detailURL, clubType, clubName, clubID)
+			isMatches := parseCompetitionMatchesFromIS(r.Context(), detailURL, clubType, clubName, clubID)
 			// Prefer IS whenever it yields any results, as IS often contains alias team names
 			if len(isMatches) > 0 {
 				matches = isMatches
 			}
 			mu.Lock()
 			comp.Matches = matches
 			mu.Unlock()
 		}(i)
 	}
 	wg.Wait()
 	clubInfo := ClubInfo{
 		Name:           clubName,
@@ -1679,9 +1657,18 @@ func main() {
 		http.Redirect(w, r, "/club/football/"+vars["id"], http.StatusMovedPermanently)
 	}).Methods("GET")
 	r.HandleFunc("/", docsHandler)
-	port := "0.0.0.0:8686"
+
-	fmt.Printf("Server running on http://%s\n", port)
+	addr := "0.0.0.0:8686"
-	log.Fatal(http.ListenAndServe(port, r))
+	srv := &http.Server{
 		Addr:           addr,
 		Handler:        r,
 		ReadTimeout:    30 * time.Second,
 		WriteTimeout:   5 * time.Minute,
 		IdleTimeout:    120 * time.Second,
 		MaxHeaderBytes: 1 << 20,
 	}
 	fmt.Printf("Server running on http://%s\n", addr)
 	log.Fatal(srv.ListenAndServe())
 }
 // docsHandler serves a simple HTML API documentation at the root endpoint.
@@ -3,7 +3,34 @@
 import argparse
 import contextlib
 import logging
 import ssl
 import sys
 import urllib.request
 BROWSER_UA = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0 Safari/537.36"
 )
 CF_SIGNS = [
    b"<title>just a moment...</title>",
    b"attention required!",
    b"enable javascript and cookies to continue",
    b"checking if the site connection is secure",
    b"cf-browser-verification",
    b"/cdn-cgi/challenge-platform/",
 ]
 def looks_like_cloudflare_block(body: bytes) -> bool:
    if not body:
        return False
    low = body.lower()
    for sig in CF_SIGNS:
        if sig in low:
            return True
    return False
 def response_body_bytes(response) -> bytes:
@@ -20,60 +47,90 @@ def response_body_bytes(response) -> bytes:
    return str(response).encode("utf-8")
-def main() -> int:
+def lightweight_fetch(url: str, referer: str = "", timeout: float = 15.0) -> bytes:
-    parser = argparse.ArgumentParser()
+    """Try a lightweight urllib fetch with browser headers first."""
-    parser.add_argument("--url", required=True)
+    req = urllib.request.Request(
-    parser.add_argument("--referer", default="")
+        url,
-    parser.add_argument("--timeout-ms", type=int, default=45000)
+        headers={
-    parser.add_argument("--wait-ms", type=int, default=1000)
+            "User-Agent": BROWSER_UA,
-    args = parser.parse_args()
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
            "Accept-Language": "cs-CZ,cs;q=0.9,en;q=0.8",
            "Accept-Encoding": "identity",
            "Connection": "keep-alive",
            **({"Referer": referer} if referer else {}),
        },
    )
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE
    with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
        body = resp.read()
    if looks_like_cloudflare_block(body):
        raise RuntimeError(" lightweight fetch returned Cloudflare challenge")
    return body
 def scrapling_fetch(url: str, referer: str = "", timeout_ms: int = 30000, wait_ms: int = 500) -> bytes:
    try:
        from scrapling.fetchers import StealthyFetcher
    except Exception as exc:
-        print(f"Scrapling import failed: {exc}", file=sys.stderr)
+        raise RuntimeError(f"Scrapling import failed: {exc}") from exc
        return 2
    logging.getLogger().setLevel(logging.ERROR)
    extra_headers = {}
-    if args.referer:
+    if referer:
-        extra_headers["Referer"] = args.referer
+        extra_headers["Referer"] = referer
    fetch_kwargs = {
        "headless": True,
        "network_idle": True,
        "google_search": False,
        "solve_cloudflare": True,
-        "timeout": args.timeout_ms,
+        "timeout": timeout_ms,
-        "wait": args.wait_ms,
+        "wait": wait_ms,
    }
    if extra_headers:
        fetch_kwargs["extra_headers"] = extra_headers
    try:
    with contextlib.redirect_stdout(sys.stderr):
-            response = StealthyFetcher.fetch(args.url, **fetch_kwargs)
+        response = StealthyFetcher.fetch(url, **fetch_kwargs)
    status = getattr(response, "status", None)
    if isinstance(status, int) and status >= 400:
        raise RuntimeError(f"Scrapling returned HTTP {status}")
    body = response_body_bytes(response)
    if not body:
        raise RuntimeError("Scrapling returned an empty body")
    return body
 def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("--url", required=True)
    parser.add_argument("--referer", default="")
    parser.add_argument("--timeout-ms", type=int, default=30000)
    parser.add_argument("--wait-ms", type=int, default=500)
    args = parser.parse_args()
    # 1) Try lightweight urllib fetch first (no browser, instant)
    try:
        body = lightweight_fetch(args.url, args.referer, timeout=min(args.timeout_ms / 1000.0, 15.0))
        sys.stdout.buffer.write(body)
        return 0
    except Exception as exc:
        print(f"Lightweight fetch failed: {exc}", file=sys.stderr)
    # 2) Fall back to Scrapling / Playwright only if lightweight failed
    try:
        body = scrapling_fetch(args.url, args.referer, args.timeout_ms, args.wait_ms)
        sys.stdout.buffer.write(body)
        return 0
    except Exception as exc:
        print(f"Scrapling fetch failed: {exc}", file=sys.stderr)
        return 1
    status = getattr(response, "status", None)
    if isinstance(status, int) and status >= 400:
        print(f"Scrapling returned HTTP {status}", file=sys.stderr)
        return 1
    body = response_body_bytes(response)
    if not body:
        print("Scrapling returned an empty body", file=sys.stderr)
        return 1
    try:
        sys.stdout.buffer.write(body)
    except BrokenPipeError:
        return 0
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())