mirror of
https://github.com/Dvorinka/facr-scraper.git
synced 2026-06-03 20:12:57 +00:00
refactor: optimize docker image and implement lightweight fetching
This commit improves the overall efficiency and reliability of the scraper by: - Optimizing the Dockerfile by reducing layers, using `--no-install-recommends`, and consolidating Playwright installation. - Adding resource limits (CPU/Memory) to the docker-compose configuration. - Refactoring `main.go` to remove unused Cloudflare client structures and increasing cache TTL. - Implementing a `lightweight_fetch` mechanism in `scrapling_fetch.py` using `urllib` to attempt fast requests before falling back to the heavier Scrapling/Playwright engine. - Adding Cloudflare challenge detection to the lightweight fetcher.
This commit is contained in:
+20
-38
@@ -14,12 +14,12 @@ RUN CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -o facr-scraper .
|
|||||||
# Python stage for Scrapling
|
# Python stage for Scrapling
|
||||||
FROM python:3.11-slim AS python-builder
|
FROM python:3.11-slim AS python-builder
|
||||||
|
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE=1
|
||||||
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
# Install system dependencies for Playwright
|
# Install system dependencies for Playwright
|
||||||
RUN apt-get update && apt-get install -y \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
wget \
|
wget curl ca-certificates gnupg \
|
||||||
gnupg \
|
|
||||||
ca-certificates \
|
|
||||||
curl \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Create virtual environment and install Scrapling
|
# Create virtual environment and install Scrapling
|
||||||
@@ -28,44 +28,27 @@ ENV PATH="/opt/scrapling/bin:$PATH"
|
|||||||
COPY requirements-scrapling.txt .
|
COPY requirements-scrapling.txt .
|
||||||
RUN pip install --no-cache-dir -r requirements-scrapling.txt
|
RUN pip install --no-cache-dir -r requirements-scrapling.txt
|
||||||
|
|
||||||
# Install Playwright browsers
|
# Install Playwright browsers with deps in one layer
|
||||||
RUN playwright install chromium
|
RUN playwright install chromium --with-deps
|
||||||
RUN playwright install-deps
|
|
||||||
|
|
||||||
# Fix Python symlinks
|
# Fix Python symlinks
|
||||||
RUN ln -sf /usr/local/bin/python /opt/scrapling/bin/python
|
RUN ln -sf /usr/local/bin/python /opt/scrapling/bin/python \
|
||||||
RUN ln -sf /usr/local/bin/python /opt/scrapling/bin/python3
|
&& ln -sf /usr/local/bin/python /opt/scrapling/bin/python3
|
||||||
|
|
||||||
# Final stage
|
# Final stage
|
||||||
FROM python:3.11-slim
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE=1
|
||||||
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
ENV PATH="/opt/scrapling/bin:$PATH"
|
||||||
|
|
||||||
# Install runtime dependencies for both Go and Playwright
|
# Install runtime dependencies for both Go and Playwright
|
||||||
RUN apt-get update && apt-get install -y \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
ca-certificates \
|
ca-certificates wget curl \
|
||||||
wget \
|
libglib2.0-0 libgobject-2.0-0 libnspr4 libnss3 libdbus-1-3 \
|
||||||
curl \
|
libatk1.0-0 libatk-bridge2.0-0 libcups2 libexpat1 libxcb1 \
|
||||||
gnupg \
|
libxkbcommon0 libatspi2.0-0 libx11-6 libxcomposite1 libxdamage1 \
|
||||||
libglib2.0-0 \
|
libxext6 libxfixes3 libxrandr2 libgbm1 libcairo2 libpango-1.0-0 \
|
||||||
libgobject-2.0-0 \
|
|
||||||
libnspr4 \
|
|
||||||
libnss3 \
|
|
||||||
libdbus-1-3 \
|
|
||||||
libatk1.0-0 \
|
|
||||||
libatk-bridge2.0-0 \
|
|
||||||
libcups2 \
|
|
||||||
libexpat1 \
|
|
||||||
libxcb1 \
|
|
||||||
libxkbcommon0 \
|
|
||||||
libatspi2.0-0 \
|
|
||||||
libx11-6 \
|
|
||||||
libxcomposite1 \
|
|
||||||
libxdamage1 \
|
|
||||||
libxext6 \
|
|
||||||
libxfixes3 \
|
|
||||||
libxrandr2 \
|
|
||||||
libgbm1 \
|
|
||||||
libcairo2 \
|
|
||||||
libpango-1.0-0 \
|
|
||||||
libasound2 \
|
libasound2 \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
@@ -80,13 +63,12 @@ COPY --from=python-builder /opt/scrapling /opt/scrapling
|
|||||||
|
|
||||||
# Copy Playwright browser cache
|
# Copy Playwright browser cache
|
||||||
COPY --from=python-builder /root/.cache/ms-playwright /home/scraper/.cache/ms-playwright
|
COPY --from=python-builder /root/.cache/ms-playwright /home/scraper/.cache/ms-playwright
|
||||||
ENV PATH="/opt/scrapling/bin:$PATH"
|
|
||||||
|
|
||||||
# Copy scrapling script
|
# Copy scrapling script
|
||||||
COPY scripts/scrapling_fetch.py /opt/scrapling/scripts/scrapling_fetch.py
|
COPY scripts/scrapling_fetch.py /opt/scrapling/scripts/scrapling_fetch.py
|
||||||
|
|
||||||
# Create cache directory for Playwright
|
# Create cache directory for Playwright
|
||||||
RUN mkdir -p /home/scraper/.cache && chown -R scraper:scraper /home/scraper
|
RUN mkdir -p /home/scraper/.cache && chown -R scraper:scraper /home/scraper /opt/scrapling
|
||||||
|
|
||||||
USER scraper
|
USER scraper
|
||||||
WORKDIR /home/scraper
|
WORKDIR /home/scraper
|
||||||
|
|||||||
@@ -13,6 +13,14 @@ services:
|
|||||||
- SCRAPLING_SCRIPT=/opt/scrapling/scripts/scrapling_fetch.py
|
- SCRAPLING_SCRIPT=/opt/scrapling/scripts/scrapling_fetch.py
|
||||||
- DEBUG_SAVE_HTML=${DEBUG_SAVE_HTML:-}
|
- DEBUG_SAVE_HTML=${DEBUG_SAVE_HTML:-}
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpus: '2.0'
|
||||||
|
memory: 4G
|
||||||
|
reservations:
|
||||||
|
cpus: '0.5'
|
||||||
|
memory: 512M
|
||||||
volumes:
|
volumes:
|
||||||
# Optional: Mount cache for Playwright browsers
|
# Optional: Mount cache for Playwright browsers
|
||||||
- playwright_cache:/home/scraper/.cache
|
- playwright_cache:/home/scraper/.cache
|
||||||
|
|||||||
@@ -16,6 +16,8 @@ import (
|
|||||||
"regexp"
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
@@ -32,48 +34,6 @@ type Competition struct {
|
|||||||
Table *CompetitionTable `json:"table,omitempty"`
|
Table *CompetitionTable `json:"table,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// Cloudflare Browser Rendering API structures
|
|
||||||
type CloudflareCrawlRequest struct {
|
|
||||||
URL string `json:"url"`
|
|
||||||
Limit int `json:"limit,omitempty"`
|
|
||||||
Depth int `json:"depth,omitempty"`
|
|
||||||
Formats []string `json:"formats,omitempty"`
|
|
||||||
Render bool `json:"render,omitempty"`
|
|
||||||
Source string `json:"source,omitempty"`
|
|
||||||
Options map[string]interface{} `json:"options,omitempty"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type CloudflareCrawlResponse struct {
|
|
||||||
Success bool `json:"success"`
|
|
||||||
Result string `json:"result"` // job ID
|
|
||||||
}
|
|
||||||
|
|
||||||
type CloudflareCrawlJob struct {
|
|
||||||
ID string `json:"id"`
|
|
||||||
Status string `json:"status"`
|
|
||||||
BrowserSecondsUsed float64 `json:"browserSecondsUsed"`
|
|
||||||
Total int `json:"total"`
|
|
||||||
Finished int `json:"finished"`
|
|
||||||
Records []CloudflareCrawlRecord `json:"records"`
|
|
||||||
Cursor string `json:"cursor,omitempty"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type CloudflareCrawlRecord struct {
|
|
||||||
URL string `json:"url"`
|
|
||||||
Status string `json:"status"`
|
|
||||||
Markdown string `json:"markdown,omitempty"`
|
|
||||||
HTML string `json:"html,omitempty"`
|
|
||||||
JSON interface{} `json:"json,omitempty"`
|
|
||||||
Metadata map[string]interface{} `json:"metadata"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type CloudflareClient struct {
|
|
||||||
AccountID string
|
|
||||||
APIToken string
|
|
||||||
BaseURL string
|
|
||||||
Client *http.Client
|
|
||||||
}
|
|
||||||
|
|
||||||
type fetchOptions struct {
|
type fetchOptions struct {
|
||||||
Referer string
|
Referer string
|
||||||
}
|
}
|
||||||
@@ -103,211 +63,95 @@ type cacheEntry struct {
|
|||||||
timestamp time.Time
|
timestamp time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
const cacheTTL = 5 * time.Minute
|
const cacheTTL = 15 * time.Minute
|
||||||
|
|
||||||
// NewCloudflareClient creates a new Cloudflare Browser Rendering API client
|
// domainBreakers is a per-domain circuit breaker map so failures on one site
|
||||||
func NewCloudflareClient() *CloudflareClient {
|
// don't block Scrapling for unrelated sites.
|
||||||
accountID := strings.TrimSpace(os.Getenv("CLOUDFLARE_ACCOUNT_ID"))
|
var domainBreakers struct {
|
||||||
apiToken := strings.TrimSpace(os.Getenv("CLOUDFLARE_API_TOKEN"))
|
mu sync.RWMutex
|
||||||
|
breakers map[string]*circuitBreaker
|
||||||
if accountID == "" || apiToken == "" {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
return &CloudflareClient{
|
|
||||||
AccountID: accountID,
|
|
||||||
APIToken: apiToken,
|
|
||||||
BaseURL: "https://api.cloudflare.com/client/v4",
|
|
||||||
Client: &http.Client{
|
|
||||||
Timeout: 30 * time.Second,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// StartCrawl initiates a crawl job
|
// scraplingSem limits concurrent Chromium launches to avoid zombie processes
|
||||||
func (c *CloudflareClient) StartCrawl(ctx context.Context, req CloudflareCrawlRequest) (string, error) {
|
// and resource exhaustion.
|
||||||
if c == nil {
|
var scraplingSem = newSemaphore(2)
|
||||||
return "", fmt.Errorf("Cloudflare client not initialized")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Set defaults
|
type circuitBreaker struct {
|
||||||
if req.Limit == 0 {
|
failures int32
|
||||||
req.Limit = 10
|
lastFail time.Time
|
||||||
}
|
threshold int
|
||||||
if req.Depth == 0 {
|
timeout time.Duration
|
||||||
req.Depth = 1
|
mu sync.Mutex
|
||||||
}
|
|
||||||
if len(req.Formats) == 0 {
|
|
||||||
req.Formats = []string{"html", "markdown"}
|
|
||||||
}
|
|
||||||
if req.Source == "" {
|
|
||||||
req.Source = "all"
|
|
||||||
}
|
|
||||||
|
|
||||||
// Restrict to specific URL patterns for fotbal.cz to avoid crawling unrelated content
|
|
||||||
if req.Options == nil {
|
|
||||||
req.Options = make(map[string]interface{})
|
|
||||||
}
|
|
||||||
|
|
||||||
// Only crawl URLs from the same domain and specific paths
|
|
||||||
includePatterns := []string{
|
|
||||||
"https://www.fotbal.cz/**",
|
|
||||||
}
|
|
||||||
excludePatterns := []string{
|
|
||||||
"**/api/**",
|
|
||||||
"**/static/**",
|
|
||||||
"**/media/**",
|
|
||||||
}
|
|
||||||
|
|
||||||
req.Options["includePatterns"] = includePatterns
|
|
||||||
req.Options["excludePatterns"] = excludePatterns
|
|
||||||
req.Options["includeExternalLinks"] = false
|
|
||||||
req.Options["includeSubdomains"] = false
|
|
||||||
|
|
||||||
body, err := json.Marshal(req)
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("failed to marshal request: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
url := fmt.Sprintf("%s/accounts/%s/browser-rendering/crawl", c.BaseURL, c.AccountID)
|
|
||||||
httpReq, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(body))
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("failed to create request: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
httpReq.Header.Set("Authorization", "Bearer "+c.APIToken)
|
|
||||||
httpReq.Header.Set("Content-Type", "application/json")
|
|
||||||
|
|
||||||
resp, err := c.Client.Do(httpReq)
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("failed to send request: %w", err)
|
|
||||||
}
|
|
||||||
defer resp.Body.Close()
|
|
||||||
|
|
||||||
if resp.StatusCode != http.StatusOK {
|
|
||||||
body, _ := io.ReadAll(resp.Body)
|
|
||||||
return "", fmt.Errorf("API request failed with status %d: %s", resp.StatusCode, string(body))
|
|
||||||
}
|
|
||||||
|
|
||||||
var crawlResp CloudflareCrawlResponse
|
|
||||||
if err := json.NewDecoder(resp.Body).Decode(&crawlResp); err != nil {
|
|
||||||
return "", fmt.Errorf("failed to decode response: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if !crawlResp.Success {
|
|
||||||
return "", fmt.Errorf("API returned unsuccessful response")
|
|
||||||
}
|
|
||||||
|
|
||||||
return crawlResp.Result, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetCrawlResults retrieves the results of a crawl job
|
func getDomainBreaker(domain string) *circuitBreaker {
|
||||||
func (c *CloudflareClient) GetCrawlResults(ctx context.Context, jobID string, limit int) (*CloudflareCrawlJob, error) {
|
domainBreakers.mu.RLock()
|
||||||
if c == nil {
|
if cb, ok := domainBreakers.breakers[domain]; ok {
|
||||||
return nil, fmt.Errorf("Cloudflare client not initialized")
|
domainBreakers.mu.RUnlock()
|
||||||
|
return cb
|
||||||
}
|
}
|
||||||
|
domainBreakers.mu.RUnlock()
|
||||||
|
|
||||||
url := fmt.Sprintf("%s/accounts/%s/browser-rendering/crawl/%s", c.BaseURL, c.AccountID, jobID)
|
domainBreakers.mu.Lock()
|
||||||
if limit > 0 {
|
defer domainBreakers.mu.Unlock()
|
||||||
url += fmt.Sprintf("?limit=%d", limit)
|
if cb, ok := domainBreakers.breakers[domain]; ok {
|
||||||
|
return cb
|
||||||
}
|
}
|
||||||
|
cb := &circuitBreaker{
|
||||||
httpReq, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
threshold: 15,
|
||||||
if err != nil {
|
timeout: 30 * time.Minute,
|
||||||
return nil, fmt.Errorf("failed to create request: %w", err)
|
|
||||||
}
|
}
|
||||||
|
if domainBreakers.breakers == nil {
|
||||||
httpReq.Header.Set("Authorization", "Bearer "+c.APIToken)
|
domainBreakers.breakers = make(map[string]*circuitBreaker)
|
||||||
|
|
||||||
resp, err := c.Client.Do(httpReq)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to send request: %w", err)
|
|
||||||
}
|
}
|
||||||
defer resp.Body.Close()
|
domainBreakers.breakers[domain] = cb
|
||||||
|
return cb
|
||||||
if resp.StatusCode != http.StatusOK {
|
|
||||||
body, _ := io.ReadAll(resp.Body)
|
|
||||||
return nil, fmt.Errorf("API request failed with status %d: %s", resp.StatusCode, string(body))
|
|
||||||
}
|
|
||||||
|
|
||||||
var result struct {
|
|
||||||
Success bool `json:"success"`
|
|
||||||
Result CloudflareCrawlJob `json:"result"`
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to decode response: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if !result.Success {
|
|
||||||
return nil, fmt.Errorf("API returned unsuccessful response")
|
|
||||||
}
|
|
||||||
|
|
||||||
return &result.Result, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// WaitForCrawlCompletion waits for a crawl job to complete and returns the results
|
func (cb *circuitBreaker) RecordFailure() {
|
||||||
func (c *CloudflareClient) WaitForCrawlCompletion(ctx context.Context, jobID string, maxAttempts int, delay time.Duration) (*CloudflareCrawlJob, error) {
|
atomic.AddInt32(&cb.failures, 1)
|
||||||
if c == nil {
|
cb.mu.Lock()
|
||||||
return nil, fmt.Errorf("Cloudflare client not initialized")
|
cb.lastFail = time.Now()
|
||||||
}
|
cb.mu.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
for i := 0; i < maxAttempts; i++ {
|
func (cb *circuitBreaker) RecordSuccess() {
|
||||||
job, err := c.GetCrawlResults(ctx, jobID, 1) // Use limit=1 for status checks
|
atomic.StoreInt32(&cb.failures, 0)
|
||||||
if err != nil {
|
}
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
if job.Status != "running" {
|
func (cb *circuitBreaker) IsOpen() bool {
|
||||||
// Get full results
|
if atomic.LoadInt32(&cb.failures) < int32(cb.threshold) {
|
||||||
fullJob, err := c.GetCrawlResults(ctx, jobID, 0) // No limit for full results
|
return false
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
return fullJob, nil
|
|
||||||
}
|
}
|
||||||
|
cb.mu.Lock()
|
||||||
|
last := cb.lastFail
|
||||||
|
cb.mu.Unlock()
|
||||||
|
return time.Since(last) < cb.timeout
|
||||||
|
}
|
||||||
|
|
||||||
|
// semaphore limits concurrent operations.
|
||||||
|
type semaphore struct {
|
||||||
|
ch chan struct{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func newSemaphore(n int) *semaphore {
|
||||||
|
return &semaphore{ch: make(chan struct{}, n)}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *semaphore) Acquire(ctx context.Context) error {
|
||||||
select {
|
select {
|
||||||
|
case s.ch <- struct{}{}:
|
||||||
|
return nil
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
return nil, ctx.Err()
|
return ctx.Err()
|
||||||
case <-time.After(delay):
|
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return nil, fmt.Errorf("crawl job did not complete within timeout")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// CrawlURL performs a complete crawl operation for a single URL
|
func (s *semaphore) Release() {
|
||||||
func (c *CloudflareClient) CrawlURL(ctx context.Context, url string) (*CloudflareCrawlJob, error) {
|
select {
|
||||||
if c == nil {
|
case <-s.ch:
|
||||||
return nil, fmt.Errorf("Cloudflare client not initialized")
|
default:
|
||||||
}
|
}
|
||||||
|
|
||||||
req := CloudflareCrawlRequest{
|
|
||||||
URL: url,
|
|
||||||
Limit: 1, // Only crawl the specific URL
|
|
||||||
Depth: 0, // Don't follow links
|
|
||||||
Formats: []string{"html", "markdown"},
|
|
||||||
Render: true,
|
|
||||||
Source: "links", // Only crawl the specific URL, not sitemaps
|
|
||||||
}
|
|
||||||
|
|
||||||
jobID, err := c.StartCrawl(ctx, req)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to start crawl: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Wait for completion with reasonable timeout
|
|
||||||
ctx, cancel := context.WithTimeout(ctx, 2*time.Minute)
|
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
job, err := c.WaitForCrawlCompletion(ctx, jobID, 24, 5*time.Second)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to wait for crawl completion: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return job, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func newBrowserRequest(url string, opts fetchOptions) (*http.Request, error) {
|
func newBrowserRequest(url string, opts fetchOptions) (*http.Request, error) {
|
||||||
@@ -364,11 +208,12 @@ func compactErrorText(s string) string {
|
|||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
func fetchPageDirect(url string, opts fetchOptions) ([]byte, error) {
|
func fetchPageDirectOnce(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
|
||||||
req, err := newBrowserRequest(url, opts)
|
req, err := newBrowserRequest(url, opts)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
req = req.WithContext(ctx)
|
||||||
|
|
||||||
client := &http.Client{Timeout: 15 * time.Second}
|
client := &http.Client{Timeout: 15 * time.Second}
|
||||||
resp, err := client.Do(req)
|
resp, err := client.Do(req)
|
||||||
@@ -392,12 +237,31 @@ func fetchPageDirect(url string, opts fetchOptions) ([]byte, error) {
|
|||||||
return body, nil
|
return body, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func fetchPageWithWget(url string, opts fetchOptions) ([]byte, error) {
|
func fetchPageDirect(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
|
||||||
|
var lastErr error
|
||||||
|
for attempt := 0; attempt < 3; attempt++ {
|
||||||
|
if attempt > 0 {
|
||||||
|
select {
|
||||||
|
case <-time.After(time.Duration(attempt) * time.Second):
|
||||||
|
case <-ctx.Done():
|
||||||
|
return nil, ctx.Err()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
body, err := fetchPageDirectOnce(ctx, url, opts)
|
||||||
|
if err == nil {
|
||||||
|
return body, nil
|
||||||
|
}
|
||||||
|
lastErr = err
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("direct fetch failed after 3 attempts: %w", lastErr)
|
||||||
|
}
|
||||||
|
|
||||||
|
func fetchPageWithWget(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
|
||||||
if _, err := exec.LookPath("wget"); err != nil {
|
if _, err := exec.LookPath("wget"); err != nil {
|
||||||
return nil, fmt.Errorf("wget not available: %w", err)
|
return nil, fmt.Errorf("wget not available: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second)
|
ctx, cancel := context.WithTimeout(ctx, 20*time.Second)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
args := []string{
|
args := []string{
|
||||||
@@ -509,6 +373,52 @@ func ensureEmbeddedScraplingHelper() (string, error) {
|
|||||||
return embeddedScraplingHelperFile, nil
|
return embeddedScraplingHelperFile, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func fetchPageWithCurl(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
|
||||||
|
if _, err := exec.LookPath("curl"); err != nil {
|
||||||
|
return nil, fmt.Errorf("curl not available: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(ctx, 20*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
args := []string{
|
||||||
|
"-sSL",
|
||||||
|
"--max-time", "15",
|
||||||
|
"-A", browserUserAgent,
|
||||||
|
"-H", "Accept: " + browserAccept,
|
||||||
|
"-H", "Accept-Language: " + browserAcceptLanguage,
|
||||||
|
"-H", "Connection: keep-alive",
|
||||||
|
}
|
||||||
|
if opts.Referer != "" {
|
||||||
|
args = append(args, "-H", "Referer: "+opts.Referer)
|
||||||
|
}
|
||||||
|
args = append(args, url)
|
||||||
|
|
||||||
|
cmd := exec.CommandContext(ctx, "curl", args...)
|
||||||
|
var stdout bytes.Buffer
|
||||||
|
var stderr bytes.Buffer
|
||||||
|
cmd.Stdout = &stdout
|
||||||
|
cmd.Stderr = &stderr
|
||||||
|
|
||||||
|
if err := cmd.Run(); err != nil {
|
||||||
|
details := compactErrorText(stderr.String())
|
||||||
|
if details == "" {
|
||||||
|
details = compactErrorText(err.Error())
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("curl request failed: %s", details)
|
||||||
|
}
|
||||||
|
|
||||||
|
body := stdout.Bytes()
|
||||||
|
if len(body) == 0 {
|
||||||
|
return nil, fmt.Errorf("curl returned an empty body")
|
||||||
|
}
|
||||||
|
if looksLikeCloudflareBlock(body) {
|
||||||
|
return nil, fmt.Errorf("curl returned a Cloudflare challenge page")
|
||||||
|
}
|
||||||
|
|
||||||
|
return body, nil
|
||||||
|
}
|
||||||
|
|
||||||
func findScraplingHelperScript() (string, error) {
|
func findScraplingHelperScript() (string, error) {
|
||||||
cwd, _ := os.Getwd()
|
cwd, _ := os.Getwd()
|
||||||
|
|
||||||
@@ -553,7 +463,17 @@ func findScraplingPython() string {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
func fetchPageWithScrapling(url string, opts fetchOptions) ([]byte, error) {
|
func fetchPageWithScrapling(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
|
||||||
|
parsedURL, err := neturl.Parse(url)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("Scrapling skipped: invalid URL: %w", err)
|
||||||
|
}
|
||||||
|
domain := parsedURL.Host
|
||||||
|
|
||||||
|
if getDomainBreaker(domain).IsOpen() {
|
||||||
|
return nil, fmt.Errorf("Scrapling skipped: circuit breaker is open for %s", domain)
|
||||||
|
}
|
||||||
|
|
||||||
pythonBin := findScraplingPython()
|
pythonBin := findScraplingPython()
|
||||||
if pythonBin == "" {
|
if pythonBin == "" {
|
||||||
return nil, fmt.Errorf("Scrapling skipped: no Python runtime found")
|
return nil, fmt.Errorf("Scrapling skipped: no Python runtime found")
|
||||||
@@ -564,10 +484,16 @@ func fetchPageWithScrapling(url string, opts fetchOptions) ([]byte, error) {
|
|||||||
return nil, fmt.Errorf("Scrapling skipped: %w", err)
|
return nil, fmt.Errorf("Scrapling skipped: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 90*time.Second)
|
// Acquire global Scrapling semaphore to limit concurrent Chromium launches
|
||||||
|
if err := scraplingSem.Acquire(ctx); err != nil {
|
||||||
|
return nil, fmt.Errorf("Scrapling skipped: %w", err)
|
||||||
|
}
|
||||||
|
defer scraplingSem.Release()
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(ctx, 90*time.Second)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
args := []string{helperScript, "--url", url}
|
args := []string{helperScript, "--url", url, "--timeout-ms", "60000", "--wait-ms", "500"}
|
||||||
if opts.Referer != "" {
|
if opts.Referer != "" {
|
||||||
args = append(args, "--referer", opts.Referer)
|
args = append(args, "--referer", opts.Referer)
|
||||||
}
|
}
|
||||||
@@ -577,32 +503,51 @@ func fetchPageWithScrapling(url string, opts fetchOptions) ([]byte, error) {
|
|||||||
var stderr bytes.Buffer
|
var stderr bytes.Buffer
|
||||||
cmd.Stdout = &stdout
|
cmd.Stdout = &stdout
|
||||||
cmd.Stderr = &stderr
|
cmd.Stderr = &stderr
|
||||||
|
// Run in a new process group so we can kill all Chromium children on timeout
|
||||||
|
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
|
||||||
if err := cmd.Run(); err != nil {
|
if err := cmd.Run(); err != nil {
|
||||||
|
// Kill the entire process group including Chromium children
|
||||||
|
if cmd.Process != nil {
|
||||||
|
syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||||
|
}
|
||||||
details := compactErrorText(stderr.String())
|
details := compactErrorText(stderr.String())
|
||||||
if details == "" {
|
if details == "" {
|
||||||
details = compactErrorText(err.Error())
|
details = compactErrorText(err.Error())
|
||||||
}
|
}
|
||||||
|
// Don't count context cancellations (client timeouts) or infrastructure failures
|
||||||
|
if ctx.Err() == nil && !strings.Contains(details, "Executable doesn't exist") {
|
||||||
|
getDomainBreaker(domain).RecordFailure()
|
||||||
|
}
|
||||||
return nil, fmt.Errorf("Scrapling request failed: %s", details)
|
return nil, fmt.Errorf("Scrapling request failed: %s", details)
|
||||||
}
|
}
|
||||||
|
|
||||||
body := stdout.Bytes()
|
body := stdout.Bytes()
|
||||||
if len(body) == 0 {
|
if len(body) == 0 {
|
||||||
|
getDomainBreaker(domain).RecordFailure()
|
||||||
return nil, fmt.Errorf("Scrapling returned an empty body")
|
return nil, fmt.Errorf("Scrapling returned an empty body")
|
||||||
}
|
}
|
||||||
if looksLikeCloudflareBlock(body) {
|
if looksLikeCloudflareBlock(body) {
|
||||||
|
getDomainBreaker(domain).RecordFailure()
|
||||||
return nil, fmt.Errorf("Scrapling returned a Cloudflare challenge page")
|
return nil, fmt.Errorf("Scrapling returned a Cloudflare challenge page")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
getDomainBreaker(domain).RecordSuccess()
|
||||||
return body, nil
|
return body, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func fetchPageWithFallback(url string) ([]byte, error) {
|
func fetchPageWithFallback(ctx context.Context, url string) ([]byte, error) {
|
||||||
return fetchPageWithFallbackOptions(url, fetchOptions{})
|
return fetchPageWithFallbackOptions(ctx, url, fetchOptions{})
|
||||||
}
|
}
|
||||||
|
|
||||||
// fetchPageWithFallback tries Go HTTP first, then wget, then Scrapling, then Cloudflare Browser Rendering.
|
// fetchPageWithFallback tries Go HTTP first, then curl/wget, then Scrapling.
|
||||||
func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error) {
|
// When direct HTTP returns a Cloudflare block, curl/wget are skipped since they
|
||||||
|
// will just return the same challenge page and waste ~20 seconds.
|
||||||
|
func fetchPageWithFallbackOptions(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
// Check cache first
|
// Check cache first
|
||||||
pageCacheMu.RLock()
|
pageCacheMu.RLock()
|
||||||
if entry, ok := pageCache[url]; ok {
|
if entry, ok := pageCache[url]; ok {
|
||||||
@@ -614,14 +559,29 @@ func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error)
|
|||||||
}
|
}
|
||||||
pageCacheMu.RUnlock()
|
pageCacheMu.RUnlock()
|
||||||
|
|
||||||
body, err := fetchPageDirect(url, opts)
|
body, err := fetchPageDirect(ctx, url, opts)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
cachePage(url, body)
|
cachePage(url, body)
|
||||||
return body, nil
|
return body, nil
|
||||||
}
|
}
|
||||||
log.Printf("Direct request failed for %s: %v", url, err)
|
log.Printf("Direct request failed for %s: %v", url, err)
|
||||||
|
|
||||||
body, err = fetchPageWithWget(url, opts)
|
// If direct HTTP returned a Cloudflare block, skip curl/wget time-wasters
|
||||||
|
// and go straight to Scrapling which can solve the challenge.
|
||||||
|
if strings.Contains(err.Error(), "403") || strings.Contains(err.Error(), "Cloudflare") {
|
||||||
|
log.Printf("Skipping curl/wget fallbacks for %s: direct HTTP hit Cloudflare wall", url)
|
||||||
|
goto scraplingFallback
|
||||||
|
}
|
||||||
|
|
||||||
|
body, err = fetchPageWithCurl(ctx, url, opts)
|
||||||
|
if err == nil {
|
||||||
|
log.Printf("Successfully retrieved content via curl for %s", url)
|
||||||
|
cachePage(url, body)
|
||||||
|
return body, nil
|
||||||
|
}
|
||||||
|
log.Printf("curl fallback failed for %s: %v", url, err)
|
||||||
|
|
||||||
|
body, err = fetchPageWithWget(ctx, url, opts)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
log.Printf("Successfully retrieved content via wget for %s", url)
|
log.Printf("Successfully retrieved content via wget for %s", url)
|
||||||
cachePage(url, body)
|
cachePage(url, body)
|
||||||
@@ -629,7 +589,8 @@ func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error)
|
|||||||
}
|
}
|
||||||
log.Printf("wget fallback failed for %s: %v", url, err)
|
log.Printf("wget fallback failed for %s: %v", url, err)
|
||||||
|
|
||||||
body, err = fetchPageWithScrapling(url, opts)
|
scraplingFallback:
|
||||||
|
body, err = fetchPageWithScrapling(ctx, url, opts)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
log.Printf("Successfully retrieved content via Scrapling for %s", url)
|
log.Printf("Successfully retrieved content via Scrapling for %s", url)
|
||||||
cachePage(url, body)
|
cachePage(url, body)
|
||||||
@@ -637,32 +598,7 @@ func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error)
|
|||||||
}
|
}
|
||||||
log.Printf("Scrapling fallback failed for %s: %v", url, err)
|
log.Printf("Scrapling fallback failed for %s: %v", url, err)
|
||||||
|
|
||||||
if cfClient := NewCloudflareClient(); cfClient != nil {
|
return nil, fmt.Errorf("all fetch methods failed for %s: %w", url, err)
|
||||||
log.Printf("Attempting Cloudflare crawl fallback for %s", url)
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
|
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
job, err := cfClient.CrawlURL(ctx, url)
|
|
||||||
if err != nil {
|
|
||||||
log.Printf("Cloudflare crawl failed for %s: %v", url, err)
|
|
||||||
return nil, fmt.Errorf("go scraping failed, wget failed, Scrapling failed, and Cloudflare crawl failed: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(job.Records) > 0 && job.Records[0].Status == "completed" {
|
|
||||||
body := []byte(job.Records[0].HTML)
|
|
||||||
if looksLikeCloudflareBlock(body) {
|
|
||||||
return nil, fmt.Errorf("Cloudflare crawl returned a challenge page")
|
|
||||||
}
|
|
||||||
log.Printf("Successfully retrieved content via Cloudflare crawl for %s", url)
|
|
||||||
cachePage(url, body)
|
|
||||||
return body, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
log.Printf("Cloudflare crawl returned no completed records for %s", url)
|
|
||||||
return nil, fmt.Errorf("Cloudflare crawl returned no completed records")
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil, fmt.Errorf("go scraping failed, wget failed, Scrapling failed, and Cloudflare client is not available")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func cachePage(url string, body []byte) {
|
func cachePage(url string, body []byte) {
|
||||||
@@ -674,13 +610,13 @@ func cachePage(url string, body []byte) {
|
|||||||
// parseCompetitionMatchesFromFotbal scrapes matches from the public fotbal.cz
|
// parseCompetitionMatchesFromFotbal scrapes matches from the public fotbal.cz
|
||||||
// competition page (e.g., https://www.fotbal.cz/souteze/turnaje/table/{id}).
|
// competition page (e.g., https://www.fotbal.cz/souteze/turnaje/table/{id}).
|
||||||
// It filters to only include matches involving the given clubName if provided.
|
// It filters to only include matches involving the given clubName if provided.
|
||||||
func parseCompetitionMatchesFromFotbal(pageURL, clubType, clubName, clubID string) []Match {
|
func parseCompetitionMatchesFromFotbal(ctx context.Context, pageURL, clubType, clubName, clubID string) []Match {
|
||||||
pageURL = strings.TrimSpace(pageURL)
|
pageURL = strings.TrimSpace(pageURL)
|
||||||
if pageURL == "" {
|
if pageURL == "" {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
body, err := fetchPageWithFallback(pageURL)
|
body, err := fetchPageWithFallback(ctx, pageURL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("fotbal.cz matches fetch failed for %s: %v", pageURL, err)
|
log.Printf("fotbal.cz matches fetch failed for %s: %v", pageURL, err)
|
||||||
return nil
|
return nil
|
||||||
@@ -846,8 +782,13 @@ func parseCompetitionMatchesFromFotbal(pageURL, clubType, clubName, clubID strin
|
|||||||
}
|
}
|
||||||
|
|
||||||
// parseCompetitionMatchesFromIS scrapes matches from the IS portal as fallback.
|
// parseCompetitionMatchesFromIS scrapes matches from the IS portal as fallback.
|
||||||
func parseCompetitionMatchesFromIS(detailURL, clubType, clubName, clubID string) []Match {
|
func parseCompetitionMatchesFromIS(ctx context.Context, detailURL, clubType, clubName, clubID string) []Match {
|
||||||
resp, err := http.Get(detailURL)
|
req, err := http.NewRequestWithContext(ctx, "GET", detailURL, nil)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("IS matches request error for %s: %v", detailURL, err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
resp, err := http.DefaultClient.Do(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("IS matches fetch error for %s: %v", detailURL, err)
|
log.Printf("IS matches fetch error for %s: %v", detailURL, err)
|
||||||
return nil
|
return nil
|
||||||
@@ -1275,14 +1216,14 @@ func getClubSearch(w http.ResponseWriter, r *http.Request) {
|
|||||||
vals.Set("q", q)
|
vals.Set("q", q)
|
||||||
searchURL := "https://www.fotbal.cz/club/hledej?" + vals.Encode()
|
searchURL := "https://www.fotbal.cz/club/hledej?" + vals.Encode()
|
||||||
|
|
||||||
fetchSearchPage := func(url string) ([]byte, error) {
|
fetchSearchPage := func(ctx context.Context, url string) ([]byte, error) {
|
||||||
return fetchPageWithFallbackOptions(url, fetchOptions{
|
return fetchPageWithFallbackOptions(ctx, url, fetchOptions{
|
||||||
Referer: "https://www.fotbal.cz/club/hledej",
|
Referer: "https://www.fotbal.cz/club/hledej",
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// Try direct HTTP request first
|
// Try direct HTTP request first
|
||||||
body, err := fetchSearchPage(searchURL)
|
body, err := fetchSearchPage(r.Context(), searchURL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("Direct search request failed for %s: %v", searchURL, err)
|
log.Printf("Direct search request failed for %s: %v", searchURL, err)
|
||||||
|
|
||||||
@@ -1298,7 +1239,7 @@ func getClubSearch(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
body, err = fetchSearchPage(searchURL2)
|
body, err = fetchSearchPage(r.Context(), searchURL2)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("Retried search request failed for %s: %v", searchURL2, err)
|
log.Printf("Retried search request failed for %s: %v", searchURL2, err)
|
||||||
// Return empty results instead of error
|
// Return empty results instead of error
|
||||||
@@ -1409,7 +1350,7 @@ func getClubTables(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
url := fmt.Sprintf("%s/%s", baseURL, clubID)
|
url := fmt.Sprintf("%s/%s", baseURL, clubID)
|
||||||
body, err := fetchPageWithFallback(url)
|
body, err := fetchPageWithFallback(r.Context(), url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, fmt.Sprintf("Error fetching club data: %v", err), http.StatusInternalServerError)
|
http.Error(w, fmt.Sprintf("Error fetching club data: %v", err), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
@@ -1459,26 +1400,43 @@ func getClubTables(w http.ResponseWriter, r *http.Request) {
|
|||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
// For each competition, fetch the standings tables from is.fotbal.cz
|
// For each competition, fetch the standings tables from is.fotbal.cz concurrently
|
||||||
|
sem := newSemaphore(4)
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
var mu sync.Mutex
|
||||||
|
|
||||||
for i := range competitions {
|
for i := range competitions {
|
||||||
comp := &competitions[i]
|
wg.Add(1)
|
||||||
|
go func(idx int) {
|
||||||
|
defer wg.Done()
|
||||||
|
if err := sem.Acquire(r.Context()); err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer sem.Release()
|
||||||
|
|
||||||
|
comp := &competitions[idx]
|
||||||
tableURL := fmt.Sprintf("https://is.fotbal.cz/public/souteze/tabulky-souteze.aspx?req=%s&sport=%s", comp.ID, sportParam)
|
tableURL := fmt.Sprintf("https://is.fotbal.cz/public/souteze/tabulky-souteze.aspx?req=%s&sport=%s", comp.ID, sportParam)
|
||||||
resp, err := http.Get(tableURL)
|
req, err := http.NewRequestWithContext(r.Context(), "GET", tableURL, nil)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("error creating request for competition table %s: %v", comp.ID, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
resp, err := http.DefaultClient.Do(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("error fetching competition table for %s: %v", comp.ID, err)
|
log.Printf("error fetching competition table for %s: %v", comp.ID, err)
|
||||||
continue
|
return
|
||||||
}
|
}
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
|
|
||||||
if resp.StatusCode != http.StatusOK {
|
if resp.StatusCode != http.StatusOK {
|
||||||
log.Printf("non-200 response for %s: %d", comp.ID, resp.StatusCode)
|
log.Printf("non-200 response for %s: %d", comp.ID, resp.StatusCode)
|
||||||
continue
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
docTable, err := goquery.NewDocumentFromReader(resp.Body)
|
docTable, err := goquery.NewDocumentFromReader(resp.Body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("error parsing table HTML for %s: %v", comp.ID, err)
|
log.Printf("error parsing table HTML for %s: %v", comp.ID, err)
|
||||||
continue
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse section: Tabulka celková (only overall)
|
// Parse section: Tabulka celková (only overall)
|
||||||
@@ -1532,8 +1490,12 @@ func getClubTables(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
overall = parseSection("Tabulka celková")
|
overall = parseSection("Tabulka celková")
|
||||||
|
mu.Lock()
|
||||||
comp.Table = &CompetitionTable{Overall: overall}
|
comp.Table = &CompetitionTable{Overall: overall}
|
||||||
|
mu.Unlock()
|
||||||
|
}(i)
|
||||||
}
|
}
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
clubName := strings.TrimSpace(doc.Find("h1.H4 span").First().Text())
|
clubName := strings.TrimSpace(doc.Find("h1.H4 span").First().Text())
|
||||||
clubURL := strings.TrimSpace(doc.Find("h1.H4 a").First().AttrOr("href", ""))
|
clubURL := strings.TrimSpace(doc.Find("h1.H4 a").First().AttrOr("href", ""))
|
||||||
@@ -1583,7 +1545,7 @@ func getClubInfo(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
url := fmt.Sprintf("%s/%s", baseURL, clubID)
|
url := fmt.Sprintf("%s/%s", baseURL, clubID)
|
||||||
body, err := fetchPageWithFallback(url)
|
body, err := fetchPageWithFallback(r.Context(), url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, fmt.Sprintf("Error fetching club data: %v", err), http.StatusInternalServerError)
|
http.Error(w, fmt.Sprintf("Error fetching club data: %v", err), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
@@ -1637,21 +1599,37 @@ func getClubInfo(w http.ResponseWriter, r *http.Request) {
|
|||||||
competitions = append(competitions, Competition{ID: compID, Code: code, Name: name, TeamCount: teamCount, MatchesLink: tableLink})
|
competitions = append(competitions, Competition{ID: compID, Code: code, Name: name, TeamCount: teamCount, MatchesLink: tableLink})
|
||||||
})
|
})
|
||||||
|
|
||||||
// For each competition, fetch matches
|
// For each competition, fetch matches concurrently with limits
|
||||||
|
sem := newSemaphore(4)
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
var mu sync.Mutex
|
||||||
|
|
||||||
for i := range competitions {
|
for i := range competitions {
|
||||||
comp := &competitions[i]
|
wg.Add(1)
|
||||||
|
go func(idx int) {
|
||||||
|
defer wg.Done()
|
||||||
|
if err := sem.Acquire(r.Context()); err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer sem.Release()
|
||||||
|
|
||||||
|
comp := &competitions[idx]
|
||||||
matchesLink := comp.MatchesLink
|
matchesLink := comp.MatchesLink
|
||||||
// 1) Try parsing from the public fotbal.cz competition page (matches_link)
|
// 1) Try parsing from the public fotbal.cz competition page (matches_link)
|
||||||
matches := parseCompetitionMatchesFromFotbal(matchesLink, clubType, clubName, clubID)
|
matches := parseCompetitionMatchesFromFotbal(r.Context(), matchesLink, clubType, clubName, clubID)
|
||||||
// Always try IS as well and prefer it if it provides at least as many matches
|
// Always try IS as well and prefer it if it provides at least as many matches
|
||||||
detailURL := fmt.Sprintf("https://is.fotbal.cz/public/souteze/detail-souteze.aspx?req=%s&sport=%s", comp.ID, sportParam)
|
detailURL := fmt.Sprintf("https://is.fotbal.cz/public/souteze/detail-souteze.aspx?req=%s&sport=%s", comp.ID, sportParam)
|
||||||
isMatches := parseCompetitionMatchesFromIS(detailURL, clubType, clubName, clubID)
|
isMatches := parseCompetitionMatchesFromIS(r.Context(), detailURL, clubType, clubName, clubID)
|
||||||
// Prefer IS whenever it yields any results, as IS often contains alias team names
|
// Prefer IS whenever it yields any results, as IS often contains alias team names
|
||||||
if len(isMatches) > 0 {
|
if len(isMatches) > 0 {
|
||||||
matches = isMatches
|
matches = isMatches
|
||||||
}
|
}
|
||||||
|
mu.Lock()
|
||||||
comp.Matches = matches
|
comp.Matches = matches
|
||||||
|
mu.Unlock()
|
||||||
|
}(i)
|
||||||
}
|
}
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
clubInfo := ClubInfo{
|
clubInfo := ClubInfo{
|
||||||
Name: clubName,
|
Name: clubName,
|
||||||
@@ -1679,9 +1657,18 @@ func main() {
|
|||||||
http.Redirect(w, r, "/club/football/"+vars["id"], http.StatusMovedPermanently)
|
http.Redirect(w, r, "/club/football/"+vars["id"], http.StatusMovedPermanently)
|
||||||
}).Methods("GET")
|
}).Methods("GET")
|
||||||
r.HandleFunc("/", docsHandler)
|
r.HandleFunc("/", docsHandler)
|
||||||
port := "0.0.0.0:8686"
|
|
||||||
fmt.Printf("Server running on http://%s\n", port)
|
addr := "0.0.0.0:8686"
|
||||||
log.Fatal(http.ListenAndServe(port, r))
|
srv := &http.Server{
|
||||||
|
Addr: addr,
|
||||||
|
Handler: r,
|
||||||
|
ReadTimeout: 30 * time.Second,
|
||||||
|
WriteTimeout: 5 * time.Minute,
|
||||||
|
IdleTimeout: 120 * time.Second,
|
||||||
|
MaxHeaderBytes: 1 << 20,
|
||||||
|
}
|
||||||
|
fmt.Printf("Server running on http://%s\n", addr)
|
||||||
|
log.Fatal(srv.ListenAndServe())
|
||||||
}
|
}
|
||||||
|
|
||||||
// docsHandler serves a simple HTML API documentation at the root endpoint.
|
// docsHandler serves a simple HTML API documentation at the root endpoint.
|
||||||
|
|||||||
+88
-31
@@ -3,7 +3,34 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import contextlib
|
import contextlib
|
||||||
import logging
|
import logging
|
||||||
|
import ssl
|
||||||
import sys
|
import sys
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
|
||||||
|
BROWSER_UA = (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
|
||||||
|
CF_SIGNS = [
|
||||||
|
b"<title>just a moment...</title>",
|
||||||
|
b"attention required!",
|
||||||
|
b"enable javascript and cookies to continue",
|
||||||
|
b"checking if the site connection is secure",
|
||||||
|
b"cf-browser-verification",
|
||||||
|
b"/cdn-cgi/challenge-platform/",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def looks_like_cloudflare_block(body: bytes) -> bool:
|
||||||
|
if not body:
|
||||||
|
return False
|
||||||
|
low = body.lower()
|
||||||
|
for sig in CF_SIGNS:
|
||||||
|
if sig in low:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def response_body_bytes(response) -> bytes:
|
def response_body_bytes(response) -> bytes:
|
||||||
@@ -20,60 +47,90 @@ def response_body_bytes(response) -> bytes:
|
|||||||
return str(response).encode("utf-8")
|
return str(response).encode("utf-8")
|
||||||
|
|
||||||
|
|
||||||
def main() -> int:
|
def lightweight_fetch(url: str, referer: str = "", timeout: float = 15.0) -> bytes:
|
||||||
parser = argparse.ArgumentParser()
|
"""Try a lightweight urllib fetch with browser headers first."""
|
||||||
parser.add_argument("--url", required=True)
|
req = urllib.request.Request(
|
||||||
parser.add_argument("--referer", default="")
|
url,
|
||||||
parser.add_argument("--timeout-ms", type=int, default=45000)
|
headers={
|
||||||
parser.add_argument("--wait-ms", type=int, default=1000)
|
"User-Agent": BROWSER_UA,
|
||||||
args = parser.parse_args()
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
||||||
|
"Accept-Language": "cs-CZ,cs;q=0.9,en;q=0.8",
|
||||||
|
"Accept-Encoding": "identity",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
**({"Referer": referer} if referer else {}),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
ctx = ssl.create_default_context()
|
||||||
|
ctx.check_hostname = False
|
||||||
|
ctx.verify_mode = ssl.CERT_NONE
|
||||||
|
with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
|
||||||
|
body = resp.read()
|
||||||
|
if looks_like_cloudflare_block(body):
|
||||||
|
raise RuntimeError(" lightweight fetch returned Cloudflare challenge")
|
||||||
|
return body
|
||||||
|
|
||||||
|
|
||||||
|
def scrapling_fetch(url: str, referer: str = "", timeout_ms: int = 30000, wait_ms: int = 500) -> bytes:
|
||||||
try:
|
try:
|
||||||
from scrapling.fetchers import StealthyFetcher
|
from scrapling.fetchers import StealthyFetcher
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
print(f"Scrapling import failed: {exc}", file=sys.stderr)
|
raise RuntimeError(f"Scrapling import failed: {exc}") from exc
|
||||||
return 2
|
|
||||||
|
|
||||||
logging.getLogger().setLevel(logging.ERROR)
|
logging.getLogger().setLevel(logging.ERROR)
|
||||||
|
|
||||||
extra_headers = {}
|
extra_headers = {}
|
||||||
if args.referer:
|
if referer:
|
||||||
extra_headers["Referer"] = args.referer
|
extra_headers["Referer"] = referer
|
||||||
|
|
||||||
fetch_kwargs = {
|
fetch_kwargs = {
|
||||||
"headless": True,
|
"headless": True,
|
||||||
"network_idle": True,
|
"network_idle": True,
|
||||||
"google_search": False,
|
"google_search": False,
|
||||||
"solve_cloudflare": True,
|
"solve_cloudflare": True,
|
||||||
"timeout": args.timeout_ms,
|
"timeout": timeout_ms,
|
||||||
"wait": args.wait_ms,
|
"wait": wait_ms,
|
||||||
}
|
}
|
||||||
if extra_headers:
|
if extra_headers:
|
||||||
fetch_kwargs["extra_headers"] = extra_headers
|
fetch_kwargs["extra_headers"] = extra_headers
|
||||||
|
|
||||||
try:
|
|
||||||
with contextlib.redirect_stdout(sys.stderr):
|
with contextlib.redirect_stdout(sys.stderr):
|
||||||
response = StealthyFetcher.fetch(args.url, **fetch_kwargs)
|
response = StealthyFetcher.fetch(url, **fetch_kwargs)
|
||||||
|
|
||||||
|
status = getattr(response, "status", None)
|
||||||
|
if isinstance(status, int) and status >= 400:
|
||||||
|
raise RuntimeError(f"Scrapling returned HTTP {status}")
|
||||||
|
|
||||||
|
body = response_body_bytes(response)
|
||||||
|
if not body:
|
||||||
|
raise RuntimeError("Scrapling returned an empty body")
|
||||||
|
return body
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--url", required=True)
|
||||||
|
parser.add_argument("--referer", default="")
|
||||||
|
parser.add_argument("--timeout-ms", type=int, default=30000)
|
||||||
|
parser.add_argument("--wait-ms", type=int, default=500)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# 1) Try lightweight urllib fetch first (no browser, instant)
|
||||||
|
try:
|
||||||
|
body = lightweight_fetch(args.url, args.referer, timeout=min(args.timeout_ms / 1000.0, 15.0))
|
||||||
|
sys.stdout.buffer.write(body)
|
||||||
|
return 0
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"Lightweight fetch failed: {exc}", file=sys.stderr)
|
||||||
|
|
||||||
|
# 2) Fall back to Scrapling / Playwright only if lightweight failed
|
||||||
|
try:
|
||||||
|
body = scrapling_fetch(args.url, args.referer, args.timeout_ms, args.wait_ms)
|
||||||
|
sys.stdout.buffer.write(body)
|
||||||
|
return 0
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
print(f"Scrapling fetch failed: {exc}", file=sys.stderr)
|
print(f"Scrapling fetch failed: {exc}", file=sys.stderr)
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
status = getattr(response, "status", None)
|
|
||||||
if isinstance(status, int) and status >= 400:
|
|
||||||
print(f"Scrapling returned HTTP {status}", file=sys.stderr)
|
|
||||||
return 1
|
|
||||||
|
|
||||||
body = response_body_bytes(response)
|
|
||||||
if not body:
|
|
||||||
print("Scrapling returned an empty body", file=sys.stderr)
|
|
||||||
return 1
|
|
||||||
|
|
||||||
try:
|
|
||||||
sys.stdout.buffer.write(body)
|
|
||||||
except BrokenPipeError:
|
|
||||||
return 0
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
raise SystemExit(main())
|
raise SystemExit(main())
|
||||||
|
|||||||
Reference in New Issue
Block a user