package main import ( "bytes" "context" _ "embed" "encoding/json" "fmt" "io" "log" "net/http" "net/http/cookiejar" neturl "net/url" "os" "os/exec" "path/filepath" "regexp" "strings" "sync" "sync/atomic" "syscall" "time" "github.com/PuerkitoBio/goquery" "github.com/gorilla/mux" "golang.org/x/net/publicsuffix" ) type Competition struct { ID string `json:"id"` Code string `json:"code"` Name string `json:"name"` TeamCount string `json:"team_count"` MatchesLink string `json:"matches_link"` Matches []Match `json:"matches,omitempty"` Table *CompetitionTable `json:"table,omitempty"` } type fetchOptions struct { Referer string } const ( browserUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0 Safari/537.36" browserAccept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8" browserAcceptLanguage = "cs-CZ,cs;q=0.9,en;q=0.8" scraplingHelperPath = "scripts/scrapling_fetch.py" ) //go:embed scripts/scrapling_fetch.py var embeddedScraplingHelper string var ( embeddedScraplingHelperOnce sync.Once embeddedScraplingHelperFile string embeddedScraplingHelperErr error // Simple in-memory cache for fetched pages pageCache = make(map[string]*cacheEntry) pageCacheMu sync.RWMutex // Club response cache for expensive multi-fetch endpoints clubCache = make(map[string]*clubCacheEntry) clubCacheMu sync.RWMutex ) type cacheEntry struct { body []byte timestamp time.Time } type clubCacheEntry struct { data []byte timestamp time.Time } const cacheTTL = 15 * time.Minute const clubCacheTTL = 30 * time.Minute // domainBreakers is a per-domain circuit breaker map so failures on one site // don't block Scrapling for unrelated sites. var domainBreakers struct { mu sync.RWMutex breakers map[string]*circuitBreaker } // scraplingSem limits concurrent Chromium launches to avoid zombie processes // and resource exhaustion. var scraplingSem = newSemaphore(2) // domainReqSem limits concurrent requests to the same domain to avoid // triggering Cloudflare rate-limiting. var domainReqSem struct { mu sync.RWMutex sems map[string]*semaphore } func getDomainReqSem(domain string) *semaphore { domainReqSem.mu.RLock() if s, ok := domainReqSem.sems[domain]; ok { domainReqSem.mu.RUnlock() return s } domainReqSem.mu.RUnlock() domainReqSem.mu.Lock() defer domainReqSem.mu.Unlock() if s, ok := domainReqSem.sems[domain]; ok { return s } s := newSemaphore(1) if domainReqSem.sems == nil { domainReqSem.sems = make(map[string]*semaphore) } domainReqSem.sems[domain] = s return s } // sharedHTTPClient is a reusable client with a cookie jar so that cookies // (including any Cloudflare clearance) survive across requests. var sharedHTTPClient = func() *http.Client { jar, err := cookiejar.New(&cookiejar.Options{PublicSuffixList: publicsuffix.List}) if err != nil { log.Printf("failed to create cookie jar, falling back to default client: %v", err) return &http.Client{Timeout: 15 * time.Second} } return &http.Client{ Timeout: 15 * time.Second, Jar: jar, CheckRedirect: func(req *http.Request, via []*http.Request) error { if len(via) >= 10 { return fmt.Errorf("stopped after 10 redirects") } // Preserve headers across redirects for _, h := range []string{"User-Agent", "Accept", "Accept-Language", "Referer", "Sec-Ch-Ua", "Sec-Ch-Ua-Mobile", "Sec-Ch-Ua-Platform", "Sec-Fetch-Dest", "Sec-Fetch-Mode", "Sec-Fetch-Site", "Upgrade-Insecure-Requests"} { if v := via[len(via)-1].Header.Get(h); v != "" { req.Header.Set(h, v) } } return nil }, } }() type circuitBreaker struct { failures int32 lastFail time.Time threshold int timeout time.Duration mu sync.Mutex } func getDomainBreaker(domain string) *circuitBreaker { domainBreakers.mu.RLock() if cb, ok := domainBreakers.breakers[domain]; ok { domainBreakers.mu.RUnlock() return cb } domainBreakers.mu.RUnlock() domainBreakers.mu.Lock() defer domainBreakers.mu.Unlock() if cb, ok := domainBreakers.breakers[domain]; ok { return cb } cb := &circuitBreaker{ threshold: 15, timeout: 30 * time.Minute, } if domainBreakers.breakers == nil { domainBreakers.breakers = make(map[string]*circuitBreaker) } domainBreakers.breakers[domain] = cb return cb } func (cb *circuitBreaker) RecordFailure() { atomic.AddInt32(&cb.failures, 1) cb.mu.Lock() cb.lastFail = time.Now() cb.mu.Unlock() } func (cb *circuitBreaker) RecordSuccess() { atomic.StoreInt32(&cb.failures, 0) } func (cb *circuitBreaker) IsOpen() bool { if atomic.LoadInt32(&cb.failures) < int32(cb.threshold) { return false } cb.mu.Lock() last := cb.lastFail cb.mu.Unlock() return time.Since(last) < cb.timeout } // semaphore limits concurrent operations. type semaphore struct { ch chan struct{} } func newSemaphore(n int) *semaphore { return &semaphore{ch: make(chan struct{}, n)} } func (s *semaphore) Acquire(ctx context.Context) error { select { case s.ch <- struct{}{}: return nil case <-ctx.Done(): return ctx.Err() } } func (s *semaphore) Release() { select { case <-s.ch: default: } } func newBrowserRequest(url string, opts fetchOptions) (*http.Request, error) { req, err := http.NewRequest("GET", url, nil) if err != nil { return nil, fmt.Errorf("failed to create request: %w", err) } req.Header.Set("User-Agent", browserUserAgent) req.Header.Set("Accept", browserAccept) req.Header.Set("Accept-Language", browserAcceptLanguage) req.Header.Set("Accept-Encoding", "gzip, deflate, br") req.Header.Set("Connection", "keep-alive") req.Header.Set("Upgrade-Insecure-Requests", "1") req.Header.Set("Sec-Ch-Ua", `"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"`) req.Header.Set("Sec-Ch-Ua-Mobile", "?0") req.Header.Set("Sec-Ch-Ua-Platform", `"Windows"`) req.Header.Set("Sec-Fetch-Dest", "document") req.Header.Set("Sec-Fetch-Mode", "navigate") req.Header.Set("Sec-Fetch-Site", "none") req.Header.Set("Sec-Fetch-User", "?1") req.Header.Set("DNT", "1") req.Header.Set("Cache-Control", "max-age=0") if opts.Referer != "" { req.Header.Set("Referer", opts.Referer) req.Header.Set("Sec-Fetch-Site", "same-origin") } return req, nil } func looksLikeCloudflareBlock(body []byte) bool { if len(body) == 0 { return false } lower := strings.ToLower(string(body)) hardSignals := []string{ "just a moment...", "attention required!", "enable javascript and cookies to continue", "checking if the site connection is secure", "cf-browser-verification", } for _, signal := range hardSignals { if strings.Contains(lower, signal) { return true } } if strings.Contains(lower, "/cdn-cgi/challenge-platform/") && (strings.Contains(lower, "window._cf_chl_opt") || strings.Contains(lower, "__cf_chl_rt_tk") || strings.Contains(lower, "cf_chl_seq_")) { return true } return false } func compactErrorText(s string) string { s = strings.Join(strings.Fields(strings.TrimSpace(s)), " ") if len(s) > 220 { return s[:217] + "..." } return s } func fetchPageDirectOnce(ctx context.Context, url string, opts fetchOptions) ([]byte, error) { parsed, err := neturl.Parse(url) if err != nil { return nil, fmt.Errorf("invalid URL: %w", err) } // Serialize requests per domain to avoid triggering rate limits. sem := getDomainReqSem(parsed.Host) if err := sem.Acquire(ctx); err != nil { return nil, err } defer sem.Release() req, err := newBrowserRequest(url, opts) if err != nil { return nil, err } req = req.WithContext(ctx) resp, err := sharedHTTPClient.Do(req) if err != nil { return nil, fmt.Errorf("direct request failed: %w", err) } defer resp.Body.Close() body, err := io.ReadAll(resp.Body) if err != nil { return nil, fmt.Errorf("failed to read response body: %w", err) } if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("direct request returned HTTP %d", resp.StatusCode) } if looksLikeCloudflareBlock(body) { return nil, fmt.Errorf("direct request returned a Cloudflare challenge page") } return body, nil } func fetchPageDirect(ctx context.Context, url string, opts fetchOptions) ([]byte, error) { var lastErr error for attempt := 0; attempt < 3; attempt++ { if attempt > 0 { select { case <-time.After(time.Duration(attempt) * time.Second): case <-ctx.Done(): return nil, ctx.Err() } } body, err := fetchPageDirectOnce(ctx, url, opts) if err == nil { return body, nil } lastErr = err } return nil, fmt.Errorf("direct fetch failed after 3 attempts: %w", lastErr) } func fetchPageWithWget(ctx context.Context, url string, opts fetchOptions) ([]byte, error) { if _, err := exec.LookPath("wget"); err != nil { return nil, fmt.Errorf("wget not available: %w", err) } ctx, cancel := context.WithTimeout(ctx, 20*time.Second) defer cancel() args := []string{ "--quiet", "--tries=1", "--timeout=15", "--max-redirect=10", "--output-document=-", "--user-agent=" + browserUserAgent, "--header=Accept: " + browserAccept, "--header=Accept-Language: " + browserAcceptLanguage, } if opts.Referer != "" { args = append(args, "--header=Referer: "+opts.Referer) } args = append(args, url) cmd := exec.CommandContext(ctx, "wget", args...) var stdout bytes.Buffer var stderr bytes.Buffer cmd.Stdout = &stdout cmd.Stderr = &stderr if err := cmd.Run(); err != nil { details := compactErrorText(stderr.String()) if details == "" { details = compactErrorText(err.Error()) } return nil, fmt.Errorf("wget request failed: %s", details) } body := stdout.Bytes() if len(body) == 0 { return nil, fmt.Errorf("wget returned an empty body") } if looksLikeCloudflareBlock(body) { return nil, fmt.Errorf("wget returned a Cloudflare challenge page") } return body, nil } func firstExistingFile(paths ...string) string { for _, path := range paths { path = strings.TrimSpace(path) if path == "" { continue } if info, err := os.Stat(path); err == nil && !info.IsDir() { return path } } return "" } func firstExecutable(paths ...string) string { for _, path := range paths { path = strings.TrimSpace(path) if path == "" { continue } if strings.ContainsRune(path, os.PathSeparator) { if info, err := os.Stat(path); err == nil && !info.IsDir() { return path } continue } if resolved, err := exec.LookPath(path); err == nil { return resolved } } return "" } func ensureEmbeddedScraplingHelper() (string, error) { embeddedScraplingHelperOnce.Do(func() { if strings.TrimSpace(embeddedScraplingHelper) == "" { embeddedScraplingHelperErr = fmt.Errorf("embedded Scrapling helper is empty") return } file, err := os.CreateTemp("", "facr-scrapling-*.py") if err != nil { embeddedScraplingHelperErr = fmt.Errorf("create embedded Scrapling helper: %w", err) return } defer file.Close() if _, err := file.WriteString(embeddedScraplingHelper); err != nil { embeddedScraplingHelperErr = fmt.Errorf("write embedded Scrapling helper: %w", err) return } if err := file.Chmod(0600); err != nil { embeddedScraplingHelperErr = fmt.Errorf("chmod embedded Scrapling helper: %w", err) return } embeddedScraplingHelperFile = file.Name() }) if embeddedScraplingHelperErr != nil { return "", embeddedScraplingHelperErr } if embeddedScraplingHelperFile == "" { return "", fmt.Errorf("embedded Scrapling helper path is empty") } return embeddedScraplingHelperFile, nil } func fetchPageWithCurl(ctx context.Context, url string, opts fetchOptions) ([]byte, error) { if _, err := exec.LookPath("curl"); err != nil { return nil, fmt.Errorf("curl not available: %w", err) } ctx, cancel := context.WithTimeout(ctx, 20*time.Second) defer cancel() args := []string{ "-sSL", "--max-time", "15", "-A", browserUserAgent, "-H", "Accept: " + browserAccept, "-H", "Accept-Language: " + browserAcceptLanguage, "-H", "Connection: keep-alive", } if opts.Referer != "" { args = append(args, "-H", "Referer: "+opts.Referer) } args = append(args, url) cmd := exec.CommandContext(ctx, "curl", args...) var stdout bytes.Buffer var stderr bytes.Buffer cmd.Stdout = &stdout cmd.Stderr = &stderr if err := cmd.Run(); err != nil { details := compactErrorText(stderr.String()) if details == "" { details = compactErrorText(err.Error()) } return nil, fmt.Errorf("curl request failed: %s", details) } body := stdout.Bytes() if len(body) == 0 { return nil, fmt.Errorf("curl returned an empty body") } if looksLikeCloudflareBlock(body) { return nil, fmt.Errorf("curl returned a Cloudflare challenge page") } return body, nil } func findScraplingHelperScript() (string, error) { cwd, _ := os.Getwd() exePath, _ := os.Executable() exeDir := "" if exePath != "" { exeDir = filepath.Dir(exePath) } if path := firstExistingFile( os.Getenv("SCRAPLING_SCRIPT"), filepath.Join(cwd, scraplingHelperPath), filepath.Join(exeDir, scraplingHelperPath), ); path != "" { return path, nil } return ensureEmbeddedScraplingHelper() } func findScraplingPython() string { cwd, _ := os.Getwd() exePath, _ := os.Executable() exeDir := "" if exePath != "" { exeDir = filepath.Dir(exePath) } return firstExecutable( os.Getenv("SCRAPLING_PYTHON_BIN"), filepath.Join(cwd, ".venv-scrapling", "bin", "python3"), filepath.Join(cwd, ".venv-scrapling", "bin", "python"), filepath.Join(cwd, ".venv", "bin", "python3"), filepath.Join(cwd, ".venv", "bin", "python"), filepath.Join(exeDir, ".venv-scrapling", "bin", "python3"), filepath.Join(exeDir, ".venv-scrapling", "bin", "python"), filepath.Join(exeDir, ".venv", "bin", "python3"), filepath.Join(exeDir, ".venv", "bin", "python"), "python3", "python", ) } func findCloakBrowserPython() string { cwd, _ := os.Getwd() exePath, _ := os.Executable() exeDir := "" if exePath != "" { exeDir = filepath.Dir(exePath) } return firstExecutable( os.Getenv("CLOAKBROWSER_PYTHON_BIN"), filepath.Join(cwd, ".venv-scrapling", "bin", "python3"), filepath.Join(cwd, ".venv-scrapling", "bin", "python"), filepath.Join(cwd, ".venv", "bin", "python3"), filepath.Join(cwd, ".venv", "bin", "python"), filepath.Join(exeDir, ".venv-scrapling", "bin", "python3"), filepath.Join(exeDir, ".venv-scrapling", "bin", "python"), filepath.Join(exeDir, ".venv", "bin", "python3"), filepath.Join(exeDir, ".venv", "bin", "python"), "python3", "python", ) } func findCloakBrowserScript() (string, error) { cwd, _ := os.Getwd() candidates := []string{ os.Getenv("CLOAKBROWSER_SCRIPT"), filepath.Join(cwd, "scripts", "cloakbrowser_fetch.py"), filepath.Join(cwd, "cloakbrowser_fetch.py"), "/opt/scrapling/scripts/cloakbrowser_fetch.py", } exePath, _ := os.Executable() if exePath != "" { exeDir := filepath.Dir(exePath) candidates = append(candidates, filepath.Join(exeDir, "scripts", "cloakbrowser_fetch.py"), filepath.Join(exeDir, "cloakbrowser_fetch.py"), ) } for _, p := range candidates { if p != "" { if _, err := os.Stat(p); err == nil { return p, nil } } } return "", fmt.Errorf("cloakbrowser_fetch.py not found") } // fetchPageWithCloakBrowser uses the CloakBrowser patched Chromium to fetch // pages that are blocked by Cloudflare. It is ~3x faster than Scrapling for // fotbal.cz because it passes bot detection without triggering challenge loops. func fetchPageWithCloakBrowser(ctx context.Context, url string, opts fetchOptions) ([]byte, error) { parsedURL, err := neturl.Parse(url) if err != nil { return nil, fmt.Errorf("CloakBrowser skipped: invalid URL: %w", err) } domain := parsedURL.Host if getDomainBreaker(domain).IsOpen() { return nil, fmt.Errorf("CloakBrowser skipped: circuit breaker is open for %s", domain) } pythonBin := findCloakBrowserPython() if pythonBin == "" { return nil, fmt.Errorf("CloakBrowser skipped: no Python runtime found") } helperScript, err := findCloakBrowserScript() if err != nil { return nil, fmt.Errorf("CloakBrowser skipped: %w", err) } // Acquire global Scrapling semaphore to limit concurrent Chromium launches if err := scraplingSem.Acquire(ctx); err != nil { return nil, fmt.Errorf("CloakBrowser skipped: %w", err) } defer scraplingSem.Release() ctx, cancel := context.WithTimeout(ctx, 45*time.Second) defer cancel() args := []string{helperScript, url} if opts.Referer != "" { args = append(args, opts.Referer) } cmd := exec.CommandContext(ctx, pythonBin, args...) var stdout bytes.Buffer var stderr bytes.Buffer cmd.Stdout = &stdout cmd.Stderr = &stderr cmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true} if err := cmd.Run(); err != nil { if cmd.Process != nil { syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL) } details := compactErrorText(stderr.String()) if details == "" { details = compactErrorText(err.Error()) } if ctx.Err() == nil { getDomainBreaker(domain).RecordFailure() } return nil, fmt.Errorf("CloakBrowser request failed: %s", details) } body := stdout.Bytes() if len(body) == 0 { getDomainBreaker(domain).RecordFailure() return nil, fmt.Errorf("CloakBrowser returned an empty body") } if looksLikeCloudflareBlock(body) { getDomainBreaker(domain).RecordFailure() return nil, fmt.Errorf("CloakBrowser returned a Cloudflare challenge page") } getDomainBreaker(domain).RecordSuccess() return body, nil } func fetchPageWithScrapling(ctx context.Context, url string, opts fetchOptions) ([]byte, error) { parsedURL, err := neturl.Parse(url) if err != nil { return nil, fmt.Errorf("Scrapling skipped: invalid URL: %w", err) } domain := parsedURL.Host if getDomainBreaker(domain).IsOpen() { return nil, fmt.Errorf("Scrapling skipped: circuit breaker is open for %s", domain) } pythonBin := findScraplingPython() if pythonBin == "" { return nil, fmt.Errorf("Scrapling skipped: no Python runtime found") } helperScript, err := findScraplingHelperScript() if err != nil { return nil, fmt.Errorf("Scrapling skipped: %w", err) } // Acquire global Scrapling semaphore to limit concurrent Chromium launches if err := scraplingSem.Acquire(ctx); err != nil { return nil, fmt.Errorf("Scrapling skipped: %w", err) } defer scraplingSem.Release() ctx, cancel := context.WithTimeout(ctx, 120*time.Second) defer cancel() args := []string{helperScript, "--url", url, "--timeout-ms", "90000", "--wait-ms", "500"} if opts.Referer != "" { args = append(args, "--referer", opts.Referer) } cmd := exec.CommandContext(ctx, pythonBin, args...) var stdout bytes.Buffer var stderr bytes.Buffer cmd.Stdout = &stdout cmd.Stderr = &stderr // Run in a new process group so we can kill all Chromium children on timeout cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} if err := cmd.Run(); err != nil { // Kill the entire process group including Chromium children if cmd.Process != nil { syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL) } details := compactErrorText(stderr.String()) if details == "" { details = compactErrorText(err.Error()) } // Don't count context cancellations (client timeouts) or infrastructure failures if ctx.Err() == nil && !strings.Contains(details, "Executable doesn't exist") { getDomainBreaker(domain).RecordFailure() } return nil, fmt.Errorf("Scrapling request failed: %s", details) } body := stdout.Bytes() if len(body) == 0 { getDomainBreaker(domain).RecordFailure() return nil, fmt.Errorf("Scrapling returned an empty body") } if looksLikeCloudflareBlock(body) { getDomainBreaker(domain).RecordFailure() return nil, fmt.Errorf("Scrapling returned a Cloudflare challenge page") } getDomainBreaker(domain).RecordSuccess() return body, nil } func fetchPageWithFallback(ctx context.Context, url string) ([]byte, error) { return fetchPageWithFallbackOptions(ctx, url, fetchOptions{}) } // fetchPageWithFallback tries Go HTTP first, then curl/wget, then CloakBrowser, // then Scrapling. When direct HTTP returns a Cloudflare block, curl/wget are // skipped since they will just return the same challenge page and waste ~20s. func fetchPageWithFallbackOptions(ctx context.Context, url string, opts fetchOptions) ([]byte, error) { if err := ctx.Err(); err != nil { return nil, err } // Check cache first pageCacheMu.RLock() if entry, ok := pageCache[url]; ok { if time.Since(entry.timestamp) < cacheTTL { pageCacheMu.RUnlock() log.Printf("Cache hit for %s", url) return entry.body, nil } } pageCacheMu.RUnlock() // Try direct HTTP first body, err := fetchPageDirect(ctx, url, opts) if err == nil { cachePage(url, body) return body, nil } log.Printf("Direct request failed for %s: %v", url, err) // If direct HTTP returned a Cloudflare block, skip curl/wget time-wasters // and go straight to CloakBrowser which can solve the challenge silently. if strings.Contains(err.Error(), "403") || strings.Contains(err.Error(), "Cloudflare") { log.Printf("Skipping curl/wget fallbacks for %s: direct HTTP hit Cloudflare wall", url) goto cloakBrowserFallback } body, err = fetchPageWithCurl(ctx, url, opts) if err == nil { log.Printf("Successfully retrieved content via curl for %s", url) cachePage(url, body) return body, nil } log.Printf("curl fallback failed for %s: %v", url, err) body, err = fetchPageWithWget(ctx, url, opts) if err == nil { log.Printf("Successfully retrieved content via wget for %s", url) cachePage(url, body) return body, nil } log.Printf("wget fallback failed for %s: %v", url, err) cloakBrowserFallback: body, err = fetchPageWithCloakBrowser(ctx, url, opts) if err == nil { log.Printf("Successfully retrieved content via CloakBrowser for %s", url) cachePage(url, body) return body, nil } log.Printf("CloakBrowser fallback failed for %s: %v", url, err) body, err = fetchPageWithScrapling(ctx, url, opts) if err == nil { log.Printf("Successfully retrieved content via Scrapling for %s", url) cachePage(url, body) return body, nil } log.Printf("Scrapling fallback failed for %s: %v", url, err) return nil, fmt.Errorf("all fetch methods failed for %s: %w", url, err) } // detachedContext returns a context.Background() with a generous timeout so // goroutines aren't all killed when r.Context() is cancelled. func detachedContext(timeout time.Duration) (context.Context, context.CancelFunc) { return context.WithTimeout(context.Background(), timeout) } func cachePage(url string, body []byte) { pageCacheMu.Lock() pageCache[url] = &cacheEntry{body: body, timestamp: time.Now()} pageCacheMu.Unlock() } // parseCompetitionMatchesFromFotbal scrapes matches from the public fotbal.cz // competition page (e.g., https://www.fotbal.cz/souteze/turnaje/table/{id}). // It filters to only include matches involving the given clubName if provided. func parseCompetitionMatchesFromFotbal(ctx context.Context, pageURL, clubType, clubName, clubID string) []Match { pageURL = strings.TrimSpace(pageURL) if pageURL == "" { return nil } body, err := fetchPageWithFallback(ctx, pageURL) if err != nil { log.Printf("fotbal.cz matches fetch failed for %s: %v", pageURL, err) return nil } // If we still don't have body content, return nil if len(body) == 0 { log.Printf("No content available for %s", pageURL) return nil } // Debug: save full HTML if env toggled if os.Getenv("DEBUG_SAVE_HTML") != "" { // derive a friendly filename from last URL path segment comp := pageURL if i := strings.LastIndex(comp, "/"); i >= 0 && i+1 < len(comp) { comp = comp[i+1:] } fname := fmt.Sprintf("fotbal_comp_%s.html", comp) if err := os.WriteFile(fname, body, 0644); err != nil { log.Printf("failed writing debug HTML %s: %v", fname, err) } else { log.Printf("saved debug HTML: %s", fname) } } doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body)) if err != nil { log.Printf("fotbal.cz matches parse error for %s: %v", pageURL, err) return nil } var matches []Match // Sections per round doc.Find("section.js-matchRoundSection li.MatchRound").Each(func(_ int, li *goquery.Selection) { a := li.Find("a.MatchRound-match").First() if a.Length() == 0 { return } // Teams teamNames := []string{} li.Find("a.MatchRound-match ul li span.H7").Each(func(_ int, s *goquery.Selection) { t := strings.TrimSpace(s.Text()) if t != "" { teamNames = append(teamNames, t) } }) if len(teamNames) < 2 { return } home := teamNames[0] away := teamNames[1] // Try to extract team IDs from img URLs if present imgIDs := []string{} li.Find("a.MatchRound-match img").Each(func(_ int, img *goquery.Selection) { src := strings.TrimSpace(img.AttrOr("src", "")) if src == "" { return } if id := extractUUIDFromHref(src); id != "" { imgIDs = append(imgIDs, id) } }) homeID, awayID := "", "" if len(imgIDs) >= 1 { homeID = imgIDs[0] } if len(imgIDs) >= 2 { awayID = imgIDs[1] } // Score score := strings.TrimSpace(a.Find("strong.H4").First().Text()) if re := regexp.MustCompile(`\s*([0-9]+)\s*:\s*([0-9]+)\s*`); re != nil { if m := re.FindStringSubmatch(score); len(m) == 3 { score = fmt.Sprintf("%s:%s", m[1], m[2]) } } // Meta: date, match id in meta list and link dateText := "" li.Find(".MatchRound-meta p").Each(func(_ int, p *goquery.Selection) { label := strings.TrimSpace(p.Find("strong").First().Text()) txt := strings.TrimSpace(p.Text()) if strings.HasPrefix(strings.ToLower(label), "datum") { // Remove label from text dateText = strings.TrimSpace(strings.ReplaceAll(txt, label+":", "")) } }) // Venue from details, if available venue := "" li.Find(".js-matchRoundDetails li p").Each(func(_ int, p *goquery.Selection) { label := strings.TrimSpace(p.Find("strong").First().Text()) txt := strings.TrimSpace(p.Text()) if strings.HasPrefix(strings.ToLower(label), "hřiště") || strings.HasPrefix(strings.ToLower(label), "hriste") { venue = strings.TrimSpace(strings.ReplaceAll(txt, label+":", "")) } }) // Match ID from the anchor href matchID := extractUUIDFromHref(a.AttrOr("href", "")) reportURL := "" if matchID != "" { if strings.EqualFold(clubType, "futsal") { reportURL = fmt.Sprintf("https://www.fotbal.cz/futsal/zapasy/futsal/%s", matchID) } else { reportURL = fmt.Sprintf("https://www.fotbal.cz/souteze/zapasy/zapas/%s", matchID) } } // Filter by club involvement: prefer UUID match, fallback to name matching including simplified token if clubName != "" || clubID != "" { involved := false // If we could extract team UUIDs, match by ID first (robust against aliases) if clubID != "" && (strings.EqualFold(homeID, clubID) || strings.EqualFold(awayID, clubID)) { involved = true } else if clubName != "" { // Fallback to fuzzy full-name matching involved = strings.EqualFold(home, clubName) || strings.EqualFold(away, clubName) || containsFold(clubName, home) || containsFold(clubName, away) || containsFold(home, clubName) || containsFold(away, clubName) // As a last resort, try matching a simplified token (e.g., city) of the club name if !involved { token := simplifyClubQuery(clubName) if token != "" && (containsFold(home, token) || containsFold(away, token)) { involved = true } } } if !involved { return } } // Backfill IDs for current club if missing if homeID == "" { if strings.EqualFold(home, clubName) || containsFold(home, clubName) || containsFold(clubName, home) { homeID = clubID } else { token := simplifyClubQuery(clubName) if token != "" && containsFold(home, token) { homeID = clubID } } } if awayID == "" { if strings.EqualFold(away, clubName) || containsFold(away, clubName) || containsFold(clubName, away) { awayID = clubID } else { token := simplifyClubQuery(clubName) if token != "" && containsFold(away, token) { awayID = clubID } } } homeLogo := getLogo(home, homeID) awayLogo := getLogo(away, awayID) matches = append(matches, Match{ DateTime: dateText, Home: home, HomeID: homeID, HomeLogoURL: homeLogo, Away: away, AwayID: awayID, AwayLogoURL: awayLogo, Score: score, Venue: venue, MatchID: matchID, ReportURL: reportURL, FACRLink: reportURL, }) }) return matches } // parseCompetitionMatchesFromIS scrapes matches from the IS portal as fallback. func parseCompetitionMatchesFromIS(ctx context.Context, detailURL, clubType, clubName, clubID string) []Match { parsed, err := neturl.Parse(detailURL) if err != nil { log.Printf("IS matches invalid URL %s: %v", detailURL, err) return nil } sem := getDomainReqSem(parsed.Host) if err := sem.Acquire(ctx); err != nil { log.Printf("IS matches domain semaphore error for %s: %v", detailURL, err) return nil } defer sem.Release() req, err := newBrowserRequest(detailURL, fetchOptions{}) if err != nil { log.Printf("IS matches request error for %s: %v", detailURL, err) return nil } req = req.WithContext(ctx) resp, err := sharedHTTPClient.Do(req) if err != nil { log.Printf("IS matches fetch error for %s: %v", detailURL, err) return nil } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { log.Printf("IS matches non-200 for %s: %d", detailURL, resp.StatusCode) return nil } // Read body so we can optionally save and then parse from memory body, err := io.ReadAll(resp.Body) if err != nil { log.Printf("IS matches read error for %s: %v", detailURL, err) return nil } if os.Getenv("DEBUG_SAVE_HTML") != "" { // name the file using the req (competition id) if present fname := "is_detail.html" if u, err := neturl.Parse(detailURL); err == nil { req := u.Query().Get("req") sport := u.Query().Get("sport") if req != "" { fname = fmt.Sprintf("is_comp_%s_%s.html", req, sport) } } if err := os.WriteFile(fname, body, 0644); err != nil { log.Printf("failed writing debug IS HTML %s: %v", fname, err) } else { log.Printf("saved debug IS HTML: %s", fname) } } docDetail, err := goquery.NewDocumentFromReader(bytes.NewReader(body)) if err != nil { log.Printf("IS matches parse error for %s: %v", detailURL, err) return nil } var matches []Match totalRows := 0 keptRows := 0 docDetail.Find("table.soutez-zapasy tr").Each(func(_ int, s *goquery.Selection) { if s.Find("th").Length() > 0 { return } tds := s.Find("td") if tds.Length() < 5 { return } totalRows++ getText := func(sel *goquery.Selection) string { return strings.TrimSpace(sel.Text()) } dt := getText(tds.Eq(0)) rawHome := getText(tds.Eq(1)) if idx := strings.Index(rawHome, "("); idx >= 0 { rawHome = strings.TrimSpace(rawHome[:idx]) } rawAway := getText(tds.Eq(2)) if idx := strings.Index(rawAway, "("); idx >= 0 { rawAway = strings.TrimSpace(rawAway[:idx]) } homeID := extractUUIDFromHref(tds.Eq(1).Find("a").First().AttrOr("href", "")) awayID := extractUUIDFromHref(tds.Eq(2).Find("a").First().AttrOr("href", "")) rawScore := getText(tds.Eq(3)) score := "" if re := regexp.MustCompile(`(\d+)\s*:\s*(\d+)`); re != nil { if m := re.FindStringSubmatch(rawScore); len(m) == 3 { score = fmt.Sprintf("%s:%s", m[1], m[2]) } } venue := "" if tds.Length() > 4 { venue = getText(tds.Eq(4)) } var reportURL, matchID string var isReportHref, isDelegHref string // Use the last column for links to be robust to optional columns tds.Eq(tds.Length() - 1).Find("a").Each(func(_ int, a *goquery.Selection) { href := strings.TrimSpace(a.AttrOr("href", "")) if href == "" { return } if u, err := neturl.Parse(href); err == nil { if id := u.Query().Get("zapas"); id != "" { matchID = id } } // Capture specific IS links if strings.Contains(href, "zapis-o-utkani-report.aspx") { isReportHref = resolveISURL(href) } if strings.Contains(href, "zapas-delegace-report.aspx") { isDelegHref = resolveISURL(href) } }) if matchID != "" { if strings.EqualFold(clubType, "futsal") { reportURL = fmt.Sprintf("https://www.fotbal.cz/futsal/zapasy/futsal/%s", matchID) } else { reportURL = fmt.Sprintf("https://www.fotbal.cz/souteze/zapasy/zapas/%s", matchID) } } // Canonical fotbal.cz link facrLink := reportURL // Filter by club involvement: prefer UUID match, fallback to name matching with simplified token if clubName != "" || clubID != "" { involved := false if clubID != "" && (strings.EqualFold(homeID, clubID) || strings.EqualFold(awayID, clubID)) { involved = true } else if clubName != "" { involved = strings.EqualFold(rawHome, clubName) || strings.EqualFold(rawAway, clubName) || containsFold(clubName, rawHome) || containsFold(clubName, rawAway) || containsFold(rawHome, clubName) || containsFold(rawAway, clubName) if !involved { token := simplifyClubQuery(clubName) if token != "" && (containsFold(rawHome, token) || containsFold(rawAway, token)) { involved = true } } } if !involved { return } } keptRows++ if homeID == "" { if strings.EqualFold(rawHome, clubName) || containsFold(rawHome, clubName) || containsFold(clubName, rawHome) { homeID = clubID } else { token := simplifyClubQuery(clubName) if token != "" && containsFold(rawHome, token) { homeID = clubID } } } if awayID == "" { if strings.EqualFold(rawAway, clubName) || containsFold(rawAway, clubName) || containsFold(clubName, rawAway) { awayID = clubID } else { token := simplifyClubQuery(clubName) if token != "" && containsFold(rawAway, token) { awayID = clubID } } } homeLogo := getLogo(rawHome, homeID) awayLogo := getLogo(rawAway, awayID) matches = append(matches, Match{DateTime: dt, Home: rawHome, HomeID: homeID, HomeLogoURL: homeLogo, Away: rawAway, AwayID: awayID, AwayLogoURL: awayLogo, Score: score, Venue: venue, MatchID: matchID, ReportURL: func() string { if isReportHref != "" { return isReportHref } return reportURL }(), FACRLink: facrLink, DelegationURL: isDelegHref}) }) if os.Getenv("DEBUG_SAVE_HTML") != "" { log.Printf("IS parse summary for %s: total rows=%d, kept=%d", detailURL, totalRows, keptRows) } return matches } var logoCache = map[string]string{} type logoAPISearchResult struct { ID string `json:"id"` Name string `json:"name"` LogoURL string `json:"logo_url"` HasLocalLogo bool `json:"has_local_logo"` } type searchAPIResult struct { Results []struct { Name string `json:"name"` LogoURL string `json:"logo_url"` } `json:"results"` } // a simplified search token like "krnov" to improve chances of finding a logo. func simplifyClubQuery(name string) string { s := strings.TrimSpace(name) if s == "" { return "" } parts := strings.Fields(s) if len(parts) == 0 { return "" } // Walk from the end to find a meaningful token (avoid legal suffixes like "z.s.") stop := map[string]struct{}{ "z.s.": {}, "z.s": {}, "zs": {}, "zapsany": {}, "zapsaný": {}, "spolek": {}, "o.s.": {}, "o.s": {}, "os": {}, "a.s.": {}, "a.s": {}, "as": {}, "s.r.o.": {}, "s.r.o": {}, "sro": {}, } for i := len(parts) - 1; i >= 0; i-- { tok := parts[i] tok = strings.Trim(tok, ",.;:-()[]{}\"'`“”’") lt := strings.ToLower(tok) if _, banned := stop[lt]; banned { continue } // prefer tokens with letters and length >= 3 letters := regexp.MustCompile(`[a-zA-Zá-žÁ-Ž]`).MatchString if len([]rune(lt)) >= 3 && letters(lt) { return lt } } // Fallback to last token sanitized last := strings.Trim(parts[len(parts)-1], ",.;:-()[]{}\"'`“”’") return strings.ToLower(last) } func getLogoFromLogoAPI(teamName string, teamID string) string { base := strings.TrimSpace(os.Getenv("LOGOAPI_BASE_URL")) if base == "" { base = "https://logoapi.sportcreative.eu" } base = strings.TrimRight(base, "/") name := strings.TrimSpace(teamName) if name == "" { return "" } cacheKey := "logoapi|" + strings.ToLower(name) if v, ok := logoCache[cacheKey]; ok { return v } client := &http.Client{Timeout: 5 * time.Second} doSearch := func(q string) ([]logoAPISearchResult, bool) { q = strings.TrimSpace(q) if q == "" { return nil, false } u := fmt.Sprintf("%s/clubs/search-with-logos?q=%s", base, neturl.QueryEscape(q)) resp, err := client.Get(u) if err != nil { return nil, false } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { io.Copy(io.Discard, resp.Body) return nil, false } var payload []logoAPISearchResult if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil { // Non-JSON or invalid response – treat as no result return nil, false } return payload, true } // Search strictly by full club name; if it yields nothing usable, let caller fall back to FACR. payload, ok := doSearch(name) if !ok || len(payload) == 0 { logoCache[cacheKey] = "" return "" } // Normalize names for comparison (case-insensitive, strip common legal/sport suffixes). normalize := func(s string) string { s = strings.ToLower(strings.TrimSpace(s)) if s == "" { return s } parts := strings.Fields(s) stop := map[string]struct{}{ "fotbal": {}, "futsal": {}, "z.s.": {}, "z.s": {}, "zs": {}, "o.s.": {}, "o.s": {}, "os": {}, "a.s.": {}, "a.s": {}, "as": {}, "s.r.o.": {}, "s.r.o": {}, "sro": {}, } end := len(parts) for end > 0 { if _, banned := stop[parts[end-1]]; banned { end-- continue } break } if end != len(parts) { parts = parts[:end] } return strings.Join(parts, " ") } want := normalize(name) var withLogo []logoAPISearchResult for _, r := range payload { if r.HasLocalLogo { withLogo = append(withLogo, r) } } if len(withLogo) == 0 { logoCache[cacheKey] = "" return "" } // Only accept a logo when the normalized club name matches; avoid arbitrary first-result picks. for _, r := range withLogo { if normalize(r.Name) == want { logoCache[cacheKey] = r.LogoURL return r.LogoURL } } // No strong match – treat as "no logo" so upstream can fall back to FACR assets. logoCache[cacheKey] = "" return "" } func getLogoBySearch(name string) string { key := strings.ToLower(strings.TrimSpace(name)) if key == "" { return "" } if v, ok := logoCache[key]; ok { return v } client := &http.Client{Timeout: 60 * time.Second} // Prefer simplified last-word token (e.g., "krnov") to improve hit rate for logos query := simplifyClubQuery(name) if query == "" { query = name } doSearch := func(q string) (searchAPIResult, bool) { url := fmt.Sprintf("http://0.0.0.0:8686/club/search?q=%s", neturl.QueryEscape(q)) resp, err := client.Get(url) if err != nil { return searchAPIResult{}, false } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { io.Copy(io.Discard, resp.Body) return searchAPIResult{}, false } var payload searchAPIResult if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil { return searchAPIResult{}, false } return payload, true } payload, ok := doSearch(query) if !ok || len(payload.Results) == 0 { // Fallback to full name if simplified token yields nothing payload, ok = doSearch(name) if !ok { return "" } } // pick best match: exact (case-insensitive), then contains, else first best := "" for _, r := range payload.Results { if strings.EqualFold(strings.TrimSpace(r.Name), strings.TrimSpace(name)) { best = r.LogoURL break } } if best == "" { for _, r := range payload.Results { rname := strings.ToLower(r.Name) if strings.Contains(rname, key) || strings.Contains(key, rname) { best = r.LogoURL break } } } if best == "" && len(payload.Results) > 0 { best = payload.Results[0].LogoURL } logoCache[key] = best return best } func getLogo(teamName string, teamID string) string { placeholder := "https://www.fotbal.cz/dist/img/logo-club-empty.svg" name := strings.ToLower(strings.TrimSpace(teamName)) if name == "" || strings.Contains(name, "volno") || strings.Contains(name, "volný los") || strings.Contains(name, "volny los") || strings.Contains(name, "bye") { return placeholder } if logo := getLogoFromLogoAPI(teamName, teamID); logo != "" { return logo } // If we have a team ID, construct the official logo URL directly. // This avoids wrong matches for duplicate names (e.g., multiple "Ořechov"). if tid := strings.TrimSpace(teamID); tid != "" { return fmt.Sprintf("https://is1.fotbal.cz/media/kluby/%s/%s_crop.jpg", tid, tid) } // Otherwise, try the local FACR-based search endpoint by name. if logo := getLogoBySearch(teamName); logo != "" { return logo } // No ID and no search hit -> placeholder return placeholder } // CompetitionTable holds standings sections; currently only Overall is used type CompetitionTable struct { Overall []TableRow `json:"overall"` } // ClubInfo is the response for club info and tables endpoints type ClubInfo struct { Name string `json:"name"` ClubID string `json:"club_id"` ClubType string `json:"club_type"` ClubInternalID string `json:"club_internal_id,omitempty"` URL string `json:"url,omitempty"` LogoURL string `json:"logo_url,omitempty"` Address string `json:"address,omitempty"` Category string `json:"category,omitempty"` Competitions []Competition `json:"competitions"` } // SearchResult represents one club from fotbal.cz search type SearchResult struct { Name string `json:"name"` ClubID string `json:"club_id"` ClubType string `json:"club_type"` // football or futsal URL string `json:"url"` LogoURL string `json:"logo_url"` Category string `json:"category,omitempty"` Address string `json:"address,omitempty"` } // getClubSearch queries fotbal.cz club search and returns results with logo func getClubSearch(w http.ResponseWriter, r *http.Request) { q := strings.TrimSpace(r.URL.Query().Get("q")) if q == "" { http.Error(w, "query parameter 'q' is required", http.StatusBadRequest) return } // Build search URL vals := neturl.Values{} vals.Set("q", q) searchURL := "https://www.fotbal.cz/club/hledej?" + vals.Encode() fetchSearchPage := func(ctx context.Context, url string) ([]byte, error) { return fetchPageWithFallbackOptions(ctx, url, fetchOptions{ Referer: "https://www.fotbal.cz/club/hledej", }) } // Try direct HTTP request first body, err := fetchSearchPage(r.Context(), searchURL) if err != nil { log.Printf("Direct search request failed for %s: %v", searchURL, err) // Retry with quoted query for short tokens searchURL2 := searchURL tokens := strings.Fields(q) for _, t := range tokens { if len([]rune(t)) <= 2 { vals2 := neturl.Values{} vals2.Set("q", "\""+q+"\"") searchURL2 = "https://www.fotbal.cz/club/hledej?" + vals2.Encode() break } } body, err = fetchSearchPage(r.Context(), searchURL2) if err != nil { log.Printf("Retried search request failed for %s: %v", searchURL2, err) // Return empty results instead of error w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(map[string]any{ "query": q, "count": 0, "results": []SearchResult{}, }) return } } doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body)) if err != nil { http.Error(w, fmt.Sprintf("Error parsing HTML: %v", err), http.StatusInternalServerError) return } var results []SearchResult // The page lists clubs in section "Výsledky hledání" as li.ListItemSplit doc.Find("li.ListItemSplit").Each(func(_ int, li *goquery.Selection) { a := li.Find("a.Link--inverted").First() href, _ := a.Attr("href") if href == "" { return } name := strings.TrimSpace(a.Find("span.H7").First().Text()) if name == "" { // fallback to link text name = strings.TrimSpace(a.Text()) } img := a.Find("img").First() logoURL, _ := img.Attr("src") // Category category := strings.TrimSpace(li.Find(".ClubCategories .BadgeCategory").First().Text()) // Address address := strings.TrimSpace(li.Find(".ClubAddress p").First().Text()) // Infer club type from href clubType := "football" if strings.Contains(strings.ToLower(href), "/futsal/") { clubType = "futsal" } // Extract club ID from last path segment // e.g., https://www.fotbal.cz/futsal/club/club/{uuid} parts := strings.Split(strings.TrimRight(href, "/"), "/") clubID := "" if len(parts) > 0 { clubID = parts[len(parts)-1] } // Normalize URL (ensure absolute) if !strings.HasPrefix(href, "http://") && !strings.HasPrefix(href, "https://") { href = "https://www.fotbal.cz" + href } // Prefer logoapi / local logo when available if l := strings.TrimSpace(getLogo(name, clubID)); l != "" { logoURL = l } results = append(results, SearchResult{ Name: name, ClubID: clubID, ClubType: clubType, URL: href, LogoURL: logoURL, Category: category, Address: address, }) }) w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(map[string]any{ "query": q, "count": len(results), "results": results, }) } // getClubTables returns club info with competition standings tables (no matches) func getClubTables(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) clubID := vars["id"] clubType := vars["type"] if clubID == "" { http.Error(w, "Club ID is required", http.StatusBadRequest) return } // Check club response cache cacheKey := "table:" + clubType + ":" + clubID clubCacheMu.RLock() if entry, ok := clubCache[cacheKey]; ok { if time.Since(entry.timestamp) < clubCacheTTL { clubCacheMu.RUnlock() w.Header().Set("Content-Type", "application/json") w.Write(entry.data) log.Printf("Club cache hit for %s", cacheKey) return } } clubCacheMu.RUnlock() // Validate club type var baseURL string var sportParam string switch clubType { case "football": baseURL = "https://www.fotbal.cz/souteze/club/club" sportParam = "fotbal" case "futsal": baseURL = "https://www.fotbal.cz/futsal/club/club" sportParam = "futsal" default: http.Error(w, "Invalid club type. Use 'football' or 'futsal'.", http.StatusBadRequest) return } url := fmt.Sprintf("%s/%s", baseURL, clubID) body, err := fetchPageWithFallback(r.Context(), url) if err != nil { http.Error(w, fmt.Sprintf("Error fetching club data: %v", err), http.StatusInternalServerError) return } doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body)) if err != nil { http.Error(w, fmt.Sprintf("Error parsing HTML: %v", err), http.StatusInternalServerError) return } // Extract club internal ID clubInternalID := "" doc.Find("section").Each(func(i int, s *goquery.Selection) { headerText := s.Find("h3 span").First().Text() if strings.TrimSpace(headerText) == "ID klubu" { clubInternalID = strings.TrimSpace(s.Find("ul li").First().Text()) } }) // Extract competitions var competitions []Competition doc.Find("table.Table tbody tr").Each(func(i int, s *goquery.Selection) { code := strings.TrimSpace(s.Find("td:first-child").Text()) nameLink := s.Find("td:nth-child(2) a") name := strings.TrimSpace(nameLink.Text()) teamCount := strings.TrimSpace(s.Find("td:nth-child(3)").Text()) // Extract competition ID from the link parts := strings.Split(nameLink.AttrOr("href", ""), "/") compID := "" if len(parts) >= 2 { compID = parts[len(parts)-1] } // Build public table link depending on clubType tableLink := "" if strings.EqualFold(clubType, "futsal") { tableLink = fmt.Sprintf("https://www.fotbal.cz/futsal/futsal/table/%s", compID) } else { tableLink = fmt.Sprintf("https://www.fotbal.cz/souteze/turnaje/table/%s", compID) } competitions = append(competitions, Competition{ ID: compID, Code: code, Name: name, TeamCount: teamCount, MatchesLink: tableLink, }) }) // For each competition, fetch the standings tables from is.fotbal.cz concurrently sem := newSemaphore(4) var wg sync.WaitGroup var mu sync.Mutex for i := range competitions { wg.Add(1) go func(idx int) { defer wg.Done() if err := sem.Acquire(r.Context()); err != nil { return } defer sem.Release() ctx, cancel := detachedContext(30 * time.Second) defer cancel() comp := &competitions[idx] tableURL := fmt.Sprintf("https://is.fotbal.cz/public/souteze/tabulky-souteze.aspx?req=%s&sport=%s", comp.ID, sportParam) req, err := http.NewRequestWithContext(ctx, "GET", tableURL, nil) if err != nil { log.Printf("error creating request for competition table %s: %v", comp.ID, err) return } resp, err := http.DefaultClient.Do(req) if err != nil { log.Printf("error fetching competition table for %s: %v", comp.ID, err) return } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { log.Printf("non-200 response for %s: %d", comp.ID, resp.StatusCode) return } docTable, err := goquery.NewDocumentFromReader(resp.Body) if err != nil { log.Printf("error parsing table HTML for %s: %v", comp.ID, err) return } // Parse section: Tabulka celková (only overall) var overall []TableRow parseSection := func(headerText string) []TableRow { var rows []TableRow // Find the h3 with matching text, then the following .list.tabulky table docTable.Find("h3").EachWithBreak(func(_ int, h3 *goquery.Selection) bool { if strings.EqualFold(strings.TrimSpace(h3.Text()), headerText) { list := h3.NextAllFiltered("div.list.tabulky").First() if list.Length() == 0 { return false } table := list.Find("table.vysledky-tabulky tbody") table.Find("tr").Each(func(_ int, tr *goquery.Selection) { // skip header rows containing th if tr.Find("th").Length() > 0 { return } tds := tr.Find("td") if tds.Length() < 8 { return } get := func(i int) string { return strings.TrimSpace(tds.Eq(i).Text()) } rank := get(0) team := get(1) teamID := extractUUIDFromHref(tds.Eq(1).Find("a").First().AttrOr("href", "")) played := get(2) wins := get(3) draws := get(4) losses := get(5) scoreRaw := get(6) // normalize score like "5 : 0" -> "5:0" score := scoreRaw if re := regexp.MustCompile(`\s*([0-9]+)\s*:\s*([0-9]+)\s*`); re != nil { if m := re.FindStringSubmatch(scoreRaw); len(m) == 3 { score = fmt.Sprintf("%s:%s", m[1], m[2]) } } points := get(7) rows = append(rows, TableRow{ Rank: rank, Team: team, TeamID: teamID, TeamLogoURL: getLogo(team, teamID), Played: played, Wins: wins, Draws: draws, Losses: losses, Score: score, Points: points, }) }) return false } return true }) return rows } overall = parseSection("Tabulka celková") mu.Lock() comp.Table = &CompetitionTable{Overall: overall} mu.Unlock() }(i) } wg.Wait() clubName := strings.TrimSpace(doc.Find("h1.H4 span").First().Text()) clubURL := strings.TrimSpace(doc.Find("h1.H4 a").First().AttrOr("href", "")) logoURL := strings.TrimSpace(doc.Find("img.Logo").First().AttrOr("src", "")) if l := strings.TrimSpace(getLogo(clubName, clubID)); l != "" { logoURL = l } category := strings.TrimSpace(doc.Find("section").First().Find("h3 span").First().Text()) address := strings.TrimSpace(doc.Find("section").First().Find("ul li").First().Text()) clubInfo := ClubInfo{ Name: clubName, ClubID: clubID, ClubType: clubType, ClubInternalID: clubInternalID, URL: clubURL, LogoURL: logoURL, Address: address, Category: category, Competitions: competitions, } var buf bytes.Buffer if err := json.NewEncoder(&buf).Encode(clubInfo); err != nil { http.Error(w, fmt.Sprintf("JSON encode error: %v", err), http.StatusInternalServerError) return } data := buf.Bytes() clubCacheMu.Lock() clubCache[cacheKey] = &clubCacheEntry{data: data, timestamp: time.Now()} clubCacheMu.Unlock() w.Header().Set("Content-Type", "application/json") w.Write(data) } // getClubInfo returns club info with competitions and matches func getClubInfo(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) clubID := vars["id"] clubType := vars["type"] if clubID == "" { http.Error(w, "Club ID is required", http.StatusBadRequest) return } // Check club response cache cacheKey := "info:" + clubType + ":" + clubID clubCacheMu.RLock() if entry, ok := clubCache[cacheKey]; ok { if time.Since(entry.timestamp) < clubCacheTTL { clubCacheMu.RUnlock() w.Header().Set("Content-Type", "application/json") w.Write(entry.data) log.Printf("Club cache hit for %s", cacheKey) return } } clubCacheMu.RUnlock() var baseURL, sportParam string switch clubType { case "football": baseURL = "https://www.fotbal.cz/souteze/club/club" sportParam = "fotbal" case "futsal": baseURL = "https://www.fotbal.cz/futsal/club/club" sportParam = "futsal" default: http.Error(w, "Invalid club type. Use 'football' or 'futsal'.", http.StatusBadRequest) return } url := fmt.Sprintf("%s/%s", baseURL, clubID) body, err := fetchPageWithFallback(r.Context(), url) if err != nil { http.Error(w, fmt.Sprintf("Error fetching club data: %v", err), http.StatusInternalServerError) return } doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body)) if err != nil { http.Error(w, fmt.Sprintf("Error parsing HTML: %v", err), http.StatusInternalServerError) return } clubName := strings.TrimSpace(doc.Find("h1.H4 span").First().Text()) // Basic club metadata clubURL := fmt.Sprintf("%s/%s", baseURL, clubID) logoURL := getLogo(clubName, clubID) if logoURL == "" { logoURL = fmt.Sprintf("https://is1.fotbal.cz/media/kluby/%s/%s_crop.jpg", clubID, clubID) } category := "Fotbal" if strings.EqualFold(clubType, "futsal") { category = "Futsal" } // Internal ID clubInternalID := "" doc.Find("section").Each(func(_ int, s *goquery.Selection) { if strings.TrimSpace(s.Find("h3 span").First().Text()) == "ID klubu" { clubInternalID = strings.TrimSpace(s.Find("ul li").First().Text()) } }) // Address (best-effort) address := strings.TrimSpace(doc.Find(".ClubAddress p").First().Text()) // Competitions list var competitions []Competition doc.Find("table.Table tbody tr").Each(func(_ int, tr *goquery.Selection) { code := strings.TrimSpace(tr.Find("td:first-child").Text()) nameLink := tr.Find("td:nth-child(2) a") name := strings.TrimSpace(nameLink.Text()) teamCount := strings.TrimSpace(tr.Find("td:nth-child(3)").Text()) parts := strings.Split(strings.TrimSpace(nameLink.AttrOr("href", "")), "/") compID := "" if len(parts) >= 2 { compID = parts[len(parts)-1] } // Public table URL for convenience tableLink := "" if strings.EqualFold(clubType, "futsal") { tableLink = fmt.Sprintf("https://www.fotbal.cz/futsal/futsal/table/%s", compID) } else { tableLink = fmt.Sprintf("https://www.fotbal.cz/souteze/turnaje/table/%s", compID) } competitions = append(competitions, Competition{ID: compID, Code: code, Name: name, TeamCount: teamCount, MatchesLink: tableLink}) }) // For each competition, fetch matches concurrently with limits sem := newSemaphore(4) var wg sync.WaitGroup var mu sync.Mutex for i := range competitions { wg.Add(1) go func(idx int) { defer wg.Done() if err := sem.Acquire(r.Context()); err != nil { return } defer sem.Release() ctx, cancel := detachedContext(120 * time.Second) defer cancel() comp := &competitions[idx] matchesLink := comp.MatchesLink // 1) Try parsing from the public fotbal.cz competition page (matches_link) matches := parseCompetitionMatchesFromFotbal(ctx, matchesLink, clubType, clubName, clubID) // Always try IS as well and prefer it if it provides at least as many matches detailURL := fmt.Sprintf("https://is.fotbal.cz/public/souteze/detail-souteze.aspx?req=%s&sport=%s", comp.ID, sportParam) isMatches := parseCompetitionMatchesFromIS(ctx, detailURL, clubType, clubName, clubID) // Prefer IS whenever it yields any results, as IS often contains alias team names if len(isMatches) > 0 { matches = isMatches } mu.Lock() comp.Matches = matches mu.Unlock() }(i) } wg.Wait() clubInfo := ClubInfo{ Name: clubName, ClubID: clubID, ClubType: clubType, ClubInternalID: clubInternalID, URL: clubURL, LogoURL: logoURL, Address: address, Category: category, Competitions: competitions, } var buf bytes.Buffer if err := json.NewEncoder(&buf).Encode(clubInfo); err != nil { http.Error(w, fmt.Sprintf("JSON encode error: %v", err), http.StatusInternalServerError) return } data := buf.Bytes() clubCacheMu.Lock() clubCache[cacheKey] = &clubCacheEntry{data: data, timestamp: time.Now()} clubCacheMu.Unlock() w.Header().Set("Content-Type", "application/json") w.Write(data) } func main() { r := mux.NewRouter() r.HandleFunc("/club/{type}/{id}", getClubInfo).Methods("GET") r.HandleFunc("/club/{type}/{id}/table", getClubTables).Methods("GET") r.HandleFunc("/club/search", getClubSearch).Methods("GET") r.HandleFunc("/club/{id:[0-9a-fA-F-]+}", func(w http.ResponseWriter, r *http.Request) { vars := mux.Vars(r) http.Redirect(w, r, "/club/football/"+vars["id"], http.StatusMovedPermanently) }).Methods("GET") r.HandleFunc("/", docsHandler) addr := "0.0.0.0:8686" srv := &http.Server{ Addr: addr, Handler: r, ReadTimeout: 30 * time.Second, WriteTimeout: 10 * time.Minute, IdleTimeout: 120 * time.Second, MaxHeaderBytes: 1 << 20, } fmt.Printf("Server running on http://%s\n", addr) log.Fatal(srv.ListenAndServe()) } // docsHandler serves a simple HTML API documentation at the root endpoint. func docsHandler(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/html; charset=utf-8") io.WriteString(w, ` FACR Scraper API Docs

FACR Scraper API

Status: ok — server is running.

Search Clubs

GET /club/search?q=QUERY

Find clubs on fotbal.cz. Supports football and futsal clubs.

Example: /club/search?q=Sparta

Response shape
{
  "query": "Sparta",
  "count": 2,
  "results": [
    {
      "name": "AC Sparta Praha",
      "club_id": "",
      "club_type": "football",
      "url": "https://www.fotbal.cz/...",
      "logo_url": "https://.../logo.png",
      "category": "Muži",
      "address": "..."
    }
  ]
}

Club Info + Matches

GET /club/{type}/{id}

Example: /club/football/{id}

Response shape
{
  "name": "AC Sparta Praha",
  "club_id": "00000000-0000-0000-0000-000000000000",
  "club_type": "football",
  "club_internal_id": "123456",
  "url": "https://www.fotbal.cz/...",
  "logo_url": "https://is1.fotbal.cz/media/kluby/.../logo.jpg",
  "address": "Milady Horákové 98, 160 00 Praha 6",
  "category": "Muži A",
  "competitions": [
    {
      "id": "12345",
      "code": "1. LIGA",
      "name": "Fortuna Liga",
      "team_count": "16",
      "matches_link": "https://www.fotbal.cz/...",
      "matches": [
        {
          "date_time": "12.08.2023 18:00",
          "home": "AC Sparta Praha",
          "home_id": "00000000-0000-0000-0000-000000000000",
          "home_logo_url": "https://.../sparta.png",
          "away": "SK Slavia Praha",
          "away_id": "11111111-1111-1111-1111-111111111111",
          "away_logo_url": "https://.../slavia.png",
          "score": "2:1",
          "venue": "Stadion Letná",
          "match_id": "match12345",
          "report_url": "https://www.fotbal.cz/..."
        }
      ]
    }
  ]
}

Club Tables (Standings)

GET /club/{type}/{id}/table

Returns standings (overall table) for each competition of the club.

Example: /club/football/{id}/table

Response shape
{
  "name": "AC Sparta Praha",
  "club_id": "00000000-0000-0000-0000-000000000000",
  "club_type": "football",
  "club_internal_id": "123456",
  "url": "https://www.fotbal.cz/...",
  "logo_url": "https://is1.fotbal.cz/media/kluby/.../logo.jpg",
  "competitions": [
    {
      "id": "12345",
      "code": "1. LIGA",
      "name": "Fortuna Liga",
      "team_count": "16",
      "matches_link": "https://www.fotbal.cz/...",
      "table": {
        "overall": [
          {
            "rank": "1",
            "team": "AC Sparta Praha",
            "team_id": "00000000-0000-0000-0000-000000000000",
            "team_logo_url": "https://.../sparta.png",
            "played": "10",
            "wins": "8",
            "draws": "2",
            "losses": "0",
            "score": "25:5",
            "points": "26"
          },
          {
            "rank": "2",
            "team": "SK Slavia Praha",
            "team_id": "11111111-1111-1111-1111-111111111111",
            "team_logo_url": "https://.../slavia.png",
            "played": "10",
            "wins": "7",
            "draws": "2",
            "losses": "1",
            "score": "20:8",
            "points": "23"
          }
        ]
      }
    }
  ]
}

Shortcuts

GET /club/{id} → redirects to /club/football/{id}

`) } func containsFold(s, substr string) bool { s = strings.ToLower(strings.TrimSpace(s)) substr = strings.ToLower(strings.TrimSpace(substr)) if substr == "" { return false } return strings.Contains(s, substr) } // extractUUIDFromHref finds the first UUID-like token in an href and returns it. func extractUUIDFromHref(href string) string { href = strings.TrimSpace(href) if href == "" { return "" } re := regexp.MustCompile(`[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}`) if m := re.FindString(href); m != "" { return m } // Fallback: some links may end with ID after slash; take last path token if it looks like hex+hyphenated parts := strings.Split(href, "/") if len(parts) > 0 { cand := parts[len(parts)-1] if re.MatchString(cand) { return cand } } return "" } type Match struct { DateTime string `json:"date_time"` Home string `json:"home"` HomeID string `json:"home_id,omitempty"` HomeLogoURL string `json:"home_logo_url,omitempty"` Away string `json:"away"` AwayID string `json:"away_id,omitempty"` AwayLogoURL string `json:"away_logo_url,omitempty"` Score string `json:"score"` Venue string `json:"venue"` Note string `json:"note,omitempty"` MatchID string `json:"match_id"` ReportURL string `json:"report_url,omitempty"` FACRLink string `json:"facr_link,omitempty"` DelegationURL string `json:"delegation_url,omitempty"` } // TableRow represents one row in a standings table type TableRow struct { Rank string `json:"rank"` Team string `json:"team"` TeamID string `json:"team_id,omitempty"` TeamLogoURL string `json:"team_logo_url,omitempty"` Played string `json:"played"` Wins string `json:"wins"` Draws string `json:"draws"` Losses string `json:"losses"` Score string `json:"score"` Points string `json:"points"` } // resolveISURL makes relative IS links absolute against https://is.fotbal.cz/public/ func resolveISURL(href string) string { href = strings.TrimSpace(href) if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") { if u, err := neturl.Parse(href); err == nil { u.Scheme = "https" u.Host = "is.fotbal.cz" if !strings.HasPrefix(u.Path, "/public/") { if strings.HasPrefix(u.Path, "/zapasy/") { u.Path = "/public" + u.Path } } q := u.Query() q.Del("discipline") u.RawQuery = q.Encode() return u.String() } return href } href = strings.TrimPrefix(href, "./") for strings.HasPrefix(href, "../") { href = strings.TrimPrefix(href, "../") } if strings.HasPrefix(href, "/") { href = strings.TrimPrefix(href, "/") } path := "/public/" + href u := neturl.URL{Scheme: "https", Host: "is.fotbal.cz", Path: path} return u.String() }