Files
FacrScraper/main.go
T
Tomas Dvorak a9a89bed7c update
2026-03-20 16:17:39 +01:00

1958 lines
58 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package main
import (
"bytes"
"context"
_ "embed"
"encoding/json"
"fmt"
"io"
"log"
"net/http"
neturl "net/url"
"os"
"os/exec"
"path/filepath"
"regexp"
"strings"
"sync"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/gorilla/mux"
)
type Competition struct {
ID string `json:"id"`
Code string `json:"code"`
Name string `json:"name"`
TeamCount string `json:"team_count"`
MatchesLink string `json:"matches_link"`
Matches []Match `json:"matches,omitempty"`
Table *CompetitionTable `json:"table,omitempty"`
}
// Cloudflare Browser Rendering API structures
type CloudflareCrawlRequest struct {
URL string `json:"url"`
Limit int `json:"limit,omitempty"`
Depth int `json:"depth,omitempty"`
Formats []string `json:"formats,omitempty"`
Render bool `json:"render,omitempty"`
Source string `json:"source,omitempty"`
Options map[string]interface{} `json:"options,omitempty"`
}
type CloudflareCrawlResponse struct {
Success bool `json:"success"`
Result string `json:"result"` // job ID
}
type CloudflareCrawlJob struct {
ID string `json:"id"`
Status string `json:"status"`
BrowserSecondsUsed float64 `json:"browserSecondsUsed"`
Total int `json:"total"`
Finished int `json:"finished"`
Records []CloudflareCrawlRecord `json:"records"`
Cursor string `json:"cursor,omitempty"`
}
type CloudflareCrawlRecord struct {
URL string `json:"url"`
Status string `json:"status"`
Markdown string `json:"markdown,omitempty"`
HTML string `json:"html,omitempty"`
JSON interface{} `json:"json,omitempty"`
Metadata map[string]interface{} `json:"metadata"`
}
type CloudflareClient struct {
AccountID string
APIToken string
BaseURL string
Client *http.Client
}
type fetchOptions struct {
Referer string
}
const (
browserUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0 Safari/537.36"
browserAccept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8"
browserAcceptLanguage = "cs-CZ,cs;q=0.9,en;q=0.8"
scraplingHelperPath = "scripts/scrapling_fetch.py"
)
//go:embed scripts/scrapling_fetch.py
var embeddedScraplingHelper string
var (
embeddedScraplingHelperOnce sync.Once
embeddedScraplingHelperFile string
embeddedScraplingHelperErr error
// Simple in-memory cache for fetched pages
pageCache = make(map[string]*cacheEntry)
pageCacheMu sync.RWMutex
)
type cacheEntry struct {
body []byte
timestamp time.Time
}
const cacheTTL = 5 * time.Minute
// NewCloudflareClient creates a new Cloudflare Browser Rendering API client
func NewCloudflareClient() *CloudflareClient {
accountID := strings.TrimSpace(os.Getenv("CLOUDFLARE_ACCOUNT_ID"))
apiToken := strings.TrimSpace(os.Getenv("CLOUDFLARE_API_TOKEN"))
if accountID == "" || apiToken == "" {
return nil
}
return &CloudflareClient{
AccountID: accountID,
APIToken: apiToken,
BaseURL: "https://api.cloudflare.com/client/v4",
Client: &http.Client{
Timeout: 30 * time.Second,
},
}
}
// StartCrawl initiates a crawl job
func (c *CloudflareClient) StartCrawl(ctx context.Context, req CloudflareCrawlRequest) (string, error) {
if c == nil {
return "", fmt.Errorf("Cloudflare client not initialized")
}
// Set defaults
if req.Limit == 0 {
req.Limit = 10
}
if req.Depth == 0 {
req.Depth = 1
}
if len(req.Formats) == 0 {
req.Formats = []string{"html", "markdown"}
}
if req.Source == "" {
req.Source = "all"
}
// Restrict to specific URL patterns for fotbal.cz to avoid crawling unrelated content
if req.Options == nil {
req.Options = make(map[string]interface{})
}
// Only crawl URLs from the same domain and specific paths
includePatterns := []string{
"https://www.fotbal.cz/**",
}
excludePatterns := []string{
"**/api/**",
"**/static/**",
"**/media/**",
}
req.Options["includePatterns"] = includePatterns
req.Options["excludePatterns"] = excludePatterns
req.Options["includeExternalLinks"] = false
req.Options["includeSubdomains"] = false
body, err := json.Marshal(req)
if err != nil {
return "", fmt.Errorf("failed to marshal request: %w", err)
}
url := fmt.Sprintf("%s/accounts/%s/browser-rendering/crawl", c.BaseURL, c.AccountID)
httpReq, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(body))
if err != nil {
return "", fmt.Errorf("failed to create request: %w", err)
}
httpReq.Header.Set("Authorization", "Bearer "+c.APIToken)
httpReq.Header.Set("Content-Type", "application/json")
resp, err := c.Client.Do(httpReq)
if err != nil {
return "", fmt.Errorf("failed to send request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
return "", fmt.Errorf("API request failed with status %d: %s", resp.StatusCode, string(body))
}
var crawlResp CloudflareCrawlResponse
if err := json.NewDecoder(resp.Body).Decode(&crawlResp); err != nil {
return "", fmt.Errorf("failed to decode response: %w", err)
}
if !crawlResp.Success {
return "", fmt.Errorf("API returned unsuccessful response")
}
return crawlResp.Result, nil
}
// GetCrawlResults retrieves the results of a crawl job
func (c *CloudflareClient) GetCrawlResults(ctx context.Context, jobID string, limit int) (*CloudflareCrawlJob, error) {
if c == nil {
return nil, fmt.Errorf("Cloudflare client not initialized")
}
url := fmt.Sprintf("%s/accounts/%s/browser-rendering/crawl/%s", c.BaseURL, c.AccountID, jobID)
if limit > 0 {
url += fmt.Sprintf("?limit=%d", limit)
}
httpReq, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
httpReq.Header.Set("Authorization", "Bearer "+c.APIToken)
resp, err := c.Client.Do(httpReq)
if err != nil {
return nil, fmt.Errorf("failed to send request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("API request failed with status %d: %s", resp.StatusCode, string(body))
}
var result struct {
Success bool `json:"success"`
Result CloudflareCrawlJob `json:"result"`
}
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return nil, fmt.Errorf("failed to decode response: %w", err)
}
if !result.Success {
return nil, fmt.Errorf("API returned unsuccessful response")
}
return &result.Result, nil
}
// WaitForCrawlCompletion waits for a crawl job to complete and returns the results
func (c *CloudflareClient) WaitForCrawlCompletion(ctx context.Context, jobID string, maxAttempts int, delay time.Duration) (*CloudflareCrawlJob, error) {
if c == nil {
return nil, fmt.Errorf("Cloudflare client not initialized")
}
for i := 0; i < maxAttempts; i++ {
job, err := c.GetCrawlResults(ctx, jobID, 1) // Use limit=1 for status checks
if err != nil {
return nil, err
}
if job.Status != "running" {
// Get full results
fullJob, err := c.GetCrawlResults(ctx, jobID, 0) // No limit for full results
if err != nil {
return nil, err
}
return fullJob, nil
}
select {
case <-ctx.Done():
return nil, ctx.Err()
case <-time.After(delay):
continue
}
}
return nil, fmt.Errorf("crawl job did not complete within timeout")
}
// CrawlURL performs a complete crawl operation for a single URL
func (c *CloudflareClient) CrawlURL(ctx context.Context, url string) (*CloudflareCrawlJob, error) {
if c == nil {
return nil, fmt.Errorf("Cloudflare client not initialized")
}
req := CloudflareCrawlRequest{
URL: url,
Limit: 1, // Only crawl the specific URL
Depth: 0, // Don't follow links
Formats: []string{"html", "markdown"},
Render: true,
Source: "links", // Only crawl the specific URL, not sitemaps
}
jobID, err := c.StartCrawl(ctx, req)
if err != nil {
return nil, fmt.Errorf("failed to start crawl: %w", err)
}
// Wait for completion with reasonable timeout
ctx, cancel := context.WithTimeout(ctx, 2*time.Minute)
defer cancel()
job, err := c.WaitForCrawlCompletion(ctx, jobID, 24, 5*time.Second)
if err != nil {
return nil, fmt.Errorf("failed to wait for crawl completion: %w", err)
}
return job, nil
}
func newBrowserRequest(url string, opts fetchOptions) (*http.Request, error) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("User-Agent", browserUserAgent)
req.Header.Set("Accept", browserAccept)
req.Header.Set("Accept-Language", browserAcceptLanguage)
if opts.Referer != "" {
req.Header.Set("Referer", opts.Referer)
}
return req, nil
}
func looksLikeCloudflareBlock(body []byte) bool {
if len(body) == 0 {
return false
}
lower := strings.ToLower(string(body))
hardSignals := []string{
"<title>just a moment...</title>",
"attention required!",
"enable javascript and cookies to continue",
"checking if the site connection is secure",
"cf-browser-verification",
}
for _, signal := range hardSignals {
if strings.Contains(lower, signal) {
return true
}
}
if strings.Contains(lower, "/cdn-cgi/challenge-platform/") &&
(strings.Contains(lower, "window._cf_chl_opt") ||
strings.Contains(lower, "__cf_chl_rt_tk") ||
strings.Contains(lower, "cf_chl_seq_")) {
return true
}
return false
}
func compactErrorText(s string) string {
s = strings.Join(strings.Fields(strings.TrimSpace(s)), " ")
if len(s) > 220 {
return s[:217] + "..."
}
return s
}
func fetchPageDirect(url string, opts fetchOptions) ([]byte, error) {
req, err := newBrowserRequest(url, opts)
if err != nil {
return nil, err
}
client := &http.Client{Timeout: 15 * time.Second}
resp, err := client.Do(req)
if err != nil {
return nil, fmt.Errorf("direct request failed: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read response body: %w", err)
}
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("direct request returned HTTP %d", resp.StatusCode)
}
if looksLikeCloudflareBlock(body) {
return nil, fmt.Errorf("direct request returned a Cloudflare challenge page")
}
return body, nil
}
func fetchPageWithWget(url string, opts fetchOptions) ([]byte, error) {
if _, err := exec.LookPath("wget"); err != nil {
return nil, fmt.Errorf("wget not available: %w", err)
}
ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second)
defer cancel()
args := []string{
"--quiet",
"--tries=1",
"--timeout=15",
"--max-redirect=10",
"--output-document=-",
"--user-agent=" + browserUserAgent,
"--header=Accept: " + browserAccept,
"--header=Accept-Language: " + browserAcceptLanguage,
}
if opts.Referer != "" {
args = append(args, "--header=Referer: "+opts.Referer)
}
args = append(args, url)
cmd := exec.CommandContext(ctx, "wget", args...)
var stdout bytes.Buffer
var stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
details := compactErrorText(stderr.String())
if details == "" {
details = compactErrorText(err.Error())
}
return nil, fmt.Errorf("wget request failed: %s", details)
}
body := stdout.Bytes()
if len(body) == 0 {
return nil, fmt.Errorf("wget returned an empty body")
}
if looksLikeCloudflareBlock(body) {
return nil, fmt.Errorf("wget returned a Cloudflare challenge page")
}
return body, nil
}
func firstExistingFile(paths ...string) string {
for _, path := range paths {
path = strings.TrimSpace(path)
if path == "" {
continue
}
if info, err := os.Stat(path); err == nil && !info.IsDir() {
return path
}
}
return ""
}
func firstExecutable(paths ...string) string {
for _, path := range paths {
path = strings.TrimSpace(path)
if path == "" {
continue
}
if strings.ContainsRune(path, os.PathSeparator) {
if info, err := os.Stat(path); err == nil && !info.IsDir() {
return path
}
continue
}
if resolved, err := exec.LookPath(path); err == nil {
return resolved
}
}
return ""
}
func ensureEmbeddedScraplingHelper() (string, error) {
embeddedScraplingHelperOnce.Do(func() {
if strings.TrimSpace(embeddedScraplingHelper) == "" {
embeddedScraplingHelperErr = fmt.Errorf("embedded Scrapling helper is empty")
return
}
file, err := os.CreateTemp("", "facr-scrapling-*.py")
if err != nil {
embeddedScraplingHelperErr = fmt.Errorf("create embedded Scrapling helper: %w", err)
return
}
defer file.Close()
if _, err := file.WriteString(embeddedScraplingHelper); err != nil {
embeddedScraplingHelperErr = fmt.Errorf("write embedded Scrapling helper: %w", err)
return
}
if err := file.Chmod(0600); err != nil {
embeddedScraplingHelperErr = fmt.Errorf("chmod embedded Scrapling helper: %w", err)
return
}
embeddedScraplingHelperFile = file.Name()
})
if embeddedScraplingHelperErr != nil {
return "", embeddedScraplingHelperErr
}
if embeddedScraplingHelperFile == "" {
return "", fmt.Errorf("embedded Scrapling helper path is empty")
}
return embeddedScraplingHelperFile, nil
}
func findScraplingHelperScript() (string, error) {
cwd, _ := os.Getwd()
exePath, _ := os.Executable()
exeDir := ""
if exePath != "" {
exeDir = filepath.Dir(exePath)
}
if path := firstExistingFile(
os.Getenv("SCRAPLING_SCRIPT"),
filepath.Join(cwd, scraplingHelperPath),
filepath.Join(exeDir, scraplingHelperPath),
); path != "" {
return path, nil
}
return ensureEmbeddedScraplingHelper()
}
func findScraplingPython() string {
cwd, _ := os.Getwd()
exePath, _ := os.Executable()
exeDir := ""
if exePath != "" {
exeDir = filepath.Dir(exePath)
}
return firstExecutable(
os.Getenv("SCRAPLING_PYTHON_BIN"),
filepath.Join(cwd, ".venv-scrapling", "bin", "python3"),
filepath.Join(cwd, ".venv-scrapling", "bin", "python"),
filepath.Join(cwd, ".venv", "bin", "python3"),
filepath.Join(cwd, ".venv", "bin", "python"),
filepath.Join(exeDir, ".venv-scrapling", "bin", "python3"),
filepath.Join(exeDir, ".venv-scrapling", "bin", "python"),
filepath.Join(exeDir, ".venv", "bin", "python3"),
filepath.Join(exeDir, ".venv", "bin", "python"),
"python3",
"python",
)
}
func fetchPageWithScrapling(url string, opts fetchOptions) ([]byte, error) {
pythonBin := findScraplingPython()
if pythonBin == "" {
return nil, fmt.Errorf("Scrapling skipped: no Python runtime found")
}
helperScript, err := findScraplingHelperScript()
if err != nil {
return nil, fmt.Errorf("Scrapling skipped: %w", err)
}
ctx, cancel := context.WithTimeout(context.Background(), 90*time.Second)
defer cancel()
args := []string{helperScript, "--url", url}
if opts.Referer != "" {
args = append(args, "--referer", opts.Referer)
}
cmd := exec.CommandContext(ctx, pythonBin, args...)
var stdout bytes.Buffer
var stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
details := compactErrorText(stderr.String())
if details == "" {
details = compactErrorText(err.Error())
}
return nil, fmt.Errorf("Scrapling request failed: %s", details)
}
body := stdout.Bytes()
if len(body) == 0 {
return nil, fmt.Errorf("Scrapling returned an empty body")
}
if looksLikeCloudflareBlock(body) {
return nil, fmt.Errorf("Scrapling returned a Cloudflare challenge page")
}
return body, nil
}
func fetchPageWithFallback(url string) ([]byte, error) {
return fetchPageWithFallbackOptions(url, fetchOptions{})
}
// fetchPageWithFallback tries Go HTTP first, then wget, then Scrapling, then Cloudflare Browser Rendering.
func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error) {
// Check cache first
pageCacheMu.RLock()
if entry, ok := pageCache[url]; ok {
if time.Since(entry.timestamp) < cacheTTL {
pageCacheMu.RUnlock()
log.Printf("Cache hit for %s", url)
return entry.body, nil
}
}
pageCacheMu.RUnlock()
body, err := fetchPageDirect(url, opts)
if err == nil {
cachePage(url, body)
return body, nil
}
log.Printf("Direct request failed for %s: %v", url, err)
body, err = fetchPageWithWget(url, opts)
if err == nil {
log.Printf("Successfully retrieved content via wget for %s", url)
cachePage(url, body)
return body, nil
}
log.Printf("wget fallback failed for %s: %v", url, err)
body, err = fetchPageWithScrapling(url, opts)
if err == nil {
log.Printf("Successfully retrieved content via Scrapling for %s", url)
cachePage(url, body)
return body, nil
}
log.Printf("Scrapling fallback failed for %s: %v", url, err)
if cfClient := NewCloudflareClient(); cfClient != nil {
log.Printf("Attempting Cloudflare crawl fallback for %s", url)
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
defer cancel()
job, err := cfClient.CrawlURL(ctx, url)
if err != nil {
log.Printf("Cloudflare crawl failed for %s: %v", url, err)
return nil, fmt.Errorf("go scraping failed, wget failed, Scrapling failed, and Cloudflare crawl failed: %w", err)
}
if len(job.Records) > 0 && job.Records[0].Status == "completed" {
body := []byte(job.Records[0].HTML)
if looksLikeCloudflareBlock(body) {
return nil, fmt.Errorf("Cloudflare crawl returned a challenge page")
}
log.Printf("Successfully retrieved content via Cloudflare crawl for %s", url)
cachePage(url, body)
return body, nil
}
log.Printf("Cloudflare crawl returned no completed records for %s", url)
return nil, fmt.Errorf("Cloudflare crawl returned no completed records")
}
return nil, fmt.Errorf("go scraping failed, wget failed, Scrapling failed, and Cloudflare client is not available")
}
func cachePage(url string, body []byte) {
pageCacheMu.Lock()
pageCache[url] = &cacheEntry{body: body, timestamp: time.Now()}
pageCacheMu.Unlock()
}
// parseCompetitionMatchesFromFotbal scrapes matches from the public fotbal.cz
// competition page (e.g., https://www.fotbal.cz/souteze/turnaje/table/{id}).
// It filters to only include matches involving the given clubName if provided.
func parseCompetitionMatchesFromFotbal(pageURL, clubType, clubName, clubID string) []Match {
pageURL = strings.TrimSpace(pageURL)
if pageURL == "" {
return nil
}
body, err := fetchPageWithFallback(pageURL)
if err != nil {
log.Printf("fotbal.cz matches fetch failed for %s: %v", pageURL, err)
return nil
}
// If we still don't have body content, return nil
if len(body) == 0 {
log.Printf("No content available for %s", pageURL)
return nil
}
// Debug: save full HTML if env toggled
if os.Getenv("DEBUG_SAVE_HTML") != "" {
// derive a friendly filename from last URL path segment
comp := pageURL
if i := strings.LastIndex(comp, "/"); i >= 0 && i+1 < len(comp) {
comp = comp[i+1:]
}
fname := fmt.Sprintf("fotbal_comp_%s.html", comp)
if err := os.WriteFile(fname, body, 0644); err != nil {
log.Printf("failed writing debug HTML %s: %v", fname, err)
} else {
log.Printf("saved debug HTML: %s", fname)
}
}
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
if err != nil {
log.Printf("fotbal.cz matches parse error for %s: %v", pageURL, err)
return nil
}
var matches []Match
// Sections per round
doc.Find("section.js-matchRoundSection li.MatchRound").Each(func(_ int, li *goquery.Selection) {
a := li.Find("a.MatchRound-match").First()
if a.Length() == 0 {
return
}
// Teams
teamNames := []string{}
li.Find("a.MatchRound-match ul li span.H7").Each(func(_ int, s *goquery.Selection) {
t := strings.TrimSpace(s.Text())
if t != "" {
teamNames = append(teamNames, t)
}
})
if len(teamNames) < 2 {
return
}
home := teamNames[0]
away := teamNames[1]
// Try to extract team IDs from img URLs if present
imgIDs := []string{}
li.Find("a.MatchRound-match img").Each(func(_ int, img *goquery.Selection) {
src := strings.TrimSpace(img.AttrOr("src", ""))
if src == "" {
return
}
if id := extractUUIDFromHref(src); id != "" {
imgIDs = append(imgIDs, id)
}
})
homeID, awayID := "", ""
if len(imgIDs) >= 1 {
homeID = imgIDs[0]
}
if len(imgIDs) >= 2 {
awayID = imgIDs[1]
}
// Score
score := strings.TrimSpace(a.Find("strong.H4").First().Text())
if re := regexp.MustCompile(`\s*([0-9]+)\s*:\s*([0-9]+)\s*`); re != nil {
if m := re.FindStringSubmatch(score); len(m) == 3 {
score = fmt.Sprintf("%s:%s", m[1], m[2])
}
}
// Meta: date, match id in meta list and link
dateText := ""
li.Find(".MatchRound-meta p").Each(func(_ int, p *goquery.Selection) {
label := strings.TrimSpace(p.Find("strong").First().Text())
txt := strings.TrimSpace(p.Text())
if strings.HasPrefix(strings.ToLower(label), "datum") {
// Remove label from text
dateText = strings.TrimSpace(strings.ReplaceAll(txt, label+":", ""))
}
})
// Venue from details, if available
venue := ""
li.Find(".js-matchRoundDetails li p").Each(func(_ int, p *goquery.Selection) {
label := strings.TrimSpace(p.Find("strong").First().Text())
txt := strings.TrimSpace(p.Text())
if strings.HasPrefix(strings.ToLower(label), "hřiště") || strings.HasPrefix(strings.ToLower(label), "hriste") {
venue = strings.TrimSpace(strings.ReplaceAll(txt, label+":", ""))
}
})
// Match ID from the anchor href
matchID := extractUUIDFromHref(a.AttrOr("href", ""))
reportURL := ""
if matchID != "" {
if strings.EqualFold(clubType, "futsal") {
reportURL = fmt.Sprintf("https://www.fotbal.cz/futsal/zapasy/futsal/%s", matchID)
} else {
reportURL = fmt.Sprintf("https://www.fotbal.cz/souteze/zapasy/zapas/%s", matchID)
}
}
// Filter by club involvement: prefer UUID match, fallback to name matching including simplified token
if clubName != "" || clubID != "" {
involved := false
// If we could extract team UUIDs, match by ID first (robust against aliases)
if clubID != "" && (strings.EqualFold(homeID, clubID) || strings.EqualFold(awayID, clubID)) {
involved = true
} else if clubName != "" {
// Fallback to fuzzy full-name matching
involved = strings.EqualFold(home, clubName) || strings.EqualFold(away, clubName) ||
containsFold(clubName, home) || containsFold(clubName, away) ||
containsFold(home, clubName) || containsFold(away, clubName)
// As a last resort, try matching a simplified token (e.g., city) of the club name
if !involved {
token := simplifyClubQuery(clubName)
if token != "" && (containsFold(home, token) || containsFold(away, token)) {
involved = true
}
}
}
if !involved {
return
}
}
// Backfill IDs for current club if missing
if homeID == "" {
if strings.EqualFold(home, clubName) || containsFold(home, clubName) || containsFold(clubName, home) {
homeID = clubID
} else {
token := simplifyClubQuery(clubName)
if token != "" && containsFold(home, token) {
homeID = clubID
}
}
}
if awayID == "" {
if strings.EqualFold(away, clubName) || containsFold(away, clubName) || containsFold(clubName, away) {
awayID = clubID
} else {
token := simplifyClubQuery(clubName)
if token != "" && containsFold(away, token) {
awayID = clubID
}
}
}
homeLogo := getLogo(home, homeID)
awayLogo := getLogo(away, awayID)
matches = append(matches, Match{
DateTime: dateText,
Home: home, HomeID: homeID, HomeLogoURL: homeLogo,
Away: away, AwayID: awayID, AwayLogoURL: awayLogo,
Score: score,
Venue: venue,
MatchID: matchID,
ReportURL: reportURL,
FACRLink: reportURL,
})
})
return matches
}
// parseCompetitionMatchesFromIS scrapes matches from the IS portal as fallback.
func parseCompetitionMatchesFromIS(detailURL, clubType, clubName, clubID string) []Match {
resp, err := http.Get(detailURL)
if err != nil {
log.Printf("IS matches fetch error for %s: %v", detailURL, err)
return nil
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
log.Printf("IS matches non-200 for %s: %d", detailURL, resp.StatusCode)
return nil
}
// Read body so we can optionally save and then parse from memory
body, err := io.ReadAll(resp.Body)
if err != nil {
log.Printf("IS matches read error for %s: %v", detailURL, err)
return nil
}
if os.Getenv("DEBUG_SAVE_HTML") != "" {
// name the file using the req (competition id) if present
fname := "is_detail.html"
if u, err := neturl.Parse(detailURL); err == nil {
req := u.Query().Get("req")
sport := u.Query().Get("sport")
if req != "" {
fname = fmt.Sprintf("is_comp_%s_%s.html", req, sport)
}
}
if err := os.WriteFile(fname, body, 0644); err != nil {
log.Printf("failed writing debug IS HTML %s: %v", fname, err)
} else {
log.Printf("saved debug IS HTML: %s", fname)
}
}
docDetail, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
if err != nil {
log.Printf("IS matches parse error for %s: %v", detailURL, err)
return nil
}
var matches []Match
totalRows := 0
keptRows := 0
docDetail.Find("table.soutez-zapasy tr").Each(func(_ int, s *goquery.Selection) {
if s.Find("th").Length() > 0 {
return
}
tds := s.Find("td")
if tds.Length() < 5 {
return
}
totalRows++
getText := func(sel *goquery.Selection) string { return strings.TrimSpace(sel.Text()) }
dt := getText(tds.Eq(0))
rawHome := getText(tds.Eq(1))
if idx := strings.Index(rawHome, "("); idx >= 0 {
rawHome = strings.TrimSpace(rawHome[:idx])
}
rawAway := getText(tds.Eq(2))
if idx := strings.Index(rawAway, "("); idx >= 0 {
rawAway = strings.TrimSpace(rawAway[:idx])
}
homeID := extractUUIDFromHref(tds.Eq(1).Find("a").First().AttrOr("href", ""))
awayID := extractUUIDFromHref(tds.Eq(2).Find("a").First().AttrOr("href", ""))
rawScore := getText(tds.Eq(3))
score := ""
if re := regexp.MustCompile(`(\d+)\s*:\s*(\d+)`); re != nil {
if m := re.FindStringSubmatch(rawScore); len(m) == 3 {
score = fmt.Sprintf("%s:%s", m[1], m[2])
}
}
venue := ""
if tds.Length() > 4 {
venue = getText(tds.Eq(4))
}
var reportURL, matchID string
var isReportHref, isDelegHref string
// Use the last column for links to be robust to optional columns
tds.Eq(tds.Length() - 1).Find("a").Each(func(_ int, a *goquery.Selection) {
href := strings.TrimSpace(a.AttrOr("href", ""))
if href == "" {
return
}
if u, err := neturl.Parse(href); err == nil {
if id := u.Query().Get("zapas"); id != "" {
matchID = id
}
}
// Capture specific IS links
if strings.Contains(href, "zapis-o-utkani-report.aspx") {
isReportHref = resolveISURL(href)
}
if strings.Contains(href, "zapas-delegace-report.aspx") {
isDelegHref = resolveISURL(href)
}
})
if matchID != "" {
if strings.EqualFold(clubType, "futsal") {
reportURL = fmt.Sprintf("https://www.fotbal.cz/futsal/zapasy/futsal/%s", matchID)
} else {
reportURL = fmt.Sprintf("https://www.fotbal.cz/souteze/zapasy/zapas/%s", matchID)
}
}
// Canonical fotbal.cz link
facrLink := reportURL
// Filter by club involvement: prefer UUID match, fallback to name matching with simplified token
if clubName != "" || clubID != "" {
involved := false
if clubID != "" && (strings.EqualFold(homeID, clubID) || strings.EqualFold(awayID, clubID)) {
involved = true
} else if clubName != "" {
involved = strings.EqualFold(rawHome, clubName) || strings.EqualFold(rawAway, clubName) ||
containsFold(clubName, rawHome) || containsFold(clubName, rawAway) ||
containsFold(rawHome, clubName) || containsFold(rawAway, clubName)
if !involved {
token := simplifyClubQuery(clubName)
if token != "" && (containsFold(rawHome, token) || containsFold(rawAway, token)) {
involved = true
}
}
}
if !involved {
return
}
}
keptRows++
if homeID == "" {
if strings.EqualFold(rawHome, clubName) || containsFold(rawHome, clubName) || containsFold(clubName, rawHome) {
homeID = clubID
} else {
token := simplifyClubQuery(clubName)
if token != "" && containsFold(rawHome, token) {
homeID = clubID
}
}
}
if awayID == "" {
if strings.EqualFold(rawAway, clubName) || containsFold(rawAway, clubName) || containsFold(clubName, rawAway) {
awayID = clubID
} else {
token := simplifyClubQuery(clubName)
if token != "" && containsFold(rawAway, token) {
awayID = clubID
}
}
}
homeLogo := getLogo(rawHome, homeID)
awayLogo := getLogo(rawAway, awayID)
matches = append(matches, Match{DateTime: dt, Home: rawHome, HomeID: homeID, HomeLogoURL: homeLogo, Away: rawAway, AwayID: awayID, AwayLogoURL: awayLogo, Score: score, Venue: venue, MatchID: matchID, ReportURL: func() string {
if isReportHref != "" {
return isReportHref
}
return reportURL
}(), FACRLink: facrLink, DelegationURL: isDelegHref})
})
if os.Getenv("DEBUG_SAVE_HTML") != "" {
log.Printf("IS parse summary for %s: total rows=%d, kept=%d", detailURL, totalRows, keptRows)
}
return matches
}
var logoCache = map[string]string{}
type logoAPISearchResult struct {
ID string `json:"id"`
Name string `json:"name"`
LogoURL string `json:"logo_url"`
HasLocalLogo bool `json:"has_local_logo"`
}
type searchAPIResult struct {
Results []struct {
Name string `json:"name"`
LogoURL string `json:"logo_url"`
} `json:"results"`
}
// a simplified search token like "krnov" to improve chances of finding a logo.
func simplifyClubQuery(name string) string {
s := strings.TrimSpace(name)
if s == "" {
return ""
}
parts := strings.Fields(s)
if len(parts) == 0 {
return ""
}
// Walk from the end to find a meaningful token (avoid legal suffixes like "z.s.")
stop := map[string]struct{}{
"z.s.": {}, "z.s": {}, "zs": {}, "zapsany": {}, "zapsaný": {}, "spolek": {},
"o.s.": {}, "o.s": {}, "os": {}, "a.s.": {}, "a.s": {}, "as": {},
"s.r.o.": {}, "s.r.o": {}, "sro": {},
}
for i := len(parts) - 1; i >= 0; i-- {
tok := parts[i]
tok = strings.Trim(tok, ",.;:-()[]{}\"'`“”’")
lt := strings.ToLower(tok)
if _, banned := stop[lt]; banned {
continue
}
// prefer tokens with letters and length >= 3
letters := regexp.MustCompile(`[a-zA-Zá-žÁ-Ž]`).MatchString
if len([]rune(lt)) >= 3 && letters(lt) {
return lt
}
}
// Fallback to last token sanitized
last := strings.Trim(parts[len(parts)-1], ",.;:-()[]{}\"'`“”’")
return strings.ToLower(last)
}
func getLogoFromLogoAPI(teamName string, teamID string) string {
base := strings.TrimSpace(os.Getenv("LOGOAPI_BASE_URL"))
if base == "" {
base = "https://logoapi.sportcreative.eu"
}
base = strings.TrimRight(base, "/")
name := strings.TrimSpace(teamName)
if name == "" {
return ""
}
cacheKey := "logoapi|" + strings.ToLower(name)
if v, ok := logoCache[cacheKey]; ok {
return v
}
client := &http.Client{Timeout: 5 * time.Second}
doSearch := func(q string) ([]logoAPISearchResult, bool) {
q = strings.TrimSpace(q)
if q == "" {
return nil, false
}
u := fmt.Sprintf("%s/clubs/search-with-logos?q=%s", base, neturl.QueryEscape(q))
resp, err := client.Get(u)
if err != nil {
return nil, false
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
io.Copy(io.Discard, resp.Body)
return nil, false
}
var payload []logoAPISearchResult
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
// Non-JSON or invalid response treat as no result
return nil, false
}
return payload, true
}
// Search strictly by full club name; if it yields nothing usable, let caller fall back to FACR.
payload, ok := doSearch(name)
if !ok || len(payload) == 0 {
logoCache[cacheKey] = ""
return ""
}
// Normalize names for comparison (case-insensitive, strip common legal/sport suffixes).
normalize := func(s string) string {
s = strings.ToLower(strings.TrimSpace(s))
if s == "" {
return s
}
parts := strings.Fields(s)
stop := map[string]struct{}{
"fotbal": {}, "futsal": {},
"z.s.": {}, "z.s": {}, "zs": {},
"o.s.": {}, "o.s": {}, "os": {},
"a.s.": {}, "a.s": {}, "as": {},
"s.r.o.": {}, "s.r.o": {}, "sro": {},
}
end := len(parts)
for end > 0 {
if _, banned := stop[parts[end-1]]; banned {
end--
continue
}
break
}
if end != len(parts) {
parts = parts[:end]
}
return strings.Join(parts, " ")
}
want := normalize(name)
var withLogo []logoAPISearchResult
for _, r := range payload {
if r.HasLocalLogo {
withLogo = append(withLogo, r)
}
}
if len(withLogo) == 0 {
logoCache[cacheKey] = ""
return ""
}
// Only accept a logo when the normalized club name matches; avoid arbitrary first-result picks.
for _, r := range withLogo {
if normalize(r.Name) == want {
logoCache[cacheKey] = r.LogoURL
return r.LogoURL
}
}
// No strong match treat as "no logo" so upstream can fall back to FACR assets.
logoCache[cacheKey] = ""
return ""
}
func getLogoBySearch(name string) string {
key := strings.ToLower(strings.TrimSpace(name))
if key == "" {
return ""
}
if v, ok := logoCache[key]; ok {
return v
}
client := &http.Client{Timeout: 5 * time.Second}
// Prefer simplified last-word token (e.g., "krnov") to improve hit rate for logos
query := simplifyClubQuery(name)
if query == "" {
query = name
}
doSearch := func(q string) (searchAPIResult, bool) {
url := fmt.Sprintf("http://localhost:8686/club/search?q=%s", neturl.QueryEscape(q))
resp, err := client.Get(url)
if err != nil {
return searchAPIResult{}, false
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
io.Copy(io.Discard, resp.Body)
return searchAPIResult{}, false
}
var payload searchAPIResult
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
return searchAPIResult{}, false
}
return payload, true
}
payload, ok := doSearch(query)
if !ok || len(payload.Results) == 0 {
// Fallback to full name if simplified token yields nothing
payload, ok = doSearch(name)
if !ok {
return ""
}
}
// pick best match: exact (case-insensitive), then contains, else first
best := ""
for _, r := range payload.Results {
if strings.EqualFold(strings.TrimSpace(r.Name), strings.TrimSpace(name)) {
best = r.LogoURL
break
}
}
if best == "" {
for _, r := range payload.Results {
rname := strings.ToLower(r.Name)
if strings.Contains(rname, key) || strings.Contains(key, rname) {
best = r.LogoURL
break
}
}
}
if best == "" && len(payload.Results) > 0 {
best = payload.Results[0].LogoURL
}
logoCache[key] = best
return best
}
func getLogo(teamName string, teamID string) string {
placeholder := "https://www.fotbal.cz/dist/img/logo-club-empty.svg"
name := strings.ToLower(strings.TrimSpace(teamName))
if name == "" || strings.Contains(name, "volno") || strings.Contains(name, "volný los") || strings.Contains(name, "volny los") || strings.Contains(name, "bye") {
return placeholder
}
if logo := getLogoFromLogoAPI(teamName, teamID); logo != "" {
return logo
}
// If we have a team ID, construct the official logo URL directly.
// This avoids wrong matches for duplicate names (e.g., multiple "Ořechov").
if tid := strings.TrimSpace(teamID); tid != "" {
return fmt.Sprintf("https://is1.fotbal.cz/media/kluby/%s/%s_crop.jpg", tid, tid)
}
// Otherwise, try the local FACR-based search endpoint by name.
if logo := getLogoBySearch(teamName); logo != "" {
return logo
}
// No ID and no search hit -> placeholder
return placeholder
}
// CompetitionTable holds standings sections; currently only Overall is used
type CompetitionTable struct {
Overall []TableRow `json:"overall"`
}
// ClubInfo is the response for club info and tables endpoints
type ClubInfo struct {
Name string `json:"name"`
ClubID string `json:"club_id"`
ClubType string `json:"club_type"`
ClubInternalID string `json:"club_internal_id,omitempty"`
URL string `json:"url,omitempty"`
LogoURL string `json:"logo_url,omitempty"`
Address string `json:"address,omitempty"`
Category string `json:"category,omitempty"`
Competitions []Competition `json:"competitions"`
}
// SearchResult represents one club from fotbal.cz search
type SearchResult struct {
Name string `json:"name"`
ClubID string `json:"club_id"`
ClubType string `json:"club_type"` // football or futsal
URL string `json:"url"`
LogoURL string `json:"logo_url"`
Category string `json:"category,omitempty"`
Address string `json:"address,omitempty"`
}
// getClubSearch queries fotbal.cz club search and returns results with logo
func getClubSearch(w http.ResponseWriter, r *http.Request) {
q := strings.TrimSpace(r.URL.Query().Get("q"))
if q == "" {
http.Error(w, "query parameter 'q' is required", http.StatusBadRequest)
return
}
// Build search URL
vals := neturl.Values{}
vals.Set("q", q)
searchURL := "https://www.fotbal.cz/club/hledej?" + vals.Encode()
fetchSearchPage := func(url string) ([]byte, error) {
return fetchPageWithFallbackOptions(url, fetchOptions{
Referer: "https://www.fotbal.cz/club/hledej",
})
}
// Try direct HTTP request first
body, err := fetchSearchPage(searchURL)
if err != nil {
log.Printf("Direct search request failed for %s: %v", searchURL, err)
// Retry with quoted query for short tokens
searchURL2 := searchURL
tokens := strings.Fields(q)
for _, t := range tokens {
if len([]rune(t)) <= 2 {
vals2 := neturl.Values{}
vals2.Set("q", "\""+q+"\"")
searchURL2 = "https://www.fotbal.cz/club/hledej?" + vals2.Encode()
break
}
}
body, err = fetchSearchPage(searchURL2)
if err != nil {
log.Printf("Retried search request failed for %s: %v", searchURL2, err)
// Return empty results instead of error
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]any{
"query": q,
"count": 0,
"results": []SearchResult{},
})
return
}
}
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
if err != nil {
http.Error(w, fmt.Sprintf("Error parsing HTML: %v", err), http.StatusInternalServerError)
return
}
var results []SearchResult
// The page lists clubs in section "Výsledky hledání" as li.ListItemSplit
doc.Find("li.ListItemSplit").Each(func(_ int, li *goquery.Selection) {
a := li.Find("a.Link--inverted").First()
href, _ := a.Attr("href")
if href == "" {
return
}
name := strings.TrimSpace(a.Find("span.H7").First().Text())
if name == "" {
// fallback to link text
name = strings.TrimSpace(a.Text())
}
img := a.Find("img").First()
logoURL, _ := img.Attr("src")
// Category
category := strings.TrimSpace(li.Find(".ClubCategories .BadgeCategory").First().Text())
// Address
address := strings.TrimSpace(li.Find(".ClubAddress p").First().Text())
// Infer club type from href
clubType := "football"
if strings.Contains(strings.ToLower(href), "/futsal/") {
clubType = "futsal"
}
// Extract club ID from last path segment
// e.g., https://www.fotbal.cz/futsal/club/club/{uuid}
parts := strings.Split(strings.TrimRight(href, "/"), "/")
clubID := ""
if len(parts) > 0 {
clubID = parts[len(parts)-1]
}
// Normalize URL (ensure absolute)
if !strings.HasPrefix(href, "http://") && !strings.HasPrefix(href, "https://") {
href = "https://www.fotbal.cz" + href
}
// Prefer logoapi / local logo when available
if l := strings.TrimSpace(getLogo(name, clubID)); l != "" {
logoURL = l
}
results = append(results, SearchResult{
Name: name,
ClubID: clubID,
ClubType: clubType,
URL: href,
LogoURL: logoURL,
Category: category,
Address: address,
})
})
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]any{
"query": q,
"count": len(results),
"results": results,
})
}
// getClubTables returns club info with competition standings tables (no matches)
func getClubTables(w http.ResponseWriter, r *http.Request) {
vars := mux.Vars(r)
clubID := vars["id"]
clubType := vars["type"]
if clubID == "" {
http.Error(w, "Club ID is required", http.StatusBadRequest)
return
}
// Validate club type
var baseURL string
var sportParam string
switch clubType {
case "football":
baseURL = "https://www.fotbal.cz/souteze/club/club"
sportParam = "fotbal"
case "futsal":
baseURL = "https://www.fotbal.cz/futsal/club/club"
sportParam = "futsal"
default:
http.Error(w, "Invalid club type. Use 'football' or 'futsal'.", http.StatusBadRequest)
return
}
url := fmt.Sprintf("%s/%s", baseURL, clubID)
body, err := fetchPageWithFallback(url)
if err != nil {
http.Error(w, fmt.Sprintf("Error fetching club data: %v", err), http.StatusInternalServerError)
return
}
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
if err != nil {
http.Error(w, fmt.Sprintf("Error parsing HTML: %v", err), http.StatusInternalServerError)
return
}
// Extract club internal ID
clubInternalID := ""
doc.Find("section").Each(func(i int, s *goquery.Selection) {
headerText := s.Find("h3 span").First().Text()
if strings.TrimSpace(headerText) == "ID klubu" {
clubInternalID = strings.TrimSpace(s.Find("ul li").First().Text())
}
})
// Extract competitions
var competitions []Competition
doc.Find("table.Table tbody tr").Each(func(i int, s *goquery.Selection) {
code := strings.TrimSpace(s.Find("td:first-child").Text())
nameLink := s.Find("td:nth-child(2) a")
name := strings.TrimSpace(nameLink.Text())
teamCount := strings.TrimSpace(s.Find("td:nth-child(3)").Text())
// Extract competition ID from the link
parts := strings.Split(nameLink.AttrOr("href", ""), "/")
compID := ""
if len(parts) >= 2 {
compID = parts[len(parts)-1]
}
// Build public table link depending on clubType
tableLink := ""
if strings.EqualFold(clubType, "futsal") {
tableLink = fmt.Sprintf("https://www.fotbal.cz/futsal/futsal/table/%s", compID)
} else {
tableLink = fmt.Sprintf("https://www.fotbal.cz/souteze/turnaje/table/%s", compID)
}
competitions = append(competitions, Competition{
ID: compID,
Code: code,
Name: name,
TeamCount: teamCount,
MatchesLink: tableLink,
})
})
// For each competition, fetch the standings tables from is.fotbal.cz
for i := range competitions {
comp := &competitions[i]
tableURL := fmt.Sprintf("https://is.fotbal.cz/public/souteze/tabulky-souteze.aspx?req=%s&sport=%s", comp.ID, sportParam)
resp, err := http.Get(tableURL)
if err != nil {
log.Printf("error fetching competition table for %s: %v", comp.ID, err)
continue
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
log.Printf("non-200 response for %s: %d", comp.ID, resp.StatusCode)
continue
}
docTable, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
log.Printf("error parsing table HTML for %s: %v", comp.ID, err)
continue
}
// Parse section: Tabulka celková (only overall)
var overall []TableRow
parseSection := func(headerText string) []TableRow {
var rows []TableRow
// Find the h3 with matching text, then the following .list.tabulky table
docTable.Find("h3").EachWithBreak(func(_ int, h3 *goquery.Selection) bool {
if strings.EqualFold(strings.TrimSpace(h3.Text()), headerText) {
list := h3.NextAllFiltered("div.list.tabulky").First()
if list.Length() == 0 {
return false
}
table := list.Find("table.vysledky-tabulky tbody")
table.Find("tr").Each(func(_ int, tr *goquery.Selection) {
// skip header rows containing th
if tr.Find("th").Length() > 0 {
return
}
tds := tr.Find("td")
if tds.Length() < 8 {
return
}
get := func(i int) string { return strings.TrimSpace(tds.Eq(i).Text()) }
rank := get(0)
team := get(1)
teamID := extractUUIDFromHref(tds.Eq(1).Find("a").First().AttrOr("href", ""))
played := get(2)
wins := get(3)
draws := get(4)
losses := get(5)
scoreRaw := get(6)
// normalize score like "5 : 0" -> "5:0"
score := scoreRaw
if re := regexp.MustCompile(`\s*([0-9]+)\s*:\s*([0-9]+)\s*`); re != nil {
if m := re.FindStringSubmatch(scoreRaw); len(m) == 3 {
score = fmt.Sprintf("%s:%s", m[1], m[2])
}
}
points := get(7)
rows = append(rows, TableRow{
Rank: rank, Team: team, TeamID: teamID, TeamLogoURL: getLogo(team, teamID), Played: played, Wins: wins, Draws: draws, Losses: losses, Score: score, Points: points,
})
})
return false
}
return true
})
return rows
}
overall = parseSection("Tabulka celková")
comp.Table = &CompetitionTable{Overall: overall}
}
clubName := strings.TrimSpace(doc.Find("h1.H4 span").First().Text())
clubURL := strings.TrimSpace(doc.Find("h1.H4 a").First().AttrOr("href", ""))
logoURL := strings.TrimSpace(doc.Find("img.Logo").First().AttrOr("src", ""))
if l := strings.TrimSpace(getLogo(clubName, clubID)); l != "" {
logoURL = l
}
category := strings.TrimSpace(doc.Find("section").First().Find("h3 span").First().Text())
address := strings.TrimSpace(doc.Find("section").First().Find("ul li").First().Text())
clubInfo := ClubInfo{
Name: clubName,
ClubID: clubID,
ClubType: clubType,
ClubInternalID: clubInternalID,
URL: clubURL,
LogoURL: logoURL,
Address: address,
Category: category,
Competitions: competitions,
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(clubInfo)
}
// getClubInfo returns club info with competitions and matches
func getClubInfo(w http.ResponseWriter, r *http.Request) {
vars := mux.Vars(r)
clubID := vars["id"]
clubType := vars["type"]
if clubID == "" {
http.Error(w, "Club ID is required", http.StatusBadRequest)
return
}
var baseURL, sportParam string
switch clubType {
case "football":
baseURL = "https://www.fotbal.cz/souteze/club/club"
sportParam = "fotbal"
case "futsal":
baseURL = "https://www.fotbal.cz/futsal/club/club"
sportParam = "futsal"
default:
http.Error(w, "Invalid club type. Use 'football' or 'futsal'.", http.StatusBadRequest)
return
}
url := fmt.Sprintf("%s/%s", baseURL, clubID)
body, err := fetchPageWithFallback(url)
if err != nil {
http.Error(w, fmt.Sprintf("Error fetching club data: %v", err), http.StatusInternalServerError)
return
}
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
if err != nil {
http.Error(w, fmt.Sprintf("Error parsing HTML: %v", err), http.StatusInternalServerError)
return
}
clubName := strings.TrimSpace(doc.Find("h1.H4 span").First().Text())
// Basic club metadata
clubURL := fmt.Sprintf("%s/%s", baseURL, clubID)
logoURL := getLogo(clubName, clubID)
if logoURL == "" {
logoURL = fmt.Sprintf("https://is1.fotbal.cz/media/kluby/%s/%s_crop.jpg", clubID, clubID)
}
category := "Fotbal"
if strings.EqualFold(clubType, "futsal") {
category = "Futsal"
}
// Internal ID
clubInternalID := ""
doc.Find("section").Each(func(_ int, s *goquery.Selection) {
if strings.TrimSpace(s.Find("h3 span").First().Text()) == "ID klubu" {
clubInternalID = strings.TrimSpace(s.Find("ul li").First().Text())
}
})
// Address (best-effort)
address := strings.TrimSpace(doc.Find(".ClubAddress p").First().Text())
// Competitions list
var competitions []Competition
doc.Find("table.Table tbody tr").Each(func(_ int, tr *goquery.Selection) {
code := strings.TrimSpace(tr.Find("td:first-child").Text())
nameLink := tr.Find("td:nth-child(2) a")
name := strings.TrimSpace(nameLink.Text())
teamCount := strings.TrimSpace(tr.Find("td:nth-child(3)").Text())
parts := strings.Split(strings.TrimSpace(nameLink.AttrOr("href", "")), "/")
compID := ""
if len(parts) >= 2 {
compID = parts[len(parts)-1]
}
// Public table URL for convenience
tableLink := ""
if strings.EqualFold(clubType, "futsal") {
tableLink = fmt.Sprintf("https://www.fotbal.cz/futsal/futsal/table/%s", compID)
} else {
tableLink = fmt.Sprintf("https://www.fotbal.cz/souteze/turnaje/table/%s", compID)
}
competitions = append(competitions, Competition{ID: compID, Code: code, Name: name, TeamCount: teamCount, MatchesLink: tableLink})
})
// For each competition, fetch matches
for i := range competitions {
comp := &competitions[i]
matchesLink := comp.MatchesLink
// 1) Try parsing from the public fotbal.cz competition page (matches_link)
matches := parseCompetitionMatchesFromFotbal(matchesLink, clubType, clubName, clubID)
// Always try IS as well and prefer it if it provides at least as many matches
detailURL := fmt.Sprintf("https://is.fotbal.cz/public/souteze/detail-souteze.aspx?req=%s&sport=%s", comp.ID, sportParam)
isMatches := parseCompetitionMatchesFromIS(detailURL, clubType, clubName, clubID)
// Prefer IS whenever it yields any results, as IS often contains alias team names
if len(isMatches) > 0 {
matches = isMatches
}
comp.Matches = matches
}
clubInfo := ClubInfo{
Name: clubName,
ClubID: clubID,
ClubType: clubType,
ClubInternalID: clubInternalID,
URL: clubURL,
LogoURL: logoURL,
Address: address,
Category: category,
Competitions: competitions,
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(clubInfo)
}
func main() {
r := mux.NewRouter()
r.HandleFunc("/club/{type}/{id}", getClubInfo).Methods("GET")
r.HandleFunc("/club/{type}/{id}/table", getClubTables).Methods("GET")
r.HandleFunc("/club/search", getClubSearch).Methods("GET")
r.HandleFunc("/club/{id:[0-9a-fA-F-]+}", func(w http.ResponseWriter, r *http.Request) {
vars := mux.Vars(r)
http.Redirect(w, r, "/club/football/"+vars["id"], http.StatusMovedPermanently)
}).Methods("GET")
r.HandleFunc("/", docsHandler)
port := ":8686"
fmt.Printf("Server running on http://localhost%s\n", port)
log.Fatal(http.ListenAndServe(port, r))
}
// docsHandler serves a simple HTML API documentation at the root endpoint.
func docsHandler(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html; charset=utf-8")
io.WriteString(w, `<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>FACR Scraper API Docs</title>
<style>
:root { color-scheme: light dark; }
body { font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif; margin: 0; padding: 24px; line-height: 1.5; }
header { margin-bottom: 24px; }
code, pre { background: rgba(127,127,127,.15); padding: .2em .4em; border-radius: 4px; }
pre { padding: 12px; overflow: auto; }
.ep { margin: 18px 0; padding: 16px; border-left: 4px solid #4f46e5; background: rgba(79,70,229,.08); border-radius: 6px; }
h1 { margin: 0 0 8px; font-size: 1.6rem; }
h2 { margin: 22px 0 8px; font-size: 1.2rem; }
a { color: #2563eb; text-decoration: none; }
a:hover { text-decoration: underline; }
ul { padding-left: 18px; }
footer { margin-top: 28px; font-size: .9rem; opacity: .8; }
</style>
<link rel="icon" href="data:," />
<meta http-equiv="Cache-Control" content="no-store" />
<meta name="robots" content="noindex" />
<script>
function ex(id, url) { const el = document.getElementById(id); el.textContent = window.location.origin + url; el.href = url; }
window.addEventListener('DOMContentLoaded', ()=>{
ex('ex-search', '/club/search?q=Sparta');
ex('ex-info', '/club/football/00000000-0000-0000-0000-000000000000');
ex('ex-table', '/club/football/00000000-0000-0000-0000-000000000000/table');
});
</script>
</head>
<body>
<header>
<h1>FACR Scraper API</h1>
<p>Status: <code>ok</code> — server is running.</p>
</header>
<section class="ep">
<h2>Search Clubs</h2>
<p><strong>GET</strong> <code>/club/search?q=QUERY</code></p>
<p>Find clubs on fotbal.cz. Supports football and futsal clubs.</p>
<p>Example: <a id="ex-search" href="/club/search?q=Sparta">/club/search?q=Sparta</a></p>
<details>
<summary>Response shape</summary>
<pre>{
"query": "Sparta",
"count": 2,
"results": [
{
"name": "AC Sparta Praha",
"club_id": "<uuid>",
"club_type": "football",
"url": "https://www.fotbal.cz/...",
"logo_url": "https://.../logo.png",
"category": "Muži",
"address": "..."
}
]
}</pre>
</details>
</section>
<section class="ep">
<h2>Club Info + Matches</h2>
<p><strong>GET</strong> <code>/club/{type}/{id}</code></p>
<ul>
<li><code>{type}</code>: <code>football</code> | <code>futsal</code></li>
<li><code>{id}</code>: club UUID from fotbal.cz</li>
</ul>
<p>Example: <a id="ex-info" href="/club/football/00000000-0000-0000-0000-000000000000">/club/football/{id}</a></p>
<details>
<summary>Response shape</summary>
<pre>{
"name": "AC Sparta Praha",
"club_id": "00000000-0000-0000-0000-000000000000",
"club_type": "football",
"club_internal_id": "123456",
"url": "https://www.fotbal.cz/...",
"logo_url": "https://is1.fotbal.cz/media/kluby/.../logo.jpg",
"address": "Milady Horákové 98, 160 00 Praha 6",
"category": "Muži A",
"competitions": [
{
"id": "12345",
"code": "1. LIGA",
"name": "Fortuna Liga",
"team_count": "16",
"matches_link": "https://www.fotbal.cz/...",
"matches": [
{
"date_time": "12.08.2023 18:00",
"home": "AC Sparta Praha",
"home_id": "00000000-0000-0000-0000-000000000000",
"home_logo_url": "https://.../sparta.png",
"away": "SK Slavia Praha",
"away_id": "11111111-1111-1111-1111-111111111111",
"away_logo_url": "https://.../slavia.png",
"score": "2:1",
"venue": "Stadion Letná",
"match_id": "match12345",
"report_url": "https://www.fotbal.cz/..."
}
]
}
]
}</pre>
</details>
</section>
<section class="ep">
<h2>Club Tables (Standings)</h2>
<p><strong>GET</strong> <code>/club/{type}/{id}/table</code></p>
<p>Returns standings (overall table) for each competition of the club.</p>
<p>Example: <a id="ex-table" href="/club/football/00000000-0000-0000-0000-000000000000/table">/club/football/{id}/table</a></p>
<details>
<summary>Response shape</summary>
<pre>{
"name": "AC Sparta Praha",
"club_id": "00000000-0000-0000-0000-000000000000",
"club_type": "football",
"club_internal_id": "123456",
"url": "https://www.fotbal.cz/...",
"logo_url": "https://is1.fotbal.cz/media/kluby/.../logo.jpg",
"competitions": [
{
"id": "12345",
"code": "1. LIGA",
"name": "Fortuna Liga",
"team_count": "16",
"matches_link": "https://www.fotbal.cz/...",
"table": {
"overall": [
{
"rank": "1",
"team": "AC Sparta Praha",
"team_id": "00000000-0000-0000-0000-000000000000",
"team_logo_url": "https://.../sparta.png",
"played": "10",
"wins": "8",
"draws": "2",
"losses": "0",
"score": "25:5",
"points": "26"
},
{
"rank": "2",
"team": "SK Slavia Praha",
"team_id": "11111111-1111-1111-1111-111111111111",
"team_logo_url": "https://.../slavia.png",
"played": "10",
"wins": "7",
"draws": "2",
"losses": "1",
"score": "20:8",
"points": "23"
}
]
}
}
]
}</pre>
</details>
</section>
<section class="ep">
<h2>Shortcuts</h2>
<p><strong>GET</strong> <code>/club/{id}</code> → redirects to <code>/club/football/{id}</code></p>
</section>
<footer>
<p>Tip: Use a reverse proxy in production and set proper timeouts. This API scrapes public pages and may be rate-limited upstream.</p>
</footer>
</body>
</html>`)
}
func containsFold(s, substr string) bool {
s = strings.ToLower(strings.TrimSpace(s))
substr = strings.ToLower(strings.TrimSpace(substr))
if substr == "" {
return false
}
return strings.Contains(s, substr)
}
// extractUUIDFromHref finds the first UUID-like token in an href and returns it.
func extractUUIDFromHref(href string) string {
href = strings.TrimSpace(href)
if href == "" {
return ""
}
re := regexp.MustCompile(`[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}`)
if m := re.FindString(href); m != "" {
return m
}
// Fallback: some links may end with ID after slash; take last path token if it looks like hex+hyphenated
parts := strings.Split(href, "/")
if len(parts) > 0 {
cand := parts[len(parts)-1]
if re.MatchString(cand) {
return cand
}
}
return ""
}
type Match struct {
DateTime string `json:"date_time"`
Home string `json:"home"`
HomeID string `json:"home_id,omitempty"`
HomeLogoURL string `json:"home_logo_url,omitempty"`
Away string `json:"away"`
AwayID string `json:"away_id,omitempty"`
AwayLogoURL string `json:"away_logo_url,omitempty"`
Score string `json:"score"`
Venue string `json:"venue"`
Note string `json:"note,omitempty"`
MatchID string `json:"match_id"`
ReportURL string `json:"report_url,omitempty"`
FACRLink string `json:"facr_link,omitempty"`
DelegationURL string `json:"delegation_url,omitempty"`
}
// TableRow represents one row in a standings table
type TableRow struct {
Rank string `json:"rank"`
Team string `json:"team"`
TeamID string `json:"team_id,omitempty"`
TeamLogoURL string `json:"team_logo_url,omitempty"`
Played string `json:"played"`
Wins string `json:"wins"`
Draws string `json:"draws"`
Losses string `json:"losses"`
Score string `json:"score"`
Points string `json:"points"`
}
// resolveISURL makes relative IS links absolute against https://is.fotbal.cz/public/
func resolveISURL(href string) string {
href = strings.TrimSpace(href)
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
if u, err := neturl.Parse(href); err == nil {
u.Scheme = "https"
u.Host = "is.fotbal.cz"
if !strings.HasPrefix(u.Path, "/public/") {
if strings.HasPrefix(u.Path, "/zapasy/") {
u.Path = "/public" + u.Path
}
}
q := u.Query()
q.Del("discipline")
u.RawQuery = q.Encode()
return u.String()
}
return href
}
href = strings.TrimPrefix(href, "./")
for strings.HasPrefix(href, "../") {
href = strings.TrimPrefix(href, "../")
}
if strings.HasPrefix(href, "/") {
href = strings.TrimPrefix(href, "/")
}
path := "/public/" + href
u := neturl.URL{Scheme: "https", Host: "is.fotbal.cz", Path: path}
return u.String()
}