mirror of
https://github.com/Dvorinka/facr-scraper.git
synced 2026-06-03 20:12:57 +00:00
1958 lines
58 KiB
Go
1958 lines
58 KiB
Go
package main
|
||
|
||
import (
|
||
"bytes"
|
||
"context"
|
||
_ "embed"
|
||
"encoding/json"
|
||
"fmt"
|
||
"io"
|
||
"log"
|
||
"net/http"
|
||
neturl "net/url"
|
||
"os"
|
||
"os/exec"
|
||
"path/filepath"
|
||
"regexp"
|
||
"strings"
|
||
"sync"
|
||
"time"
|
||
|
||
"github.com/PuerkitoBio/goquery"
|
||
"github.com/gorilla/mux"
|
||
)
|
||
|
||
type Competition struct {
|
||
ID string `json:"id"`
|
||
Code string `json:"code"`
|
||
Name string `json:"name"`
|
||
TeamCount string `json:"team_count"`
|
||
MatchesLink string `json:"matches_link"`
|
||
Matches []Match `json:"matches,omitempty"`
|
||
Table *CompetitionTable `json:"table,omitempty"`
|
||
}
|
||
|
||
// Cloudflare Browser Rendering API structures
|
||
type CloudflareCrawlRequest struct {
|
||
URL string `json:"url"`
|
||
Limit int `json:"limit,omitempty"`
|
||
Depth int `json:"depth,omitempty"`
|
||
Formats []string `json:"formats,omitempty"`
|
||
Render bool `json:"render,omitempty"`
|
||
Source string `json:"source,omitempty"`
|
||
Options map[string]interface{} `json:"options,omitempty"`
|
||
}
|
||
|
||
type CloudflareCrawlResponse struct {
|
||
Success bool `json:"success"`
|
||
Result string `json:"result"` // job ID
|
||
}
|
||
|
||
type CloudflareCrawlJob struct {
|
||
ID string `json:"id"`
|
||
Status string `json:"status"`
|
||
BrowserSecondsUsed float64 `json:"browserSecondsUsed"`
|
||
Total int `json:"total"`
|
||
Finished int `json:"finished"`
|
||
Records []CloudflareCrawlRecord `json:"records"`
|
||
Cursor string `json:"cursor,omitempty"`
|
||
}
|
||
|
||
type CloudflareCrawlRecord struct {
|
||
URL string `json:"url"`
|
||
Status string `json:"status"`
|
||
Markdown string `json:"markdown,omitempty"`
|
||
HTML string `json:"html,omitempty"`
|
||
JSON interface{} `json:"json,omitempty"`
|
||
Metadata map[string]interface{} `json:"metadata"`
|
||
}
|
||
|
||
type CloudflareClient struct {
|
||
AccountID string
|
||
APIToken string
|
||
BaseURL string
|
||
Client *http.Client
|
||
}
|
||
|
||
type fetchOptions struct {
|
||
Referer string
|
||
}
|
||
|
||
const (
|
||
browserUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0 Safari/537.36"
|
||
browserAccept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8"
|
||
browserAcceptLanguage = "cs-CZ,cs;q=0.9,en;q=0.8"
|
||
scraplingHelperPath = "scripts/scrapling_fetch.py"
|
||
)
|
||
|
||
//go:embed scripts/scrapling_fetch.py
|
||
var embeddedScraplingHelper string
|
||
|
||
var (
|
||
embeddedScraplingHelperOnce sync.Once
|
||
embeddedScraplingHelperFile string
|
||
embeddedScraplingHelperErr error
|
||
|
||
// Simple in-memory cache for fetched pages
|
||
pageCache = make(map[string]*cacheEntry)
|
||
pageCacheMu sync.RWMutex
|
||
)
|
||
|
||
type cacheEntry struct {
|
||
body []byte
|
||
timestamp time.Time
|
||
}
|
||
|
||
const cacheTTL = 5 * time.Minute
|
||
|
||
// NewCloudflareClient creates a new Cloudflare Browser Rendering API client
|
||
func NewCloudflareClient() *CloudflareClient {
|
||
accountID := strings.TrimSpace(os.Getenv("CLOUDFLARE_ACCOUNT_ID"))
|
||
apiToken := strings.TrimSpace(os.Getenv("CLOUDFLARE_API_TOKEN"))
|
||
|
||
if accountID == "" || apiToken == "" {
|
||
return nil
|
||
}
|
||
|
||
return &CloudflareClient{
|
||
AccountID: accountID,
|
||
APIToken: apiToken,
|
||
BaseURL: "https://api.cloudflare.com/client/v4",
|
||
Client: &http.Client{
|
||
Timeout: 30 * time.Second,
|
||
},
|
||
}
|
||
}
|
||
|
||
// StartCrawl initiates a crawl job
|
||
func (c *CloudflareClient) StartCrawl(ctx context.Context, req CloudflareCrawlRequest) (string, error) {
|
||
if c == nil {
|
||
return "", fmt.Errorf("Cloudflare client not initialized")
|
||
}
|
||
|
||
// Set defaults
|
||
if req.Limit == 0 {
|
||
req.Limit = 10
|
||
}
|
||
if req.Depth == 0 {
|
||
req.Depth = 1
|
||
}
|
||
if len(req.Formats) == 0 {
|
||
req.Formats = []string{"html", "markdown"}
|
||
}
|
||
if req.Source == "" {
|
||
req.Source = "all"
|
||
}
|
||
|
||
// Restrict to specific URL patterns for fotbal.cz to avoid crawling unrelated content
|
||
if req.Options == nil {
|
||
req.Options = make(map[string]interface{})
|
||
}
|
||
|
||
// Only crawl URLs from the same domain and specific paths
|
||
includePatterns := []string{
|
||
"https://www.fotbal.cz/**",
|
||
}
|
||
excludePatterns := []string{
|
||
"**/api/**",
|
||
"**/static/**",
|
||
"**/media/**",
|
||
}
|
||
|
||
req.Options["includePatterns"] = includePatterns
|
||
req.Options["excludePatterns"] = excludePatterns
|
||
req.Options["includeExternalLinks"] = false
|
||
req.Options["includeSubdomains"] = false
|
||
|
||
body, err := json.Marshal(req)
|
||
if err != nil {
|
||
return "", fmt.Errorf("failed to marshal request: %w", err)
|
||
}
|
||
|
||
url := fmt.Sprintf("%s/accounts/%s/browser-rendering/crawl", c.BaseURL, c.AccountID)
|
||
httpReq, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(body))
|
||
if err != nil {
|
||
return "", fmt.Errorf("failed to create request: %w", err)
|
||
}
|
||
|
||
httpReq.Header.Set("Authorization", "Bearer "+c.APIToken)
|
||
httpReq.Header.Set("Content-Type", "application/json")
|
||
|
||
resp, err := c.Client.Do(httpReq)
|
||
if err != nil {
|
||
return "", fmt.Errorf("failed to send request: %w", err)
|
||
}
|
||
defer resp.Body.Close()
|
||
|
||
if resp.StatusCode != http.StatusOK {
|
||
body, _ := io.ReadAll(resp.Body)
|
||
return "", fmt.Errorf("API request failed with status %d: %s", resp.StatusCode, string(body))
|
||
}
|
||
|
||
var crawlResp CloudflareCrawlResponse
|
||
if err := json.NewDecoder(resp.Body).Decode(&crawlResp); err != nil {
|
||
return "", fmt.Errorf("failed to decode response: %w", err)
|
||
}
|
||
|
||
if !crawlResp.Success {
|
||
return "", fmt.Errorf("API returned unsuccessful response")
|
||
}
|
||
|
||
return crawlResp.Result, nil
|
||
}
|
||
|
||
// GetCrawlResults retrieves the results of a crawl job
|
||
func (c *CloudflareClient) GetCrawlResults(ctx context.Context, jobID string, limit int) (*CloudflareCrawlJob, error) {
|
||
if c == nil {
|
||
return nil, fmt.Errorf("Cloudflare client not initialized")
|
||
}
|
||
|
||
url := fmt.Sprintf("%s/accounts/%s/browser-rendering/crawl/%s", c.BaseURL, c.AccountID, jobID)
|
||
if limit > 0 {
|
||
url += fmt.Sprintf("?limit=%d", limit)
|
||
}
|
||
|
||
httpReq, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("failed to create request: %w", err)
|
||
}
|
||
|
||
httpReq.Header.Set("Authorization", "Bearer "+c.APIToken)
|
||
|
||
resp, err := c.Client.Do(httpReq)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("failed to send request: %w", err)
|
||
}
|
||
defer resp.Body.Close()
|
||
|
||
if resp.StatusCode != http.StatusOK {
|
||
body, _ := io.ReadAll(resp.Body)
|
||
return nil, fmt.Errorf("API request failed with status %d: %s", resp.StatusCode, string(body))
|
||
}
|
||
|
||
var result struct {
|
||
Success bool `json:"success"`
|
||
Result CloudflareCrawlJob `json:"result"`
|
||
}
|
||
|
||
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
||
return nil, fmt.Errorf("failed to decode response: %w", err)
|
||
}
|
||
|
||
if !result.Success {
|
||
return nil, fmt.Errorf("API returned unsuccessful response")
|
||
}
|
||
|
||
return &result.Result, nil
|
||
}
|
||
|
||
// WaitForCrawlCompletion waits for a crawl job to complete and returns the results
|
||
func (c *CloudflareClient) WaitForCrawlCompletion(ctx context.Context, jobID string, maxAttempts int, delay time.Duration) (*CloudflareCrawlJob, error) {
|
||
if c == nil {
|
||
return nil, fmt.Errorf("Cloudflare client not initialized")
|
||
}
|
||
|
||
for i := 0; i < maxAttempts; i++ {
|
||
job, err := c.GetCrawlResults(ctx, jobID, 1) // Use limit=1 for status checks
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
if job.Status != "running" {
|
||
// Get full results
|
||
fullJob, err := c.GetCrawlResults(ctx, jobID, 0) // No limit for full results
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
return fullJob, nil
|
||
}
|
||
|
||
select {
|
||
case <-ctx.Done():
|
||
return nil, ctx.Err()
|
||
case <-time.After(delay):
|
||
continue
|
||
}
|
||
}
|
||
|
||
return nil, fmt.Errorf("crawl job did not complete within timeout")
|
||
}
|
||
|
||
// CrawlURL performs a complete crawl operation for a single URL
|
||
func (c *CloudflareClient) CrawlURL(ctx context.Context, url string) (*CloudflareCrawlJob, error) {
|
||
if c == nil {
|
||
return nil, fmt.Errorf("Cloudflare client not initialized")
|
||
}
|
||
|
||
req := CloudflareCrawlRequest{
|
||
URL: url,
|
||
Limit: 1, // Only crawl the specific URL
|
||
Depth: 0, // Don't follow links
|
||
Formats: []string{"html", "markdown"},
|
||
Render: true,
|
||
Source: "links", // Only crawl the specific URL, not sitemaps
|
||
}
|
||
|
||
jobID, err := c.StartCrawl(ctx, req)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("failed to start crawl: %w", err)
|
||
}
|
||
|
||
// Wait for completion with reasonable timeout
|
||
ctx, cancel := context.WithTimeout(ctx, 2*time.Minute)
|
||
defer cancel()
|
||
|
||
job, err := c.WaitForCrawlCompletion(ctx, jobID, 24, 5*time.Second)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("failed to wait for crawl completion: %w", err)
|
||
}
|
||
|
||
return job, nil
|
||
}
|
||
|
||
func newBrowserRequest(url string, opts fetchOptions) (*http.Request, error) {
|
||
req, err := http.NewRequest("GET", url, nil)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("failed to create request: %w", err)
|
||
}
|
||
|
||
req.Header.Set("User-Agent", browserUserAgent)
|
||
req.Header.Set("Accept", browserAccept)
|
||
req.Header.Set("Accept-Language", browserAcceptLanguage)
|
||
if opts.Referer != "" {
|
||
req.Header.Set("Referer", opts.Referer)
|
||
}
|
||
|
||
return req, nil
|
||
}
|
||
|
||
func looksLikeCloudflareBlock(body []byte) bool {
|
||
if len(body) == 0 {
|
||
return false
|
||
}
|
||
|
||
lower := strings.ToLower(string(body))
|
||
hardSignals := []string{
|
||
"<title>just a moment...</title>",
|
||
"attention required!",
|
||
"enable javascript and cookies to continue",
|
||
"checking if the site connection is secure",
|
||
"cf-browser-verification",
|
||
}
|
||
|
||
for _, signal := range hardSignals {
|
||
if strings.Contains(lower, signal) {
|
||
return true
|
||
}
|
||
}
|
||
|
||
if strings.Contains(lower, "/cdn-cgi/challenge-platform/") &&
|
||
(strings.Contains(lower, "window._cf_chl_opt") ||
|
||
strings.Contains(lower, "__cf_chl_rt_tk") ||
|
||
strings.Contains(lower, "cf_chl_seq_")) {
|
||
return true
|
||
}
|
||
|
||
return false
|
||
}
|
||
|
||
func compactErrorText(s string) string {
|
||
s = strings.Join(strings.Fields(strings.TrimSpace(s)), " ")
|
||
if len(s) > 220 {
|
||
return s[:217] + "..."
|
||
}
|
||
return s
|
||
}
|
||
|
||
func fetchPageDirect(url string, opts fetchOptions) ([]byte, error) {
|
||
req, err := newBrowserRequest(url, opts)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
client := &http.Client{Timeout: 15 * time.Second}
|
||
resp, err := client.Do(req)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("direct request failed: %w", err)
|
||
}
|
||
defer resp.Body.Close()
|
||
|
||
body, err := io.ReadAll(resp.Body)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("failed to read response body: %w", err)
|
||
}
|
||
|
||
if resp.StatusCode != http.StatusOK {
|
||
return nil, fmt.Errorf("direct request returned HTTP %d", resp.StatusCode)
|
||
}
|
||
if looksLikeCloudflareBlock(body) {
|
||
return nil, fmt.Errorf("direct request returned a Cloudflare challenge page")
|
||
}
|
||
|
||
return body, nil
|
||
}
|
||
|
||
func fetchPageWithWget(url string, opts fetchOptions) ([]byte, error) {
|
||
if _, err := exec.LookPath("wget"); err != nil {
|
||
return nil, fmt.Errorf("wget not available: %w", err)
|
||
}
|
||
|
||
ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second)
|
||
defer cancel()
|
||
|
||
args := []string{
|
||
"--quiet",
|
||
"--tries=1",
|
||
"--timeout=15",
|
||
"--max-redirect=10",
|
||
"--output-document=-",
|
||
"--user-agent=" + browserUserAgent,
|
||
"--header=Accept: " + browserAccept,
|
||
"--header=Accept-Language: " + browserAcceptLanguage,
|
||
}
|
||
if opts.Referer != "" {
|
||
args = append(args, "--header=Referer: "+opts.Referer)
|
||
}
|
||
args = append(args, url)
|
||
|
||
cmd := exec.CommandContext(ctx, "wget", args...)
|
||
var stdout bytes.Buffer
|
||
var stderr bytes.Buffer
|
||
cmd.Stdout = &stdout
|
||
cmd.Stderr = &stderr
|
||
|
||
if err := cmd.Run(); err != nil {
|
||
details := compactErrorText(stderr.String())
|
||
if details == "" {
|
||
details = compactErrorText(err.Error())
|
||
}
|
||
return nil, fmt.Errorf("wget request failed: %s", details)
|
||
}
|
||
|
||
body := stdout.Bytes()
|
||
if len(body) == 0 {
|
||
return nil, fmt.Errorf("wget returned an empty body")
|
||
}
|
||
if looksLikeCloudflareBlock(body) {
|
||
return nil, fmt.Errorf("wget returned a Cloudflare challenge page")
|
||
}
|
||
|
||
return body, nil
|
||
}
|
||
|
||
func firstExistingFile(paths ...string) string {
|
||
for _, path := range paths {
|
||
path = strings.TrimSpace(path)
|
||
if path == "" {
|
||
continue
|
||
}
|
||
if info, err := os.Stat(path); err == nil && !info.IsDir() {
|
||
return path
|
||
}
|
||
}
|
||
|
||
return ""
|
||
}
|
||
|
||
func firstExecutable(paths ...string) string {
|
||
for _, path := range paths {
|
||
path = strings.TrimSpace(path)
|
||
if path == "" {
|
||
continue
|
||
}
|
||
if strings.ContainsRune(path, os.PathSeparator) {
|
||
if info, err := os.Stat(path); err == nil && !info.IsDir() {
|
||
return path
|
||
}
|
||
continue
|
||
}
|
||
if resolved, err := exec.LookPath(path); err == nil {
|
||
return resolved
|
||
}
|
||
}
|
||
|
||
return ""
|
||
}
|
||
|
||
func ensureEmbeddedScraplingHelper() (string, error) {
|
||
embeddedScraplingHelperOnce.Do(func() {
|
||
if strings.TrimSpace(embeddedScraplingHelper) == "" {
|
||
embeddedScraplingHelperErr = fmt.Errorf("embedded Scrapling helper is empty")
|
||
return
|
||
}
|
||
|
||
file, err := os.CreateTemp("", "facr-scrapling-*.py")
|
||
if err != nil {
|
||
embeddedScraplingHelperErr = fmt.Errorf("create embedded Scrapling helper: %w", err)
|
||
return
|
||
}
|
||
defer file.Close()
|
||
|
||
if _, err := file.WriteString(embeddedScraplingHelper); err != nil {
|
||
embeddedScraplingHelperErr = fmt.Errorf("write embedded Scrapling helper: %w", err)
|
||
return
|
||
}
|
||
if err := file.Chmod(0600); err != nil {
|
||
embeddedScraplingHelperErr = fmt.Errorf("chmod embedded Scrapling helper: %w", err)
|
||
return
|
||
}
|
||
|
||
embeddedScraplingHelperFile = file.Name()
|
||
})
|
||
|
||
if embeddedScraplingHelperErr != nil {
|
||
return "", embeddedScraplingHelperErr
|
||
}
|
||
if embeddedScraplingHelperFile == "" {
|
||
return "", fmt.Errorf("embedded Scrapling helper path is empty")
|
||
}
|
||
return embeddedScraplingHelperFile, nil
|
||
}
|
||
|
||
func findScraplingHelperScript() (string, error) {
|
||
cwd, _ := os.Getwd()
|
||
|
||
exePath, _ := os.Executable()
|
||
exeDir := ""
|
||
if exePath != "" {
|
||
exeDir = filepath.Dir(exePath)
|
||
}
|
||
|
||
if path := firstExistingFile(
|
||
os.Getenv("SCRAPLING_SCRIPT"),
|
||
filepath.Join(cwd, scraplingHelperPath),
|
||
filepath.Join(exeDir, scraplingHelperPath),
|
||
); path != "" {
|
||
return path, nil
|
||
}
|
||
|
||
return ensureEmbeddedScraplingHelper()
|
||
}
|
||
|
||
func findScraplingPython() string {
|
||
cwd, _ := os.Getwd()
|
||
|
||
exePath, _ := os.Executable()
|
||
exeDir := ""
|
||
if exePath != "" {
|
||
exeDir = filepath.Dir(exePath)
|
||
}
|
||
|
||
return firstExecutable(
|
||
os.Getenv("SCRAPLING_PYTHON_BIN"),
|
||
filepath.Join(cwd, ".venv-scrapling", "bin", "python3"),
|
||
filepath.Join(cwd, ".venv-scrapling", "bin", "python"),
|
||
filepath.Join(cwd, ".venv", "bin", "python3"),
|
||
filepath.Join(cwd, ".venv", "bin", "python"),
|
||
filepath.Join(exeDir, ".venv-scrapling", "bin", "python3"),
|
||
filepath.Join(exeDir, ".venv-scrapling", "bin", "python"),
|
||
filepath.Join(exeDir, ".venv", "bin", "python3"),
|
||
filepath.Join(exeDir, ".venv", "bin", "python"),
|
||
"python3",
|
||
"python",
|
||
)
|
||
}
|
||
|
||
func fetchPageWithScrapling(url string, opts fetchOptions) ([]byte, error) {
|
||
pythonBin := findScraplingPython()
|
||
if pythonBin == "" {
|
||
return nil, fmt.Errorf("Scrapling skipped: no Python runtime found")
|
||
}
|
||
|
||
helperScript, err := findScraplingHelperScript()
|
||
if err != nil {
|
||
return nil, fmt.Errorf("Scrapling skipped: %w", err)
|
||
}
|
||
|
||
ctx, cancel := context.WithTimeout(context.Background(), 90*time.Second)
|
||
defer cancel()
|
||
|
||
args := []string{helperScript, "--url", url}
|
||
if opts.Referer != "" {
|
||
args = append(args, "--referer", opts.Referer)
|
||
}
|
||
|
||
cmd := exec.CommandContext(ctx, pythonBin, args...)
|
||
var stdout bytes.Buffer
|
||
var stderr bytes.Buffer
|
||
cmd.Stdout = &stdout
|
||
cmd.Stderr = &stderr
|
||
|
||
if err := cmd.Run(); err != nil {
|
||
details := compactErrorText(stderr.String())
|
||
if details == "" {
|
||
details = compactErrorText(err.Error())
|
||
}
|
||
return nil, fmt.Errorf("Scrapling request failed: %s", details)
|
||
}
|
||
|
||
body := stdout.Bytes()
|
||
if len(body) == 0 {
|
||
return nil, fmt.Errorf("Scrapling returned an empty body")
|
||
}
|
||
if looksLikeCloudflareBlock(body) {
|
||
return nil, fmt.Errorf("Scrapling returned a Cloudflare challenge page")
|
||
}
|
||
|
||
return body, nil
|
||
}
|
||
|
||
func fetchPageWithFallback(url string) ([]byte, error) {
|
||
return fetchPageWithFallbackOptions(url, fetchOptions{})
|
||
}
|
||
|
||
// fetchPageWithFallback tries Go HTTP first, then wget, then Scrapling, then Cloudflare Browser Rendering.
|
||
func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error) {
|
||
// Check cache first
|
||
pageCacheMu.RLock()
|
||
if entry, ok := pageCache[url]; ok {
|
||
if time.Since(entry.timestamp) < cacheTTL {
|
||
pageCacheMu.RUnlock()
|
||
log.Printf("Cache hit for %s", url)
|
||
return entry.body, nil
|
||
}
|
||
}
|
||
pageCacheMu.RUnlock()
|
||
|
||
body, err := fetchPageDirect(url, opts)
|
||
if err == nil {
|
||
cachePage(url, body)
|
||
return body, nil
|
||
}
|
||
log.Printf("Direct request failed for %s: %v", url, err)
|
||
|
||
body, err = fetchPageWithWget(url, opts)
|
||
if err == nil {
|
||
log.Printf("Successfully retrieved content via wget for %s", url)
|
||
cachePage(url, body)
|
||
return body, nil
|
||
}
|
||
log.Printf("wget fallback failed for %s: %v", url, err)
|
||
|
||
body, err = fetchPageWithScrapling(url, opts)
|
||
if err == nil {
|
||
log.Printf("Successfully retrieved content via Scrapling for %s", url)
|
||
cachePage(url, body)
|
||
return body, nil
|
||
}
|
||
log.Printf("Scrapling fallback failed for %s: %v", url, err)
|
||
|
||
if cfClient := NewCloudflareClient(); cfClient != nil {
|
||
log.Printf("Attempting Cloudflare crawl fallback for %s", url)
|
||
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
|
||
defer cancel()
|
||
|
||
job, err := cfClient.CrawlURL(ctx, url)
|
||
if err != nil {
|
||
log.Printf("Cloudflare crawl failed for %s: %v", url, err)
|
||
return nil, fmt.Errorf("go scraping failed, wget failed, Scrapling failed, and Cloudflare crawl failed: %w", err)
|
||
}
|
||
|
||
if len(job.Records) > 0 && job.Records[0].Status == "completed" {
|
||
body := []byte(job.Records[0].HTML)
|
||
if looksLikeCloudflareBlock(body) {
|
||
return nil, fmt.Errorf("Cloudflare crawl returned a challenge page")
|
||
}
|
||
log.Printf("Successfully retrieved content via Cloudflare crawl for %s", url)
|
||
cachePage(url, body)
|
||
return body, nil
|
||
}
|
||
|
||
log.Printf("Cloudflare crawl returned no completed records for %s", url)
|
||
return nil, fmt.Errorf("Cloudflare crawl returned no completed records")
|
||
}
|
||
|
||
return nil, fmt.Errorf("go scraping failed, wget failed, Scrapling failed, and Cloudflare client is not available")
|
||
}
|
||
|
||
func cachePage(url string, body []byte) {
|
||
pageCacheMu.Lock()
|
||
pageCache[url] = &cacheEntry{body: body, timestamp: time.Now()}
|
||
pageCacheMu.Unlock()
|
||
}
|
||
|
||
// parseCompetitionMatchesFromFotbal scrapes matches from the public fotbal.cz
|
||
// competition page (e.g., https://www.fotbal.cz/souteze/turnaje/table/{id}).
|
||
// It filters to only include matches involving the given clubName if provided.
|
||
func parseCompetitionMatchesFromFotbal(pageURL, clubType, clubName, clubID string) []Match {
|
||
pageURL = strings.TrimSpace(pageURL)
|
||
if pageURL == "" {
|
||
return nil
|
||
}
|
||
|
||
body, err := fetchPageWithFallback(pageURL)
|
||
if err != nil {
|
||
log.Printf("fotbal.cz matches fetch failed for %s: %v", pageURL, err)
|
||
return nil
|
||
}
|
||
|
||
// If we still don't have body content, return nil
|
||
if len(body) == 0 {
|
||
log.Printf("No content available for %s", pageURL)
|
||
return nil
|
||
}
|
||
// Debug: save full HTML if env toggled
|
||
if os.Getenv("DEBUG_SAVE_HTML") != "" {
|
||
// derive a friendly filename from last URL path segment
|
||
comp := pageURL
|
||
if i := strings.LastIndex(comp, "/"); i >= 0 && i+1 < len(comp) {
|
||
comp = comp[i+1:]
|
||
}
|
||
fname := fmt.Sprintf("fotbal_comp_%s.html", comp)
|
||
if err := os.WriteFile(fname, body, 0644); err != nil {
|
||
log.Printf("failed writing debug HTML %s: %v", fname, err)
|
||
} else {
|
||
log.Printf("saved debug HTML: %s", fname)
|
||
}
|
||
}
|
||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
|
||
if err != nil {
|
||
log.Printf("fotbal.cz matches parse error for %s: %v", pageURL, err)
|
||
return nil
|
||
}
|
||
|
||
var matches []Match
|
||
// Sections per round
|
||
doc.Find("section.js-matchRoundSection li.MatchRound").Each(func(_ int, li *goquery.Selection) {
|
||
a := li.Find("a.MatchRound-match").First()
|
||
if a.Length() == 0 {
|
||
return
|
||
}
|
||
// Teams
|
||
teamNames := []string{}
|
||
li.Find("a.MatchRound-match ul li span.H7").Each(func(_ int, s *goquery.Selection) {
|
||
t := strings.TrimSpace(s.Text())
|
||
if t != "" {
|
||
teamNames = append(teamNames, t)
|
||
}
|
||
})
|
||
if len(teamNames) < 2 {
|
||
return
|
||
}
|
||
home := teamNames[0]
|
||
away := teamNames[1]
|
||
// Try to extract team IDs from img URLs if present
|
||
imgIDs := []string{}
|
||
li.Find("a.MatchRound-match img").Each(func(_ int, img *goquery.Selection) {
|
||
src := strings.TrimSpace(img.AttrOr("src", ""))
|
||
if src == "" {
|
||
return
|
||
}
|
||
if id := extractUUIDFromHref(src); id != "" {
|
||
imgIDs = append(imgIDs, id)
|
||
}
|
||
})
|
||
homeID, awayID := "", ""
|
||
if len(imgIDs) >= 1 {
|
||
homeID = imgIDs[0]
|
||
}
|
||
if len(imgIDs) >= 2 {
|
||
awayID = imgIDs[1]
|
||
}
|
||
// Score
|
||
score := strings.TrimSpace(a.Find("strong.H4").First().Text())
|
||
if re := regexp.MustCompile(`\s*([0-9]+)\s*:\s*([0-9]+)\s*`); re != nil {
|
||
if m := re.FindStringSubmatch(score); len(m) == 3 {
|
||
score = fmt.Sprintf("%s:%s", m[1], m[2])
|
||
}
|
||
}
|
||
// Meta: date, match id in meta list and link
|
||
dateText := ""
|
||
li.Find(".MatchRound-meta p").Each(func(_ int, p *goquery.Selection) {
|
||
label := strings.TrimSpace(p.Find("strong").First().Text())
|
||
txt := strings.TrimSpace(p.Text())
|
||
if strings.HasPrefix(strings.ToLower(label), "datum") {
|
||
// Remove label from text
|
||
dateText = strings.TrimSpace(strings.ReplaceAll(txt, label+":", ""))
|
||
}
|
||
})
|
||
// Venue from details, if available
|
||
venue := ""
|
||
li.Find(".js-matchRoundDetails li p").Each(func(_ int, p *goquery.Selection) {
|
||
label := strings.TrimSpace(p.Find("strong").First().Text())
|
||
txt := strings.TrimSpace(p.Text())
|
||
if strings.HasPrefix(strings.ToLower(label), "hřiště") || strings.HasPrefix(strings.ToLower(label), "hriste") {
|
||
venue = strings.TrimSpace(strings.ReplaceAll(txt, label+":", ""))
|
||
}
|
||
})
|
||
// Match ID from the anchor href
|
||
matchID := extractUUIDFromHref(a.AttrOr("href", ""))
|
||
reportURL := ""
|
||
if matchID != "" {
|
||
if strings.EqualFold(clubType, "futsal") {
|
||
reportURL = fmt.Sprintf("https://www.fotbal.cz/futsal/zapasy/futsal/%s", matchID)
|
||
} else {
|
||
reportURL = fmt.Sprintf("https://www.fotbal.cz/souteze/zapasy/zapas/%s", matchID)
|
||
}
|
||
}
|
||
// Filter by club involvement: prefer UUID match, fallback to name matching including simplified token
|
||
if clubName != "" || clubID != "" {
|
||
involved := false
|
||
// If we could extract team UUIDs, match by ID first (robust against aliases)
|
||
if clubID != "" && (strings.EqualFold(homeID, clubID) || strings.EqualFold(awayID, clubID)) {
|
||
involved = true
|
||
} else if clubName != "" {
|
||
// Fallback to fuzzy full-name matching
|
||
involved = strings.EqualFold(home, clubName) || strings.EqualFold(away, clubName) ||
|
||
containsFold(clubName, home) || containsFold(clubName, away) ||
|
||
containsFold(home, clubName) || containsFold(away, clubName)
|
||
// As a last resort, try matching a simplified token (e.g., city) of the club name
|
||
if !involved {
|
||
token := simplifyClubQuery(clubName)
|
||
if token != "" && (containsFold(home, token) || containsFold(away, token)) {
|
||
involved = true
|
||
}
|
||
}
|
||
}
|
||
if !involved {
|
||
return
|
||
}
|
||
}
|
||
// Backfill IDs for current club if missing
|
||
if homeID == "" {
|
||
if strings.EqualFold(home, clubName) || containsFold(home, clubName) || containsFold(clubName, home) {
|
||
homeID = clubID
|
||
} else {
|
||
token := simplifyClubQuery(clubName)
|
||
if token != "" && containsFold(home, token) {
|
||
homeID = clubID
|
||
}
|
||
}
|
||
}
|
||
if awayID == "" {
|
||
if strings.EqualFold(away, clubName) || containsFold(away, clubName) || containsFold(clubName, away) {
|
||
awayID = clubID
|
||
} else {
|
||
token := simplifyClubQuery(clubName)
|
||
if token != "" && containsFold(away, token) {
|
||
awayID = clubID
|
||
}
|
||
}
|
||
}
|
||
homeLogo := getLogo(home, homeID)
|
||
awayLogo := getLogo(away, awayID)
|
||
matches = append(matches, Match{
|
||
DateTime: dateText,
|
||
Home: home, HomeID: homeID, HomeLogoURL: homeLogo,
|
||
Away: away, AwayID: awayID, AwayLogoURL: awayLogo,
|
||
Score: score,
|
||
Venue: venue,
|
||
MatchID: matchID,
|
||
ReportURL: reportURL,
|
||
FACRLink: reportURL,
|
||
})
|
||
})
|
||
return matches
|
||
}
|
||
|
||
// parseCompetitionMatchesFromIS scrapes matches from the IS portal as fallback.
|
||
func parseCompetitionMatchesFromIS(detailURL, clubType, clubName, clubID string) []Match {
|
||
resp, err := http.Get(detailURL)
|
||
if err != nil {
|
||
log.Printf("IS matches fetch error for %s: %v", detailURL, err)
|
||
return nil
|
||
}
|
||
defer resp.Body.Close()
|
||
if resp.StatusCode != http.StatusOK {
|
||
log.Printf("IS matches non-200 for %s: %d", detailURL, resp.StatusCode)
|
||
return nil
|
||
}
|
||
// Read body so we can optionally save and then parse from memory
|
||
body, err := io.ReadAll(resp.Body)
|
||
if err != nil {
|
||
log.Printf("IS matches read error for %s: %v", detailURL, err)
|
||
return nil
|
||
}
|
||
if os.Getenv("DEBUG_SAVE_HTML") != "" {
|
||
// name the file using the req (competition id) if present
|
||
fname := "is_detail.html"
|
||
if u, err := neturl.Parse(detailURL); err == nil {
|
||
req := u.Query().Get("req")
|
||
sport := u.Query().Get("sport")
|
||
if req != "" {
|
||
fname = fmt.Sprintf("is_comp_%s_%s.html", req, sport)
|
||
}
|
||
}
|
||
if err := os.WriteFile(fname, body, 0644); err != nil {
|
||
log.Printf("failed writing debug IS HTML %s: %v", fname, err)
|
||
} else {
|
||
log.Printf("saved debug IS HTML: %s", fname)
|
||
}
|
||
}
|
||
docDetail, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
|
||
if err != nil {
|
||
log.Printf("IS matches parse error for %s: %v", detailURL, err)
|
||
return nil
|
||
}
|
||
var matches []Match
|
||
totalRows := 0
|
||
keptRows := 0
|
||
docDetail.Find("table.soutez-zapasy tr").Each(func(_ int, s *goquery.Selection) {
|
||
if s.Find("th").Length() > 0 {
|
||
return
|
||
}
|
||
tds := s.Find("td")
|
||
if tds.Length() < 5 {
|
||
return
|
||
}
|
||
totalRows++
|
||
getText := func(sel *goquery.Selection) string { return strings.TrimSpace(sel.Text()) }
|
||
dt := getText(tds.Eq(0))
|
||
rawHome := getText(tds.Eq(1))
|
||
if idx := strings.Index(rawHome, "("); idx >= 0 {
|
||
rawHome = strings.TrimSpace(rawHome[:idx])
|
||
}
|
||
rawAway := getText(tds.Eq(2))
|
||
if idx := strings.Index(rawAway, "("); idx >= 0 {
|
||
rawAway = strings.TrimSpace(rawAway[:idx])
|
||
}
|
||
homeID := extractUUIDFromHref(tds.Eq(1).Find("a").First().AttrOr("href", ""))
|
||
awayID := extractUUIDFromHref(tds.Eq(2).Find("a").First().AttrOr("href", ""))
|
||
rawScore := getText(tds.Eq(3))
|
||
score := ""
|
||
if re := regexp.MustCompile(`(\d+)\s*:\s*(\d+)`); re != nil {
|
||
if m := re.FindStringSubmatch(rawScore); len(m) == 3 {
|
||
score = fmt.Sprintf("%s:%s", m[1], m[2])
|
||
}
|
||
}
|
||
venue := ""
|
||
if tds.Length() > 4 {
|
||
venue = getText(tds.Eq(4))
|
||
}
|
||
var reportURL, matchID string
|
||
var isReportHref, isDelegHref string
|
||
// Use the last column for links to be robust to optional columns
|
||
tds.Eq(tds.Length() - 1).Find("a").Each(func(_ int, a *goquery.Selection) {
|
||
href := strings.TrimSpace(a.AttrOr("href", ""))
|
||
if href == "" {
|
||
return
|
||
}
|
||
if u, err := neturl.Parse(href); err == nil {
|
||
if id := u.Query().Get("zapas"); id != "" {
|
||
matchID = id
|
||
}
|
||
}
|
||
// Capture specific IS links
|
||
if strings.Contains(href, "zapis-o-utkani-report.aspx") {
|
||
isReportHref = resolveISURL(href)
|
||
}
|
||
if strings.Contains(href, "zapas-delegace-report.aspx") {
|
||
isDelegHref = resolveISURL(href)
|
||
}
|
||
})
|
||
if matchID != "" {
|
||
if strings.EqualFold(clubType, "futsal") {
|
||
reportURL = fmt.Sprintf("https://www.fotbal.cz/futsal/zapasy/futsal/%s", matchID)
|
||
} else {
|
||
reportURL = fmt.Sprintf("https://www.fotbal.cz/souteze/zapasy/zapas/%s", matchID)
|
||
}
|
||
}
|
||
// Canonical fotbal.cz link
|
||
facrLink := reportURL
|
||
// Filter by club involvement: prefer UUID match, fallback to name matching with simplified token
|
||
if clubName != "" || clubID != "" {
|
||
involved := false
|
||
if clubID != "" && (strings.EqualFold(homeID, clubID) || strings.EqualFold(awayID, clubID)) {
|
||
involved = true
|
||
} else if clubName != "" {
|
||
involved = strings.EqualFold(rawHome, clubName) || strings.EqualFold(rawAway, clubName) ||
|
||
containsFold(clubName, rawHome) || containsFold(clubName, rawAway) ||
|
||
containsFold(rawHome, clubName) || containsFold(rawAway, clubName)
|
||
if !involved {
|
||
token := simplifyClubQuery(clubName)
|
||
if token != "" && (containsFold(rawHome, token) || containsFold(rawAway, token)) {
|
||
involved = true
|
||
}
|
||
}
|
||
}
|
||
if !involved {
|
||
return
|
||
}
|
||
}
|
||
keptRows++
|
||
if homeID == "" {
|
||
if strings.EqualFold(rawHome, clubName) || containsFold(rawHome, clubName) || containsFold(clubName, rawHome) {
|
||
homeID = clubID
|
||
} else {
|
||
token := simplifyClubQuery(clubName)
|
||
if token != "" && containsFold(rawHome, token) {
|
||
homeID = clubID
|
||
}
|
||
}
|
||
}
|
||
if awayID == "" {
|
||
if strings.EqualFold(rawAway, clubName) || containsFold(rawAway, clubName) || containsFold(clubName, rawAway) {
|
||
awayID = clubID
|
||
} else {
|
||
token := simplifyClubQuery(clubName)
|
||
if token != "" && containsFold(rawAway, token) {
|
||
awayID = clubID
|
||
}
|
||
}
|
||
}
|
||
homeLogo := getLogo(rawHome, homeID)
|
||
awayLogo := getLogo(rawAway, awayID)
|
||
matches = append(matches, Match{DateTime: dt, Home: rawHome, HomeID: homeID, HomeLogoURL: homeLogo, Away: rawAway, AwayID: awayID, AwayLogoURL: awayLogo, Score: score, Venue: venue, MatchID: matchID, ReportURL: func() string {
|
||
if isReportHref != "" {
|
||
return isReportHref
|
||
}
|
||
return reportURL
|
||
}(), FACRLink: facrLink, DelegationURL: isDelegHref})
|
||
})
|
||
if os.Getenv("DEBUG_SAVE_HTML") != "" {
|
||
log.Printf("IS parse summary for %s: total rows=%d, kept=%d", detailURL, totalRows, keptRows)
|
||
}
|
||
return matches
|
||
}
|
||
|
||
var logoCache = map[string]string{}
|
||
|
||
type logoAPISearchResult struct {
|
||
ID string `json:"id"`
|
||
Name string `json:"name"`
|
||
LogoURL string `json:"logo_url"`
|
||
HasLocalLogo bool `json:"has_local_logo"`
|
||
}
|
||
|
||
type searchAPIResult struct {
|
||
Results []struct {
|
||
Name string `json:"name"`
|
||
LogoURL string `json:"logo_url"`
|
||
} `json:"results"`
|
||
}
|
||
|
||
// a simplified search token like "krnov" to improve chances of finding a logo.
|
||
func simplifyClubQuery(name string) string {
|
||
s := strings.TrimSpace(name)
|
||
if s == "" {
|
||
return ""
|
||
}
|
||
parts := strings.Fields(s)
|
||
if len(parts) == 0 {
|
||
return ""
|
||
}
|
||
// Walk from the end to find a meaningful token (avoid legal suffixes like "z.s.")
|
||
stop := map[string]struct{}{
|
||
"z.s.": {}, "z.s": {}, "zs": {}, "zapsany": {}, "zapsaný": {}, "spolek": {},
|
||
"o.s.": {}, "o.s": {}, "os": {}, "a.s.": {}, "a.s": {}, "as": {},
|
||
"s.r.o.": {}, "s.r.o": {}, "sro": {},
|
||
}
|
||
for i := len(parts) - 1; i >= 0; i-- {
|
||
tok := parts[i]
|
||
tok = strings.Trim(tok, ",.;:-()[]{}\"'`“”’")
|
||
lt := strings.ToLower(tok)
|
||
if _, banned := stop[lt]; banned {
|
||
continue
|
||
}
|
||
// prefer tokens with letters and length >= 3
|
||
letters := regexp.MustCompile(`[a-zA-Zá-žÁ-Ž]`).MatchString
|
||
if len([]rune(lt)) >= 3 && letters(lt) {
|
||
return lt
|
||
}
|
||
}
|
||
// Fallback to last token sanitized
|
||
last := strings.Trim(parts[len(parts)-1], ",.;:-()[]{}\"'`“”’")
|
||
return strings.ToLower(last)
|
||
}
|
||
|
||
func getLogoFromLogoAPI(teamName string, teamID string) string {
|
||
base := strings.TrimSpace(os.Getenv("LOGOAPI_BASE_URL"))
|
||
if base == "" {
|
||
base = "https://logoapi.sportcreative.eu"
|
||
}
|
||
base = strings.TrimRight(base, "/")
|
||
name := strings.TrimSpace(teamName)
|
||
if name == "" {
|
||
return ""
|
||
}
|
||
cacheKey := "logoapi|" + strings.ToLower(name)
|
||
if v, ok := logoCache[cacheKey]; ok {
|
||
return v
|
||
}
|
||
client := &http.Client{Timeout: 5 * time.Second}
|
||
doSearch := func(q string) ([]logoAPISearchResult, bool) {
|
||
q = strings.TrimSpace(q)
|
||
if q == "" {
|
||
return nil, false
|
||
}
|
||
u := fmt.Sprintf("%s/clubs/search-with-logos?q=%s", base, neturl.QueryEscape(q))
|
||
resp, err := client.Get(u)
|
||
if err != nil {
|
||
return nil, false
|
||
}
|
||
defer resp.Body.Close()
|
||
if resp.StatusCode != http.StatusOK {
|
||
io.Copy(io.Discard, resp.Body)
|
||
return nil, false
|
||
}
|
||
var payload []logoAPISearchResult
|
||
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
|
||
// Non-JSON or invalid response – treat as no result
|
||
return nil, false
|
||
}
|
||
return payload, true
|
||
}
|
||
// Search strictly by full club name; if it yields nothing usable, let caller fall back to FACR.
|
||
payload, ok := doSearch(name)
|
||
if !ok || len(payload) == 0 {
|
||
logoCache[cacheKey] = ""
|
||
return ""
|
||
}
|
||
// Normalize names for comparison (case-insensitive, strip common legal/sport suffixes).
|
||
normalize := func(s string) string {
|
||
s = strings.ToLower(strings.TrimSpace(s))
|
||
if s == "" {
|
||
return s
|
||
}
|
||
parts := strings.Fields(s)
|
||
stop := map[string]struct{}{
|
||
"fotbal": {}, "futsal": {},
|
||
"z.s.": {}, "z.s": {}, "zs": {},
|
||
"o.s.": {}, "o.s": {}, "os": {},
|
||
"a.s.": {}, "a.s": {}, "as": {},
|
||
"s.r.o.": {}, "s.r.o": {}, "sro": {},
|
||
}
|
||
end := len(parts)
|
||
for end > 0 {
|
||
if _, banned := stop[parts[end-1]]; banned {
|
||
end--
|
||
continue
|
||
}
|
||
break
|
||
}
|
||
if end != len(parts) {
|
||
parts = parts[:end]
|
||
}
|
||
return strings.Join(parts, " ")
|
||
}
|
||
want := normalize(name)
|
||
var withLogo []logoAPISearchResult
|
||
for _, r := range payload {
|
||
if r.HasLocalLogo {
|
||
withLogo = append(withLogo, r)
|
||
}
|
||
}
|
||
if len(withLogo) == 0 {
|
||
logoCache[cacheKey] = ""
|
||
return ""
|
||
}
|
||
// Only accept a logo when the normalized club name matches; avoid arbitrary first-result picks.
|
||
for _, r := range withLogo {
|
||
if normalize(r.Name) == want {
|
||
logoCache[cacheKey] = r.LogoURL
|
||
return r.LogoURL
|
||
}
|
||
}
|
||
// No strong match – treat as "no logo" so upstream can fall back to FACR assets.
|
||
logoCache[cacheKey] = ""
|
||
return ""
|
||
}
|
||
|
||
func getLogoBySearch(name string) string {
|
||
key := strings.ToLower(strings.TrimSpace(name))
|
||
if key == "" {
|
||
return ""
|
||
}
|
||
if v, ok := logoCache[key]; ok {
|
||
return v
|
||
}
|
||
client := &http.Client{Timeout: 5 * time.Second}
|
||
// Prefer simplified last-word token (e.g., "krnov") to improve hit rate for logos
|
||
query := simplifyClubQuery(name)
|
||
if query == "" {
|
||
query = name
|
||
}
|
||
doSearch := func(q string) (searchAPIResult, bool) {
|
||
url := fmt.Sprintf("http://localhost:8686/club/search?q=%s", neturl.QueryEscape(q))
|
||
resp, err := client.Get(url)
|
||
if err != nil {
|
||
return searchAPIResult{}, false
|
||
}
|
||
defer resp.Body.Close()
|
||
if resp.StatusCode != http.StatusOK {
|
||
io.Copy(io.Discard, resp.Body)
|
||
return searchAPIResult{}, false
|
||
}
|
||
var payload searchAPIResult
|
||
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
|
||
return searchAPIResult{}, false
|
||
}
|
||
return payload, true
|
||
}
|
||
payload, ok := doSearch(query)
|
||
if !ok || len(payload.Results) == 0 {
|
||
// Fallback to full name if simplified token yields nothing
|
||
payload, ok = doSearch(name)
|
||
if !ok {
|
||
return ""
|
||
}
|
||
}
|
||
// pick best match: exact (case-insensitive), then contains, else first
|
||
best := ""
|
||
for _, r := range payload.Results {
|
||
if strings.EqualFold(strings.TrimSpace(r.Name), strings.TrimSpace(name)) {
|
||
best = r.LogoURL
|
||
break
|
||
}
|
||
}
|
||
if best == "" {
|
||
for _, r := range payload.Results {
|
||
rname := strings.ToLower(r.Name)
|
||
if strings.Contains(rname, key) || strings.Contains(key, rname) {
|
||
best = r.LogoURL
|
||
break
|
||
}
|
||
}
|
||
}
|
||
if best == "" && len(payload.Results) > 0 {
|
||
best = payload.Results[0].LogoURL
|
||
}
|
||
logoCache[key] = best
|
||
return best
|
||
}
|
||
|
||
func getLogo(teamName string, teamID string) string {
|
||
placeholder := "https://www.fotbal.cz/dist/img/logo-club-empty.svg"
|
||
name := strings.ToLower(strings.TrimSpace(teamName))
|
||
if name == "" || strings.Contains(name, "volno") || strings.Contains(name, "volný los") || strings.Contains(name, "volny los") || strings.Contains(name, "bye") {
|
||
return placeholder
|
||
}
|
||
if logo := getLogoFromLogoAPI(teamName, teamID); logo != "" {
|
||
return logo
|
||
}
|
||
// If we have a team ID, construct the official logo URL directly.
|
||
// This avoids wrong matches for duplicate names (e.g., multiple "Ořechov").
|
||
if tid := strings.TrimSpace(teamID); tid != "" {
|
||
return fmt.Sprintf("https://is1.fotbal.cz/media/kluby/%s/%s_crop.jpg", tid, tid)
|
||
}
|
||
// Otherwise, try the local FACR-based search endpoint by name.
|
||
if logo := getLogoBySearch(teamName); logo != "" {
|
||
return logo
|
||
}
|
||
// No ID and no search hit -> placeholder
|
||
return placeholder
|
||
}
|
||
|
||
// CompetitionTable holds standings sections; currently only Overall is used
|
||
type CompetitionTable struct {
|
||
Overall []TableRow `json:"overall"`
|
||
}
|
||
|
||
// ClubInfo is the response for club info and tables endpoints
|
||
type ClubInfo struct {
|
||
Name string `json:"name"`
|
||
ClubID string `json:"club_id"`
|
||
ClubType string `json:"club_type"`
|
||
ClubInternalID string `json:"club_internal_id,omitempty"`
|
||
URL string `json:"url,omitempty"`
|
||
LogoURL string `json:"logo_url,omitempty"`
|
||
Address string `json:"address,omitempty"`
|
||
Category string `json:"category,omitempty"`
|
||
Competitions []Competition `json:"competitions"`
|
||
}
|
||
|
||
// SearchResult represents one club from fotbal.cz search
|
||
type SearchResult struct {
|
||
Name string `json:"name"`
|
||
ClubID string `json:"club_id"`
|
||
ClubType string `json:"club_type"` // football or futsal
|
||
URL string `json:"url"`
|
||
LogoURL string `json:"logo_url"`
|
||
Category string `json:"category,omitempty"`
|
||
Address string `json:"address,omitempty"`
|
||
}
|
||
|
||
// getClubSearch queries fotbal.cz club search and returns results with logo
|
||
func getClubSearch(w http.ResponseWriter, r *http.Request) {
|
||
q := strings.TrimSpace(r.URL.Query().Get("q"))
|
||
if q == "" {
|
||
http.Error(w, "query parameter 'q' is required", http.StatusBadRequest)
|
||
return
|
||
}
|
||
|
||
// Build search URL
|
||
vals := neturl.Values{}
|
||
vals.Set("q", q)
|
||
searchURL := "https://www.fotbal.cz/club/hledej?" + vals.Encode()
|
||
|
||
fetchSearchPage := func(url string) ([]byte, error) {
|
||
return fetchPageWithFallbackOptions(url, fetchOptions{
|
||
Referer: "https://www.fotbal.cz/club/hledej",
|
||
})
|
||
}
|
||
|
||
// Try direct HTTP request first
|
||
body, err := fetchSearchPage(searchURL)
|
||
if err != nil {
|
||
log.Printf("Direct search request failed for %s: %v", searchURL, err)
|
||
|
||
// Retry with quoted query for short tokens
|
||
searchURL2 := searchURL
|
||
tokens := strings.Fields(q)
|
||
for _, t := range tokens {
|
||
if len([]rune(t)) <= 2 {
|
||
vals2 := neturl.Values{}
|
||
vals2.Set("q", "\""+q+"\"")
|
||
searchURL2 = "https://www.fotbal.cz/club/hledej?" + vals2.Encode()
|
||
break
|
||
}
|
||
}
|
||
|
||
body, err = fetchSearchPage(searchURL2)
|
||
if err != nil {
|
||
log.Printf("Retried search request failed for %s: %v", searchURL2, err)
|
||
// Return empty results instead of error
|
||
w.Header().Set("Content-Type", "application/json")
|
||
json.NewEncoder(w).Encode(map[string]any{
|
||
"query": q,
|
||
"count": 0,
|
||
"results": []SearchResult{},
|
||
})
|
||
return
|
||
}
|
||
}
|
||
|
||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
|
||
if err != nil {
|
||
http.Error(w, fmt.Sprintf("Error parsing HTML: %v", err), http.StatusInternalServerError)
|
||
return
|
||
}
|
||
|
||
var results []SearchResult
|
||
// The page lists clubs in section "Výsledky hledání" as li.ListItemSplit
|
||
doc.Find("li.ListItemSplit").Each(func(_ int, li *goquery.Selection) {
|
||
a := li.Find("a.Link--inverted").First()
|
||
href, _ := a.Attr("href")
|
||
if href == "" {
|
||
return
|
||
}
|
||
name := strings.TrimSpace(a.Find("span.H7").First().Text())
|
||
if name == "" {
|
||
// fallback to link text
|
||
name = strings.TrimSpace(a.Text())
|
||
}
|
||
img := a.Find("img").First()
|
||
logoURL, _ := img.Attr("src")
|
||
|
||
// Category
|
||
category := strings.TrimSpace(li.Find(".ClubCategories .BadgeCategory").First().Text())
|
||
// Address
|
||
address := strings.TrimSpace(li.Find(".ClubAddress p").First().Text())
|
||
|
||
// Infer club type from href
|
||
clubType := "football"
|
||
if strings.Contains(strings.ToLower(href), "/futsal/") {
|
||
clubType = "futsal"
|
||
}
|
||
|
||
// Extract club ID from last path segment
|
||
// e.g., https://www.fotbal.cz/futsal/club/club/{uuid}
|
||
parts := strings.Split(strings.TrimRight(href, "/"), "/")
|
||
clubID := ""
|
||
if len(parts) > 0 {
|
||
clubID = parts[len(parts)-1]
|
||
}
|
||
|
||
// Normalize URL (ensure absolute)
|
||
if !strings.HasPrefix(href, "http://") && !strings.HasPrefix(href, "https://") {
|
||
href = "https://www.fotbal.cz" + href
|
||
}
|
||
|
||
// Prefer logoapi / local logo when available
|
||
if l := strings.TrimSpace(getLogo(name, clubID)); l != "" {
|
||
logoURL = l
|
||
}
|
||
|
||
results = append(results, SearchResult{
|
||
Name: name,
|
||
ClubID: clubID,
|
||
ClubType: clubType,
|
||
URL: href,
|
||
LogoURL: logoURL,
|
||
Category: category,
|
||
Address: address,
|
||
})
|
||
})
|
||
|
||
w.Header().Set("Content-Type", "application/json")
|
||
json.NewEncoder(w).Encode(map[string]any{
|
||
"query": q,
|
||
"count": len(results),
|
||
"results": results,
|
||
})
|
||
}
|
||
|
||
// getClubTables returns club info with competition standings tables (no matches)
|
||
func getClubTables(w http.ResponseWriter, r *http.Request) {
|
||
vars := mux.Vars(r)
|
||
clubID := vars["id"]
|
||
clubType := vars["type"]
|
||
|
||
if clubID == "" {
|
||
http.Error(w, "Club ID is required", http.StatusBadRequest)
|
||
return
|
||
}
|
||
|
||
// Validate club type
|
||
var baseURL string
|
||
var sportParam string
|
||
switch clubType {
|
||
case "football":
|
||
baseURL = "https://www.fotbal.cz/souteze/club/club"
|
||
sportParam = "fotbal"
|
||
case "futsal":
|
||
baseURL = "https://www.fotbal.cz/futsal/club/club"
|
||
sportParam = "futsal"
|
||
default:
|
||
http.Error(w, "Invalid club type. Use 'football' or 'futsal'.", http.StatusBadRequest)
|
||
return
|
||
}
|
||
|
||
url := fmt.Sprintf("%s/%s", baseURL, clubID)
|
||
body, err := fetchPageWithFallback(url)
|
||
if err != nil {
|
||
http.Error(w, fmt.Sprintf("Error fetching club data: %v", err), http.StatusInternalServerError)
|
||
return
|
||
}
|
||
|
||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
|
||
if err != nil {
|
||
http.Error(w, fmt.Sprintf("Error parsing HTML: %v", err), http.StatusInternalServerError)
|
||
return
|
||
}
|
||
// Extract club internal ID
|
||
clubInternalID := ""
|
||
doc.Find("section").Each(func(i int, s *goquery.Selection) {
|
||
headerText := s.Find("h3 span").First().Text()
|
||
if strings.TrimSpace(headerText) == "ID klubu" {
|
||
clubInternalID = strings.TrimSpace(s.Find("ul li").First().Text())
|
||
}
|
||
})
|
||
|
||
// Extract competitions
|
||
var competitions []Competition
|
||
doc.Find("table.Table tbody tr").Each(func(i int, s *goquery.Selection) {
|
||
code := strings.TrimSpace(s.Find("td:first-child").Text())
|
||
nameLink := s.Find("td:nth-child(2) a")
|
||
name := strings.TrimSpace(nameLink.Text())
|
||
teamCount := strings.TrimSpace(s.Find("td:nth-child(3)").Text())
|
||
// Extract competition ID from the link
|
||
parts := strings.Split(nameLink.AttrOr("href", ""), "/")
|
||
compID := ""
|
||
if len(parts) >= 2 {
|
||
compID = parts[len(parts)-1]
|
||
}
|
||
// Build public table link depending on clubType
|
||
tableLink := ""
|
||
if strings.EqualFold(clubType, "futsal") {
|
||
tableLink = fmt.Sprintf("https://www.fotbal.cz/futsal/futsal/table/%s", compID)
|
||
} else {
|
||
tableLink = fmt.Sprintf("https://www.fotbal.cz/souteze/turnaje/table/%s", compID)
|
||
}
|
||
|
||
competitions = append(competitions, Competition{
|
||
ID: compID,
|
||
Code: code,
|
||
Name: name,
|
||
TeamCount: teamCount,
|
||
MatchesLink: tableLink,
|
||
})
|
||
})
|
||
|
||
// For each competition, fetch the standings tables from is.fotbal.cz
|
||
for i := range competitions {
|
||
comp := &competitions[i]
|
||
tableURL := fmt.Sprintf("https://is.fotbal.cz/public/souteze/tabulky-souteze.aspx?req=%s&sport=%s", comp.ID, sportParam)
|
||
resp, err := http.Get(tableURL)
|
||
if err != nil {
|
||
log.Printf("error fetching competition table for %s: %v", comp.ID, err)
|
||
continue
|
||
}
|
||
defer resp.Body.Close()
|
||
|
||
if resp.StatusCode != http.StatusOK {
|
||
log.Printf("non-200 response for %s: %d", comp.ID, resp.StatusCode)
|
||
continue
|
||
}
|
||
|
||
docTable, err := goquery.NewDocumentFromReader(resp.Body)
|
||
if err != nil {
|
||
log.Printf("error parsing table HTML for %s: %v", comp.ID, err)
|
||
continue
|
||
}
|
||
|
||
// Parse section: Tabulka celková (only overall)
|
||
var overall []TableRow
|
||
|
||
parseSection := func(headerText string) []TableRow {
|
||
var rows []TableRow
|
||
// Find the h3 with matching text, then the following .list.tabulky table
|
||
docTable.Find("h3").EachWithBreak(func(_ int, h3 *goquery.Selection) bool {
|
||
if strings.EqualFold(strings.TrimSpace(h3.Text()), headerText) {
|
||
list := h3.NextAllFiltered("div.list.tabulky").First()
|
||
if list.Length() == 0 {
|
||
return false
|
||
}
|
||
table := list.Find("table.vysledky-tabulky tbody")
|
||
table.Find("tr").Each(func(_ int, tr *goquery.Selection) {
|
||
// skip header rows containing th
|
||
if tr.Find("th").Length() > 0 {
|
||
return
|
||
}
|
||
tds := tr.Find("td")
|
||
if tds.Length() < 8 {
|
||
return
|
||
}
|
||
get := func(i int) string { return strings.TrimSpace(tds.Eq(i).Text()) }
|
||
rank := get(0)
|
||
team := get(1)
|
||
teamID := extractUUIDFromHref(tds.Eq(1).Find("a").First().AttrOr("href", ""))
|
||
played := get(2)
|
||
wins := get(3)
|
||
draws := get(4)
|
||
losses := get(5)
|
||
scoreRaw := get(6)
|
||
// normalize score like "5 : 0" -> "5:0"
|
||
score := scoreRaw
|
||
if re := regexp.MustCompile(`\s*([0-9]+)\s*:\s*([0-9]+)\s*`); re != nil {
|
||
if m := re.FindStringSubmatch(scoreRaw); len(m) == 3 {
|
||
score = fmt.Sprintf("%s:%s", m[1], m[2])
|
||
}
|
||
}
|
||
points := get(7)
|
||
rows = append(rows, TableRow{
|
||
Rank: rank, Team: team, TeamID: teamID, TeamLogoURL: getLogo(team, teamID), Played: played, Wins: wins, Draws: draws, Losses: losses, Score: score, Points: points,
|
||
})
|
||
})
|
||
return false
|
||
}
|
||
return true
|
||
})
|
||
return rows
|
||
}
|
||
|
||
overall = parseSection("Tabulka celková")
|
||
comp.Table = &CompetitionTable{Overall: overall}
|
||
}
|
||
|
||
clubName := strings.TrimSpace(doc.Find("h1.H4 span").First().Text())
|
||
clubURL := strings.TrimSpace(doc.Find("h1.H4 a").First().AttrOr("href", ""))
|
||
logoURL := strings.TrimSpace(doc.Find("img.Logo").First().AttrOr("src", ""))
|
||
if l := strings.TrimSpace(getLogo(clubName, clubID)); l != "" {
|
||
logoURL = l
|
||
}
|
||
category := strings.TrimSpace(doc.Find("section").First().Find("h3 span").First().Text())
|
||
address := strings.TrimSpace(doc.Find("section").First().Find("ul li").First().Text())
|
||
|
||
clubInfo := ClubInfo{
|
||
Name: clubName,
|
||
ClubID: clubID,
|
||
ClubType: clubType,
|
||
ClubInternalID: clubInternalID,
|
||
URL: clubURL,
|
||
LogoURL: logoURL,
|
||
Address: address,
|
||
Category: category,
|
||
Competitions: competitions,
|
||
}
|
||
|
||
w.Header().Set("Content-Type", "application/json")
|
||
json.NewEncoder(w).Encode(clubInfo)
|
||
}
|
||
|
||
// getClubInfo returns club info with competitions and matches
|
||
func getClubInfo(w http.ResponseWriter, r *http.Request) {
|
||
vars := mux.Vars(r)
|
||
clubID := vars["id"]
|
||
clubType := vars["type"]
|
||
if clubID == "" {
|
||
http.Error(w, "Club ID is required", http.StatusBadRequest)
|
||
return
|
||
}
|
||
var baseURL, sportParam string
|
||
switch clubType {
|
||
case "football":
|
||
baseURL = "https://www.fotbal.cz/souteze/club/club"
|
||
sportParam = "fotbal"
|
||
case "futsal":
|
||
baseURL = "https://www.fotbal.cz/futsal/club/club"
|
||
sportParam = "futsal"
|
||
default:
|
||
http.Error(w, "Invalid club type. Use 'football' or 'futsal'.", http.StatusBadRequest)
|
||
return
|
||
}
|
||
|
||
url := fmt.Sprintf("%s/%s", baseURL, clubID)
|
||
body, err := fetchPageWithFallback(url)
|
||
if err != nil {
|
||
http.Error(w, fmt.Sprintf("Error fetching club data: %v", err), http.StatusInternalServerError)
|
||
return
|
||
}
|
||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
|
||
if err != nil {
|
||
http.Error(w, fmt.Sprintf("Error parsing HTML: %v", err), http.StatusInternalServerError)
|
||
return
|
||
}
|
||
|
||
clubName := strings.TrimSpace(doc.Find("h1.H4 span").First().Text())
|
||
// Basic club metadata
|
||
clubURL := fmt.Sprintf("%s/%s", baseURL, clubID)
|
||
logoURL := getLogo(clubName, clubID)
|
||
if logoURL == "" {
|
||
logoURL = fmt.Sprintf("https://is1.fotbal.cz/media/kluby/%s/%s_crop.jpg", clubID, clubID)
|
||
}
|
||
category := "Fotbal"
|
||
if strings.EqualFold(clubType, "futsal") {
|
||
category = "Futsal"
|
||
}
|
||
// Internal ID
|
||
clubInternalID := ""
|
||
doc.Find("section").Each(func(_ int, s *goquery.Selection) {
|
||
if strings.TrimSpace(s.Find("h3 span").First().Text()) == "ID klubu" {
|
||
clubInternalID = strings.TrimSpace(s.Find("ul li").First().Text())
|
||
}
|
||
})
|
||
// Address (best-effort)
|
||
address := strings.TrimSpace(doc.Find(".ClubAddress p").First().Text())
|
||
|
||
// Competitions list
|
||
var competitions []Competition
|
||
doc.Find("table.Table tbody tr").Each(func(_ int, tr *goquery.Selection) {
|
||
code := strings.TrimSpace(tr.Find("td:first-child").Text())
|
||
nameLink := tr.Find("td:nth-child(2) a")
|
||
name := strings.TrimSpace(nameLink.Text())
|
||
teamCount := strings.TrimSpace(tr.Find("td:nth-child(3)").Text())
|
||
parts := strings.Split(strings.TrimSpace(nameLink.AttrOr("href", "")), "/")
|
||
compID := ""
|
||
if len(parts) >= 2 {
|
||
compID = parts[len(parts)-1]
|
||
}
|
||
// Public table URL for convenience
|
||
tableLink := ""
|
||
if strings.EqualFold(clubType, "futsal") {
|
||
tableLink = fmt.Sprintf("https://www.fotbal.cz/futsal/futsal/table/%s", compID)
|
||
} else {
|
||
tableLink = fmt.Sprintf("https://www.fotbal.cz/souteze/turnaje/table/%s", compID)
|
||
}
|
||
competitions = append(competitions, Competition{ID: compID, Code: code, Name: name, TeamCount: teamCount, MatchesLink: tableLink})
|
||
})
|
||
|
||
// For each competition, fetch matches
|
||
for i := range competitions {
|
||
comp := &competitions[i]
|
||
matchesLink := comp.MatchesLink
|
||
// 1) Try parsing from the public fotbal.cz competition page (matches_link)
|
||
matches := parseCompetitionMatchesFromFotbal(matchesLink, clubType, clubName, clubID)
|
||
// Always try IS as well and prefer it if it provides at least as many matches
|
||
detailURL := fmt.Sprintf("https://is.fotbal.cz/public/souteze/detail-souteze.aspx?req=%s&sport=%s", comp.ID, sportParam)
|
||
isMatches := parseCompetitionMatchesFromIS(detailURL, clubType, clubName, clubID)
|
||
// Prefer IS whenever it yields any results, as IS often contains alias team names
|
||
if len(isMatches) > 0 {
|
||
matches = isMatches
|
||
}
|
||
comp.Matches = matches
|
||
}
|
||
|
||
clubInfo := ClubInfo{
|
||
Name: clubName,
|
||
ClubID: clubID,
|
||
ClubType: clubType,
|
||
ClubInternalID: clubInternalID,
|
||
URL: clubURL,
|
||
LogoURL: logoURL,
|
||
Address: address,
|
||
Category: category,
|
||
Competitions: competitions,
|
||
}
|
||
|
||
w.Header().Set("Content-Type", "application/json")
|
||
json.NewEncoder(w).Encode(clubInfo)
|
||
}
|
||
|
||
func main() {
|
||
r := mux.NewRouter()
|
||
r.HandleFunc("/club/{type}/{id}", getClubInfo).Methods("GET")
|
||
r.HandleFunc("/club/{type}/{id}/table", getClubTables).Methods("GET")
|
||
r.HandleFunc("/club/search", getClubSearch).Methods("GET")
|
||
r.HandleFunc("/club/{id:[0-9a-fA-F-]+}", func(w http.ResponseWriter, r *http.Request) {
|
||
vars := mux.Vars(r)
|
||
http.Redirect(w, r, "/club/football/"+vars["id"], http.StatusMovedPermanently)
|
||
}).Methods("GET")
|
||
r.HandleFunc("/", docsHandler)
|
||
port := ":8686"
|
||
fmt.Printf("Server running on http://localhost%s\n", port)
|
||
log.Fatal(http.ListenAndServe(port, r))
|
||
}
|
||
|
||
// docsHandler serves a simple HTML API documentation at the root endpoint.
|
||
func docsHandler(w http.ResponseWriter, r *http.Request) {
|
||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||
io.WriteString(w, `<!doctype html>
|
||
<html lang="en">
|
||
<head>
|
||
<meta charset="utf-8" />
|
||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||
<title>FACR Scraper API Docs</title>
|
||
<style>
|
||
:root { color-scheme: light dark; }
|
||
body { font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif; margin: 0; padding: 24px; line-height: 1.5; }
|
||
header { margin-bottom: 24px; }
|
||
code, pre { background: rgba(127,127,127,.15); padding: .2em .4em; border-radius: 4px; }
|
||
pre { padding: 12px; overflow: auto; }
|
||
.ep { margin: 18px 0; padding: 16px; border-left: 4px solid #4f46e5; background: rgba(79,70,229,.08); border-radius: 6px; }
|
||
h1 { margin: 0 0 8px; font-size: 1.6rem; }
|
||
h2 { margin: 22px 0 8px; font-size: 1.2rem; }
|
||
a { color: #2563eb; text-decoration: none; }
|
||
a:hover { text-decoration: underline; }
|
||
ul { padding-left: 18px; }
|
||
footer { margin-top: 28px; font-size: .9rem; opacity: .8; }
|
||
</style>
|
||
<link rel="icon" href="data:," />
|
||
<meta http-equiv="Cache-Control" content="no-store" />
|
||
<meta name="robots" content="noindex" />
|
||
<script>
|
||
function ex(id, url) { const el = document.getElementById(id); el.textContent = window.location.origin + url; el.href = url; }
|
||
window.addEventListener('DOMContentLoaded', ()=>{
|
||
ex('ex-search', '/club/search?q=Sparta');
|
||
ex('ex-info', '/club/football/00000000-0000-0000-0000-000000000000');
|
||
ex('ex-table', '/club/football/00000000-0000-0000-0000-000000000000/table');
|
||
});
|
||
</script>
|
||
</head>
|
||
<body>
|
||
<header>
|
||
<h1>FACR Scraper API</h1>
|
||
<p>Status: <code>ok</code> — server is running.</p>
|
||
</header>
|
||
|
||
<section class="ep">
|
||
<h2>Search Clubs</h2>
|
||
<p><strong>GET</strong> <code>/club/search?q=QUERY</code></p>
|
||
<p>Find clubs on fotbal.cz. Supports football and futsal clubs.</p>
|
||
<p>Example: <a id="ex-search" href="/club/search?q=Sparta">/club/search?q=Sparta</a></p>
|
||
<details>
|
||
<summary>Response shape</summary>
|
||
<pre>{
|
||
"query": "Sparta",
|
||
"count": 2,
|
||
"results": [
|
||
{
|
||
"name": "AC Sparta Praha",
|
||
"club_id": "<uuid>",
|
||
"club_type": "football",
|
||
"url": "https://www.fotbal.cz/...",
|
||
"logo_url": "https://.../logo.png",
|
||
"category": "Muži",
|
||
"address": "..."
|
||
}
|
||
]
|
||
}</pre>
|
||
</details>
|
||
</section>
|
||
|
||
<section class="ep">
|
||
<h2>Club Info + Matches</h2>
|
||
<p><strong>GET</strong> <code>/club/{type}/{id}</code></p>
|
||
<ul>
|
||
<li><code>{type}</code>: <code>football</code> | <code>futsal</code></li>
|
||
<li><code>{id}</code>: club UUID from fotbal.cz</li>
|
||
</ul>
|
||
<p>Example: <a id="ex-info" href="/club/football/00000000-0000-0000-0000-000000000000">/club/football/{id}</a></p>
|
||
<details>
|
||
<summary>Response shape</summary>
|
||
<pre>{
|
||
"name": "AC Sparta Praha",
|
||
"club_id": "00000000-0000-0000-0000-000000000000",
|
||
"club_type": "football",
|
||
"club_internal_id": "123456",
|
||
"url": "https://www.fotbal.cz/...",
|
||
"logo_url": "https://is1.fotbal.cz/media/kluby/.../logo.jpg",
|
||
"address": "Milady Horákové 98, 160 00 Praha 6",
|
||
"category": "Muži A",
|
||
"competitions": [
|
||
{
|
||
"id": "12345",
|
||
"code": "1. LIGA",
|
||
"name": "Fortuna Liga",
|
||
"team_count": "16",
|
||
"matches_link": "https://www.fotbal.cz/...",
|
||
"matches": [
|
||
{
|
||
"date_time": "12.08.2023 18:00",
|
||
"home": "AC Sparta Praha",
|
||
"home_id": "00000000-0000-0000-0000-000000000000",
|
||
"home_logo_url": "https://.../sparta.png",
|
||
"away": "SK Slavia Praha",
|
||
"away_id": "11111111-1111-1111-1111-111111111111",
|
||
"away_logo_url": "https://.../slavia.png",
|
||
"score": "2:1",
|
||
"venue": "Stadion Letná",
|
||
"match_id": "match12345",
|
||
"report_url": "https://www.fotbal.cz/..."
|
||
}
|
||
]
|
||
}
|
||
]
|
||
}</pre>
|
||
</details>
|
||
</section>
|
||
|
||
<section class="ep">
|
||
<h2>Club Tables (Standings)</h2>
|
||
<p><strong>GET</strong> <code>/club/{type}/{id}/table</code></p>
|
||
<p>Returns standings (overall table) for each competition of the club.</p>
|
||
<p>Example: <a id="ex-table" href="/club/football/00000000-0000-0000-0000-000000000000/table">/club/football/{id}/table</a></p>
|
||
<details>
|
||
<summary>Response shape</summary>
|
||
<pre>{
|
||
"name": "AC Sparta Praha",
|
||
"club_id": "00000000-0000-0000-0000-000000000000",
|
||
"club_type": "football",
|
||
"club_internal_id": "123456",
|
||
"url": "https://www.fotbal.cz/...",
|
||
"logo_url": "https://is1.fotbal.cz/media/kluby/.../logo.jpg",
|
||
"competitions": [
|
||
{
|
||
"id": "12345",
|
||
"code": "1. LIGA",
|
||
"name": "Fortuna Liga",
|
||
"team_count": "16",
|
||
"matches_link": "https://www.fotbal.cz/...",
|
||
"table": {
|
||
"overall": [
|
||
{
|
||
"rank": "1",
|
||
"team": "AC Sparta Praha",
|
||
"team_id": "00000000-0000-0000-0000-000000000000",
|
||
"team_logo_url": "https://.../sparta.png",
|
||
"played": "10",
|
||
"wins": "8",
|
||
"draws": "2",
|
||
"losses": "0",
|
||
"score": "25:5",
|
||
"points": "26"
|
||
},
|
||
{
|
||
"rank": "2",
|
||
"team": "SK Slavia Praha",
|
||
"team_id": "11111111-1111-1111-1111-111111111111",
|
||
"team_logo_url": "https://.../slavia.png",
|
||
"played": "10",
|
||
"wins": "7",
|
||
"draws": "2",
|
||
"losses": "1",
|
||
"score": "20:8",
|
||
"points": "23"
|
||
}
|
||
]
|
||
}
|
||
}
|
||
]
|
||
}</pre>
|
||
</details>
|
||
</section>
|
||
|
||
<section class="ep">
|
||
<h2>Shortcuts</h2>
|
||
<p><strong>GET</strong> <code>/club/{id}</code> → redirects to <code>/club/football/{id}</code></p>
|
||
</section>
|
||
|
||
<footer>
|
||
<p>Tip: Use a reverse proxy in production and set proper timeouts. This API scrapes public pages and may be rate-limited upstream.</p>
|
||
</footer>
|
||
</body>
|
||
</html>`)
|
||
}
|
||
|
||
func containsFold(s, substr string) bool {
|
||
s = strings.ToLower(strings.TrimSpace(s))
|
||
substr = strings.ToLower(strings.TrimSpace(substr))
|
||
if substr == "" {
|
||
return false
|
||
}
|
||
return strings.Contains(s, substr)
|
||
}
|
||
|
||
// extractUUIDFromHref finds the first UUID-like token in an href and returns it.
|
||
func extractUUIDFromHref(href string) string {
|
||
href = strings.TrimSpace(href)
|
||
if href == "" {
|
||
return ""
|
||
}
|
||
re := regexp.MustCompile(`[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}`)
|
||
if m := re.FindString(href); m != "" {
|
||
return m
|
||
}
|
||
// Fallback: some links may end with ID after slash; take last path token if it looks like hex+hyphenated
|
||
parts := strings.Split(href, "/")
|
||
if len(parts) > 0 {
|
||
cand := parts[len(parts)-1]
|
||
if re.MatchString(cand) {
|
||
return cand
|
||
}
|
||
}
|
||
return ""
|
||
}
|
||
|
||
type Match struct {
|
||
DateTime string `json:"date_time"`
|
||
Home string `json:"home"`
|
||
HomeID string `json:"home_id,omitempty"`
|
||
HomeLogoURL string `json:"home_logo_url,omitempty"`
|
||
Away string `json:"away"`
|
||
AwayID string `json:"away_id,omitempty"`
|
||
AwayLogoURL string `json:"away_logo_url,omitempty"`
|
||
Score string `json:"score"`
|
||
Venue string `json:"venue"`
|
||
Note string `json:"note,omitempty"`
|
||
MatchID string `json:"match_id"`
|
||
ReportURL string `json:"report_url,omitempty"`
|
||
FACRLink string `json:"facr_link,omitempty"`
|
||
DelegationURL string `json:"delegation_url,omitempty"`
|
||
}
|
||
|
||
// TableRow represents one row in a standings table
|
||
type TableRow struct {
|
||
Rank string `json:"rank"`
|
||
Team string `json:"team"`
|
||
TeamID string `json:"team_id,omitempty"`
|
||
TeamLogoURL string `json:"team_logo_url,omitempty"`
|
||
Played string `json:"played"`
|
||
Wins string `json:"wins"`
|
||
Draws string `json:"draws"`
|
||
Losses string `json:"losses"`
|
||
Score string `json:"score"`
|
||
Points string `json:"points"`
|
||
}
|
||
|
||
// resolveISURL makes relative IS links absolute against https://is.fotbal.cz/public/
|
||
func resolveISURL(href string) string {
|
||
href = strings.TrimSpace(href)
|
||
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
|
||
if u, err := neturl.Parse(href); err == nil {
|
||
u.Scheme = "https"
|
||
u.Host = "is.fotbal.cz"
|
||
if !strings.HasPrefix(u.Path, "/public/") {
|
||
if strings.HasPrefix(u.Path, "/zapasy/") {
|
||
u.Path = "/public" + u.Path
|
||
}
|
||
}
|
||
q := u.Query()
|
||
q.Del("discipline")
|
||
u.RawQuery = q.Encode()
|
||
return u.String()
|
||
}
|
||
return href
|
||
}
|
||
href = strings.TrimPrefix(href, "./")
|
||
for strings.HasPrefix(href, "../") {
|
||
href = strings.TrimPrefix(href, "../")
|
||
}
|
||
if strings.HasPrefix(href, "/") {
|
||
href = strings.TrimPrefix(href, "/")
|
||
}
|
||
path := "/public/" + href
|
||
u := neturl.URL{Scheme: "https", Host: "is.fotbal.cz", Path: path}
|
||
return u.String()
|
||
}
|