This commit is contained in:
Tomas Dvorak
2026-03-20 16:17:39 +01:00
parent dc3b7e22ee
commit a9a89bed7c
3 changed files with 35 additions and 5 deletions
-1
View File
@@ -2,7 +2,6 @@
.git .git
.gitignore .gitignore
README.md README.md
Dockerfile
.dockerignore .dockerignore
.env .env
.venv* .venv*
+3 -4
View File
@@ -1,4 +1,3 @@
export LOGOAPI_BASE_URL="https://logoapi.sportcreative.eu" # or your real logoapi base URL LOGOAPI_BASE_URL=https://logoapi.sportcreative.eu
# Cloudflare Browser Rendering API configuration CLOUDFLARE_ACCOUNT_ID=2154bf34f65a995f9b85aa17fee9da43
export CLOUDFLARE_ACCOUNT_ID="2154bf34f65a995f9b85aa17fee9da43" # Your Cloudflare account ID CLOUDFLARE_API_TOKEN=TdhMaQWPnxCwc-g22W9l-A26hYTdkn_9tQCUKZ0h
export CLOUDFLARE_API_TOKEN="TdhMaQWPnxCwc-g22W9l-A26hYTdkn_9tQCUKZ0h" # API token with Browser Rendering - Edit permission
+32
View File
@@ -92,8 +92,19 @@ var (
embeddedScraplingHelperOnce sync.Once embeddedScraplingHelperOnce sync.Once
embeddedScraplingHelperFile string embeddedScraplingHelperFile string
embeddedScraplingHelperErr error embeddedScraplingHelperErr error
// Simple in-memory cache for fetched pages
pageCache = make(map[string]*cacheEntry)
pageCacheMu sync.RWMutex
) )
type cacheEntry struct {
body []byte
timestamp time.Time
}
const cacheTTL = 5 * time.Minute
// NewCloudflareClient creates a new Cloudflare Browser Rendering API client // NewCloudflareClient creates a new Cloudflare Browser Rendering API client
func NewCloudflareClient() *CloudflareClient { func NewCloudflareClient() *CloudflareClient {
accountID := strings.TrimSpace(os.Getenv("CLOUDFLARE_ACCOUNT_ID")) accountID := strings.TrimSpace(os.Getenv("CLOUDFLARE_ACCOUNT_ID"))
@@ -592,8 +603,20 @@ func fetchPageWithFallback(url string) ([]byte, error) {
// fetchPageWithFallback tries Go HTTP first, then wget, then Scrapling, then Cloudflare Browser Rendering. // fetchPageWithFallback tries Go HTTP first, then wget, then Scrapling, then Cloudflare Browser Rendering.
func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error) { func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error) {
// Check cache first
pageCacheMu.RLock()
if entry, ok := pageCache[url]; ok {
if time.Since(entry.timestamp) < cacheTTL {
pageCacheMu.RUnlock()
log.Printf("Cache hit for %s", url)
return entry.body, nil
}
}
pageCacheMu.RUnlock()
body, err := fetchPageDirect(url, opts) body, err := fetchPageDirect(url, opts)
if err == nil { if err == nil {
cachePage(url, body)
return body, nil return body, nil
} }
log.Printf("Direct request failed for %s: %v", url, err) log.Printf("Direct request failed for %s: %v", url, err)
@@ -601,6 +624,7 @@ func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error)
body, err = fetchPageWithWget(url, opts) body, err = fetchPageWithWget(url, opts)
if err == nil { if err == nil {
log.Printf("Successfully retrieved content via wget for %s", url) log.Printf("Successfully retrieved content via wget for %s", url)
cachePage(url, body)
return body, nil return body, nil
} }
log.Printf("wget fallback failed for %s: %v", url, err) log.Printf("wget fallback failed for %s: %v", url, err)
@@ -608,6 +632,7 @@ func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error)
body, err = fetchPageWithScrapling(url, opts) body, err = fetchPageWithScrapling(url, opts)
if err == nil { if err == nil {
log.Printf("Successfully retrieved content via Scrapling for %s", url) log.Printf("Successfully retrieved content via Scrapling for %s", url)
cachePage(url, body)
return body, nil return body, nil
} }
log.Printf("Scrapling fallback failed for %s: %v", url, err) log.Printf("Scrapling fallback failed for %s: %v", url, err)
@@ -629,6 +654,7 @@ func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error)
return nil, fmt.Errorf("Cloudflare crawl returned a challenge page") return nil, fmt.Errorf("Cloudflare crawl returned a challenge page")
} }
log.Printf("Successfully retrieved content via Cloudflare crawl for %s", url) log.Printf("Successfully retrieved content via Cloudflare crawl for %s", url)
cachePage(url, body)
return body, nil return body, nil
} }
@@ -639,6 +665,12 @@ func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error)
return nil, fmt.Errorf("go scraping failed, wget failed, Scrapling failed, and Cloudflare client is not available") return nil, fmt.Errorf("go scraping failed, wget failed, Scrapling failed, and Cloudflare client is not available")
} }
func cachePage(url string, body []byte) {
pageCacheMu.Lock()
pageCache[url] = &cacheEntry{body: body, timestamp: time.Now()}
pageCacheMu.Unlock()
}
// parseCompetitionMatchesFromFotbal scrapes matches from the public fotbal.cz // parseCompetitionMatchesFromFotbal scrapes matches from the public fotbal.cz
// competition page (e.g., https://www.fotbal.cz/souteze/turnaje/table/{id}). // competition page (e.g., https://www.fotbal.cz/souteze/turnaje/table/{id}).
// It filters to only include matches involving the given clubName if provided. // It filters to only include matches involving the given clubName if provided.