mirror of
https://github.com/Dvorinka/facr-scraper.git
synced 2026-06-03 20:12:57 +00:00
update
This commit is contained in:
@@ -92,8 +92,19 @@ var (
|
||||
embeddedScraplingHelperOnce sync.Once
|
||||
embeddedScraplingHelperFile string
|
||||
embeddedScraplingHelperErr error
|
||||
|
||||
// Simple in-memory cache for fetched pages
|
||||
pageCache = make(map[string]*cacheEntry)
|
||||
pageCacheMu sync.RWMutex
|
||||
)
|
||||
|
||||
type cacheEntry struct {
|
||||
body []byte
|
||||
timestamp time.Time
|
||||
}
|
||||
|
||||
const cacheTTL = 5 * time.Minute
|
||||
|
||||
// NewCloudflareClient creates a new Cloudflare Browser Rendering API client
|
||||
func NewCloudflareClient() *CloudflareClient {
|
||||
accountID := strings.TrimSpace(os.Getenv("CLOUDFLARE_ACCOUNT_ID"))
|
||||
@@ -592,8 +603,20 @@ func fetchPageWithFallback(url string) ([]byte, error) {
|
||||
|
||||
// fetchPageWithFallback tries Go HTTP first, then wget, then Scrapling, then Cloudflare Browser Rendering.
|
||||
func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error) {
|
||||
// Check cache first
|
||||
pageCacheMu.RLock()
|
||||
if entry, ok := pageCache[url]; ok {
|
||||
if time.Since(entry.timestamp) < cacheTTL {
|
||||
pageCacheMu.RUnlock()
|
||||
log.Printf("Cache hit for %s", url)
|
||||
return entry.body, nil
|
||||
}
|
||||
}
|
||||
pageCacheMu.RUnlock()
|
||||
|
||||
body, err := fetchPageDirect(url, opts)
|
||||
if err == nil {
|
||||
cachePage(url, body)
|
||||
return body, nil
|
||||
}
|
||||
log.Printf("Direct request failed for %s: %v", url, err)
|
||||
@@ -601,6 +624,7 @@ func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error)
|
||||
body, err = fetchPageWithWget(url, opts)
|
||||
if err == nil {
|
||||
log.Printf("Successfully retrieved content via wget for %s", url)
|
||||
cachePage(url, body)
|
||||
return body, nil
|
||||
}
|
||||
log.Printf("wget fallback failed for %s: %v", url, err)
|
||||
@@ -608,6 +632,7 @@ func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error)
|
||||
body, err = fetchPageWithScrapling(url, opts)
|
||||
if err == nil {
|
||||
log.Printf("Successfully retrieved content via Scrapling for %s", url)
|
||||
cachePage(url, body)
|
||||
return body, nil
|
||||
}
|
||||
log.Printf("Scrapling fallback failed for %s: %v", url, err)
|
||||
@@ -629,6 +654,7 @@ func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error)
|
||||
return nil, fmt.Errorf("Cloudflare crawl returned a challenge page")
|
||||
}
|
||||
log.Printf("Successfully retrieved content via Cloudflare crawl for %s", url)
|
||||
cachePage(url, body)
|
||||
return body, nil
|
||||
}
|
||||
|
||||
@@ -639,6 +665,12 @@ func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error)
|
||||
return nil, fmt.Errorf("go scraping failed, wget failed, Scrapling failed, and Cloudflare client is not available")
|
||||
}
|
||||
|
||||
func cachePage(url string, body []byte) {
|
||||
pageCacheMu.Lock()
|
||||
pageCache[url] = &cacheEntry{body: body, timestamp: time.Now()}
|
||||
pageCacheMu.Unlock()
|
||||
}
|
||||
|
||||
// parseCompetitionMatchesFromFotbal scrapes matches from the public fotbal.cz
|
||||
// competition page (e.g., https://www.fotbal.cz/souteze/turnaje/table/{id}).
|
||||
// It filters to only include matches involving the given clubName if provided.
|
||||
|
||||
Reference in New Issue
Block a user