mirror of
https://github.com/Dvorinka/facr-scraper.git
synced 2026-06-03 20:12:57 +00:00
update
This commit is contained in:
@@ -2,7 +2,6 @@
|
|||||||
.git
|
.git
|
||||||
.gitignore
|
.gitignore
|
||||||
README.md
|
README.md
|
||||||
Dockerfile
|
|
||||||
.dockerignore
|
.dockerignore
|
||||||
.env
|
.env
|
||||||
.venv*
|
.venv*
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
export LOGOAPI_BASE_URL="https://logoapi.sportcreative.eu" # or your real logoapi base URL
|
LOGOAPI_BASE_URL=https://logoapi.sportcreative.eu
|
||||||
# Cloudflare Browser Rendering API configuration
|
CLOUDFLARE_ACCOUNT_ID=2154bf34f65a995f9b85aa17fee9da43
|
||||||
export CLOUDFLARE_ACCOUNT_ID="2154bf34f65a995f9b85aa17fee9da43" # Your Cloudflare account ID
|
CLOUDFLARE_API_TOKEN=TdhMaQWPnxCwc-g22W9l-A26hYTdkn_9tQCUKZ0h
|
||||||
export CLOUDFLARE_API_TOKEN="TdhMaQWPnxCwc-g22W9l-A26hYTdkn_9tQCUKZ0h" # API token with Browser Rendering - Edit permission
|
|
||||||
@@ -92,8 +92,19 @@ var (
|
|||||||
embeddedScraplingHelperOnce sync.Once
|
embeddedScraplingHelperOnce sync.Once
|
||||||
embeddedScraplingHelperFile string
|
embeddedScraplingHelperFile string
|
||||||
embeddedScraplingHelperErr error
|
embeddedScraplingHelperErr error
|
||||||
|
|
||||||
|
// Simple in-memory cache for fetched pages
|
||||||
|
pageCache = make(map[string]*cacheEntry)
|
||||||
|
pageCacheMu sync.RWMutex
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type cacheEntry struct {
|
||||||
|
body []byte
|
||||||
|
timestamp time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
const cacheTTL = 5 * time.Minute
|
||||||
|
|
||||||
// NewCloudflareClient creates a new Cloudflare Browser Rendering API client
|
// NewCloudflareClient creates a new Cloudflare Browser Rendering API client
|
||||||
func NewCloudflareClient() *CloudflareClient {
|
func NewCloudflareClient() *CloudflareClient {
|
||||||
accountID := strings.TrimSpace(os.Getenv("CLOUDFLARE_ACCOUNT_ID"))
|
accountID := strings.TrimSpace(os.Getenv("CLOUDFLARE_ACCOUNT_ID"))
|
||||||
@@ -592,8 +603,20 @@ func fetchPageWithFallback(url string) ([]byte, error) {
|
|||||||
|
|
||||||
// fetchPageWithFallback tries Go HTTP first, then wget, then Scrapling, then Cloudflare Browser Rendering.
|
// fetchPageWithFallback tries Go HTTP first, then wget, then Scrapling, then Cloudflare Browser Rendering.
|
||||||
func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error) {
|
func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error) {
|
||||||
|
// Check cache first
|
||||||
|
pageCacheMu.RLock()
|
||||||
|
if entry, ok := pageCache[url]; ok {
|
||||||
|
if time.Since(entry.timestamp) < cacheTTL {
|
||||||
|
pageCacheMu.RUnlock()
|
||||||
|
log.Printf("Cache hit for %s", url)
|
||||||
|
return entry.body, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pageCacheMu.RUnlock()
|
||||||
|
|
||||||
body, err := fetchPageDirect(url, opts)
|
body, err := fetchPageDirect(url, opts)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
cachePage(url, body)
|
||||||
return body, nil
|
return body, nil
|
||||||
}
|
}
|
||||||
log.Printf("Direct request failed for %s: %v", url, err)
|
log.Printf("Direct request failed for %s: %v", url, err)
|
||||||
@@ -601,6 +624,7 @@ func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error)
|
|||||||
body, err = fetchPageWithWget(url, opts)
|
body, err = fetchPageWithWget(url, opts)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
log.Printf("Successfully retrieved content via wget for %s", url)
|
log.Printf("Successfully retrieved content via wget for %s", url)
|
||||||
|
cachePage(url, body)
|
||||||
return body, nil
|
return body, nil
|
||||||
}
|
}
|
||||||
log.Printf("wget fallback failed for %s: %v", url, err)
|
log.Printf("wget fallback failed for %s: %v", url, err)
|
||||||
@@ -608,6 +632,7 @@ func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error)
|
|||||||
body, err = fetchPageWithScrapling(url, opts)
|
body, err = fetchPageWithScrapling(url, opts)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
log.Printf("Successfully retrieved content via Scrapling for %s", url)
|
log.Printf("Successfully retrieved content via Scrapling for %s", url)
|
||||||
|
cachePage(url, body)
|
||||||
return body, nil
|
return body, nil
|
||||||
}
|
}
|
||||||
log.Printf("Scrapling fallback failed for %s: %v", url, err)
|
log.Printf("Scrapling fallback failed for %s: %v", url, err)
|
||||||
@@ -629,6 +654,7 @@ func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error)
|
|||||||
return nil, fmt.Errorf("Cloudflare crawl returned a challenge page")
|
return nil, fmt.Errorf("Cloudflare crawl returned a challenge page")
|
||||||
}
|
}
|
||||||
log.Printf("Successfully retrieved content via Cloudflare crawl for %s", url)
|
log.Printf("Successfully retrieved content via Cloudflare crawl for %s", url)
|
||||||
|
cachePage(url, body)
|
||||||
return body, nil
|
return body, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -639,6 +665,12 @@ func fetchPageWithFallbackOptions(url string, opts fetchOptions) ([]byte, error)
|
|||||||
return nil, fmt.Errorf("go scraping failed, wget failed, Scrapling failed, and Cloudflare client is not available")
|
return nil, fmt.Errorf("go scraping failed, wget failed, Scrapling failed, and Cloudflare client is not available")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func cachePage(url string, body []byte) {
|
||||||
|
pageCacheMu.Lock()
|
||||||
|
pageCache[url] = &cacheEntry{body: body, timestamp: time.Now()}
|
||||||
|
pageCacheMu.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
// parseCompetitionMatchesFromFotbal scrapes matches from the public fotbal.cz
|
// parseCompetitionMatchesFromFotbal scrapes matches from the public fotbal.cz
|
||||||
// competition page (e.g., https://www.fotbal.cz/souteze/turnaje/table/{id}).
|
// competition page (e.g., https://www.fotbal.cz/souteze/turnaje/table/{id}).
|
||||||
// It filters to only include matches involving the given clubName if provided.
|
// It filters to only include matches involving the given clubName if provided.
|
||||||
|
|||||||
Reference in New Issue
Block a user