mirror of
https://github.com/Dvorinka/facr-scraper.git
synced 2026-06-03 20:12:57 +00:00
feat(scraper): implement CloakBrowser support and enhance request stealth
Integrate CloakBrowser to improve success rates against Cloudflare challenges and implement more robust request handling in the Go backend. - Add CloakBrowser integration to Dockerfile and requirements - Implement domain-specific request semaphores in Go to prevent rate-limiting - Add shared HTTP client with cookie jar and header preservation for better session management - Enhance request headers in Go to include modern client hints (Sec-Ch-Ua) - Add benchmarking scripts to compare fetch methods (urllib vs Scrapling vs CloakBrowser) - Update docker-compose to support CloakBrowser environment variables - Optimize Docker image by pre-downloading patched Chromium binaries
This commit is contained in:
+13
-3
@@ -31,6 +31,10 @@ RUN pip install --no-cache-dir -r requirements-scrapling.txt
|
|||||||
# Install Playwright browsers with deps in one layer
|
# Install Playwright browsers with deps in one layer
|
||||||
RUN playwright install chromium --with-deps
|
RUN playwright install chromium --with-deps
|
||||||
|
|
||||||
|
# Pre-download CloakBrowser patched Chromium binary so it doesn't
|
||||||
|
# download at runtime (saves ~10-20s per cold-start request).
|
||||||
|
RUN python -m cloakbrowser install
|
||||||
|
|
||||||
# Fix Python symlinks
|
# Fix Python symlinks
|
||||||
RUN ln -sf /usr/local/bin/python /opt/scrapling/bin/python \
|
RUN ln -sf /usr/local/bin/python /opt/scrapling/bin/python \
|
||||||
&& ln -sf /usr/local/bin/python /opt/scrapling/bin/python3
|
&& ln -sf /usr/local/bin/python /opt/scrapling/bin/python3
|
||||||
@@ -50,6 +54,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|||||||
libxkbcommon0 libatspi2.0-0 libx11-6 libxcomposite1 libxdamage1 \
|
libxkbcommon0 libatspi2.0-0 libx11-6 libxcomposite1 libxdamage1 \
|
||||||
libxext6 libxfixes3 libxrandr2 libgbm1 libcairo2 libpango-1.0-0 \
|
libxext6 libxfixes3 libxrandr2 libgbm1 libcairo2 libpango-1.0-0 \
|
||||||
libasound2 \
|
libasound2 \
|
||||||
|
fonts-liberation fonts-noto-color-emoji fonts-noto-core \
|
||||||
|
fontconfig locales \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Create non-root user
|
# Create non-root user
|
||||||
@@ -64,10 +70,14 @@ COPY --from=python-builder /opt/scrapling /opt/scrapling
|
|||||||
# Copy Playwright browser cache
|
# Copy Playwright browser cache
|
||||||
COPY --from=python-builder /root/.cache/ms-playwright /home/scraper/.cache/ms-playwright
|
COPY --from=python-builder /root/.cache/ms-playwright /home/scraper/.cache/ms-playwright
|
||||||
|
|
||||||
# Copy scrapling script
|
# Copy CloakBrowser patched Chromium binary cache
|
||||||
COPY scripts/scrapling_fetch.py /opt/scrapling/scripts/scrapling_fetch.py
|
COPY --from=python-builder /root/.cloakbrowser /home/scraper/.cloakbrowser
|
||||||
|
|
||||||
# Create cache directory for Playwright
|
# Copy scrapling and cloakbrowser scripts
|
||||||
|
COPY scripts/scrapling_fetch.py /opt/scrapling/scripts/scrapling_fetch.py
|
||||||
|
COPY scripts/cloakbrowser_fetch.py /opt/scrapling/scripts/cloakbrowser_fetch.py
|
||||||
|
|
||||||
|
# Create cache directory and fix permissions
|
||||||
RUN mkdir -p /home/scraper/.cache && chown -R scraper:scraper /home/scraper /opt/scrapling
|
RUN mkdir -p /home/scraper/.cache && chown -R scraper:scraper /home/scraper /opt/scrapling
|
||||||
|
|
||||||
USER scraper
|
USER scraper
|
||||||
|
|||||||
@@ -0,0 +1,6 @@
|
|||||||
|
FROM facr-scraper:cloakbrowser
|
||||||
|
USER root
|
||||||
|
COPY facr-scraper-new /usr/local/bin/facr-scraper
|
||||||
|
COPY scripts/cloakbrowser_fetch.py /opt/scrapling/scripts/cloakbrowser_fetch.py
|
||||||
|
RUN chmod +x /usr/local/bin/facr-scraper && chown -R scraper:scraper /opt/scrapling/scripts
|
||||||
|
USER scraper
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
FROM facr-scraper:cloakbrowser
|
||||||
|
COPY facr-scraper /usr/local/bin/facr-scraper
|
||||||
@@ -11,6 +11,10 @@ services:
|
|||||||
- CLOUDFLARE_API_TOKEN=${CLOUDFLARE_API_TOKEN}
|
- CLOUDFLARE_API_TOKEN=${CLOUDFLARE_API_TOKEN}
|
||||||
- SCRAPLING_PYTHON_BIN=/opt/scrapling/bin/python
|
- SCRAPLING_PYTHON_BIN=/opt/scrapling/bin/python
|
||||||
- SCRAPLING_SCRIPT=/opt/scrapling/scripts/scrapling_fetch.py
|
- SCRAPLING_SCRIPT=/opt/scrapling/scripts/scrapling_fetch.py
|
||||||
|
- CLOAKBROWSER_PYTHON_BIN=/opt/scrapling/bin/python
|
||||||
|
- CLOAKBROWSER_SCRIPT=/opt/scrapling/scripts/cloakbrowser_fetch.py
|
||||||
|
- CLOAKBROWSER_TIMEZONE=${CLOAKBROWSER_TIMEZONE:-Europe/Prague}
|
||||||
|
- CLOAKBROWSER_LOCALE=${CLOAKBROWSER_LOCALE:-cs-CZ}
|
||||||
- DEBUG_SAVE_HTML=${DEBUG_SAVE_HTML:-}
|
- DEBUG_SAVE_HTML=${DEBUG_SAVE_HTML:-}
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
deploy:
|
deploy:
|
||||||
|
|||||||
Executable
BIN
Binary file not shown.
Executable
BIN
Binary file not shown.
@@ -9,6 +9,7 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"log"
|
"log"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"net/http/cookiejar"
|
||||||
neturl "net/url"
|
neturl "net/url"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
@@ -22,6 +23,7 @@ import (
|
|||||||
|
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
"github.com/gorilla/mux"
|
"github.com/gorilla/mux"
|
||||||
|
"golang.org/x/net/publicsuffix"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Competition struct {
|
type Competition struct {
|
||||||
@@ -56,6 +58,10 @@ var (
|
|||||||
// Simple in-memory cache for fetched pages
|
// Simple in-memory cache for fetched pages
|
||||||
pageCache = make(map[string]*cacheEntry)
|
pageCache = make(map[string]*cacheEntry)
|
||||||
pageCacheMu sync.RWMutex
|
pageCacheMu sync.RWMutex
|
||||||
|
|
||||||
|
// Club response cache for expensive multi-fetch endpoints
|
||||||
|
clubCache = make(map[string]*clubCacheEntry)
|
||||||
|
clubCacheMu sync.RWMutex
|
||||||
)
|
)
|
||||||
|
|
||||||
type cacheEntry struct {
|
type cacheEntry struct {
|
||||||
@@ -63,7 +69,13 @@ type cacheEntry struct {
|
|||||||
timestamp time.Time
|
timestamp time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type clubCacheEntry struct {
|
||||||
|
data []byte
|
||||||
|
timestamp time.Time
|
||||||
|
}
|
||||||
|
|
||||||
const cacheTTL = 15 * time.Minute
|
const cacheTTL = 15 * time.Minute
|
||||||
|
const clubCacheTTL = 30 * time.Minute
|
||||||
|
|
||||||
// domainBreakers is a per-domain circuit breaker map so failures on one site
|
// domainBreakers is a per-domain circuit breaker map so failures on one site
|
||||||
// don't block Scrapling for unrelated sites.
|
// don't block Scrapling for unrelated sites.
|
||||||
@@ -76,6 +88,60 @@ var domainBreakers struct {
|
|||||||
// and resource exhaustion.
|
// and resource exhaustion.
|
||||||
var scraplingSem = newSemaphore(2)
|
var scraplingSem = newSemaphore(2)
|
||||||
|
|
||||||
|
// domainReqSem limits concurrent requests to the same domain to avoid
|
||||||
|
// triggering Cloudflare rate-limiting.
|
||||||
|
var domainReqSem struct {
|
||||||
|
mu sync.RWMutex
|
||||||
|
sems map[string]*semaphore
|
||||||
|
}
|
||||||
|
|
||||||
|
func getDomainReqSem(domain string) *semaphore {
|
||||||
|
domainReqSem.mu.RLock()
|
||||||
|
if s, ok := domainReqSem.sems[domain]; ok {
|
||||||
|
domainReqSem.mu.RUnlock()
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
domainReqSem.mu.RUnlock()
|
||||||
|
|
||||||
|
domainReqSem.mu.Lock()
|
||||||
|
defer domainReqSem.mu.Unlock()
|
||||||
|
if s, ok := domainReqSem.sems[domain]; ok {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
s := newSemaphore(1)
|
||||||
|
if domainReqSem.sems == nil {
|
||||||
|
domainReqSem.sems = make(map[string]*semaphore)
|
||||||
|
}
|
||||||
|
domainReqSem.sems[domain] = s
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
// sharedHTTPClient is a reusable client with a cookie jar so that cookies
|
||||||
|
// (including any Cloudflare clearance) survive across requests.
|
||||||
|
var sharedHTTPClient = func() *http.Client {
|
||||||
|
jar, err := cookiejar.New(&cookiejar.Options{PublicSuffixList: publicsuffix.List})
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("failed to create cookie jar, falling back to default client: %v", err)
|
||||||
|
return &http.Client{Timeout: 15 * time.Second}
|
||||||
|
}
|
||||||
|
return &http.Client{
|
||||||
|
Timeout: 15 * time.Second,
|
||||||
|
Jar: jar,
|
||||||
|
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||||||
|
if len(via) >= 10 {
|
||||||
|
return fmt.Errorf("stopped after 10 redirects")
|
||||||
|
}
|
||||||
|
// Preserve headers across redirects
|
||||||
|
for _, h := range []string{"User-Agent", "Accept", "Accept-Language", "Referer", "Sec-Ch-Ua", "Sec-Ch-Ua-Mobile", "Sec-Ch-Ua-Platform", "Sec-Fetch-Dest", "Sec-Fetch-Mode", "Sec-Fetch-Site", "Upgrade-Insecure-Requests"} {
|
||||||
|
if v := via[len(via)-1].Header.Get(h); v != "" {
|
||||||
|
req.Header.Set(h, v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
type circuitBreaker struct {
|
type circuitBreaker struct {
|
||||||
failures int32
|
failures int32
|
||||||
lastFail time.Time
|
lastFail time.Time
|
||||||
@@ -163,8 +229,21 @@ func newBrowserRequest(url string, opts fetchOptions) (*http.Request, error) {
|
|||||||
req.Header.Set("User-Agent", browserUserAgent)
|
req.Header.Set("User-Agent", browserUserAgent)
|
||||||
req.Header.Set("Accept", browserAccept)
|
req.Header.Set("Accept", browserAccept)
|
||||||
req.Header.Set("Accept-Language", browserAcceptLanguage)
|
req.Header.Set("Accept-Language", browserAcceptLanguage)
|
||||||
|
req.Header.Set("Accept-Encoding", "gzip, deflate, br")
|
||||||
|
req.Header.Set("Connection", "keep-alive")
|
||||||
|
req.Header.Set("Upgrade-Insecure-Requests", "1")
|
||||||
|
req.Header.Set("Sec-Ch-Ua", `"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"`)
|
||||||
|
req.Header.Set("Sec-Ch-Ua-Mobile", "?0")
|
||||||
|
req.Header.Set("Sec-Ch-Ua-Platform", `"Windows"`)
|
||||||
|
req.Header.Set("Sec-Fetch-Dest", "document")
|
||||||
|
req.Header.Set("Sec-Fetch-Mode", "navigate")
|
||||||
|
req.Header.Set("Sec-Fetch-Site", "none")
|
||||||
|
req.Header.Set("Sec-Fetch-User", "?1")
|
||||||
|
req.Header.Set("DNT", "1")
|
||||||
|
req.Header.Set("Cache-Control", "max-age=0")
|
||||||
if opts.Referer != "" {
|
if opts.Referer != "" {
|
||||||
req.Header.Set("Referer", opts.Referer)
|
req.Header.Set("Referer", opts.Referer)
|
||||||
|
req.Header.Set("Sec-Fetch-Site", "same-origin")
|
||||||
}
|
}
|
||||||
|
|
||||||
return req, nil
|
return req, nil
|
||||||
@@ -209,14 +288,25 @@ func compactErrorText(s string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func fetchPageDirectOnce(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
|
func fetchPageDirectOnce(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
|
||||||
|
parsed, err := neturl.Parse(url)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid URL: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Serialize requests per domain to avoid triggering rate limits.
|
||||||
|
sem := getDomainReqSem(parsed.Host)
|
||||||
|
if err := sem.Acquire(ctx); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer sem.Release()
|
||||||
|
|
||||||
req, err := newBrowserRequest(url, opts)
|
req, err := newBrowserRequest(url, opts)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
req = req.WithContext(ctx)
|
req = req.WithContext(ctx)
|
||||||
|
|
||||||
client := &http.Client{Timeout: 15 * time.Second}
|
resp, err := sharedHTTPClient.Do(req)
|
||||||
resp, err := client.Do(req)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("direct request failed: %w", err)
|
return nil, fmt.Errorf("direct request failed: %w", err)
|
||||||
}
|
}
|
||||||
@@ -463,6 +553,129 @@ func findScraplingPython() string {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func findCloakBrowserPython() string {
|
||||||
|
cwd, _ := os.Getwd()
|
||||||
|
|
||||||
|
exePath, _ := os.Executable()
|
||||||
|
exeDir := ""
|
||||||
|
if exePath != "" {
|
||||||
|
exeDir = filepath.Dir(exePath)
|
||||||
|
}
|
||||||
|
|
||||||
|
return firstExecutable(
|
||||||
|
os.Getenv("CLOAKBROWSER_PYTHON_BIN"),
|
||||||
|
filepath.Join(cwd, ".venv-scrapling", "bin", "python3"),
|
||||||
|
filepath.Join(cwd, ".venv-scrapling", "bin", "python"),
|
||||||
|
filepath.Join(cwd, ".venv", "bin", "python3"),
|
||||||
|
filepath.Join(cwd, ".venv", "bin", "python"),
|
||||||
|
filepath.Join(exeDir, ".venv-scrapling", "bin", "python3"),
|
||||||
|
filepath.Join(exeDir, ".venv-scrapling", "bin", "python"),
|
||||||
|
filepath.Join(exeDir, ".venv", "bin", "python3"),
|
||||||
|
filepath.Join(exeDir, ".venv", "bin", "python"),
|
||||||
|
"python3",
|
||||||
|
"python",
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func findCloakBrowserScript() (string, error) {
|
||||||
|
cwd, _ := os.Getwd()
|
||||||
|
candidates := []string{
|
||||||
|
os.Getenv("CLOAKBROWSER_SCRIPT"),
|
||||||
|
filepath.Join(cwd, "scripts", "cloakbrowser_fetch.py"),
|
||||||
|
filepath.Join(cwd, "cloakbrowser_fetch.py"),
|
||||||
|
"/opt/scrapling/scripts/cloakbrowser_fetch.py",
|
||||||
|
}
|
||||||
|
exePath, _ := os.Executable()
|
||||||
|
if exePath != "" {
|
||||||
|
exeDir := filepath.Dir(exePath)
|
||||||
|
candidates = append(candidates,
|
||||||
|
filepath.Join(exeDir, "scripts", "cloakbrowser_fetch.py"),
|
||||||
|
filepath.Join(exeDir, "cloakbrowser_fetch.py"),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
for _, p := range candidates {
|
||||||
|
if p != "" {
|
||||||
|
if _, err := os.Stat(p); err == nil {
|
||||||
|
return p, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "", fmt.Errorf("cloakbrowser_fetch.py not found")
|
||||||
|
}
|
||||||
|
|
||||||
|
// fetchPageWithCloakBrowser uses the CloakBrowser patched Chromium to fetch
|
||||||
|
// pages that are blocked by Cloudflare. It is ~3x faster than Scrapling for
|
||||||
|
// fotbal.cz because it passes bot detection without triggering challenge loops.
|
||||||
|
func fetchPageWithCloakBrowser(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
|
||||||
|
parsedURL, err := neturl.Parse(url)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("CloakBrowser skipped: invalid URL: %w", err)
|
||||||
|
}
|
||||||
|
domain := parsedURL.Host
|
||||||
|
|
||||||
|
if getDomainBreaker(domain).IsOpen() {
|
||||||
|
return nil, fmt.Errorf("CloakBrowser skipped: circuit breaker is open for %s", domain)
|
||||||
|
}
|
||||||
|
|
||||||
|
pythonBin := findCloakBrowserPython()
|
||||||
|
if pythonBin == "" {
|
||||||
|
return nil, fmt.Errorf("CloakBrowser skipped: no Python runtime found")
|
||||||
|
}
|
||||||
|
|
||||||
|
helperScript, err := findCloakBrowserScript()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("CloakBrowser skipped: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Acquire global Scrapling semaphore to limit concurrent Chromium launches
|
||||||
|
if err := scraplingSem.Acquire(ctx); err != nil {
|
||||||
|
return nil, fmt.Errorf("CloakBrowser skipped: %w", err)
|
||||||
|
}
|
||||||
|
defer scraplingSem.Release()
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(ctx, 45*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
args := []string{helperScript, url}
|
||||||
|
if opts.Referer != "" {
|
||||||
|
args = append(args, opts.Referer)
|
||||||
|
}
|
||||||
|
|
||||||
|
cmd := exec.CommandContext(ctx, pythonBin, args...)
|
||||||
|
var stdout bytes.Buffer
|
||||||
|
var stderr bytes.Buffer
|
||||||
|
cmd.Stdout = &stdout
|
||||||
|
cmd.Stderr = &stderr
|
||||||
|
cmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true}
|
||||||
|
|
||||||
|
if err := cmd.Run(); err != nil {
|
||||||
|
if cmd.Process != nil {
|
||||||
|
syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||||
|
}
|
||||||
|
details := compactErrorText(stderr.String())
|
||||||
|
if details == "" {
|
||||||
|
details = compactErrorText(err.Error())
|
||||||
|
}
|
||||||
|
if ctx.Err() == nil {
|
||||||
|
getDomainBreaker(domain).RecordFailure()
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("CloakBrowser request failed: %s", details)
|
||||||
|
}
|
||||||
|
|
||||||
|
body := stdout.Bytes()
|
||||||
|
if len(body) == 0 {
|
||||||
|
getDomainBreaker(domain).RecordFailure()
|
||||||
|
return nil, fmt.Errorf("CloakBrowser returned an empty body")
|
||||||
|
}
|
||||||
|
if looksLikeCloudflareBlock(body) {
|
||||||
|
getDomainBreaker(domain).RecordFailure()
|
||||||
|
return nil, fmt.Errorf("CloakBrowser returned a Cloudflare challenge page")
|
||||||
|
}
|
||||||
|
|
||||||
|
getDomainBreaker(domain).RecordSuccess()
|
||||||
|
return body, nil
|
||||||
|
}
|
||||||
|
|
||||||
func fetchPageWithScrapling(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
|
func fetchPageWithScrapling(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
|
||||||
parsedURL, err := neturl.Parse(url)
|
parsedURL, err := neturl.Parse(url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -490,10 +703,10 @@ func fetchPageWithScrapling(ctx context.Context, url string, opts fetchOptions)
|
|||||||
}
|
}
|
||||||
defer scraplingSem.Release()
|
defer scraplingSem.Release()
|
||||||
|
|
||||||
ctx, cancel := context.WithTimeout(ctx, 90*time.Second)
|
ctx, cancel := context.WithTimeout(ctx, 120*time.Second)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
args := []string{helperScript, "--url", url, "--timeout-ms", "60000", "--wait-ms", "500"}
|
args := []string{helperScript, "--url", url, "--timeout-ms", "90000", "--wait-ms", "500"}
|
||||||
if opts.Referer != "" {
|
if opts.Referer != "" {
|
||||||
args = append(args, "--referer", opts.Referer)
|
args = append(args, "--referer", opts.Referer)
|
||||||
}
|
}
|
||||||
@@ -540,9 +753,9 @@ func fetchPageWithFallback(ctx context.Context, url string) ([]byte, error) {
|
|||||||
return fetchPageWithFallbackOptions(ctx, url, fetchOptions{})
|
return fetchPageWithFallbackOptions(ctx, url, fetchOptions{})
|
||||||
}
|
}
|
||||||
|
|
||||||
// fetchPageWithFallback tries Go HTTP first, then curl/wget, then Scrapling.
|
// fetchPageWithFallback tries Go HTTP first, then curl/wget, then CloakBrowser,
|
||||||
// When direct HTTP returns a Cloudflare block, curl/wget are skipped since they
|
// then Scrapling. When direct HTTP returns a Cloudflare block, curl/wget are
|
||||||
// will just return the same challenge page and waste ~20 seconds.
|
// skipped since they will just return the same challenge page and waste ~20s.
|
||||||
func fetchPageWithFallbackOptions(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
|
func fetchPageWithFallbackOptions(ctx context.Context, url string, opts fetchOptions) ([]byte, error) {
|
||||||
if err := ctx.Err(); err != nil {
|
if err := ctx.Err(); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@@ -559,6 +772,7 @@ func fetchPageWithFallbackOptions(ctx context.Context, url string, opts fetchOpt
|
|||||||
}
|
}
|
||||||
pageCacheMu.RUnlock()
|
pageCacheMu.RUnlock()
|
||||||
|
|
||||||
|
// Try direct HTTP first
|
||||||
body, err := fetchPageDirect(ctx, url, opts)
|
body, err := fetchPageDirect(ctx, url, opts)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
cachePage(url, body)
|
cachePage(url, body)
|
||||||
@@ -567,10 +781,10 @@ func fetchPageWithFallbackOptions(ctx context.Context, url string, opts fetchOpt
|
|||||||
log.Printf("Direct request failed for %s: %v", url, err)
|
log.Printf("Direct request failed for %s: %v", url, err)
|
||||||
|
|
||||||
// If direct HTTP returned a Cloudflare block, skip curl/wget time-wasters
|
// If direct HTTP returned a Cloudflare block, skip curl/wget time-wasters
|
||||||
// and go straight to Scrapling which can solve the challenge.
|
// and go straight to CloakBrowser which can solve the challenge silently.
|
||||||
if strings.Contains(err.Error(), "403") || strings.Contains(err.Error(), "Cloudflare") {
|
if strings.Contains(err.Error(), "403") || strings.Contains(err.Error(), "Cloudflare") {
|
||||||
log.Printf("Skipping curl/wget fallbacks for %s: direct HTTP hit Cloudflare wall", url)
|
log.Printf("Skipping curl/wget fallbacks for %s: direct HTTP hit Cloudflare wall", url)
|
||||||
goto scraplingFallback
|
goto cloakBrowserFallback
|
||||||
}
|
}
|
||||||
|
|
||||||
body, err = fetchPageWithCurl(ctx, url, opts)
|
body, err = fetchPageWithCurl(ctx, url, opts)
|
||||||
@@ -589,7 +803,15 @@ func fetchPageWithFallbackOptions(ctx context.Context, url string, opts fetchOpt
|
|||||||
}
|
}
|
||||||
log.Printf("wget fallback failed for %s: %v", url, err)
|
log.Printf("wget fallback failed for %s: %v", url, err)
|
||||||
|
|
||||||
scraplingFallback:
|
cloakBrowserFallback:
|
||||||
|
body, err = fetchPageWithCloakBrowser(ctx, url, opts)
|
||||||
|
if err == nil {
|
||||||
|
log.Printf("Successfully retrieved content via CloakBrowser for %s", url)
|
||||||
|
cachePage(url, body)
|
||||||
|
return body, nil
|
||||||
|
}
|
||||||
|
log.Printf("CloakBrowser fallback failed for %s: %v", url, err)
|
||||||
|
|
||||||
body, err = fetchPageWithScrapling(ctx, url, opts)
|
body, err = fetchPageWithScrapling(ctx, url, opts)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
log.Printf("Successfully retrieved content via Scrapling for %s", url)
|
log.Printf("Successfully retrieved content via Scrapling for %s", url)
|
||||||
@@ -601,6 +823,12 @@ scraplingFallback:
|
|||||||
return nil, fmt.Errorf("all fetch methods failed for %s: %w", url, err)
|
return nil, fmt.Errorf("all fetch methods failed for %s: %w", url, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// detachedContext returns a context.Background() with a generous timeout so
|
||||||
|
// goroutines aren't all killed when r.Context() is cancelled.
|
||||||
|
func detachedContext(timeout time.Duration) (context.Context, context.CancelFunc) {
|
||||||
|
return context.WithTimeout(context.Background(), timeout)
|
||||||
|
}
|
||||||
|
|
||||||
func cachePage(url string, body []byte) {
|
func cachePage(url string, body []byte) {
|
||||||
pageCacheMu.Lock()
|
pageCacheMu.Lock()
|
||||||
pageCache[url] = &cacheEntry{body: body, timestamp: time.Now()}
|
pageCache[url] = &cacheEntry{body: body, timestamp: time.Now()}
|
||||||
@@ -783,12 +1011,25 @@ func parseCompetitionMatchesFromFotbal(ctx context.Context, pageURL, clubType, c
|
|||||||
|
|
||||||
// parseCompetitionMatchesFromIS scrapes matches from the IS portal as fallback.
|
// parseCompetitionMatchesFromIS scrapes matches from the IS portal as fallback.
|
||||||
func parseCompetitionMatchesFromIS(ctx context.Context, detailURL, clubType, clubName, clubID string) []Match {
|
func parseCompetitionMatchesFromIS(ctx context.Context, detailURL, clubType, clubName, clubID string) []Match {
|
||||||
req, err := http.NewRequestWithContext(ctx, "GET", detailURL, nil)
|
parsed, err := neturl.Parse(detailURL)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("IS matches invalid URL %s: %v", detailURL, err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
sem := getDomainReqSem(parsed.Host)
|
||||||
|
if err := sem.Acquire(ctx); err != nil {
|
||||||
|
log.Printf("IS matches domain semaphore error for %s: %v", detailURL, err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
defer sem.Release()
|
||||||
|
|
||||||
|
req, err := newBrowserRequest(detailURL, fetchOptions{})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("IS matches request error for %s: %v", detailURL, err)
|
log.Printf("IS matches request error for %s: %v", detailURL, err)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
resp, err := http.DefaultClient.Do(req)
|
req = req.WithContext(ctx)
|
||||||
|
resp, err := sharedHTTPClient.Do(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("IS matches fetch error for %s: %v", detailURL, err)
|
log.Printf("IS matches fetch error for %s: %v", detailURL, err)
|
||||||
return nil
|
return nil
|
||||||
@@ -1097,7 +1338,7 @@ func getLogoBySearch(name string) string {
|
|||||||
if v, ok := logoCache[key]; ok {
|
if v, ok := logoCache[key]; ok {
|
||||||
return v
|
return v
|
||||||
}
|
}
|
||||||
client := &http.Client{Timeout: 5 * time.Second}
|
client := &http.Client{Timeout: 60 * time.Second}
|
||||||
// Prefer simplified last-word token (e.g., "krnov") to improve hit rate for logos
|
// Prefer simplified last-word token (e.g., "krnov") to improve hit rate for logos
|
||||||
query := simplifyClubQuery(name)
|
query := simplifyClubQuery(name)
|
||||||
if query == "" {
|
if query == "" {
|
||||||
@@ -1334,6 +1575,20 @@ func getClubTables(w http.ResponseWriter, r *http.Request) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check club response cache
|
||||||
|
cacheKey := "table:" + clubType + ":" + clubID
|
||||||
|
clubCacheMu.RLock()
|
||||||
|
if entry, ok := clubCache[cacheKey]; ok {
|
||||||
|
if time.Since(entry.timestamp) < clubCacheTTL {
|
||||||
|
clubCacheMu.RUnlock()
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
w.Write(entry.data)
|
||||||
|
log.Printf("Club cache hit for %s", cacheKey)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
clubCacheMu.RUnlock()
|
||||||
|
|
||||||
// Validate club type
|
// Validate club type
|
||||||
var baseURL string
|
var baseURL string
|
||||||
var sportParam string
|
var sportParam string
|
||||||
@@ -1414,9 +1669,12 @@ func getClubTables(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
defer sem.Release()
|
defer sem.Release()
|
||||||
|
|
||||||
|
ctx, cancel := detachedContext(30 * time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
comp := &competitions[idx]
|
comp := &competitions[idx]
|
||||||
tableURL := fmt.Sprintf("https://is.fotbal.cz/public/souteze/tabulky-souteze.aspx?req=%s&sport=%s", comp.ID, sportParam)
|
tableURL := fmt.Sprintf("https://is.fotbal.cz/public/souteze/tabulky-souteze.aspx?req=%s&sport=%s", comp.ID, sportParam)
|
||||||
req, err := http.NewRequestWithContext(r.Context(), "GET", tableURL, nil)
|
req, err := http.NewRequestWithContext(ctx, "GET", tableURL, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("error creating request for competition table %s: %v", comp.ID, err)
|
log.Printf("error creating request for competition table %s: %v", comp.ID, err)
|
||||||
return
|
return
|
||||||
@@ -1518,8 +1776,18 @@ func getClubTables(w http.ResponseWriter, r *http.Request) {
|
|||||||
Competitions: competitions,
|
Competitions: competitions,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var buf bytes.Buffer
|
||||||
|
if err := json.NewEncoder(&buf).Encode(clubInfo); err != nil {
|
||||||
|
http.Error(w, fmt.Sprintf("JSON encode error: %v", err), http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
data := buf.Bytes()
|
||||||
|
clubCacheMu.Lock()
|
||||||
|
clubCache[cacheKey] = &clubCacheEntry{data: data, timestamp: time.Now()}
|
||||||
|
clubCacheMu.Unlock()
|
||||||
|
|
||||||
w.Header().Set("Content-Type", "application/json")
|
w.Header().Set("Content-Type", "application/json")
|
||||||
json.NewEncoder(w).Encode(clubInfo)
|
w.Write(data)
|
||||||
}
|
}
|
||||||
|
|
||||||
// getClubInfo returns club info with competitions and matches
|
// getClubInfo returns club info with competitions and matches
|
||||||
@@ -1531,6 +1799,21 @@ func getClubInfo(w http.ResponseWriter, r *http.Request) {
|
|||||||
http.Error(w, "Club ID is required", http.StatusBadRequest)
|
http.Error(w, "Club ID is required", http.StatusBadRequest)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check club response cache
|
||||||
|
cacheKey := "info:" + clubType + ":" + clubID
|
||||||
|
clubCacheMu.RLock()
|
||||||
|
if entry, ok := clubCache[cacheKey]; ok {
|
||||||
|
if time.Since(entry.timestamp) < clubCacheTTL {
|
||||||
|
clubCacheMu.RUnlock()
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
w.Write(entry.data)
|
||||||
|
log.Printf("Club cache hit for %s", cacheKey)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
clubCacheMu.RUnlock()
|
||||||
|
|
||||||
var baseURL, sportParam string
|
var baseURL, sportParam string
|
||||||
switch clubType {
|
switch clubType {
|
||||||
case "football":
|
case "football":
|
||||||
@@ -1613,13 +1896,16 @@ func getClubInfo(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
defer sem.Release()
|
defer sem.Release()
|
||||||
|
|
||||||
|
ctx, cancel := detachedContext(120 * time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
comp := &competitions[idx]
|
comp := &competitions[idx]
|
||||||
matchesLink := comp.MatchesLink
|
matchesLink := comp.MatchesLink
|
||||||
// 1) Try parsing from the public fotbal.cz competition page (matches_link)
|
// 1) Try parsing from the public fotbal.cz competition page (matches_link)
|
||||||
matches := parseCompetitionMatchesFromFotbal(r.Context(), matchesLink, clubType, clubName, clubID)
|
matches := parseCompetitionMatchesFromFotbal(ctx, matchesLink, clubType, clubName, clubID)
|
||||||
// Always try IS as well and prefer it if it provides at least as many matches
|
// Always try IS as well and prefer it if it provides at least as many matches
|
||||||
detailURL := fmt.Sprintf("https://is.fotbal.cz/public/souteze/detail-souteze.aspx?req=%s&sport=%s", comp.ID, sportParam)
|
detailURL := fmt.Sprintf("https://is.fotbal.cz/public/souteze/detail-souteze.aspx?req=%s&sport=%s", comp.ID, sportParam)
|
||||||
isMatches := parseCompetitionMatchesFromIS(r.Context(), detailURL, clubType, clubName, clubID)
|
isMatches := parseCompetitionMatchesFromIS(ctx, detailURL, clubType, clubName, clubID)
|
||||||
// Prefer IS whenever it yields any results, as IS often contains alias team names
|
// Prefer IS whenever it yields any results, as IS often contains alias team names
|
||||||
if len(isMatches) > 0 {
|
if len(isMatches) > 0 {
|
||||||
matches = isMatches
|
matches = isMatches
|
||||||
@@ -1643,8 +1929,18 @@ func getClubInfo(w http.ResponseWriter, r *http.Request) {
|
|||||||
Competitions: competitions,
|
Competitions: competitions,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var buf bytes.Buffer
|
||||||
|
if err := json.NewEncoder(&buf).Encode(clubInfo); err != nil {
|
||||||
|
http.Error(w, fmt.Sprintf("JSON encode error: %v", err), http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
data := buf.Bytes()
|
||||||
|
clubCacheMu.Lock()
|
||||||
|
clubCache[cacheKey] = &clubCacheEntry{data: data, timestamp: time.Now()}
|
||||||
|
clubCacheMu.Unlock()
|
||||||
|
|
||||||
w.Header().Set("Content-Type", "application/json")
|
w.Header().Set("Content-Type", "application/json")
|
||||||
json.NewEncoder(w).Encode(clubInfo)
|
w.Write(data)
|
||||||
}
|
}
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
@@ -1663,7 +1959,7 @@ func main() {
|
|||||||
Addr: addr,
|
Addr: addr,
|
||||||
Handler: r,
|
Handler: r,
|
||||||
ReadTimeout: 30 * time.Second,
|
ReadTimeout: 30 * time.Second,
|
||||||
WriteTimeout: 5 * time.Minute,
|
WriteTimeout: 10 * time.Minute,
|
||||||
IdleTimeout: 120 * time.Second,
|
IdleTimeout: 120 * time.Second,
|
||||||
MaxHeaderBytes: 1 << 20,
|
MaxHeaderBytes: 1 << 20,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1 +1,2 @@
|
|||||||
scrapling[fetchers]==0.4.1
|
scrapling[fetchers]==0.4.1
|
||||||
|
cloakbrowser==0.3.28
|
||||||
|
|||||||
@@ -0,0 +1,240 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Benchmark script comparing fetch methods:
|
||||||
|
1. Direct urllib (lightweight HTTP)
|
||||||
|
2. Scrapling (StealthyFetcher -> Chromium via patchright)
|
||||||
|
3. CloakBrowser (patched Chromium with stealth)
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
.venv-scrapling/bin/python scripts/benchmark_fetch.py [--url URL] [--iterations N]
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import gc
|
||||||
|
import os
|
||||||
|
import resource
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import urllib.request
|
||||||
|
import ssl
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add venv site-packages to path if needed
|
||||||
|
venv = Path(__file__).parent.parent / ".venv-scrapling"
|
||||||
|
if venv.exists():
|
||||||
|
import site
|
||||||
|
site.addsitedir(str(venv / "lib" / "python3.13" / "site-packages"))
|
||||||
|
|
||||||
|
BROWSER_UA = (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
|
||||||
|
CF_SIGNS = [
|
||||||
|
b"<title>just a moment...</title>",
|
||||||
|
b"attention required!",
|
||||||
|
b"enable javascript and cookies to continue",
|
||||||
|
b"checking if the site connection is secure",
|
||||||
|
b"cf-browser-verification",
|
||||||
|
b"/cdn-cgi/challenge-platform/",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def looks_like_cloudflare_block(body: bytes) -> bool:
|
||||||
|
if not body:
|
||||||
|
return False
|
||||||
|
low = body.lower()
|
||||||
|
# Must contain an actual challenge title, not just CDN references
|
||||||
|
hard_signals = [
|
||||||
|
b"<title>just a moment...</title>",
|
||||||
|
b"attention required!",
|
||||||
|
b"enable javascript and cookies to continue",
|
||||||
|
b"checking if the site connection is secure",
|
||||||
|
]
|
||||||
|
for sig in hard_signals:
|
||||||
|
if sig in low:
|
||||||
|
return True
|
||||||
|
# Secondary: challenge platform JS + challenge token
|
||||||
|
if b"/cdn-cgi/challenge-platform/" in low and (
|
||||||
|
b"window._cf_chl_opt" in low or b"__cf_chl_rt_tk" in low
|
||||||
|
):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def get_memory_mb() -> float:
|
||||||
|
"""Return current process RSS memory in MB."""
|
||||||
|
usage = resource.getrusage(resource.RUSAGE_SELF)
|
||||||
|
return usage.ru_maxrss / 1024.0 # KB -> MB on Linux
|
||||||
|
|
||||||
|
|
||||||
|
def direct_fetch(url: str, referer: str = "", timeout: float = 15.0) -> bytes:
|
||||||
|
req = urllib.request.Request(
|
||||||
|
url,
|
||||||
|
headers={
|
||||||
|
"User-Agent": BROWSER_UA,
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"Accept-Language": "cs-CZ,cs;q=0.9,en;q=0.8",
|
||||||
|
"Accept-Encoding": "identity",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
**({"Referer": referer} if referer else {}),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
ctx = ssl.create_default_context()
|
||||||
|
ctx.check_hostname = False
|
||||||
|
ctx.verify_mode = ssl.CERT_NONE
|
||||||
|
with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
|
||||||
|
body = resp.read()
|
||||||
|
if looks_like_cloudflare_block(body):
|
||||||
|
raise RuntimeError("Cloudflare block detected")
|
||||||
|
return body
|
||||||
|
|
||||||
|
|
||||||
|
def scrapling_fetch(url: str, referer: str = "", timeout_ms: int = 90000, wait_ms: int = 500) -> bytes:
|
||||||
|
from scrapling.fetchers import StealthyFetcher
|
||||||
|
|
||||||
|
extra_headers = {}
|
||||||
|
if referer:
|
||||||
|
extra_headers["Referer"] = referer
|
||||||
|
|
||||||
|
fetch_kwargs = {
|
||||||
|
"headless": True,
|
||||||
|
"network_idle": False,
|
||||||
|
"google_search": False,
|
||||||
|
"solve_cloudflare": True,
|
||||||
|
"timeout": timeout_ms,
|
||||||
|
"wait": wait_ms,
|
||||||
|
}
|
||||||
|
if extra_headers:
|
||||||
|
fetch_kwargs["extra_headers"] = extra_headers
|
||||||
|
|
||||||
|
response = StealthyFetcher.fetch(url, **fetch_kwargs)
|
||||||
|
|
||||||
|
status = getattr(response, "status", None)
|
||||||
|
if isinstance(status, int) and status >= 400:
|
||||||
|
raise RuntimeError(f"HTTP {status}")
|
||||||
|
|
||||||
|
body = getattr(response, "body", None)
|
||||||
|
if isinstance(body, (bytes, bytearray)):
|
||||||
|
return bytes(body)
|
||||||
|
if isinstance(body, str):
|
||||||
|
return body.encode("utf-8")
|
||||||
|
text = getattr(response, "text", None)
|
||||||
|
if isinstance(text, str):
|
||||||
|
return text.encode("utf-8")
|
||||||
|
return str(response).encode("utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
def cloakbrowser_fetch(url: str, referer: str = "", timeout_ms: int = 90000) -> bytes:
|
||||||
|
from cloakbrowser import launch_context
|
||||||
|
|
||||||
|
ctx = launch_context(headless=True)
|
||||||
|
page = ctx.new_page()
|
||||||
|
|
||||||
|
try:
|
||||||
|
extra_headers = {}
|
||||||
|
if referer:
|
||||||
|
extra_headers["Referer"] = referer
|
||||||
|
|
||||||
|
if extra_headers:
|
||||||
|
page.set_extra_http_headers(extra_headers)
|
||||||
|
|
||||||
|
page.goto(url, timeout=timeout_ms, wait_until="networkidle")
|
||||||
|
html = page.content()
|
||||||
|
body = html.encode("utf-8")
|
||||||
|
|
||||||
|
if looks_like_cloudflare_block(body):
|
||||||
|
raise RuntimeError("Cloudflare block detected")
|
||||||
|
return body
|
||||||
|
finally:
|
||||||
|
ctx.close()
|
||||||
|
|
||||||
|
|
||||||
|
def benchmark_method(name: str, fn, url: str, referer: str, iterations: int = 1):
|
||||||
|
"""Run a fetch method and return timing + metadata."""
|
||||||
|
gc.collect()
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for i in range(iterations):
|
||||||
|
print(f" [{name}] iteration {i + 1}/{iterations}...", flush=True)
|
||||||
|
mem_before = get_memory_mb()
|
||||||
|
start = time.monotonic()
|
||||||
|
error = None
|
||||||
|
body = b""
|
||||||
|
try:
|
||||||
|
body = fn(url, referer)
|
||||||
|
if not body:
|
||||||
|
error = "empty body"
|
||||||
|
except Exception as exc:
|
||||||
|
error = str(exc)
|
||||||
|
elapsed = time.monotonic() - start
|
||||||
|
mem_after = get_memory_mb()
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
"iteration": i + 1,
|
||||||
|
"elapsed_sec": elapsed,
|
||||||
|
"success": error is None,
|
||||||
|
"error": error,
|
||||||
|
"body_size": len(body),
|
||||||
|
"mem_before_mb": mem_before,
|
||||||
|
"mem_after_mb": mem_after,
|
||||||
|
"mem_delta_mb": mem_after - mem_before,
|
||||||
|
})
|
||||||
|
|
||||||
|
if error:
|
||||||
|
print(f" FAILED: {error}", flush=True)
|
||||||
|
else:
|
||||||
|
print(f" OK in {elapsed:.2f}s, {len(body)} bytes, mem +{mem_after - mem_before:.1f}MB", flush=True)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--url", default="https://www.fotbal.cz/souteze/club/club/7eacd9f0-bfa0-4928-a9b6-936140168f58")
|
||||||
|
parser.add_argument("--search-url", default="https://www.fotbal.cz/club/hledej?q=fotbalovy+klub+krnov")
|
||||||
|
parser.add_argument("--iterations", type=int, default=1)
|
||||||
|
parser.add_argument("--methods", default="all", help="Comma-separated: direct,scrapling,cloakbrowser,all")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
methods = [m.strip().lower() for m in args.methods.split(",")]
|
||||||
|
test_all = "all" in methods
|
||||||
|
|
||||||
|
print("=" * 70)
|
||||||
|
print("FACR Scraper Fetch Benchmark")
|
||||||
|
print("=" * 70)
|
||||||
|
print(f"Python: {sys.version}")
|
||||||
|
print(f"Iterations per method: {args.iterations}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
urls = [
|
||||||
|
("Club page", args.url),
|
||||||
|
("Search page", args.search_url),
|
||||||
|
]
|
||||||
|
|
||||||
|
for label, url in urls:
|
||||||
|
print(f"\n{'=' * 70}")
|
||||||
|
print(f"Testing: {label}")
|
||||||
|
print(f"URL: {url}")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
if test_all or "direct" in methods:
|
||||||
|
print("\n--- Direct HTTP (urllib) ---")
|
||||||
|
benchmark_method("direct", direct_fetch, url, "", args.iterations)
|
||||||
|
|
||||||
|
if test_all or "cloakbrowser" in methods:
|
||||||
|
print("\n--- CloakBrowser ---")
|
||||||
|
benchmark_method("cloakbrowser", cloakbrowser_fetch, url, "", args.iterations)
|
||||||
|
|
||||||
|
if test_all or "scrapling" in methods:
|
||||||
|
print("\n--- Scrapling ---")
|
||||||
|
benchmark_method("scrapling", scrapling_fetch, url, "", args.iterations)
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("Benchmark complete.")
|
||||||
|
print("=" * 70)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
@@ -0,0 +1,24 @@
|
|||||||
|
import sys, os, time
|
||||||
|
from cloakbrowser import launch_context
|
||||||
|
|
||||||
|
url = sys.argv[1]
|
||||||
|
tz = os.environ.get('CLOAKBROWSER_TIMEZONE', 'Europe/Prague')
|
||||||
|
lc = os.environ.get('CLOAKBROWSER_LOCALE', 'cs-CZ')
|
||||||
|
|
||||||
|
ctx = launch_context(
|
||||||
|
headless=True,
|
||||||
|
timezone=tz,
|
||||||
|
locale=lc,
|
||||||
|
args=['--no-sandbox', '--disable-dev-shm-usage']
|
||||||
|
)
|
||||||
|
page = ctx.new_page()
|
||||||
|
|
||||||
|
# Note: we intentionally do NOT set a custom Referer here.
|
||||||
|
# A self-referring Referer (e.g. /club/hledej -> /club/hledej) triggers
|
||||||
|
# Cloudflare's bot detection even with CloakBrowser's stealth patches.
|
||||||
|
|
||||||
|
try:
|
||||||
|
page.goto(url, timeout=30000, wait_until='networkidle')
|
||||||
|
print(page.content(), end='')
|
||||||
|
finally:
|
||||||
|
ctx.close()
|
||||||
@@ -82,9 +82,11 @@ def scrapling_fetch(url: str, referer: str = "", timeout_ms: int = 30000, wait_m
|
|||||||
if referer:
|
if referer:
|
||||||
extra_headers["Referer"] = referer
|
extra_headers["Referer"] = referer
|
||||||
|
|
||||||
|
# Increase challenge-solving timeout; network_idle can interfere with
|
||||||
|
# ongoing Cloudflare polling so we disable it.
|
||||||
fetch_kwargs = {
|
fetch_kwargs = {
|
||||||
"headless": True,
|
"headless": True,
|
||||||
"network_idle": True,
|
"network_idle": False,
|
||||||
"google_search": False,
|
"google_search": False,
|
||||||
"solve_cloudflare": True,
|
"solve_cloudflare": True,
|
||||||
"timeout": timeout_ms,
|
"timeout": timeout_ms,
|
||||||
|
|||||||
Reference in New Issue
Block a user