Files
Devour/internal/scraper/web_integration_test.go
T
Tomas Dvorak 898a3c303f update
2026-02-24 10:33:59 +01:00

133 lines
3.4 KiB
Go

package scraper
import (
"context"
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"
)
func TestWebScraper_ReturnsErrorWhenNothingFetched(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
http.NotFound(w, r)
}))
defer srv.Close()
s := NewWebScraper(&Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
Concurrency: 1,
MaxDepth: 1,
})
_, err := s.Scrape(context.Background(), &Source{
Name: "missing",
Type: SourceTypeWeb,
URL: srv.URL + "/missing",
})
if err == nil {
t.Fatal("expected error when web scrape yields no documents")
}
}
func TestWebScraper_ReturnsErrorWhenPageHasNoExtractableContent(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><head><title>Empty</title></head><body><p>tiny</p></body></html>`))
}))
defer srv.Close()
s := NewWebScraper(&Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
Concurrency: 1,
MaxDepth: 1,
})
_, err := s.Scrape(context.Background(), &Source{
Name: "empty",
Type: SourceTypeWeb,
URL: srv.URL,
})
if err == nil {
t.Fatal("expected error when page has no extractable docs")
}
if !strings.Contains(err.Error(), "extracted no documents") {
t.Fatalf("unexpected error message: %v", err)
}
}
func TestWebScraper_AllowsRedirectedDocumentPath(t *testing.T) {
content := strings.Repeat("ruby docs content ", 30)
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.URL.Path {
case "/core/Regexp.html":
http.Redirect(w, r, "/3.4.1/Regexp.html", http.StatusFound)
case "/3.4.1/Regexp.html":
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><head><title>Regexp</title></head><body><main>` + content + `</main></body></html>`))
default:
http.NotFound(w, r)
}
}))
defer srv.Close()
s := NewWebScraper(&Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
Concurrency: 1,
MaxDepth: 1,
})
docs, err := s.Scrape(context.Background(), &Source{
Name: "ruby",
Type: SourceTypeWeb,
URL: srv.URL + "/core/Regexp.html",
})
if err != nil {
t.Fatalf("unexpected scrape error: %v", err)
}
if len(docs) == 0 {
t.Fatal("expected redirected page to be scraped")
}
if !strings.Contains(docs[0].URL, "/3.4.1/Regexp.html") {
t.Fatalf("expected final redirected URL, got %q", docs[0].URL)
}
}
func TestWebScraper_GlobalWrapperNormalizesOutput(t *testing.T) {
content := strings.Repeat("docs content ", 20)
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><head><title>Regex Guide ¶ deprecated</title></head><body><main>` + content + `</main></body></html>`))
}))
defer srv.Close()
s := NewScraper(SourceTypeWeb, &Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
Concurrency: 1,
MaxDepth: 1,
})
if s == nil {
t.Fatal("expected web scraper")
}
docs, err := s.Scrape(context.Background(), &Source{
Name: "test",
Type: SourceTypeWeb,
URL: srv.URL,
})
if err != nil {
t.Fatalf("unexpected scrape error: %v", err)
}
if len(docs) == 0 {
t.Fatal("expected at least one document")
}
if docs[0].Title != "Regex Guide" {
t.Fatalf("expected normalized title, got %q", docs[0].Title)
}
}