package scraper import ( "context" "net/http" "net/http/httptest" "strings" "testing" "time" ) func TestWebScraper_ReturnsErrorWhenNothingFetched(t *testing.T) { srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { http.NotFound(w, r) })) defer srv.Close() s := NewWebScraper(&Config{ UserAgent: "DevourTest/1.0", Timeout: 2 * time.Second, Concurrency: 1, MaxDepth: 1, }) _, err := s.Scrape(context.Background(), &Source{ Name: "missing", Type: SourceTypeWeb, URL: srv.URL + "/missing", }) if err == nil { t.Fatal("expected error when web scrape yields no documents") } } func TestWebScraper_ReturnsErrorWhenPageHasNoExtractableContent(t *testing.T) { srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/html") _, _ = w.Write([]byte(`Empty

tiny

`)) })) defer srv.Close() s := NewWebScraper(&Config{ UserAgent: "DevourTest/1.0", Timeout: 2 * time.Second, Concurrency: 1, MaxDepth: 1, }) _, err := s.Scrape(context.Background(), &Source{ Name: "empty", Type: SourceTypeWeb, URL: srv.URL, }) if err == nil { t.Fatal("expected error when page has no extractable docs") } if !strings.Contains(err.Error(), "extracted no documents") { t.Fatalf("unexpected error message: %v", err) } } func TestWebScraper_AllowsRedirectedDocumentPath(t *testing.T) { content := strings.Repeat("ruby docs content ", 30) srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { switch r.URL.Path { case "/core/Regexp.html": http.Redirect(w, r, "/3.4.1/Regexp.html", http.StatusFound) case "/3.4.1/Regexp.html": w.Header().Set("Content-Type", "text/html") _, _ = w.Write([]byte(`Regexp
` + content + `
`)) default: http.NotFound(w, r) } })) defer srv.Close() s := NewWebScraper(&Config{ UserAgent: "DevourTest/1.0", Timeout: 2 * time.Second, Concurrency: 1, MaxDepth: 1, }) docs, err := s.Scrape(context.Background(), &Source{ Name: "ruby", Type: SourceTypeWeb, URL: srv.URL + "/core/Regexp.html", }) if err != nil { t.Fatalf("unexpected scrape error: %v", err) } if len(docs) == 0 { t.Fatal("expected redirected page to be scraped") } if !strings.Contains(docs[0].URL, "/3.4.1/Regexp.html") { t.Fatalf("expected final redirected URL, got %q", docs[0].URL) } } func TestWebScraper_GlobalWrapperNormalizesOutput(t *testing.T) { content := strings.Repeat("docs content ", 20) srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/html") _, _ = w.Write([]byte(`Regex Guide ΒΆ deprecated
` + content + `
`)) })) defer srv.Close() s := NewScraper(SourceTypeWeb, &Config{ UserAgent: "DevourTest/1.0", Timeout: 2 * time.Second, Concurrency: 1, MaxDepth: 1, }) if s == nil { t.Fatal("expected web scraper") } docs, err := s.Scrape(context.Background(), &Source{ Name: "test", Type: SourceTypeWeb, URL: srv.URL, }) if err != nil { t.Fatalf("unexpected scrape error: %v", err) } if len(docs) == 0 { t.Fatal("expected at least one document") } if docs[0].Title != "Regex Guide" { t.Fatalf("expected normalized title, got %q", docs[0].Title) } }