package scraper import ( "context" "net/http" "net/http/httptest" "strings" "testing" "time" ) func TestWebScraper_ReturnsErrorWhenNothingFetched(t *testing.T) { srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { http.NotFound(w, r) })) defer srv.Close() s := NewWebScraper(&Config{ UserAgent: "DevourTest/1.0", Timeout: 2 * time.Second, Concurrency: 1, MaxDepth: 1, }) _, err := s.Scrape(context.Background(), &Source{ Name: "missing", Type: SourceTypeWeb, URL: srv.URL + "/missing", }) if err == nil { t.Fatal("expected error when web scrape yields no documents") } } func TestWebScraper_ReturnsErrorWhenPageHasNoExtractableContent(t *testing.T) { srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/html") _, _ = w.Write([]byte(`
tiny
`)) })) defer srv.Close() s := NewWebScraper(&Config{ UserAgent: "DevourTest/1.0", Timeout: 2 * time.Second, Concurrency: 1, MaxDepth: 1, }) _, err := s.Scrape(context.Background(), &Source{ Name: "empty", Type: SourceTypeWeb, URL: srv.URL, }) if err == nil { t.Fatal("expected error when page has no extractable docs") } if !strings.Contains(err.Error(), "extracted no documents") { t.Fatalf("unexpected error message: %v", err) } } func TestWebScraper_AllowsRedirectedDocumentPath(t *testing.T) { content := strings.Repeat("ruby docs content ", 30) srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { switch r.URL.Path { case "/core/Regexp.html": http.Redirect(w, r, "/3.4.1/Regexp.html", http.StatusFound) case "/3.4.1/Regexp.html": w.Header().Set("Content-Type", "text/html") _, _ = w.Write([]byte(`