mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
133 lines
3.4 KiB
Go
133 lines
3.4 KiB
Go
package scraper
|
|
|
|
import (
|
|
"context"
|
|
"net/http"
|
|
"net/http/httptest"
|
|
"strings"
|
|
"testing"
|
|
"time"
|
|
)
|
|
|
|
func TestWebScraper_ReturnsErrorWhenNothingFetched(t *testing.T) {
|
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
http.NotFound(w, r)
|
|
}))
|
|
defer srv.Close()
|
|
|
|
s := NewWebScraper(&Config{
|
|
UserAgent: "DevourTest/1.0",
|
|
Timeout: 2 * time.Second,
|
|
Concurrency: 1,
|
|
MaxDepth: 1,
|
|
})
|
|
|
|
_, err := s.Scrape(context.Background(), &Source{
|
|
Name: "missing",
|
|
Type: SourceTypeWeb,
|
|
URL: srv.URL + "/missing",
|
|
})
|
|
if err == nil {
|
|
t.Fatal("expected error when web scrape yields no documents")
|
|
}
|
|
}
|
|
|
|
func TestWebScraper_ReturnsErrorWhenPageHasNoExtractableContent(t *testing.T) {
|
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "text/html")
|
|
_, _ = w.Write([]byte(`<html><head><title>Empty</title></head><body><p>tiny</p></body></html>`))
|
|
}))
|
|
defer srv.Close()
|
|
|
|
s := NewWebScraper(&Config{
|
|
UserAgent: "DevourTest/1.0",
|
|
Timeout: 2 * time.Second,
|
|
Concurrency: 1,
|
|
MaxDepth: 1,
|
|
})
|
|
|
|
_, err := s.Scrape(context.Background(), &Source{
|
|
Name: "empty",
|
|
Type: SourceTypeWeb,
|
|
URL: srv.URL,
|
|
})
|
|
if err == nil {
|
|
t.Fatal("expected error when page has no extractable docs")
|
|
}
|
|
if !strings.Contains(err.Error(), "extracted no documents") {
|
|
t.Fatalf("unexpected error message: %v", err)
|
|
}
|
|
}
|
|
|
|
func TestWebScraper_AllowsRedirectedDocumentPath(t *testing.T) {
|
|
content := strings.Repeat("ruby docs content ", 30)
|
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
switch r.URL.Path {
|
|
case "/core/Regexp.html":
|
|
http.Redirect(w, r, "/3.4.1/Regexp.html", http.StatusFound)
|
|
case "/3.4.1/Regexp.html":
|
|
w.Header().Set("Content-Type", "text/html")
|
|
_, _ = w.Write([]byte(`<html><head><title>Regexp</title></head><body><main>` + content + `</main></body></html>`))
|
|
default:
|
|
http.NotFound(w, r)
|
|
}
|
|
}))
|
|
defer srv.Close()
|
|
|
|
s := NewWebScraper(&Config{
|
|
UserAgent: "DevourTest/1.0",
|
|
Timeout: 2 * time.Second,
|
|
Concurrency: 1,
|
|
MaxDepth: 1,
|
|
})
|
|
|
|
docs, err := s.Scrape(context.Background(), &Source{
|
|
Name: "ruby",
|
|
Type: SourceTypeWeb,
|
|
URL: srv.URL + "/core/Regexp.html",
|
|
})
|
|
if err != nil {
|
|
t.Fatalf("unexpected scrape error: %v", err)
|
|
}
|
|
if len(docs) == 0 {
|
|
t.Fatal("expected redirected page to be scraped")
|
|
}
|
|
if !strings.Contains(docs[0].URL, "/3.4.1/Regexp.html") {
|
|
t.Fatalf("expected final redirected URL, got %q", docs[0].URL)
|
|
}
|
|
}
|
|
|
|
func TestWebScraper_GlobalWrapperNormalizesOutput(t *testing.T) {
|
|
content := strings.Repeat("docs content ", 20)
|
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "text/html")
|
|
_, _ = w.Write([]byte(`<html><head><title>Regex Guide ¶ deprecated</title></head><body><main>` + content + `</main></body></html>`))
|
|
}))
|
|
defer srv.Close()
|
|
|
|
s := NewScraper(SourceTypeWeb, &Config{
|
|
UserAgent: "DevourTest/1.0",
|
|
Timeout: 2 * time.Second,
|
|
Concurrency: 1,
|
|
MaxDepth: 1,
|
|
})
|
|
if s == nil {
|
|
t.Fatal("expected web scraper")
|
|
}
|
|
|
|
docs, err := s.Scrape(context.Background(), &Source{
|
|
Name: "test",
|
|
Type: SourceTypeWeb,
|
|
URL: srv.URL,
|
|
})
|
|
if err != nil {
|
|
t.Fatalf("unexpected scrape error: %v", err)
|
|
}
|
|
if len(docs) == 0 {
|
|
t.Fatal("expected at least one document")
|
|
}
|
|
if docs[0].Title != "Regex Guide" {
|
|
t.Fatalf("expected normalized title, got %q", docs[0].Title)
|
|
}
|
|
}
|