mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
update
This commit is contained in:
@@ -0,0 +1,132 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestWebScraper_ReturnsErrorWhenNothingFetched(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
http.NotFound(w, r)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := NewWebScraper(&Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
Concurrency: 1,
|
||||
MaxDepth: 1,
|
||||
})
|
||||
|
||||
_, err := s.Scrape(context.Background(), &Source{
|
||||
Name: "missing",
|
||||
Type: SourceTypeWeb,
|
||||
URL: srv.URL + "/missing",
|
||||
})
|
||||
if err == nil {
|
||||
t.Fatal("expected error when web scrape yields no documents")
|
||||
}
|
||||
}
|
||||
|
||||
func TestWebScraper_ReturnsErrorWhenPageHasNoExtractableContent(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(`<html><head><title>Empty</title></head><body><p>tiny</p></body></html>`))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := NewWebScraper(&Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
Concurrency: 1,
|
||||
MaxDepth: 1,
|
||||
})
|
||||
|
||||
_, err := s.Scrape(context.Background(), &Source{
|
||||
Name: "empty",
|
||||
Type: SourceTypeWeb,
|
||||
URL: srv.URL,
|
||||
})
|
||||
if err == nil {
|
||||
t.Fatal("expected error when page has no extractable docs")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "extracted no documents") {
|
||||
t.Fatalf("unexpected error message: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWebScraper_AllowsRedirectedDocumentPath(t *testing.T) {
|
||||
content := strings.Repeat("ruby docs content ", 30)
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch r.URL.Path {
|
||||
case "/core/Regexp.html":
|
||||
http.Redirect(w, r, "/3.4.1/Regexp.html", http.StatusFound)
|
||||
case "/3.4.1/Regexp.html":
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(`<html><head><title>Regexp</title></head><body><main>` + content + `</main></body></html>`))
|
||||
default:
|
||||
http.NotFound(w, r)
|
||||
}
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := NewWebScraper(&Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
Concurrency: 1,
|
||||
MaxDepth: 1,
|
||||
})
|
||||
|
||||
docs, err := s.Scrape(context.Background(), &Source{
|
||||
Name: "ruby",
|
||||
Type: SourceTypeWeb,
|
||||
URL: srv.URL + "/core/Regexp.html",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected scrape error: %v", err)
|
||||
}
|
||||
if len(docs) == 0 {
|
||||
t.Fatal("expected redirected page to be scraped")
|
||||
}
|
||||
if !strings.Contains(docs[0].URL, "/3.4.1/Regexp.html") {
|
||||
t.Fatalf("expected final redirected URL, got %q", docs[0].URL)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWebScraper_GlobalWrapperNormalizesOutput(t *testing.T) {
|
||||
content := strings.Repeat("docs content ", 20)
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(`<html><head><title>Regex Guide ¶ deprecated</title></head><body><main>` + content + `</main></body></html>`))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := NewScraper(SourceTypeWeb, &Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
Concurrency: 1,
|
||||
MaxDepth: 1,
|
||||
})
|
||||
if s == nil {
|
||||
t.Fatal("expected web scraper")
|
||||
}
|
||||
|
||||
docs, err := s.Scrape(context.Background(), &Source{
|
||||
Name: "test",
|
||||
Type: SourceTypeWeb,
|
||||
URL: srv.URL,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected scrape error: %v", err)
|
||||
}
|
||||
if len(docs) == 0 {
|
||||
t.Fatal("expected at least one document")
|
||||
}
|
||||
if docs[0].Title != "Regex Guide" {
|
||||
t.Fatalf("expected normalized title, got %q", docs[0].Title)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user