Files
Devour/internal/scraper/localsearch_test.go
T
Tomas Dvorak 898a3c303f update
2026-02-24 10:33:59 +01:00

227 lines
5.8 KiB
Go

package scraper
import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"net/url"
"strings"
"testing"
"time"
)
func TestLocalSearchScraperScrape(t *testing.T) {
mux := http.NewServeMux()
baseURL := ""
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
if got := r.URL.Query().Get("q"); got != "go http client" {
t.Fatalf("expected query go http client, got %q", got)
}
if got := r.URL.Query().Get("format"); got != "json" {
t.Fatalf("expected format=json, got %q", got)
}
_ = json.NewEncoder(w).Encode(map[string]interface{}{
"results": []map[string]interface{}{
{
"url": baseURL + "/docs/http-client",
"title": "HTTP Client Guide",
"content": "How to build an HTTP client in Go",
"engine": "searxng",
"score": 0.99,
},
},
})
})
mux.HandleFunc("/docs/http-client", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><head><title>HTTP Client Guide</title></head><body><main>` + strings.Repeat("http client docs ", 30) + `</main></body></html>`))
})
srv := httptest.NewServer(mux)
defer srv.Close()
baseURL = srv.URL
s := NewLocalSearchScraper(&Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
})
docs, err := s.Scrape(context.Background(), &Source{
Name: "local-search",
Type: SourceTypeLocalSearch,
URL: srv.URL + "/search",
Query: "go http client",
ResultLimit: 5,
})
if err != nil {
t.Fatalf("unexpected scrape error: %v", err)
}
if len(docs) == 0 {
t.Fatal("expected at least one document")
}
doc := docs[0]
if doc.URL != srv.URL+"/docs/http-client" {
t.Fatalf("unexpected document URL: %q", doc.URL)
}
if doc.Metadata["search_query"] != "go http client" {
t.Fatalf("expected metadata search_query, got %v", doc.Metadata["search_query"])
}
if doc.Metadata["search_engine"] != "searxng" {
t.Fatalf("expected metadata search_engine=searxng, got %v", doc.Metadata["search_engine"])
}
}
func TestLocalSearchScraperDomainFilter(t *testing.T) {
mux := http.NewServeMux()
baseURL := ""
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
_ = json.NewEncoder(w).Encode(map[string]interface{}{
"results": []map[string]interface{}{
{
"url": baseURL + "/docs/in-scope",
"title": "In Scope",
},
{
"url": "https://example.com/out-of-scope",
"title": "Out Scope",
},
},
})
})
mux.HandleFunc("/docs/in-scope", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><head><title>In Scope</title></head><body><main>` + strings.Repeat("scoped docs ", 30) + `</main></body></html>`))
})
srv := httptest.NewServer(mux)
defer srv.Close()
baseURL = srv.URL
parsed, err := url.Parse(srv.URL)
if err != nil {
t.Fatalf("failed to parse server URL: %v", err)
}
s := NewLocalSearchScraper(&Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
})
docs, err := s.Scrape(context.Background(), &Source{
Name: "local-search",
Type: SourceTypeLocalSearch,
URL: srv.URL + "/search",
Query: "scope test",
ResultLimit: 10,
Domains: []string{parsed.Hostname()},
})
if err != nil {
t.Fatalf("unexpected scrape error: %v", err)
}
if len(docs) == 0 {
t.Fatal("expected at least one in-scope document")
}
for _, doc := range docs {
docURL, parseErr := url.Parse(doc.URL)
if parseErr != nil {
t.Fatalf("invalid doc URL %q: %v", doc.URL, parseErr)
}
if docURL.Hostname() != parsed.Hostname() {
t.Fatalf("expected only in-scope domain, got %q", doc.URL)
}
}
}
func TestLocalSearchScraperRequiresQuery(t *testing.T) {
s := NewLocalSearchScraper(&Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
})
_, err := s.Scrape(context.Background(), &Source{
Name: "local-search",
Type: SourceTypeLocalSearch,
URL: "http://127.0.0.1:8080/search",
})
if err == nil {
t.Fatal("expected error when query is missing")
}
if !strings.Contains(err.Error(), "query") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestLocalSearchScraperDetectChanges(t *testing.T) {
mux := http.NewServeMux()
baseURL := ""
resultPath := "/docs/one"
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
_ = json.NewEncoder(w).Encode(map[string]interface{}{
"results": []map[string]interface{}{
{
"url": baseURL + resultPath,
"title": "Versioned",
"score": 1.0,
},
},
})
})
srv := httptest.NewServer(mux)
defer srv.Close()
baseURL = srv.URL
s := NewLocalSearchScraper(&Config{
UserAgent: "DevourTest/1.0",
Timeout: 2 * time.Second,
})
source := &Source{
Name: "local-search",
Type: SourceTypeLocalSearch,
URL: srv.URL + "/search",
Query: "version test",
ResultLimit: 3,
}
changed, hash1, err := s.DetectChanges(context.Background(), source, "")
if err != nil {
t.Fatalf("unexpected detect changes error: %v", err)
}
if !changed {
t.Fatal("expected first detect changes call to report changed")
}
if hash1 == "" {
t.Fatal("expected non-empty hash")
}
changed, hash2, err := s.DetectChanges(context.Background(), source, hash1)
if err != nil {
t.Fatalf("unexpected detect changes error: %v", err)
}
if changed {
t.Fatal("expected unchanged results with identical hash")
}
if hash2 != hash1 {
t.Fatalf("expected identical hash, got %q and %q", hash1, hash2)
}
resultPath = "/docs/two"
changed, hash3, err := s.DetectChanges(context.Background(), source, hash1)
if err != nil {
t.Fatalf("unexpected detect changes error: %v", err)
}
if !changed {
t.Fatal("expected changed results after search output changed")
}
if hash3 == hash1 {
t.Fatal("expected hash to change")
}
}