mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
227 lines
5.8 KiB
Go
227 lines
5.8 KiB
Go
package scraper
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"net/http"
|
|
"net/http/httptest"
|
|
"net/url"
|
|
"strings"
|
|
"testing"
|
|
"time"
|
|
)
|
|
|
|
func TestLocalSearchScraperScrape(t *testing.T) {
|
|
mux := http.NewServeMux()
|
|
baseURL := ""
|
|
|
|
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
|
|
if got := r.URL.Query().Get("q"); got != "go http client" {
|
|
t.Fatalf("expected query go http client, got %q", got)
|
|
}
|
|
if got := r.URL.Query().Get("format"); got != "json" {
|
|
t.Fatalf("expected format=json, got %q", got)
|
|
}
|
|
_ = json.NewEncoder(w).Encode(map[string]interface{}{
|
|
"results": []map[string]interface{}{
|
|
{
|
|
"url": baseURL + "/docs/http-client",
|
|
"title": "HTTP Client Guide",
|
|
"content": "How to build an HTTP client in Go",
|
|
"engine": "searxng",
|
|
"score": 0.99,
|
|
},
|
|
},
|
|
})
|
|
})
|
|
|
|
mux.HandleFunc("/docs/http-client", func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "text/html")
|
|
_, _ = w.Write([]byte(`<html><head><title>HTTP Client Guide</title></head><body><main>` + strings.Repeat("http client docs ", 30) + `</main></body></html>`))
|
|
})
|
|
|
|
srv := httptest.NewServer(mux)
|
|
defer srv.Close()
|
|
baseURL = srv.URL
|
|
|
|
s := NewLocalSearchScraper(&Config{
|
|
UserAgent: "DevourTest/1.0",
|
|
Timeout: 2 * time.Second,
|
|
})
|
|
|
|
docs, err := s.Scrape(context.Background(), &Source{
|
|
Name: "local-search",
|
|
Type: SourceTypeLocalSearch,
|
|
URL: srv.URL + "/search",
|
|
Query: "go http client",
|
|
ResultLimit: 5,
|
|
})
|
|
if err != nil {
|
|
t.Fatalf("unexpected scrape error: %v", err)
|
|
}
|
|
if len(docs) == 0 {
|
|
t.Fatal("expected at least one document")
|
|
}
|
|
|
|
doc := docs[0]
|
|
if doc.URL != srv.URL+"/docs/http-client" {
|
|
t.Fatalf("unexpected document URL: %q", doc.URL)
|
|
}
|
|
if doc.Metadata["search_query"] != "go http client" {
|
|
t.Fatalf("expected metadata search_query, got %v", doc.Metadata["search_query"])
|
|
}
|
|
if doc.Metadata["search_engine"] != "searxng" {
|
|
t.Fatalf("expected metadata search_engine=searxng, got %v", doc.Metadata["search_engine"])
|
|
}
|
|
}
|
|
|
|
func TestLocalSearchScraperDomainFilter(t *testing.T) {
|
|
mux := http.NewServeMux()
|
|
baseURL := ""
|
|
|
|
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
|
|
_ = json.NewEncoder(w).Encode(map[string]interface{}{
|
|
"results": []map[string]interface{}{
|
|
{
|
|
"url": baseURL + "/docs/in-scope",
|
|
"title": "In Scope",
|
|
},
|
|
{
|
|
"url": "https://example.com/out-of-scope",
|
|
"title": "Out Scope",
|
|
},
|
|
},
|
|
})
|
|
})
|
|
|
|
mux.HandleFunc("/docs/in-scope", func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "text/html")
|
|
_, _ = w.Write([]byte(`<html><head><title>In Scope</title></head><body><main>` + strings.Repeat("scoped docs ", 30) + `</main></body></html>`))
|
|
})
|
|
|
|
srv := httptest.NewServer(mux)
|
|
defer srv.Close()
|
|
baseURL = srv.URL
|
|
|
|
parsed, err := url.Parse(srv.URL)
|
|
if err != nil {
|
|
t.Fatalf("failed to parse server URL: %v", err)
|
|
}
|
|
|
|
s := NewLocalSearchScraper(&Config{
|
|
UserAgent: "DevourTest/1.0",
|
|
Timeout: 2 * time.Second,
|
|
})
|
|
|
|
docs, err := s.Scrape(context.Background(), &Source{
|
|
Name: "local-search",
|
|
Type: SourceTypeLocalSearch,
|
|
URL: srv.URL + "/search",
|
|
Query: "scope test",
|
|
ResultLimit: 10,
|
|
Domains: []string{parsed.Hostname()},
|
|
})
|
|
if err != nil {
|
|
t.Fatalf("unexpected scrape error: %v", err)
|
|
}
|
|
if len(docs) == 0 {
|
|
t.Fatal("expected at least one in-scope document")
|
|
}
|
|
for _, doc := range docs {
|
|
docURL, parseErr := url.Parse(doc.URL)
|
|
if parseErr != nil {
|
|
t.Fatalf("invalid doc URL %q: %v", doc.URL, parseErr)
|
|
}
|
|
if docURL.Hostname() != parsed.Hostname() {
|
|
t.Fatalf("expected only in-scope domain, got %q", doc.URL)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestLocalSearchScraperRequiresQuery(t *testing.T) {
|
|
s := NewLocalSearchScraper(&Config{
|
|
UserAgent: "DevourTest/1.0",
|
|
Timeout: 2 * time.Second,
|
|
})
|
|
|
|
_, err := s.Scrape(context.Background(), &Source{
|
|
Name: "local-search",
|
|
Type: SourceTypeLocalSearch,
|
|
URL: "http://127.0.0.1:8080/search",
|
|
})
|
|
if err == nil {
|
|
t.Fatal("expected error when query is missing")
|
|
}
|
|
if !strings.Contains(err.Error(), "query") {
|
|
t.Fatalf("unexpected error: %v", err)
|
|
}
|
|
}
|
|
|
|
func TestLocalSearchScraperDetectChanges(t *testing.T) {
|
|
mux := http.NewServeMux()
|
|
baseURL := ""
|
|
resultPath := "/docs/one"
|
|
|
|
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
|
|
_ = json.NewEncoder(w).Encode(map[string]interface{}{
|
|
"results": []map[string]interface{}{
|
|
{
|
|
"url": baseURL + resultPath,
|
|
"title": "Versioned",
|
|
"score": 1.0,
|
|
},
|
|
},
|
|
})
|
|
})
|
|
|
|
srv := httptest.NewServer(mux)
|
|
defer srv.Close()
|
|
baseURL = srv.URL
|
|
|
|
s := NewLocalSearchScraper(&Config{
|
|
UserAgent: "DevourTest/1.0",
|
|
Timeout: 2 * time.Second,
|
|
})
|
|
source := &Source{
|
|
Name: "local-search",
|
|
Type: SourceTypeLocalSearch,
|
|
URL: srv.URL + "/search",
|
|
Query: "version test",
|
|
ResultLimit: 3,
|
|
}
|
|
|
|
changed, hash1, err := s.DetectChanges(context.Background(), source, "")
|
|
if err != nil {
|
|
t.Fatalf("unexpected detect changes error: %v", err)
|
|
}
|
|
if !changed {
|
|
t.Fatal("expected first detect changes call to report changed")
|
|
}
|
|
if hash1 == "" {
|
|
t.Fatal("expected non-empty hash")
|
|
}
|
|
|
|
changed, hash2, err := s.DetectChanges(context.Background(), source, hash1)
|
|
if err != nil {
|
|
t.Fatalf("unexpected detect changes error: %v", err)
|
|
}
|
|
if changed {
|
|
t.Fatal("expected unchanged results with identical hash")
|
|
}
|
|
if hash2 != hash1 {
|
|
t.Fatalf("expected identical hash, got %q and %q", hash1, hash2)
|
|
}
|
|
|
|
resultPath = "/docs/two"
|
|
changed, hash3, err := s.DetectChanges(context.Background(), source, hash1)
|
|
if err != nil {
|
|
t.Fatalf("unexpected detect changes error: %v", err)
|
|
}
|
|
if !changed {
|
|
t.Fatal("expected changed results after search output changed")
|
|
}
|
|
if hash3 == hash1 {
|
|
t.Fatal("expected hash to change")
|
|
}
|
|
}
|