mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
update
This commit is contained in:
@@ -0,0 +1,226 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"net/url"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestLocalSearchScraperScrape(t *testing.T) {
|
||||
mux := http.NewServeMux()
|
||||
baseURL := ""
|
||||
|
||||
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
|
||||
if got := r.URL.Query().Get("q"); got != "go http client" {
|
||||
t.Fatalf("expected query go http client, got %q", got)
|
||||
}
|
||||
if got := r.URL.Query().Get("format"); got != "json" {
|
||||
t.Fatalf("expected format=json, got %q", got)
|
||||
}
|
||||
_ = json.NewEncoder(w).Encode(map[string]interface{}{
|
||||
"results": []map[string]interface{}{
|
||||
{
|
||||
"url": baseURL + "/docs/http-client",
|
||||
"title": "HTTP Client Guide",
|
||||
"content": "How to build an HTTP client in Go",
|
||||
"engine": "searxng",
|
||||
"score": 0.99,
|
||||
},
|
||||
},
|
||||
})
|
||||
})
|
||||
|
||||
mux.HandleFunc("/docs/http-client", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(`<html><head><title>HTTP Client Guide</title></head><body><main>` + strings.Repeat("http client docs ", 30) + `</main></body></html>`))
|
||||
})
|
||||
|
||||
srv := httptest.NewServer(mux)
|
||||
defer srv.Close()
|
||||
baseURL = srv.URL
|
||||
|
||||
s := NewLocalSearchScraper(&Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
})
|
||||
|
||||
docs, err := s.Scrape(context.Background(), &Source{
|
||||
Name: "local-search",
|
||||
Type: SourceTypeLocalSearch,
|
||||
URL: srv.URL + "/search",
|
||||
Query: "go http client",
|
||||
ResultLimit: 5,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected scrape error: %v", err)
|
||||
}
|
||||
if len(docs) == 0 {
|
||||
t.Fatal("expected at least one document")
|
||||
}
|
||||
|
||||
doc := docs[0]
|
||||
if doc.URL != srv.URL+"/docs/http-client" {
|
||||
t.Fatalf("unexpected document URL: %q", doc.URL)
|
||||
}
|
||||
if doc.Metadata["search_query"] != "go http client" {
|
||||
t.Fatalf("expected metadata search_query, got %v", doc.Metadata["search_query"])
|
||||
}
|
||||
if doc.Metadata["search_engine"] != "searxng" {
|
||||
t.Fatalf("expected metadata search_engine=searxng, got %v", doc.Metadata["search_engine"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestLocalSearchScraperDomainFilter(t *testing.T) {
|
||||
mux := http.NewServeMux()
|
||||
baseURL := ""
|
||||
|
||||
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
|
||||
_ = json.NewEncoder(w).Encode(map[string]interface{}{
|
||||
"results": []map[string]interface{}{
|
||||
{
|
||||
"url": baseURL + "/docs/in-scope",
|
||||
"title": "In Scope",
|
||||
},
|
||||
{
|
||||
"url": "https://example.com/out-of-scope",
|
||||
"title": "Out Scope",
|
||||
},
|
||||
},
|
||||
})
|
||||
})
|
||||
|
||||
mux.HandleFunc("/docs/in-scope", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(`<html><head><title>In Scope</title></head><body><main>` + strings.Repeat("scoped docs ", 30) + `</main></body></html>`))
|
||||
})
|
||||
|
||||
srv := httptest.NewServer(mux)
|
||||
defer srv.Close()
|
||||
baseURL = srv.URL
|
||||
|
||||
parsed, err := url.Parse(srv.URL)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to parse server URL: %v", err)
|
||||
}
|
||||
|
||||
s := NewLocalSearchScraper(&Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
})
|
||||
|
||||
docs, err := s.Scrape(context.Background(), &Source{
|
||||
Name: "local-search",
|
||||
Type: SourceTypeLocalSearch,
|
||||
URL: srv.URL + "/search",
|
||||
Query: "scope test",
|
||||
ResultLimit: 10,
|
||||
Domains: []string{parsed.Hostname()},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected scrape error: %v", err)
|
||||
}
|
||||
if len(docs) == 0 {
|
||||
t.Fatal("expected at least one in-scope document")
|
||||
}
|
||||
for _, doc := range docs {
|
||||
docURL, parseErr := url.Parse(doc.URL)
|
||||
if parseErr != nil {
|
||||
t.Fatalf("invalid doc URL %q: %v", doc.URL, parseErr)
|
||||
}
|
||||
if docURL.Hostname() != parsed.Hostname() {
|
||||
t.Fatalf("expected only in-scope domain, got %q", doc.URL)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestLocalSearchScraperRequiresQuery(t *testing.T) {
|
||||
s := NewLocalSearchScraper(&Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
})
|
||||
|
||||
_, err := s.Scrape(context.Background(), &Source{
|
||||
Name: "local-search",
|
||||
Type: SourceTypeLocalSearch,
|
||||
URL: "http://127.0.0.1:8080/search",
|
||||
})
|
||||
if err == nil {
|
||||
t.Fatal("expected error when query is missing")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "query") {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLocalSearchScraperDetectChanges(t *testing.T) {
|
||||
mux := http.NewServeMux()
|
||||
baseURL := ""
|
||||
resultPath := "/docs/one"
|
||||
|
||||
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
|
||||
_ = json.NewEncoder(w).Encode(map[string]interface{}{
|
||||
"results": []map[string]interface{}{
|
||||
{
|
||||
"url": baseURL + resultPath,
|
||||
"title": "Versioned",
|
||||
"score": 1.0,
|
||||
},
|
||||
},
|
||||
})
|
||||
})
|
||||
|
||||
srv := httptest.NewServer(mux)
|
||||
defer srv.Close()
|
||||
baseURL = srv.URL
|
||||
|
||||
s := NewLocalSearchScraper(&Config{
|
||||
UserAgent: "DevourTest/1.0",
|
||||
Timeout: 2 * time.Second,
|
||||
})
|
||||
source := &Source{
|
||||
Name: "local-search",
|
||||
Type: SourceTypeLocalSearch,
|
||||
URL: srv.URL + "/search",
|
||||
Query: "version test",
|
||||
ResultLimit: 3,
|
||||
}
|
||||
|
||||
changed, hash1, err := s.DetectChanges(context.Background(), source, "")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected detect changes error: %v", err)
|
||||
}
|
||||
if !changed {
|
||||
t.Fatal("expected first detect changes call to report changed")
|
||||
}
|
||||
if hash1 == "" {
|
||||
t.Fatal("expected non-empty hash")
|
||||
}
|
||||
|
||||
changed, hash2, err := s.DetectChanges(context.Background(), source, hash1)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected detect changes error: %v", err)
|
||||
}
|
||||
if changed {
|
||||
t.Fatal("expected unchanged results with identical hash")
|
||||
}
|
||||
if hash2 != hash1 {
|
||||
t.Fatalf("expected identical hash, got %q and %q", hash1, hash2)
|
||||
}
|
||||
|
||||
resultPath = "/docs/two"
|
||||
changed, hash3, err := s.DetectChanges(context.Background(), source, hash1)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected detect changes error: %v", err)
|
||||
}
|
||||
if !changed {
|
||||
t.Fatal("expected changed results after search output changed")
|
||||
}
|
||||
if hash3 == hash1 {
|
||||
t.Fatal("expected hash to change")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user