Files
Devour/internal/scraper/localsearch.go
2026-02-24 12:10:13 +01:00

404 lines
9.6 KiB
Go

package scraper
import (
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"sort"
"strconv"
"strings"
"time"
)
const (
defaultLocalSearchLimit = 8
maxLocalSearchLimit = 50
maxSearchResponseBytes = 2 << 20 // 2MB
)
// LocalSearchScraper scrapes docs from result URLs returned by a local search API.
type LocalSearchScraper struct {
config *Config
client *http.Client
web *WebScraper
}
type localSearchResult struct {
URL string
Title string
Snippet string
Engine string
Score float64
}
// NewLocalSearchScraper creates a scraper backed by a self-hosted search API.
func NewLocalSearchScraper(config *Config) *LocalSearchScraper {
baseConfig := &Config{}
if config != nil {
*baseConfig = *config
}
if baseConfig.UserAgent == "" {
baseConfig.UserAgent = "Devour/1.0 (Local Search Scraper)"
}
if baseConfig.Timeout <= 0 {
baseConfig.Timeout = 30 * time.Second
}
webConfig := *baseConfig
webConfig.Concurrency = 1
webConfig.MaxDepth = 1
return &LocalSearchScraper{
config: baseConfig,
client: &http.Client{Timeout: baseConfig.Timeout},
web: NewWebScraper(&webConfig),
}
}
// Scrape queries a local search API and scrapes the returned URLs.
func (s *LocalSearchScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
if source == nil {
return nil, fmt.Errorf("source is required")
}
if strings.TrimSpace(source.URL) == "" {
return nil, fmt.Errorf("search API URL is required")
}
query := strings.TrimSpace(source.Query)
if query == "" {
return nil, fmt.Errorf("search query is required for localsearch sources (set source.query or --search-query)")
}
limit := clampLocalSearchLimit(source.ResultLimit)
results, err := s.search(ctx, source, query, limit)
if err != nil {
return nil, err
}
docs := make([]*Document, 0, limit)
seen := make(map[string]bool)
var scrapeErrors []error
for i, result := range results {
if ctx.Err() != nil {
return nil, ctx.Err()
}
resultURL := stripURLFragment(result.URL)
if resultURL == "" || seen[resultURL] {
continue
}
if !domainAllowed(resultURL, source.Domains) {
continue
}
if !s.web.shouldInclude(resultURL, source.Include, source.Exclude) {
continue
}
seen[resultURL] = true
pageDocs, err := s.web.Scrape(ctx, &Source{
Name: source.Name,
Type: SourceTypeWeb,
URL: resultURL,
Include: source.Include,
Exclude: source.Exclude,
})
if err != nil {
if len(scrapeErrors) < 20 {
scrapeErrors = append(scrapeErrors, fmt.Errorf("%s: %w", resultURL, err))
}
continue
}
for _, doc := range pageDocs {
if doc.Metadata == nil {
doc.Metadata = make(map[string]interface{})
}
doc.Metadata["search_api"] = source.URL
doc.Metadata["search_query"] = query
doc.Metadata["search_rank"] = i + 1
if result.Engine != "" {
doc.Metadata["search_engine"] = result.Engine
}
if result.Snippet != "" {
doc.Metadata["search_snippet"] = result.Snippet
}
if result.Score != 0 {
doc.Metadata["search_score"] = result.Score
}
if strings.TrimSpace(doc.Title) == "" && strings.TrimSpace(result.Title) != "" {
doc.Title = strings.TrimSpace(result.Title)
}
docs = append(docs, doc)
}
}
if len(docs) == 0 {
if len(scrapeErrors) > 0 {
return nil, fmt.Errorf("local search returned results but page scraping failed: %w", errors.Join(scrapeErrors...))
}
return nil, fmt.Errorf("local search yielded no usable results for query %q", query)
}
return docs, nil
}
// DetectChanges checks if top search results changed.
func (s *LocalSearchScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
if source == nil {
return false, "", fmt.Errorf("source is required")
}
query := strings.TrimSpace(source.Query)
if query == "" {
return false, "", fmt.Errorf("search query is required for localsearch sources")
}
limit := clampLocalSearchLimit(source.ResultLimit)
results, err := s.search(ctx, source, query, limit)
if err != nil {
return false, "", err
}
signatures := make([]string, 0, len(results))
for _, result := range results {
u := stripURLFragment(result.URL)
if u == "" {
continue
}
if !domainAllowed(u, source.Domains) {
continue
}
if !s.web.shouldInclude(u, source.Include, source.Exclude) {
continue
}
signatures = append(signatures, fmt.Sprintf("%s|%s|%s|%.6f", u, result.Title, result.Engine, result.Score))
}
sort.Strings(signatures)
hash := sha256.Sum256([]byte(strings.Join(signatures, "\n")))
currentHash := hex.EncodeToString(hash[:])
return currentHash != lastHash, currentHash, nil
}
func (s *LocalSearchScraper) search(ctx context.Context, source *Source, query string, limit int) ([]localSearchResult, error) {
searchURL, err := buildLocalSearchURL(source.URL, query, limit)
if err != nil {
return nil, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
if err != nil {
return nil, fmt.Errorf("failed to build search request: %w", err)
}
req.Header.Set("User-Agent", s.config.UserAgent)
req.Header.Set("Accept", "application/json")
resp, err := s.client.Do(req)
if err != nil {
return nil, fmt.Errorf("search API request failed: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(io.LimitReader(resp.Body, maxSearchResponseBytes))
if err != nil {
return nil, fmt.Errorf("failed reading search API response: %w", err)
}
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
msg := strings.TrimSpace(string(body))
if len(msg) > 200 {
msg = msg[:200]
}
return nil, fmt.Errorf("search API returned HTTP %d: %s", resp.StatusCode, msg)
}
results, err := decodeLocalSearchResults(body)
if err != nil {
return nil, err
}
if len(results) == 0 {
return nil, fmt.Errorf("search API returned no results")
}
if len(results) > limit {
results = results[:limit]
}
return results, nil
}
func buildLocalSearchURL(rawURL, query string, limit int) (string, error) {
u, err := url.Parse(strings.TrimSpace(rawURL))
if err != nil {
return "", fmt.Errorf("invalid search API URL: %w", err)
}
if u.Scheme == "" || u.Host == "" {
return "", fmt.Errorf("invalid search API URL: expected absolute URL, got %q", rawURL)
}
params := u.Query()
params.Set("q", query)
if params.Get("format") == "" {
params.Set("format", "json")
}
if params.Get("limit") == "" {
params.Set("limit", strconv.Itoa(clampLocalSearchLimit(limit)))
}
u.RawQuery = params.Encode()
return u.String(), nil
}
func decodeLocalSearchResults(body []byte) ([]localSearchResult, error) {
var payload map[string]interface{}
if err := json.Unmarshal(body, &payload); err != nil {
return nil, fmt.Errorf("search API returned invalid JSON: %w", err)
}
rawResults, ok := payload["results"]
if !ok {
return nil, fmt.Errorf("search API response missing results field")
}
items, ok := rawResults.([]interface{})
if !ok {
return nil, fmt.Errorf("search API results field is not an array")
}
results := make([]localSearchResult, 0, len(items))
for _, item := range items {
record, ok := item.(map[string]interface{})
if !ok {
continue
}
resultURL := pickString(record, "url", "link", "href")
if strings.TrimSpace(resultURL) == "" {
continue
}
results = append(results, localSearchResult{
URL: strings.TrimSpace(resultURL),
Title: strings.TrimSpace(pickString(record, "title", "name")),
Snippet: strings.TrimSpace(pickString(record, "content", "snippet", "description", "text")),
Engine: strings.TrimSpace(pickString(record, "engine", "source")),
Score: pickFloat(record, "score", "relevance"),
})
}
return results, nil
}
func pickString(record map[string]interface{}, keys ...string) string {
for _, key := range keys {
value, ok := record[key]
if !ok {
continue
}
switch v := value.(type) {
case string:
return v
case json.Number:
return v.String()
case float64:
return strconv.FormatFloat(v, 'f', -1, 64)
case int:
return strconv.Itoa(v)
}
}
return ""
}
func pickFloat(record map[string]interface{}, keys ...string) float64 {
for _, key := range keys {
value, ok := record[key]
if !ok {
continue
}
switch v := value.(type) {
case float64:
return v
case float32:
return float64(v)
case int:
return float64(v)
case int64:
return float64(v)
case json.Number:
f, err := v.Float64()
if err == nil {
return f
}
case string:
f, err := strconv.ParseFloat(strings.TrimSpace(v), 64)
if err == nil {
return f
}
}
}
return 0
}
func clampLocalSearchLimit(limit int) int {
if limit <= 0 {
return defaultLocalSearchLimit
}
if limit > maxLocalSearchLimit {
return maxLocalSearchLimit
}
return limit
}
func stripURLFragment(raw string) string {
u, err := url.Parse(strings.TrimSpace(raw))
if err != nil {
return strings.TrimSpace(raw)
}
u.Fragment = ""
return u.String()
}
func domainAllowed(raw string, allowedDomains []string) bool {
if len(allowedDomains) == 0 {
return true
}
u, err := url.Parse(raw)
if err != nil {
return false
}
host := strings.ToLower(strings.TrimSpace(u.Hostname()))
if host == "" {
return false
}
for _, candidate := range allowedDomains {
domain := normalizeDomain(candidate)
if domain == "" {
continue
}
if host == domain || strings.HasSuffix(host, "."+domain) {
return true
}
}
return false
}
func normalizeDomain(raw string) string {
raw = strings.ToLower(strings.TrimSpace(raw))
if raw == "" {
return ""
}
if strings.Contains(raw, "://") {
parsed, err := url.Parse(raw)
if err == nil {
return strings.ToLower(parsed.Hostname())
}
}
return strings.TrimPrefix(raw, ".")
}