package scraper import ( "context" "crypto/sha256" "encoding/hex" "encoding/json" "fmt" "io" "net/http" "net/url" "sort" "strconv" "strings" "time" ) const ( defaultLocalSearchLimit = 8 maxLocalSearchLimit = 50 maxSearchResponseBytes = 2 << 20 // 2MB ) // LocalSearchScraper scrapes docs from result URLs returned by a local search API. type LocalSearchScraper struct { config *Config client *http.Client web *WebScraper } type localSearchResult struct { URL string Title string Snippet string Engine string Score float64 } // NewLocalSearchScraper creates a scraper backed by a self-hosted search API. func NewLocalSearchScraper(config *Config) *LocalSearchScraper { baseConfig := &Config{} if config != nil { *baseConfig = *config } if baseConfig.UserAgent == "" { baseConfig.UserAgent = "Devour/1.0 (Local Search Scraper)" } if baseConfig.Timeout <= 0 { baseConfig.Timeout = 30 * time.Second } webConfig := *baseConfig webConfig.Concurrency = 1 webConfig.MaxDepth = 1 return &LocalSearchScraper{ config: baseConfig, client: &http.Client{Timeout: baseConfig.Timeout}, web: NewWebScraper(&webConfig), } } // Scrape queries a local search API and scrapes the returned URLs. func (s *LocalSearchScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) { if source == nil { return nil, fmt.Errorf("source is required") } if strings.TrimSpace(source.URL) == "" { return nil, fmt.Errorf("search API URL is required") } query := strings.TrimSpace(source.Query) if query == "" { return nil, fmt.Errorf("search query is required for localsearch sources (set source.query or --search-query)") } limit := clampLocalSearchLimit(source.ResultLimit) results, err := s.search(ctx, source, query, limit) if err != nil { return nil, err } docs := make([]*Document, 0, limit) seen := make(map[string]bool) var scrapeErrors []string for i, result := range results { if ctx.Err() != nil { return nil, ctx.Err() } resultURL := stripURLFragment(result.URL) if resultURL == "" || seen[resultURL] { continue } if !domainAllowed(resultURL, source.Domains) { continue } if !s.web.shouldInclude(resultURL, source.Include, source.Exclude) { continue } seen[resultURL] = true pageDocs, err := s.web.Scrape(ctx, &Source{ Name: source.Name, Type: SourceTypeWeb, URL: resultURL, Include: source.Include, Exclude: source.Exclude, }) if err != nil { if len(scrapeErrors) < 20 { scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", resultURL, err)) } continue } for _, doc := range pageDocs { if doc.Metadata == nil { doc.Metadata = make(map[string]interface{}) } doc.Metadata["search_api"] = source.URL doc.Metadata["search_query"] = query doc.Metadata["search_rank"] = i + 1 if result.Engine != "" { doc.Metadata["search_engine"] = result.Engine } if result.Snippet != "" { doc.Metadata["search_snippet"] = result.Snippet } if result.Score != 0 { doc.Metadata["search_score"] = result.Score } if strings.TrimSpace(doc.Title) == "" && strings.TrimSpace(result.Title) != "" { doc.Title = strings.TrimSpace(result.Title) } docs = append(docs, doc) } } if len(docs) == 0 { if len(scrapeErrors) > 0 { return nil, fmt.Errorf("local search returned results but page scraping failed: %s", strings.Join(scrapeErrors, "; ")) } return nil, fmt.Errorf("local search yielded no usable results for query %q", query) } return docs, nil } // DetectChanges checks if top search results changed. func (s *LocalSearchScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) { if source == nil { return false, "", fmt.Errorf("source is required") } query := strings.TrimSpace(source.Query) if query == "" { return false, "", fmt.Errorf("search query is required for localsearch sources") } limit := clampLocalSearchLimit(source.ResultLimit) results, err := s.search(ctx, source, query, limit) if err != nil { return false, "", err } signatures := make([]string, 0, len(results)) for _, result := range results { u := stripURLFragment(result.URL) if u == "" { continue } if !domainAllowed(u, source.Domains) { continue } if !s.web.shouldInclude(u, source.Include, source.Exclude) { continue } signatures = append(signatures, fmt.Sprintf("%s|%s|%s|%.6f", u, result.Title, result.Engine, result.Score)) } sort.Strings(signatures) hash := sha256.Sum256([]byte(strings.Join(signatures, "\n"))) currentHash := hex.EncodeToString(hash[:]) return currentHash != lastHash, currentHash, nil } func (s *LocalSearchScraper) search(ctx context.Context, source *Source, query string, limit int) ([]localSearchResult, error) { searchURL, err := buildLocalSearchURL(source.URL, query, limit) if err != nil { return nil, err } req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) if err != nil { return nil, fmt.Errorf("failed to build search request: %w", err) } req.Header.Set("User-Agent", s.config.UserAgent) req.Header.Set("Accept", "application/json") resp, err := s.client.Do(req) if err != nil { return nil, fmt.Errorf("search API request failed: %w", err) } defer resp.Body.Close() body, err := io.ReadAll(io.LimitReader(resp.Body, maxSearchResponseBytes)) if err != nil { return nil, fmt.Errorf("failed reading search API response: %w", err) } if resp.StatusCode < 200 || resp.StatusCode >= 300 { msg := strings.TrimSpace(string(body)) if len(msg) > 200 { msg = msg[:200] } return nil, fmt.Errorf("search API returned HTTP %d: %s", resp.StatusCode, msg) } results, err := decodeLocalSearchResults(body) if err != nil { return nil, err } if len(results) == 0 { return nil, fmt.Errorf("search API returned no results") } if len(results) > limit { results = results[:limit] } return results, nil } func buildLocalSearchURL(rawURL, query string, limit int) (string, error) { u, err := url.Parse(strings.TrimSpace(rawURL)) if err != nil { return "", fmt.Errorf("invalid search API URL: %w", err) } if u.Scheme == "" || u.Host == "" { return "", fmt.Errorf("invalid search API URL: expected absolute URL, got %q", rawURL) } params := u.Query() params.Set("q", query) if params.Get("format") == "" { params.Set("format", "json") } if params.Get("limit") == "" { params.Set("limit", strconv.Itoa(clampLocalSearchLimit(limit))) } u.RawQuery = params.Encode() return u.String(), nil } func decodeLocalSearchResults(body []byte) ([]localSearchResult, error) { var payload map[string]interface{} if err := json.Unmarshal(body, &payload); err != nil { return nil, fmt.Errorf("search API returned invalid JSON: %w", err) } rawResults, ok := payload["results"] if !ok { return nil, fmt.Errorf("search API response missing results field") } items, ok := rawResults.([]interface{}) if !ok { return nil, fmt.Errorf("search API results field is not an array") } results := make([]localSearchResult, 0, len(items)) for _, item := range items { record, ok := item.(map[string]interface{}) if !ok { continue } resultURL := pickString(record, "url", "link", "href") if strings.TrimSpace(resultURL) == "" { continue } results = append(results, localSearchResult{ URL: strings.TrimSpace(resultURL), Title: strings.TrimSpace(pickString(record, "title", "name")), Snippet: strings.TrimSpace(pickString(record, "content", "snippet", "description", "text")), Engine: strings.TrimSpace(pickString(record, "engine", "source")), Score: pickFloat(record, "score", "relevance"), }) } return results, nil } func pickString(record map[string]interface{}, keys ...string) string { for _, key := range keys { value, ok := record[key] if !ok { continue } switch v := value.(type) { case string: return v case json.Number: return v.String() case float64: return strconv.FormatFloat(v, 'f', -1, 64) case int: return strconv.Itoa(v) } } return "" } func pickFloat(record map[string]interface{}, keys ...string) float64 { for _, key := range keys { value, ok := record[key] if !ok { continue } switch v := value.(type) { case float64: return v case float32: return float64(v) case int: return float64(v) case int64: return float64(v) case json.Number: f, err := v.Float64() if err == nil { return f } case string: f, err := strconv.ParseFloat(strings.TrimSpace(v), 64) if err == nil { return f } } } return 0 } func clampLocalSearchLimit(limit int) int { if limit <= 0 { return defaultLocalSearchLimit } if limit > maxLocalSearchLimit { return maxLocalSearchLimit } return limit } func stripURLFragment(raw string) string { u, err := url.Parse(strings.TrimSpace(raw)) if err != nil { return strings.TrimSpace(raw) } u.Fragment = "" return u.String() } func domainAllowed(raw string, allowedDomains []string) bool { if len(allowedDomains) == 0 { return true } u, err := url.Parse(raw) if err != nil { return false } host := strings.ToLower(strings.TrimSpace(u.Hostname())) if host == "" { return false } for _, candidate := range allowedDomains { domain := normalizeDomain(candidate) if domain == "" { continue } if host == domain || strings.HasSuffix(host, "."+domain) { return true } } return false } func normalizeDomain(raw string) string { raw = strings.ToLower(strings.TrimSpace(raw)) if raw == "" { return "" } if strings.Contains(raw, "://") { parsed, err := url.Parse(raw) if err == nil { return strings.ToLower(parsed.Hostname()) } } return strings.TrimPrefix(raw, ".") }