mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
404 lines
9.6 KiB
Go
404 lines
9.6 KiB
Go
package scraper
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
const (
|
|
defaultLocalSearchLimit = 8
|
|
maxLocalSearchLimit = 50
|
|
maxSearchResponseBytes = 2 << 20 // 2MB
|
|
)
|
|
|
|
// LocalSearchScraper scrapes docs from result URLs returned by a local search API.
|
|
type LocalSearchScraper struct {
|
|
config *Config
|
|
client *http.Client
|
|
web *WebScraper
|
|
}
|
|
|
|
type localSearchResult struct {
|
|
URL string
|
|
Title string
|
|
Snippet string
|
|
Engine string
|
|
Score float64
|
|
}
|
|
|
|
// NewLocalSearchScraper creates a scraper backed by a self-hosted search API.
|
|
func NewLocalSearchScraper(config *Config) *LocalSearchScraper {
|
|
baseConfig := &Config{}
|
|
if config != nil {
|
|
*baseConfig = *config
|
|
}
|
|
if baseConfig.UserAgent == "" {
|
|
baseConfig.UserAgent = "Devour/1.0 (Local Search Scraper)"
|
|
}
|
|
if baseConfig.Timeout <= 0 {
|
|
baseConfig.Timeout = 30 * time.Second
|
|
}
|
|
|
|
webConfig := *baseConfig
|
|
webConfig.Concurrency = 1
|
|
webConfig.MaxDepth = 1
|
|
|
|
return &LocalSearchScraper{
|
|
config: baseConfig,
|
|
client: &http.Client{Timeout: baseConfig.Timeout},
|
|
web: NewWebScraper(&webConfig),
|
|
}
|
|
}
|
|
|
|
// Scrape queries a local search API and scrapes the returned URLs.
|
|
func (s *LocalSearchScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
|
if source == nil {
|
|
return nil, fmt.Errorf("source is required")
|
|
}
|
|
if strings.TrimSpace(source.URL) == "" {
|
|
return nil, fmt.Errorf("search API URL is required")
|
|
}
|
|
query := strings.TrimSpace(source.Query)
|
|
if query == "" {
|
|
return nil, fmt.Errorf("search query is required for localsearch sources (set source.query or --search-query)")
|
|
}
|
|
|
|
limit := clampLocalSearchLimit(source.ResultLimit)
|
|
results, err := s.search(ctx, source, query, limit)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
docs := make([]*Document, 0, limit)
|
|
seen := make(map[string]bool)
|
|
var scrapeErrors []error
|
|
|
|
for i, result := range results {
|
|
if ctx.Err() != nil {
|
|
return nil, ctx.Err()
|
|
}
|
|
|
|
resultURL := stripURLFragment(result.URL)
|
|
if resultURL == "" || seen[resultURL] {
|
|
continue
|
|
}
|
|
if !domainAllowed(resultURL, source.Domains) {
|
|
continue
|
|
}
|
|
if !s.web.shouldInclude(resultURL, source.Include, source.Exclude) {
|
|
continue
|
|
}
|
|
seen[resultURL] = true
|
|
|
|
pageDocs, err := s.web.Scrape(ctx, &Source{
|
|
Name: source.Name,
|
|
Type: SourceTypeWeb,
|
|
URL: resultURL,
|
|
Include: source.Include,
|
|
Exclude: source.Exclude,
|
|
})
|
|
if err != nil {
|
|
if len(scrapeErrors) < 20 {
|
|
scrapeErrors = append(scrapeErrors, fmt.Errorf("%s: %w", resultURL, err))
|
|
}
|
|
continue
|
|
}
|
|
|
|
for _, doc := range pageDocs {
|
|
if doc.Metadata == nil {
|
|
doc.Metadata = make(map[string]interface{})
|
|
}
|
|
doc.Metadata["search_api"] = source.URL
|
|
doc.Metadata["search_query"] = query
|
|
doc.Metadata["search_rank"] = i + 1
|
|
if result.Engine != "" {
|
|
doc.Metadata["search_engine"] = result.Engine
|
|
}
|
|
if result.Snippet != "" {
|
|
doc.Metadata["search_snippet"] = result.Snippet
|
|
}
|
|
if result.Score != 0 {
|
|
doc.Metadata["search_score"] = result.Score
|
|
}
|
|
if strings.TrimSpace(doc.Title) == "" && strings.TrimSpace(result.Title) != "" {
|
|
doc.Title = strings.TrimSpace(result.Title)
|
|
}
|
|
|
|
docs = append(docs, doc)
|
|
}
|
|
}
|
|
|
|
if len(docs) == 0 {
|
|
if len(scrapeErrors) > 0 {
|
|
return nil, fmt.Errorf("local search returned results but page scraping failed: %w", errors.Join(scrapeErrors...))
|
|
}
|
|
return nil, fmt.Errorf("local search yielded no usable results for query %q", query)
|
|
}
|
|
|
|
return docs, nil
|
|
}
|
|
|
|
// DetectChanges checks if top search results changed.
|
|
func (s *LocalSearchScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
|
if source == nil {
|
|
return false, "", fmt.Errorf("source is required")
|
|
}
|
|
query := strings.TrimSpace(source.Query)
|
|
if query == "" {
|
|
return false, "", fmt.Errorf("search query is required for localsearch sources")
|
|
}
|
|
|
|
limit := clampLocalSearchLimit(source.ResultLimit)
|
|
results, err := s.search(ctx, source, query, limit)
|
|
if err != nil {
|
|
return false, "", err
|
|
}
|
|
|
|
signatures := make([]string, 0, len(results))
|
|
for _, result := range results {
|
|
u := stripURLFragment(result.URL)
|
|
if u == "" {
|
|
continue
|
|
}
|
|
if !domainAllowed(u, source.Domains) {
|
|
continue
|
|
}
|
|
if !s.web.shouldInclude(u, source.Include, source.Exclude) {
|
|
continue
|
|
}
|
|
signatures = append(signatures, fmt.Sprintf("%s|%s|%s|%.6f", u, result.Title, result.Engine, result.Score))
|
|
}
|
|
sort.Strings(signatures)
|
|
|
|
hash := sha256.Sum256([]byte(strings.Join(signatures, "\n")))
|
|
currentHash := hex.EncodeToString(hash[:])
|
|
return currentHash != lastHash, currentHash, nil
|
|
}
|
|
|
|
func (s *LocalSearchScraper) search(ctx context.Context, source *Source, query string, limit int) ([]localSearchResult, error) {
|
|
searchURL, err := buildLocalSearchURL(source.URL, query, limit)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to build search request: %w", err)
|
|
}
|
|
req.Header.Set("User-Agent", s.config.UserAgent)
|
|
req.Header.Set("Accept", "application/json")
|
|
|
|
resp, err := s.client.Do(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("search API request failed: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
body, err := io.ReadAll(io.LimitReader(resp.Body, maxSearchResponseBytes))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed reading search API response: %w", err)
|
|
}
|
|
|
|
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
|
msg := strings.TrimSpace(string(body))
|
|
if len(msg) > 200 {
|
|
msg = msg[:200]
|
|
}
|
|
return nil, fmt.Errorf("search API returned HTTP %d: %s", resp.StatusCode, msg)
|
|
}
|
|
|
|
results, err := decodeLocalSearchResults(body)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if len(results) == 0 {
|
|
return nil, fmt.Errorf("search API returned no results")
|
|
}
|
|
if len(results) > limit {
|
|
results = results[:limit]
|
|
}
|
|
return results, nil
|
|
}
|
|
|
|
func buildLocalSearchURL(rawURL, query string, limit int) (string, error) {
|
|
u, err := url.Parse(strings.TrimSpace(rawURL))
|
|
if err != nil {
|
|
return "", fmt.Errorf("invalid search API URL: %w", err)
|
|
}
|
|
if u.Scheme == "" || u.Host == "" {
|
|
return "", fmt.Errorf("invalid search API URL: expected absolute URL, got %q", rawURL)
|
|
}
|
|
|
|
params := u.Query()
|
|
params.Set("q", query)
|
|
if params.Get("format") == "" {
|
|
params.Set("format", "json")
|
|
}
|
|
if params.Get("limit") == "" {
|
|
params.Set("limit", strconv.Itoa(clampLocalSearchLimit(limit)))
|
|
}
|
|
u.RawQuery = params.Encode()
|
|
|
|
return u.String(), nil
|
|
}
|
|
|
|
func decodeLocalSearchResults(body []byte) ([]localSearchResult, error) {
|
|
var payload map[string]interface{}
|
|
if err := json.Unmarshal(body, &payload); err != nil {
|
|
return nil, fmt.Errorf("search API returned invalid JSON: %w", err)
|
|
}
|
|
|
|
rawResults, ok := payload["results"]
|
|
if !ok {
|
|
return nil, fmt.Errorf("search API response missing results field")
|
|
}
|
|
|
|
items, ok := rawResults.([]interface{})
|
|
if !ok {
|
|
return nil, fmt.Errorf("search API results field is not an array")
|
|
}
|
|
|
|
results := make([]localSearchResult, 0, len(items))
|
|
for _, item := range items {
|
|
record, ok := item.(map[string]interface{})
|
|
if !ok {
|
|
continue
|
|
}
|
|
|
|
resultURL := pickString(record, "url", "link", "href")
|
|
if strings.TrimSpace(resultURL) == "" {
|
|
continue
|
|
}
|
|
|
|
results = append(results, localSearchResult{
|
|
URL: strings.TrimSpace(resultURL),
|
|
Title: strings.TrimSpace(pickString(record, "title", "name")),
|
|
Snippet: strings.TrimSpace(pickString(record, "content", "snippet", "description", "text")),
|
|
Engine: strings.TrimSpace(pickString(record, "engine", "source")),
|
|
Score: pickFloat(record, "score", "relevance"),
|
|
})
|
|
}
|
|
|
|
return results, nil
|
|
}
|
|
|
|
func pickString(record map[string]interface{}, keys ...string) string {
|
|
for _, key := range keys {
|
|
value, ok := record[key]
|
|
if !ok {
|
|
continue
|
|
}
|
|
switch v := value.(type) {
|
|
case string:
|
|
return v
|
|
case json.Number:
|
|
return v.String()
|
|
case float64:
|
|
return strconv.FormatFloat(v, 'f', -1, 64)
|
|
case int:
|
|
return strconv.Itoa(v)
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func pickFloat(record map[string]interface{}, keys ...string) float64 {
|
|
for _, key := range keys {
|
|
value, ok := record[key]
|
|
if !ok {
|
|
continue
|
|
}
|
|
switch v := value.(type) {
|
|
case float64:
|
|
return v
|
|
case float32:
|
|
return float64(v)
|
|
case int:
|
|
return float64(v)
|
|
case int64:
|
|
return float64(v)
|
|
case json.Number:
|
|
f, err := v.Float64()
|
|
if err == nil {
|
|
return f
|
|
}
|
|
case string:
|
|
f, err := strconv.ParseFloat(strings.TrimSpace(v), 64)
|
|
if err == nil {
|
|
return f
|
|
}
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func clampLocalSearchLimit(limit int) int {
|
|
if limit <= 0 {
|
|
return defaultLocalSearchLimit
|
|
}
|
|
if limit > maxLocalSearchLimit {
|
|
return maxLocalSearchLimit
|
|
}
|
|
return limit
|
|
}
|
|
|
|
func stripURLFragment(raw string) string {
|
|
u, err := url.Parse(strings.TrimSpace(raw))
|
|
if err != nil {
|
|
return strings.TrimSpace(raw)
|
|
}
|
|
u.Fragment = ""
|
|
return u.String()
|
|
}
|
|
|
|
func domainAllowed(raw string, allowedDomains []string) bool {
|
|
if len(allowedDomains) == 0 {
|
|
return true
|
|
}
|
|
|
|
u, err := url.Parse(raw)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
host := strings.ToLower(strings.TrimSpace(u.Hostname()))
|
|
if host == "" {
|
|
return false
|
|
}
|
|
|
|
for _, candidate := range allowedDomains {
|
|
domain := normalizeDomain(candidate)
|
|
if domain == "" {
|
|
continue
|
|
}
|
|
if host == domain || strings.HasSuffix(host, "."+domain) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func normalizeDomain(raw string) string {
|
|
raw = strings.ToLower(strings.TrimSpace(raw))
|
|
if raw == "" {
|
|
return ""
|
|
}
|
|
if strings.Contains(raw, "://") {
|
|
parsed, err := url.Parse(raw)
|
|
if err == nil {
|
|
return strings.ToLower(parsed.Hostname())
|
|
}
|
|
}
|
|
return strings.TrimPrefix(raw, ".")
|
|
}
|