mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
update
This commit is contained in:
@@ -0,0 +1,402 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
defaultLocalSearchLimit = 8
|
||||
maxLocalSearchLimit = 50
|
||||
maxSearchResponseBytes = 2 << 20 // 2MB
|
||||
)
|
||||
|
||||
// LocalSearchScraper scrapes docs from result URLs returned by a local search API.
|
||||
type LocalSearchScraper struct {
|
||||
config *Config
|
||||
client *http.Client
|
||||
web *WebScraper
|
||||
}
|
||||
|
||||
type localSearchResult struct {
|
||||
URL string
|
||||
Title string
|
||||
Snippet string
|
||||
Engine string
|
||||
Score float64
|
||||
}
|
||||
|
||||
// NewLocalSearchScraper creates a scraper backed by a self-hosted search API.
|
||||
func NewLocalSearchScraper(config *Config) *LocalSearchScraper {
|
||||
baseConfig := &Config{}
|
||||
if config != nil {
|
||||
*baseConfig = *config
|
||||
}
|
||||
if baseConfig.UserAgent == "" {
|
||||
baseConfig.UserAgent = "Devour/1.0 (Local Search Scraper)"
|
||||
}
|
||||
if baseConfig.Timeout <= 0 {
|
||||
baseConfig.Timeout = 30 * time.Second
|
||||
}
|
||||
|
||||
webConfig := *baseConfig
|
||||
webConfig.Concurrency = 1
|
||||
webConfig.MaxDepth = 1
|
||||
|
||||
return &LocalSearchScraper{
|
||||
config: baseConfig,
|
||||
client: &http.Client{Timeout: baseConfig.Timeout},
|
||||
web: NewWebScraper(&webConfig),
|
||||
}
|
||||
}
|
||||
|
||||
// Scrape queries a local search API and scrapes the returned URLs.
|
||||
func (s *LocalSearchScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
if source == nil {
|
||||
return nil, fmt.Errorf("source is required")
|
||||
}
|
||||
if strings.TrimSpace(source.URL) == "" {
|
||||
return nil, fmt.Errorf("search API URL is required")
|
||||
}
|
||||
query := strings.TrimSpace(source.Query)
|
||||
if query == "" {
|
||||
return nil, fmt.Errorf("search query is required for localsearch sources (set source.query or --search-query)")
|
||||
}
|
||||
|
||||
limit := clampLocalSearchLimit(source.ResultLimit)
|
||||
results, err := s.search(ctx, source, query, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
docs := make([]*Document, 0, limit)
|
||||
seen := make(map[string]bool)
|
||||
var scrapeErrors []string
|
||||
|
||||
for i, result := range results {
|
||||
if ctx.Err() != nil {
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
|
||||
resultURL := stripURLFragment(result.URL)
|
||||
if resultURL == "" || seen[resultURL] {
|
||||
continue
|
||||
}
|
||||
if !domainAllowed(resultURL, source.Domains) {
|
||||
continue
|
||||
}
|
||||
if !s.web.shouldInclude(resultURL, source.Include, source.Exclude) {
|
||||
continue
|
||||
}
|
||||
seen[resultURL] = true
|
||||
|
||||
pageDocs, err := s.web.Scrape(ctx, &Source{
|
||||
Name: source.Name,
|
||||
Type: SourceTypeWeb,
|
||||
URL: resultURL,
|
||||
Include: source.Include,
|
||||
Exclude: source.Exclude,
|
||||
})
|
||||
if err != nil {
|
||||
if len(scrapeErrors) < 20 {
|
||||
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", resultURL, err))
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
for _, doc := range pageDocs {
|
||||
if doc.Metadata == nil {
|
||||
doc.Metadata = make(map[string]interface{})
|
||||
}
|
||||
doc.Metadata["search_api"] = source.URL
|
||||
doc.Metadata["search_query"] = query
|
||||
doc.Metadata["search_rank"] = i + 1
|
||||
if result.Engine != "" {
|
||||
doc.Metadata["search_engine"] = result.Engine
|
||||
}
|
||||
if result.Snippet != "" {
|
||||
doc.Metadata["search_snippet"] = result.Snippet
|
||||
}
|
||||
if result.Score != 0 {
|
||||
doc.Metadata["search_score"] = result.Score
|
||||
}
|
||||
if strings.TrimSpace(doc.Title) == "" && strings.TrimSpace(result.Title) != "" {
|
||||
doc.Title = strings.TrimSpace(result.Title)
|
||||
}
|
||||
|
||||
docs = append(docs, doc)
|
||||
}
|
||||
}
|
||||
|
||||
if len(docs) == 0 {
|
||||
if len(scrapeErrors) > 0 {
|
||||
return nil, fmt.Errorf("local search returned results but page scraping failed: %s", strings.Join(scrapeErrors, "; "))
|
||||
}
|
||||
return nil, fmt.Errorf("local search yielded no usable results for query %q", query)
|
||||
}
|
||||
|
||||
return docs, nil
|
||||
}
|
||||
|
||||
// DetectChanges checks if top search results changed.
|
||||
func (s *LocalSearchScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
if source == nil {
|
||||
return false, "", fmt.Errorf("source is required")
|
||||
}
|
||||
query := strings.TrimSpace(source.Query)
|
||||
if query == "" {
|
||||
return false, "", fmt.Errorf("search query is required for localsearch sources")
|
||||
}
|
||||
|
||||
limit := clampLocalSearchLimit(source.ResultLimit)
|
||||
results, err := s.search(ctx, source, query, limit)
|
||||
if err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
signatures := make([]string, 0, len(results))
|
||||
for _, result := range results {
|
||||
u := stripURLFragment(result.URL)
|
||||
if u == "" {
|
||||
continue
|
||||
}
|
||||
if !domainAllowed(u, source.Domains) {
|
||||
continue
|
||||
}
|
||||
if !s.web.shouldInclude(u, source.Include, source.Exclude) {
|
||||
continue
|
||||
}
|
||||
signatures = append(signatures, fmt.Sprintf("%s|%s|%s|%.6f", u, result.Title, result.Engine, result.Score))
|
||||
}
|
||||
sort.Strings(signatures)
|
||||
|
||||
hash := sha256.Sum256([]byte(strings.Join(signatures, "\n")))
|
||||
currentHash := hex.EncodeToString(hash[:])
|
||||
return currentHash != lastHash, currentHash, nil
|
||||
}
|
||||
|
||||
func (s *LocalSearchScraper) search(ctx context.Context, source *Source, query string, limit int) ([]localSearchResult, error) {
|
||||
searchURL, err := buildLocalSearchURL(source.URL, query, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to build search request: %w", err)
|
||||
}
|
||||
req.Header.Set("User-Agent", s.config.UserAgent)
|
||||
req.Header.Set("Accept", "application/json")
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("search API request failed: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(io.LimitReader(resp.Body, maxSearchResponseBytes))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed reading search API response: %w", err)
|
||||
}
|
||||
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
msg := strings.TrimSpace(string(body))
|
||||
if len(msg) > 200 {
|
||||
msg = msg[:200]
|
||||
}
|
||||
return nil, fmt.Errorf("search API returned HTTP %d: %s", resp.StatusCode, msg)
|
||||
}
|
||||
|
||||
results, err := decodeLocalSearchResults(body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(results) == 0 {
|
||||
return nil, fmt.Errorf("search API returned no results")
|
||||
}
|
||||
if len(results) > limit {
|
||||
results = results[:limit]
|
||||
}
|
||||
return results, nil
|
||||
}
|
||||
|
||||
func buildLocalSearchURL(rawURL, query string, limit int) (string, error) {
|
||||
u, err := url.Parse(strings.TrimSpace(rawURL))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("invalid search API URL: %w", err)
|
||||
}
|
||||
if u.Scheme == "" || u.Host == "" {
|
||||
return "", fmt.Errorf("invalid search API URL: expected absolute URL, got %q", rawURL)
|
||||
}
|
||||
|
||||
params := u.Query()
|
||||
params.Set("q", query)
|
||||
if params.Get("format") == "" {
|
||||
params.Set("format", "json")
|
||||
}
|
||||
if params.Get("limit") == "" {
|
||||
params.Set("limit", strconv.Itoa(clampLocalSearchLimit(limit)))
|
||||
}
|
||||
u.RawQuery = params.Encode()
|
||||
|
||||
return u.String(), nil
|
||||
}
|
||||
|
||||
func decodeLocalSearchResults(body []byte) ([]localSearchResult, error) {
|
||||
var payload map[string]interface{}
|
||||
if err := json.Unmarshal(body, &payload); err != nil {
|
||||
return nil, fmt.Errorf("search API returned invalid JSON: %w", err)
|
||||
}
|
||||
|
||||
rawResults, ok := payload["results"]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("search API response missing results field")
|
||||
}
|
||||
|
||||
items, ok := rawResults.([]interface{})
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("search API results field is not an array")
|
||||
}
|
||||
|
||||
results := make([]localSearchResult, 0, len(items))
|
||||
for _, item := range items {
|
||||
record, ok := item.(map[string]interface{})
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
resultURL := pickString(record, "url", "link", "href")
|
||||
if strings.TrimSpace(resultURL) == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
results = append(results, localSearchResult{
|
||||
URL: strings.TrimSpace(resultURL),
|
||||
Title: strings.TrimSpace(pickString(record, "title", "name")),
|
||||
Snippet: strings.TrimSpace(pickString(record, "content", "snippet", "description", "text")),
|
||||
Engine: strings.TrimSpace(pickString(record, "engine", "source")),
|
||||
Score: pickFloat(record, "score", "relevance"),
|
||||
})
|
||||
}
|
||||
|
||||
return results, nil
|
||||
}
|
||||
|
||||
func pickString(record map[string]interface{}, keys ...string) string {
|
||||
for _, key := range keys {
|
||||
value, ok := record[key]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
switch v := value.(type) {
|
||||
case string:
|
||||
return v
|
||||
case json.Number:
|
||||
return v.String()
|
||||
case float64:
|
||||
return strconv.FormatFloat(v, 'f', -1, 64)
|
||||
case int:
|
||||
return strconv.Itoa(v)
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func pickFloat(record map[string]interface{}, keys ...string) float64 {
|
||||
for _, key := range keys {
|
||||
value, ok := record[key]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
switch v := value.(type) {
|
||||
case float64:
|
||||
return v
|
||||
case float32:
|
||||
return float64(v)
|
||||
case int:
|
||||
return float64(v)
|
||||
case int64:
|
||||
return float64(v)
|
||||
case json.Number:
|
||||
f, err := v.Float64()
|
||||
if err == nil {
|
||||
return f
|
||||
}
|
||||
case string:
|
||||
f, err := strconv.ParseFloat(strings.TrimSpace(v), 64)
|
||||
if err == nil {
|
||||
return f
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func clampLocalSearchLimit(limit int) int {
|
||||
if limit <= 0 {
|
||||
return defaultLocalSearchLimit
|
||||
}
|
||||
if limit > maxLocalSearchLimit {
|
||||
return maxLocalSearchLimit
|
||||
}
|
||||
return limit
|
||||
}
|
||||
|
||||
func stripURLFragment(raw string) string {
|
||||
u, err := url.Parse(strings.TrimSpace(raw))
|
||||
if err != nil {
|
||||
return strings.TrimSpace(raw)
|
||||
}
|
||||
u.Fragment = ""
|
||||
return u.String()
|
||||
}
|
||||
|
||||
func domainAllowed(raw string, allowedDomains []string) bool {
|
||||
if len(allowedDomains) == 0 {
|
||||
return true
|
||||
}
|
||||
|
||||
u, err := url.Parse(raw)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
host := strings.ToLower(strings.TrimSpace(u.Hostname()))
|
||||
if host == "" {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, candidate := range allowedDomains {
|
||||
domain := normalizeDomain(candidate)
|
||||
if domain == "" {
|
||||
continue
|
||||
}
|
||||
if host == domain || strings.HasSuffix(host, "."+domain) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func normalizeDomain(raw string) string {
|
||||
raw = strings.ToLower(strings.TrimSpace(raw))
|
||||
if raw == "" {
|
||||
return ""
|
||||
}
|
||||
if strings.Contains(raw, "://") {
|
||||
parsed, err := url.Parse(raw)
|
||||
if err == nil {
|
||||
return strings.ToLower(parsed.Hostname())
|
||||
}
|
||||
}
|
||||
return strings.TrimPrefix(raw, ".")
|
||||
}
|
||||
Reference in New Issue
Block a user