mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
update
This commit is contained in:
+191
-7
@@ -6,8 +6,10 @@ import (
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"path"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/gocolly/colly/v2"
|
||||
@@ -27,17 +29,42 @@ func NewWebScraper(config *Config) *WebScraper {
|
||||
func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
visited := make(map[string]bool)
|
||||
scheduled := make(map[string]bool)
|
||||
contentHashes := make(map[string]bool)
|
||||
var mu sync.Mutex
|
||||
var scrapeErrors []string
|
||||
|
||||
// Parse base URL for domain restrictions
|
||||
baseURL, err := url.Parse(source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid URL: %w", err)
|
||||
}
|
||||
allowedDomain := baseURL.Hostname()
|
||||
if allowedDomain == "" {
|
||||
allowedDomain = baseURL.Host
|
||||
}
|
||||
|
||||
maxDepth := s.config.MaxDepth
|
||||
if maxDepth <= 0 {
|
||||
maxDepth = 2
|
||||
}
|
||||
maxPages := s.config.Concurrency * 40
|
||||
if maxPages < 20 {
|
||||
maxPages = 20
|
||||
}
|
||||
if maxDepth <= 1 && maxPages > 30 {
|
||||
maxPages = 30
|
||||
}
|
||||
if maxPages > 300 {
|
||||
maxPages = 300
|
||||
}
|
||||
scopePrefix := pathScopePrefix(baseURL.Path)
|
||||
scopeLeaf := pathScopeLeaf(baseURL.Path)
|
||||
|
||||
// Create Colly collector
|
||||
c := colly.NewCollector(
|
||||
colly.AllowedDomains(baseURL.Host),
|
||||
colly.MaxDepth(s.config.MaxDepth),
|
||||
colly.AllowedDomains(allowedDomain),
|
||||
colly.MaxDepth(maxDepth),
|
||||
colly.Async(true),
|
||||
colly.UserAgent(s.config.UserAgent),
|
||||
)
|
||||
@@ -65,18 +92,40 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
|
||||
|
||||
// Handle errors
|
||||
c.OnError(func(r *colly.Response, err error) {
|
||||
fmt.Printf("Error scraping %s: %v\n", r.Request.URL, err)
|
||||
errText := strings.ToLower(err.Error())
|
||||
if strings.Contains(errText, "already visited") {
|
||||
return
|
||||
}
|
||||
reqURL := source.URL
|
||||
if r != nil && r.Request != nil && r.Request.URL != nil {
|
||||
reqURL = r.Request.URL.String()
|
||||
}
|
||||
mu.Lock()
|
||||
if len(scrapeErrors) < 20 {
|
||||
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", reqURL, err))
|
||||
}
|
||||
mu.Unlock()
|
||||
})
|
||||
|
||||
// Extract content from pages
|
||||
c.OnHTML("html", func(e *colly.HTMLElement) {
|
||||
pageURL := e.Request.URL.String()
|
||||
if !withinScope(e.Request.URL, baseURL, scopePrefix, scopeLeaf) {
|
||||
return
|
||||
}
|
||||
|
||||
// Skip if already visited
|
||||
mu.Lock()
|
||||
if visited[pageURL] {
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
if len(visited) >= maxPages {
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
visited[pageURL] = true
|
||||
mu.Unlock()
|
||||
|
||||
// Check include/exclude patterns
|
||||
if !s.shouldInclude(pageURL, source.Include, source.Exclude) {
|
||||
@@ -99,6 +148,13 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
|
||||
|
||||
// Generate hash for change detection
|
||||
hash := s.generateHash(content)
|
||||
mu.Lock()
|
||||
if contentHashes[hash] {
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
contentHashes[hash] = true
|
||||
mu.Unlock()
|
||||
|
||||
// Extract metadata
|
||||
metadata := map[string]interface{}{
|
||||
@@ -120,30 +176,74 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
|
||||
mu.Lock()
|
||||
documents = append(documents, doc)
|
||||
mu.Unlock()
|
||||
})
|
||||
|
||||
// Follow links
|
||||
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
||||
link := e.Attr("href")
|
||||
absoluteURL := e.Request.AbsoluteURL(link)
|
||||
|
||||
// Skip if already visited
|
||||
if visited[absoluteURL] {
|
||||
if absoluteURL == "" {
|
||||
return
|
||||
}
|
||||
|
||||
linkURL, err := url.Parse(absoluteURL)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
if linkURL.Scheme != "http" && linkURL.Scheme != "https" {
|
||||
return
|
||||
}
|
||||
if !withinScope(linkURL, baseURL, scopePrefix, scopeLeaf) {
|
||||
return
|
||||
}
|
||||
|
||||
// Skip if already visited
|
||||
mu.Lock()
|
||||
if visited[absoluteURL] {
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
if len(visited) >= maxPages {
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
mu.Unlock()
|
||||
|
||||
// Check include/exclude patterns
|
||||
if !s.shouldInclude(absoluteURL, source.Include, source.Exclude) {
|
||||
return
|
||||
}
|
||||
|
||||
mu.Lock()
|
||||
if scheduled[absoluteURL] {
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
if len(scheduled) >= maxPages {
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
scheduled[absoluteURL] = true
|
||||
mu.Unlock()
|
||||
|
||||
if err := c.Visit(absoluteURL); err != nil {
|
||||
fmt.Printf("Error visiting %s: %v\n", absoluteURL, err)
|
||||
errText := strings.ToLower(err.Error())
|
||||
if strings.Contains(errText, "already visited") {
|
||||
return
|
||||
}
|
||||
mu.Lock()
|
||||
if len(scrapeErrors) < 20 {
|
||||
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", absoluteURL, err))
|
||||
}
|
||||
mu.Unlock()
|
||||
}
|
||||
})
|
||||
|
||||
// Start scraping
|
||||
scheduled[source.URL] = true
|
||||
if err := c.Visit(source.URL); err != nil {
|
||||
return nil, fmt.Errorf("failed to start scraping: %w", err)
|
||||
}
|
||||
@@ -151,6 +251,16 @@ func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, e
|
||||
// Wait for async scraping to complete
|
||||
c.Wait()
|
||||
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
|
||||
if len(documents) == 0 {
|
||||
if len(scrapeErrors) > 0 {
|
||||
return nil, fmt.Errorf("web scrape failed: %s", strings.Join(scrapeErrors, "; "))
|
||||
}
|
||||
return nil, fmt.Errorf("web scrape extracted no documents from %s", source.URL)
|
||||
}
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
@@ -283,6 +393,25 @@ func (s *WebScraper) generateHash(content string) string {
|
||||
|
||||
// cleanText removes extra whitespace and normalizes text.
|
||||
func cleanText(text string) string {
|
||||
noisePhrases := []string{
|
||||
"table of contents",
|
||||
"in this article",
|
||||
"additional resources",
|
||||
"feedback",
|
||||
"collaborate with us on github",
|
||||
"copyright",
|
||||
"all rights reserved",
|
||||
"privacy policy",
|
||||
"terms of service",
|
||||
"sign in",
|
||||
"skip to main content",
|
||||
"ask learn",
|
||||
}
|
||||
for _, phrase := range noisePhrases {
|
||||
re := regexp.MustCompile(`(?i)` + regexp.QuoteMeta(phrase))
|
||||
text = re.ReplaceAllString(text, " ")
|
||||
}
|
||||
|
||||
// Replace multiple whitespace with single space
|
||||
re := regexp.MustCompile(`\s+`)
|
||||
text = re.ReplaceAllString(text, " ")
|
||||
@@ -292,3 +421,58 @@ func cleanText(text string) string {
|
||||
|
||||
return text
|
||||
}
|
||||
|
||||
func pathScopePrefix(rawPath string) string {
|
||||
clean := path.Clean(rawPath)
|
||||
if clean == "." || clean == "/" || clean == "" {
|
||||
return ""
|
||||
}
|
||||
|
||||
last := path.Base(clean)
|
||||
if strings.Contains(last, ".") {
|
||||
dir := path.Dir(clean)
|
||||
if dir == "/" {
|
||||
// Root-level document page: keep crawler scoped to this page path.
|
||||
return clean
|
||||
}
|
||||
return dir
|
||||
}
|
||||
|
||||
dir := path.Dir(clean)
|
||||
if dir == "/" {
|
||||
return clean
|
||||
}
|
||||
return dir
|
||||
}
|
||||
|
||||
func pathScopeLeaf(rawPath string) string {
|
||||
clean := path.Clean(rawPath)
|
||||
if clean == "." || clean == "/" || clean == "" {
|
||||
return ""
|
||||
}
|
||||
last := path.Base(clean)
|
||||
if strings.Contains(last, ".") {
|
||||
return last
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func withinScope(target, base *url.URL, prefix, leaf string) bool {
|
||||
if target == nil || base == nil {
|
||||
return false
|
||||
}
|
||||
if !strings.EqualFold(target.Hostname(), base.Hostname()) {
|
||||
return false
|
||||
}
|
||||
if prefix == "" {
|
||||
return true
|
||||
}
|
||||
targetPath := target.Path
|
||||
if targetPath == "" {
|
||||
targetPath = path.Clean("/")
|
||||
}
|
||||
if strings.HasPrefix(targetPath, prefix) {
|
||||
return true
|
||||
}
|
||||
return leaf != "" && path.Base(targetPath) == leaf
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user