mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
479 lines
10 KiB
Go
479 lines
10 KiB
Go
package scraper
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"fmt"
|
|
"net/url"
|
|
"path"
|
|
"regexp"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/gocolly/colly/v2"
|
|
)
|
|
|
|
// WebScraper scrapes documentation from web URLs.
|
|
type WebScraper struct {
|
|
config *Config
|
|
}
|
|
|
|
// NewWebScraper creates a new web scraper.
|
|
func NewWebScraper(config *Config) *WebScraper {
|
|
return &WebScraper{config: config}
|
|
}
|
|
|
|
// Scrape fetches and parses documents from a web source.
|
|
func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
|
var documents []*Document
|
|
visited := make(map[string]bool)
|
|
scheduled := make(map[string]bool)
|
|
contentHashes := make(map[string]bool)
|
|
var mu sync.Mutex
|
|
var scrapeErrors []string
|
|
|
|
// Parse base URL for domain restrictions
|
|
baseURL, err := url.Parse(source.URL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("invalid URL: %w", err)
|
|
}
|
|
allowedDomain := baseURL.Hostname()
|
|
if allowedDomain == "" {
|
|
allowedDomain = baseURL.Host
|
|
}
|
|
|
|
maxDepth := s.config.MaxDepth
|
|
if maxDepth <= 0 {
|
|
maxDepth = 2
|
|
}
|
|
maxPages := s.config.Concurrency * 40
|
|
if maxPages < 20 {
|
|
maxPages = 20
|
|
}
|
|
if maxDepth <= 1 && maxPages > 30 {
|
|
maxPages = 30
|
|
}
|
|
if maxPages > 300 {
|
|
maxPages = 300
|
|
}
|
|
scopePrefix := pathScopePrefix(baseURL.Path)
|
|
scopeLeaf := pathScopeLeaf(baseURL.Path)
|
|
|
|
// Create Colly collector
|
|
c := colly.NewCollector(
|
|
colly.AllowedDomains(allowedDomain),
|
|
colly.MaxDepth(maxDepth),
|
|
colly.Async(true),
|
|
colly.UserAgent(s.config.UserAgent),
|
|
)
|
|
|
|
// Set rate limiting
|
|
if s.config.RateLimit > 0 {
|
|
if err := c.Limit(&colly.LimitRule{
|
|
DomainGlob: "*",
|
|
Parallelism: s.config.Concurrency,
|
|
Delay: s.config.RateLimit,
|
|
}); err != nil {
|
|
return nil, fmt.Errorf("failed to set rate limiting: %w", err)
|
|
}
|
|
}
|
|
|
|
// Set timeout
|
|
if s.config.Timeout > 0 {
|
|
c.SetRequestTimeout(s.config.Timeout)
|
|
}
|
|
|
|
// Enable caching if cache directory is set
|
|
if s.config.CacheDir != "" {
|
|
c.CacheDir = s.config.CacheDir
|
|
}
|
|
|
|
// Handle errors
|
|
c.OnError(func(r *colly.Response, err error) {
|
|
errText := strings.ToLower(err.Error())
|
|
if strings.Contains(errText, "already visited") {
|
|
return
|
|
}
|
|
reqURL := source.URL
|
|
if r != nil && r.Request != nil && r.Request.URL != nil {
|
|
reqURL = r.Request.URL.String()
|
|
}
|
|
mu.Lock()
|
|
if len(scrapeErrors) < 20 {
|
|
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", reqURL, err))
|
|
}
|
|
mu.Unlock()
|
|
})
|
|
|
|
// Extract content from pages
|
|
c.OnHTML("html", func(e *colly.HTMLElement) {
|
|
pageURL := e.Request.URL.String()
|
|
if !withinScope(e.Request.URL, baseURL, scopePrefix, scopeLeaf) {
|
|
return
|
|
}
|
|
|
|
// Skip if already visited
|
|
mu.Lock()
|
|
if visited[pageURL] {
|
|
mu.Unlock()
|
|
return
|
|
}
|
|
if len(visited) >= maxPages {
|
|
mu.Unlock()
|
|
return
|
|
}
|
|
visited[pageURL] = true
|
|
mu.Unlock()
|
|
|
|
// Check include/exclude patterns
|
|
if !s.shouldInclude(pageURL, source.Include, source.Exclude) {
|
|
return
|
|
}
|
|
|
|
// Extract title
|
|
title := e.ChildText("title")
|
|
if title == "" {
|
|
title = e.ChildText("h1")
|
|
}
|
|
|
|
// Extract main content
|
|
content := s.extractContent(e)
|
|
|
|
// Skip if content is too short
|
|
if len(content) < 100 {
|
|
return
|
|
}
|
|
|
|
// Generate hash for change detection
|
|
hash := s.generateHash(content)
|
|
mu.Lock()
|
|
if contentHashes[hash] {
|
|
mu.Unlock()
|
|
return
|
|
}
|
|
contentHashes[hash] = true
|
|
mu.Unlock()
|
|
|
|
// Extract metadata
|
|
metadata := map[string]interface{}{
|
|
"headings": s.extractHeadings(e),
|
|
"links": s.extractLinks(e),
|
|
"images": s.extractImages(e),
|
|
"description": e.ChildAttr(`meta[name="description"]`, "content"),
|
|
}
|
|
|
|
doc := &Document{
|
|
ID: generateDocID(pageURL),
|
|
Source: source.Name,
|
|
Type: "html",
|
|
Title: strings.TrimSpace(title),
|
|
Content: content,
|
|
URL: pageURL,
|
|
Metadata: metadata,
|
|
Hash: hash,
|
|
Timestamp: time.Now(),
|
|
}
|
|
|
|
mu.Lock()
|
|
documents = append(documents, doc)
|
|
mu.Unlock()
|
|
})
|
|
|
|
// Follow links
|
|
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
|
link := e.Attr("href")
|
|
absoluteURL := e.Request.AbsoluteURL(link)
|
|
if absoluteURL == "" {
|
|
return
|
|
}
|
|
|
|
linkURL, err := url.Parse(absoluteURL)
|
|
if err != nil {
|
|
return
|
|
}
|
|
if linkURL.Scheme != "http" && linkURL.Scheme != "https" {
|
|
return
|
|
}
|
|
if !withinScope(linkURL, baseURL, scopePrefix, scopeLeaf) {
|
|
return
|
|
}
|
|
|
|
// Skip if already visited
|
|
mu.Lock()
|
|
if visited[absoluteURL] {
|
|
mu.Unlock()
|
|
return
|
|
}
|
|
if len(visited) >= maxPages {
|
|
mu.Unlock()
|
|
return
|
|
}
|
|
mu.Unlock()
|
|
|
|
// Check include/exclude patterns
|
|
if !s.shouldInclude(absoluteURL, source.Include, source.Exclude) {
|
|
return
|
|
}
|
|
|
|
mu.Lock()
|
|
if scheduled[absoluteURL] {
|
|
mu.Unlock()
|
|
return
|
|
}
|
|
if len(scheduled) >= maxPages {
|
|
mu.Unlock()
|
|
return
|
|
}
|
|
scheduled[absoluteURL] = true
|
|
mu.Unlock()
|
|
|
|
if err := c.Visit(absoluteURL); err != nil {
|
|
errText := strings.ToLower(err.Error())
|
|
if strings.Contains(errText, "already visited") {
|
|
return
|
|
}
|
|
mu.Lock()
|
|
if len(scrapeErrors) < 20 {
|
|
scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", absoluteURL, err))
|
|
}
|
|
mu.Unlock()
|
|
}
|
|
})
|
|
|
|
// Start scraping
|
|
scheduled[source.URL] = true
|
|
if err := c.Visit(source.URL); err != nil {
|
|
return nil, fmt.Errorf("failed to start scraping: %w", err)
|
|
}
|
|
|
|
// Wait for async scraping to complete
|
|
c.Wait()
|
|
|
|
mu.Lock()
|
|
defer mu.Unlock()
|
|
|
|
if len(documents) == 0 {
|
|
if len(scrapeErrors) > 0 {
|
|
return nil, fmt.Errorf("web scrape failed: %s", strings.Join(scrapeErrors, "; "))
|
|
}
|
|
return nil, fmt.Errorf("web scrape extracted no documents from %s", source.URL)
|
|
}
|
|
|
|
return documents, nil
|
|
}
|
|
|
|
// DetectChanges checks if the web source has changed.
|
|
func (s *WebScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
|
// Quick check by fetching just the main page
|
|
c := colly.NewCollector(
|
|
colly.UserAgent(s.config.UserAgent),
|
|
)
|
|
c.SetRequestTimeout(s.config.Timeout)
|
|
|
|
var content string
|
|
c.OnHTML("html", func(e *colly.HTMLElement) {
|
|
content = s.extractContent(e)
|
|
})
|
|
|
|
if err := c.Visit(source.URL); err != nil {
|
|
return false, "", err
|
|
}
|
|
|
|
currentHash := s.generateHash(content)
|
|
changed := currentHash != lastHash
|
|
|
|
return changed, currentHash, nil
|
|
}
|
|
|
|
// extractContent extracts the main text content from a page.
|
|
func (s *WebScraper) extractContent(e *colly.HTMLElement) string {
|
|
// Try common content selectors
|
|
selectors := []string{
|
|
"article",
|
|
"main",
|
|
".content",
|
|
".documentation",
|
|
".docs",
|
|
".markdown-body",
|
|
"[role='main']",
|
|
"#content",
|
|
"#main",
|
|
}
|
|
|
|
var content string
|
|
for _, selector := range selectors {
|
|
content = e.ChildText(selector)
|
|
if len(content) > 200 {
|
|
break
|
|
}
|
|
}
|
|
|
|
// Fallback to body if no content found
|
|
if content == "" {
|
|
content = e.ChildText("body")
|
|
}
|
|
|
|
// Clean up content
|
|
content = cleanText(content)
|
|
|
|
return content
|
|
}
|
|
|
|
// extractHeadings extracts heading structure.
|
|
func (s *WebScraper) extractHeadings(e *colly.HTMLElement) []string {
|
|
var headings []string
|
|
e.ForEach("h1, h2, h3, h4, h5, h6", func(_ int, h *colly.HTMLElement) {
|
|
text := strings.TrimSpace(h.Text)
|
|
if text != "" {
|
|
headings = append(headings, text)
|
|
}
|
|
})
|
|
return headings
|
|
}
|
|
|
|
// extractLinks extracts internal links.
|
|
func (s *WebScraper) extractLinks(e *colly.HTMLElement) []string {
|
|
var links []string
|
|
seen := make(map[string]bool)
|
|
e.ForEach("a[href]", func(_ int, a *colly.HTMLElement) {
|
|
href := a.Attr("href")
|
|
if href != "" && !seen[href] && !strings.HasPrefix(href, "#") {
|
|
links = append(links, href)
|
|
seen[href] = true
|
|
}
|
|
})
|
|
return links
|
|
}
|
|
|
|
// extractImages extracts image URLs.
|
|
func (s *WebScraper) extractImages(e *colly.HTMLElement) []string {
|
|
var images []string
|
|
e.ForEach("img[src]", func(_ int, img *colly.HTMLElement) {
|
|
src := img.Attr("src")
|
|
if src != "" {
|
|
images = append(images, src)
|
|
}
|
|
})
|
|
return images
|
|
}
|
|
|
|
// shouldInclude checks if a URL should be included based on patterns.
|
|
func (s *WebScraper) shouldInclude(urlStr string, include, exclude []string) bool {
|
|
// Check exclude patterns first
|
|
for _, pattern := range exclude {
|
|
matched, _ := regexp.MatchString(pattern, urlStr)
|
|
if matched {
|
|
return false
|
|
}
|
|
}
|
|
|
|
// If no include patterns, include all
|
|
if len(include) == 0 {
|
|
return true
|
|
}
|
|
|
|
// Check include patterns
|
|
for _, pattern := range include {
|
|
matched, _ := regexp.MatchString(pattern, urlStr)
|
|
if matched {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// generateHash generates a SHA256 hash of content.
|
|
func (s *WebScraper) generateHash(content string) string {
|
|
hash := sha256.Sum256([]byte(content))
|
|
return hex.EncodeToString(hash[:])
|
|
}
|
|
|
|
// cleanText removes extra whitespace and normalizes text.
|
|
func cleanText(text string) string {
|
|
noisePhrases := []string{
|
|
"table of contents",
|
|
"in this article",
|
|
"additional resources",
|
|
"feedback",
|
|
"collaborate with us on github",
|
|
"copyright",
|
|
"all rights reserved",
|
|
"privacy policy",
|
|
"terms of service",
|
|
"sign in",
|
|
"skip to main content",
|
|
"ask learn",
|
|
}
|
|
for _, phrase := range noisePhrases {
|
|
re := regexp.MustCompile(`(?i)` + regexp.QuoteMeta(phrase))
|
|
text = re.ReplaceAllString(text, " ")
|
|
}
|
|
|
|
// Replace multiple whitespace with single space
|
|
re := regexp.MustCompile(`\s+`)
|
|
text = re.ReplaceAllString(text, " ")
|
|
|
|
// Trim spaces
|
|
text = strings.TrimSpace(text)
|
|
|
|
return text
|
|
}
|
|
|
|
func pathScopePrefix(rawPath string) string {
|
|
clean := path.Clean(rawPath)
|
|
if clean == "." || clean == "/" || clean == "" {
|
|
return ""
|
|
}
|
|
|
|
last := path.Base(clean)
|
|
if strings.Contains(last, ".") {
|
|
dir := path.Dir(clean)
|
|
if dir == "/" {
|
|
// Root-level document page: keep crawler scoped to this page path.
|
|
return clean
|
|
}
|
|
return dir
|
|
}
|
|
|
|
dir := path.Dir(clean)
|
|
if dir == "/" {
|
|
return clean
|
|
}
|
|
return dir
|
|
}
|
|
|
|
func pathScopeLeaf(rawPath string) string {
|
|
clean := path.Clean(rawPath)
|
|
if clean == "." || clean == "/" || clean == "" {
|
|
return ""
|
|
}
|
|
last := path.Base(clean)
|
|
if strings.Contains(last, ".") {
|
|
return last
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func withinScope(target, base *url.URL, prefix, leaf string) bool {
|
|
if target == nil || base == nil {
|
|
return false
|
|
}
|
|
if !strings.EqualFold(target.Hostname(), base.Hostname()) {
|
|
return false
|
|
}
|
|
if prefix == "" {
|
|
return true
|
|
}
|
|
targetPath := target.Path
|
|
if targetPath == "" {
|
|
targetPath = path.Clean("/")
|
|
}
|
|
if strings.HasPrefix(targetPath, prefix) {
|
|
return true
|
|
}
|
|
return leaf != "" && path.Base(targetPath) == leaf
|
|
}
|