mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
first commit
This commit is contained in:
@@ -0,0 +1,296 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/gocolly/colly/v2"
|
||||
)
|
||||
|
||||
// WebScraper scrapes documentation from web URLs.
|
||||
type WebScraper struct {
|
||||
config *Config
|
||||
}
|
||||
|
||||
// NewWebScraper creates a new web scraper.
|
||||
func NewWebScraper(config *Config) *WebScraper {
|
||||
return &WebScraper{config: config}
|
||||
}
|
||||
|
||||
// Scrape fetches and parses documents from a web source.
|
||||
func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
||||
var documents []*Document
|
||||
visited := make(map[string]bool)
|
||||
|
||||
// Parse base URL for domain restrictions
|
||||
baseURL, err := url.Parse(source.URL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid URL: %w", err)
|
||||
}
|
||||
|
||||
// Create Colly collector
|
||||
c := colly.NewCollector(
|
||||
colly.AllowedDomains(baseURL.Host),
|
||||
colly.MaxDepth(s.config.MaxDepth),
|
||||
colly.Async(true),
|
||||
colly.UserAgent(s.config.UserAgent),
|
||||
)
|
||||
|
||||
// Set rate limiting
|
||||
if s.config.RateLimit > 0 {
|
||||
c.Limit(&colly.LimitRule{
|
||||
DomainGlob: "*",
|
||||
Parallelism: s.config.Concurrency,
|
||||
Delay: s.config.RateLimit,
|
||||
})
|
||||
}
|
||||
|
||||
// Set timeout
|
||||
if s.config.Timeout > 0 {
|
||||
c.SetRequestTimeout(s.config.Timeout)
|
||||
}
|
||||
|
||||
// Enable caching if cache directory is set
|
||||
if s.config.CacheDir != "" {
|
||||
c.CacheDir = s.config.CacheDir
|
||||
}
|
||||
|
||||
// Handle errors
|
||||
c.OnError(func(r *colly.Response, err error) {
|
||||
fmt.Printf("Error scraping %s: %v\n", r.Request.URL, err)
|
||||
})
|
||||
|
||||
// Extract content from pages
|
||||
c.OnHTML("html", func(e *colly.HTMLElement) {
|
||||
pageURL := e.Request.URL.String()
|
||||
|
||||
// Skip if already visited
|
||||
if visited[pageURL] {
|
||||
return
|
||||
}
|
||||
visited[pageURL] = true
|
||||
|
||||
// Check include/exclude patterns
|
||||
if !s.shouldInclude(pageURL, source.Include, source.Exclude) {
|
||||
return
|
||||
}
|
||||
|
||||
// Extract title
|
||||
title := e.ChildText("title")
|
||||
if title == "" {
|
||||
title = e.ChildText("h1")
|
||||
}
|
||||
|
||||
// Extract main content
|
||||
content := s.extractContent(e)
|
||||
|
||||
// Skip if content is too short
|
||||
if len(content) < 100 {
|
||||
return
|
||||
}
|
||||
|
||||
// Generate hash for change detection
|
||||
hash := s.generateHash(content)
|
||||
|
||||
// Extract metadata
|
||||
metadata := map[string]interface{}{
|
||||
"headings": s.extractHeadings(e),
|
||||
"links": s.extractLinks(e),
|
||||
"images": s.extractImages(e),
|
||||
"description": e.ChildAttr(`meta[name="description"]`, "content"),
|
||||
}
|
||||
|
||||
doc := &Document{
|
||||
ID: generateDocID(pageURL),
|
||||
Source: source.Name,
|
||||
Type: "html",
|
||||
Title: strings.TrimSpace(title),
|
||||
Content: content,
|
||||
URL: pageURL,
|
||||
Metadata: metadata,
|
||||
Hash: hash,
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
|
||||
documents = append(documents, doc)
|
||||
})
|
||||
|
||||
// Follow links
|
||||
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
||||
link := e.Attr("href")
|
||||
absoluteURL := e.Request.AbsoluteURL(link)
|
||||
|
||||
// Skip if already visited
|
||||
if visited[absoluteURL] {
|
||||
return
|
||||
}
|
||||
|
||||
// Check include/exclude patterns
|
||||
if !s.shouldInclude(absoluteURL, source.Include, source.Exclude) {
|
||||
return
|
||||
}
|
||||
|
||||
c.Visit(absoluteURL)
|
||||
})
|
||||
|
||||
// Start scraping
|
||||
if err := c.Visit(source.URL); err != nil {
|
||||
return nil, fmt.Errorf("failed to start scraping: %w", err)
|
||||
}
|
||||
|
||||
// Wait for async scraping to complete
|
||||
c.Wait()
|
||||
|
||||
return documents, nil
|
||||
}
|
||||
|
||||
// DetectChanges checks if the web source has changed.
|
||||
func (s *WebScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
||||
// Quick check by fetching just the main page
|
||||
c := colly.NewCollector(
|
||||
colly.UserAgent(s.config.UserAgent),
|
||||
)
|
||||
c.SetRequestTimeout(s.config.Timeout)
|
||||
|
||||
var content string
|
||||
c.OnHTML("html", func(e *colly.HTMLElement) {
|
||||
content = s.extractContent(e)
|
||||
})
|
||||
|
||||
if err := c.Visit(source.URL); err != nil {
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
currentHash := s.generateHash(content)
|
||||
changed := currentHash != lastHash
|
||||
|
||||
return changed, currentHash, nil
|
||||
}
|
||||
|
||||
// extractContent extracts the main text content from a page.
|
||||
func (s *WebScraper) extractContent(e *colly.HTMLElement) string {
|
||||
// Try common content selectors
|
||||
selectors := []string{
|
||||
"article",
|
||||
"main",
|
||||
".content",
|
||||
".documentation",
|
||||
".docs",
|
||||
".markdown-body",
|
||||
"[role='main']",
|
||||
"#content",
|
||||
"#main",
|
||||
}
|
||||
|
||||
var content string
|
||||
for _, selector := range selectors {
|
||||
content = e.ChildText(selector)
|
||||
if len(content) > 200 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to body if no content found
|
||||
if content == "" {
|
||||
content = e.ChildText("body")
|
||||
}
|
||||
|
||||
// Clean up content
|
||||
content = cleanText(content)
|
||||
|
||||
return content
|
||||
}
|
||||
|
||||
// extractHeadings extracts heading structure.
|
||||
func (s *WebScraper) extractHeadings(e *colly.HTMLElement) []string {
|
||||
var headings []string
|
||||
e.ForEach("h1, h2, h3, h4, h5, h6", func(_ int, h *colly.HTMLElement) {
|
||||
text := strings.TrimSpace(h.Text)
|
||||
if text != "" {
|
||||
headings = append(headings, text)
|
||||
}
|
||||
})
|
||||
return headings
|
||||
}
|
||||
|
||||
// extractLinks extracts internal links.
|
||||
func (s *WebScraper) extractLinks(e *colly.HTMLElement) []string {
|
||||
var links []string
|
||||
seen := make(map[string]bool)
|
||||
e.ForEach("a[href]", func(_ int, a *colly.HTMLElement) {
|
||||
href := a.Attr("href")
|
||||
if href != "" && !seen[href] && !strings.HasPrefix(href, "#") {
|
||||
links = append(links, href)
|
||||
seen[href] = true
|
||||
}
|
||||
})
|
||||
return links
|
||||
}
|
||||
|
||||
// extractImages extracts image URLs.
|
||||
func (s *WebScraper) extractImages(e *colly.HTMLElement) []string {
|
||||
var images []string
|
||||
e.ForEach("img[src]", func(_ int, img *colly.HTMLElement) {
|
||||
src := img.Attr("src")
|
||||
if src != "" {
|
||||
images = append(images, src)
|
||||
}
|
||||
})
|
||||
return images
|
||||
}
|
||||
|
||||
// shouldInclude checks if a URL should be included based on patterns.
|
||||
func (s *WebScraper) shouldInclude(urlStr string, include, exclude []string) bool {
|
||||
// Check exclude patterns first
|
||||
for _, pattern := range exclude {
|
||||
matched, _ := regexp.MatchString(pattern, urlStr)
|
||||
if matched {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// If no include patterns, include all
|
||||
if len(include) == 0 {
|
||||
return true
|
||||
}
|
||||
|
||||
// Check include patterns
|
||||
for _, pattern := range include {
|
||||
matched, _ := regexp.MatchString(pattern, urlStr)
|
||||
if matched {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// generateHash generates a SHA256 hash of content.
|
||||
func (s *WebScraper) generateHash(content string) string {
|
||||
hash := sha256.Sum256([]byte(content))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
// cleanText removes extra whitespace and normalizes text.
|
||||
func cleanText(text string) string {
|
||||
// Replace multiple whitespace with single space
|
||||
re := regexp.MustCompile(`\s+`)
|
||||
text = re.ReplaceAllString(text, " ")
|
||||
|
||||
// Trim spaces
|
||||
text = strings.TrimSpace(text)
|
||||
|
||||
return text
|
||||
}
|
||||
|
||||
// generateDocID generates a unique ID for a document.
|
||||
func generateDocID(urlStr string) string {
|
||||
hash := sha256.Sum256([]byte(urlStr))
|
||||
return hex.EncodeToString(hash[:12])
|
||||
}
|
||||
Reference in New Issue
Block a user