package scraper import ( "context" "crypto/sha256" "encoding/hex" "fmt" "net/url" "regexp" "strings" "time" "github.com/gocolly/colly/v2" ) // WebScraper scrapes documentation from web URLs. type WebScraper struct { config *Config } // NewWebScraper creates a new web scraper. func NewWebScraper(config *Config) *WebScraper { return &WebScraper{config: config} } // Scrape fetches and parses documents from a web source. func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) { var documents []*Document visited := make(map[string]bool) // Parse base URL for domain restrictions baseURL, err := url.Parse(source.URL) if err != nil { return nil, fmt.Errorf("invalid URL: %w", err) } // Create Colly collector c := colly.NewCollector( colly.AllowedDomains(baseURL.Host), colly.MaxDepth(s.config.MaxDepth), colly.Async(true), colly.UserAgent(s.config.UserAgent), ) // Set rate limiting if s.config.RateLimit > 0 { c.Limit(&colly.LimitRule{ DomainGlob: "*", Parallelism: s.config.Concurrency, Delay: s.config.RateLimit, }) } // Set timeout if s.config.Timeout > 0 { c.SetRequestTimeout(s.config.Timeout) } // Enable caching if cache directory is set if s.config.CacheDir != "" { c.CacheDir = s.config.CacheDir } // Handle errors c.OnError(func(r *colly.Response, err error) { fmt.Printf("Error scraping %s: %v\n", r.Request.URL, err) }) // Extract content from pages c.OnHTML("html", func(e *colly.HTMLElement) { pageURL := e.Request.URL.String() // Skip if already visited if visited[pageURL] { return } visited[pageURL] = true // Check include/exclude patterns if !s.shouldInclude(pageURL, source.Include, source.Exclude) { return } // Extract title title := e.ChildText("title") if title == "" { title = e.ChildText("h1") } // Extract main content content := s.extractContent(e) // Skip if content is too short if len(content) < 100 { return } // Generate hash for change detection hash := s.generateHash(content) // Extract metadata metadata := map[string]interface{}{ "headings": s.extractHeadings(e), "links": s.extractLinks(e), "images": s.extractImages(e), "description": e.ChildAttr(`meta[name="description"]`, "content"), } doc := &Document{ ID: generateDocID(pageURL), Source: source.Name, Type: "html", Title: strings.TrimSpace(title), Content: content, URL: pageURL, Metadata: metadata, Hash: hash, Timestamp: time.Now(), } documents = append(documents, doc) }) // Follow links c.OnHTML("a[href]", func(e *colly.HTMLElement) { link := e.Attr("href") absoluteURL := e.Request.AbsoluteURL(link) // Skip if already visited if visited[absoluteURL] { return } // Check include/exclude patterns if !s.shouldInclude(absoluteURL, source.Include, source.Exclude) { return } c.Visit(absoluteURL) }) // Start scraping if err := c.Visit(source.URL); err != nil { return nil, fmt.Errorf("failed to start scraping: %w", err) } // Wait for async scraping to complete c.Wait() return documents, nil } // DetectChanges checks if the web source has changed. func (s *WebScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) { // Quick check by fetching just the main page c := colly.NewCollector( colly.UserAgent(s.config.UserAgent), ) c.SetRequestTimeout(s.config.Timeout) var content string c.OnHTML("html", func(e *colly.HTMLElement) { content = s.extractContent(e) }) if err := c.Visit(source.URL); err != nil { return false, "", err } currentHash := s.generateHash(content) changed := currentHash != lastHash return changed, currentHash, nil } // extractContent extracts the main text content from a page. func (s *WebScraper) extractContent(e *colly.HTMLElement) string { // Try common content selectors selectors := []string{ "article", "main", ".content", ".documentation", ".docs", ".markdown-body", "[role='main']", "#content", "#main", } var content string for _, selector := range selectors { content = e.ChildText(selector) if len(content) > 200 { break } } // Fallback to body if no content found if content == "" { content = e.ChildText("body") } // Clean up content content = cleanText(content) return content } // extractHeadings extracts heading structure. func (s *WebScraper) extractHeadings(e *colly.HTMLElement) []string { var headings []string e.ForEach("h1, h2, h3, h4, h5, h6", func(_ int, h *colly.HTMLElement) { text := strings.TrimSpace(h.Text) if text != "" { headings = append(headings, text) } }) return headings } // extractLinks extracts internal links. func (s *WebScraper) extractLinks(e *colly.HTMLElement) []string { var links []string seen := make(map[string]bool) e.ForEach("a[href]", func(_ int, a *colly.HTMLElement) { href := a.Attr("href") if href != "" && !seen[href] && !strings.HasPrefix(href, "#") { links = append(links, href) seen[href] = true } }) return links } // extractImages extracts image URLs. func (s *WebScraper) extractImages(e *colly.HTMLElement) []string { var images []string e.ForEach("img[src]", func(_ int, img *colly.HTMLElement) { src := img.Attr("src") if src != "" { images = append(images, src) } }) return images } // shouldInclude checks if a URL should be included based on patterns. func (s *WebScraper) shouldInclude(urlStr string, include, exclude []string) bool { // Check exclude patterns first for _, pattern := range exclude { matched, _ := regexp.MatchString(pattern, urlStr) if matched { return false } } // If no include patterns, include all if len(include) == 0 { return true } // Check include patterns for _, pattern := range include { matched, _ := regexp.MatchString(pattern, urlStr) if matched { return true } } return false } // generateHash generates a SHA256 hash of content. func (s *WebScraper) generateHash(content string) string { hash := sha256.Sum256([]byte(content)) return hex.EncodeToString(hash[:]) } // cleanText removes extra whitespace and normalizes text. func cleanText(text string) string { // Replace multiple whitespace with single space re := regexp.MustCompile(`\s+`) text = re.ReplaceAllString(text, " ") // Trim spaces text = strings.TrimSpace(text) return text } // generateDocID generates a unique ID for a document. func generateDocID(urlStr string) string { hash := sha256.Sum256([]byte(urlStr)) return hex.EncodeToString(hash[:12]) }