package scraper import ( "context" "crypto/sha256" "encoding/hex" "fmt" "net/url" "path" "regexp" "strings" "sync" "time" "github.com/gocolly/colly/v2" ) // WebScraper scrapes documentation from web URLs. type WebScraper struct { config *Config } // NewWebScraper creates a new web scraper. func NewWebScraper(config *Config) *WebScraper { return &WebScraper{config: config} } // Scrape fetches and parses documents from a web source. func (s *WebScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) { var documents []*Document visited := make(map[string]bool) scheduled := make(map[string]bool) contentHashes := make(map[string]bool) var mu sync.Mutex var scrapeErrors []string // Parse base URL for domain restrictions baseURL, err := url.Parse(source.URL) if err != nil { return nil, fmt.Errorf("invalid URL: %w", err) } allowedDomain := baseURL.Hostname() if allowedDomain == "" { allowedDomain = baseURL.Host } maxDepth := s.config.MaxDepth if maxDepth <= 0 { maxDepth = 2 } maxPages := s.config.Concurrency * 40 if maxPages < 20 { maxPages = 20 } if maxDepth <= 1 && maxPages > 30 { maxPages = 30 } if maxPages > 300 { maxPages = 300 } scopePrefix := pathScopePrefix(baseURL.Path) scopeLeaf := pathScopeLeaf(baseURL.Path) // Create Colly collector c := colly.NewCollector( colly.AllowedDomains(allowedDomain), colly.MaxDepth(maxDepth), colly.Async(true), colly.UserAgent(s.config.UserAgent), ) // Set rate limiting if s.config.RateLimit > 0 { if err := c.Limit(&colly.LimitRule{ DomainGlob: "*", Parallelism: s.config.Concurrency, Delay: s.config.RateLimit, }); err != nil { return nil, fmt.Errorf("failed to set rate limiting: %w", err) } } // Set timeout if s.config.Timeout > 0 { c.SetRequestTimeout(s.config.Timeout) } // Enable caching if cache directory is set if s.config.CacheDir != "" { c.CacheDir = s.config.CacheDir } // Handle errors c.OnError(func(r *colly.Response, err error) { errText := strings.ToLower(err.Error()) if strings.Contains(errText, "already visited") { return } reqURL := source.URL if r != nil && r.Request != nil && r.Request.URL != nil { reqURL = r.Request.URL.String() } mu.Lock() if len(scrapeErrors) < 20 { scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", reqURL, err)) } mu.Unlock() }) // Extract content from pages c.OnHTML("html", func(e *colly.HTMLElement) { pageURL := e.Request.URL.String() if !withinScope(e.Request.URL, baseURL, scopePrefix, scopeLeaf) { return } // Skip if already visited mu.Lock() if visited[pageURL] { mu.Unlock() return } if len(visited) >= maxPages { mu.Unlock() return } visited[pageURL] = true mu.Unlock() // Check include/exclude patterns if !s.shouldInclude(pageURL, source.Include, source.Exclude) { return } // Extract title title := e.ChildText("title") if title == "" { title = e.ChildText("h1") } // Extract main content content := s.extractContent(e) // Skip if content is too short if len(content) < 100 { return } // Generate hash for change detection hash := s.generateHash(content) mu.Lock() if contentHashes[hash] { mu.Unlock() return } contentHashes[hash] = true mu.Unlock() // Extract metadata metadata := map[string]interface{}{ "headings": s.extractHeadings(e), "links": s.extractLinks(e), "images": s.extractImages(e), "description": e.ChildAttr(`meta[name="description"]`, "content"), } doc := &Document{ ID: generateDocID(pageURL), Source: source.Name, Type: "html", Title: strings.TrimSpace(title), Content: content, URL: pageURL, Metadata: metadata, Hash: hash, Timestamp: time.Now(), } mu.Lock() documents = append(documents, doc) mu.Unlock() }) // Follow links c.OnHTML("a[href]", func(e *colly.HTMLElement) { link := e.Attr("href") absoluteURL := e.Request.AbsoluteURL(link) if absoluteURL == "" { return } linkURL, err := url.Parse(absoluteURL) if err != nil { return } if linkURL.Scheme != "http" && linkURL.Scheme != "https" { return } if !withinScope(linkURL, baseURL, scopePrefix, scopeLeaf) { return } // Skip if already visited mu.Lock() if visited[absoluteURL] { mu.Unlock() return } if len(visited) >= maxPages { mu.Unlock() return } mu.Unlock() // Check include/exclude patterns if !s.shouldInclude(absoluteURL, source.Include, source.Exclude) { return } mu.Lock() if scheduled[absoluteURL] { mu.Unlock() return } if len(scheduled) >= maxPages { mu.Unlock() return } scheduled[absoluteURL] = true mu.Unlock() if err := c.Visit(absoluteURL); err != nil { errText := strings.ToLower(err.Error()) if strings.Contains(errText, "already visited") { return } mu.Lock() if len(scrapeErrors) < 20 { scrapeErrors = append(scrapeErrors, fmt.Sprintf("%s: %v", absoluteURL, err)) } mu.Unlock() } }) // Start scraping scheduled[source.URL] = true if err := c.Visit(source.URL); err != nil { return nil, fmt.Errorf("failed to start scraping: %w", err) } // Wait for async scraping to complete c.Wait() mu.Lock() defer mu.Unlock() if len(documents) == 0 { if len(scrapeErrors) > 0 { return nil, fmt.Errorf("web scrape failed: %s", strings.Join(scrapeErrors, "; ")) } return nil, fmt.Errorf("web scrape extracted no documents from %s", source.URL) } return documents, nil } // DetectChanges checks if the web source has changed. func (s *WebScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) { // Quick check by fetching just the main page c := colly.NewCollector( colly.UserAgent(s.config.UserAgent), ) c.SetRequestTimeout(s.config.Timeout) var content string c.OnHTML("html", func(e *colly.HTMLElement) { content = s.extractContent(e) }) if err := c.Visit(source.URL); err != nil { return false, "", err } currentHash := s.generateHash(content) changed := currentHash != lastHash return changed, currentHash, nil } // extractContent extracts the main text content from a page. func (s *WebScraper) extractContent(e *colly.HTMLElement) string { // Try common content selectors selectors := []string{ "article", "main", ".content", ".documentation", ".docs", ".markdown-body", "[role='main']", "#content", "#main", } var content string for _, selector := range selectors { content = e.ChildText(selector) if len(content) > 200 { break } } // Fallback to body if no content found if content == "" { content = e.ChildText("body") } // Clean up content content = cleanText(content) return content } // extractHeadings extracts heading structure. func (s *WebScraper) extractHeadings(e *colly.HTMLElement) []string { var headings []string e.ForEach("h1, h2, h3, h4, h5, h6", func(_ int, h *colly.HTMLElement) { text := strings.TrimSpace(h.Text) if text != "" { headings = append(headings, text) } }) return headings } // extractLinks extracts internal links. func (s *WebScraper) extractLinks(e *colly.HTMLElement) []string { var links []string seen := make(map[string]bool) e.ForEach("a[href]", func(_ int, a *colly.HTMLElement) { href := a.Attr("href") if href != "" && !seen[href] && !strings.HasPrefix(href, "#") { links = append(links, href) seen[href] = true } }) return links } // extractImages extracts image URLs. func (s *WebScraper) extractImages(e *colly.HTMLElement) []string { var images []string e.ForEach("img[src]", func(_ int, img *colly.HTMLElement) { src := img.Attr("src") if src != "" { images = append(images, src) } }) return images } // shouldInclude checks if a URL should be included based on patterns. func (s *WebScraper) shouldInclude(urlStr string, include, exclude []string) bool { // Check exclude patterns first for _, pattern := range exclude { matched, _ := regexp.MatchString(pattern, urlStr) if matched { return false } } // If no include patterns, include all if len(include) == 0 { return true } // Check include patterns for _, pattern := range include { matched, _ := regexp.MatchString(pattern, urlStr) if matched { return true } } return false } // generateHash generates a SHA256 hash of content. func (s *WebScraper) generateHash(content string) string { hash := sha256.Sum256([]byte(content)) return hex.EncodeToString(hash[:]) } // cleanText removes extra whitespace and normalizes text. func cleanText(text string) string { noisePhrases := []string{ "table of contents", "in this article", "additional resources", "feedback", "collaborate with us on github", "copyright", "all rights reserved", "privacy policy", "terms of service", "sign in", "skip to main content", "ask learn", } for _, phrase := range noisePhrases { re := regexp.MustCompile(`(?i)` + regexp.QuoteMeta(phrase)) text = re.ReplaceAllString(text, " ") } // Replace multiple whitespace with single space re := regexp.MustCompile(`\s+`) text = re.ReplaceAllString(text, " ") // Trim spaces text = strings.TrimSpace(text) return text } func pathScopePrefix(rawPath string) string { clean := path.Clean(rawPath) if clean == "." || clean == "/" || clean == "" { return "" } last := path.Base(clean) if strings.Contains(last, ".") { dir := path.Dir(clean) if dir == "/" { // Root-level document page: keep crawler scoped to this page path. return clean } return dir } dir := path.Dir(clean) if dir == "/" { return clean } return dir } func pathScopeLeaf(rawPath string) string { clean := path.Clean(rawPath) if clean == "." || clean == "/" || clean == "" { return "" } last := path.Base(clean) if strings.Contains(last, ".") { return last } return "" } func withinScope(target, base *url.URL, prefix, leaf string) bool { if target == nil || base == nil { return false } if !strings.EqualFold(target.Hostname(), base.Hostname()) { return false } if prefix == "" { return true } targetPath := target.Path if targetPath == "" { targetPath = path.Clean("/") } if strings.HasPrefix(targetPath, prefix) { return true } return leaf != "" && path.Base(targetPath) == leaf }