Devour/internal/scraper/web_integration_test.go

package scraper

import (
	"context"
	"net/http"
	"net/http/httptest"
	"strings"
	"testing"
	"time"
)

func TestWebScraper_ReturnsErrorWhenNothingFetched(t *testing.T) {
	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		http.NotFound(w, r)
	}))
	defer srv.Close()

	s := NewWebScraper(&Config{
		UserAgent:   "DevourTest/1.0",
		Timeout:     2 * time.Second,
		Concurrency: 1,
		MaxDepth:    1,
	})

	_, err := s.Scrape(context.Background(), &Source{
		Name: "missing",
		Type: SourceTypeWeb,
		URL:  srv.URL + "/missing",
	})
	if err == nil {
		t.Fatal("expected error when web scrape yields no documents")
	}
}

func TestWebScraper_ReturnsErrorWhenPageHasNoExtractableContent(t *testing.T) {
	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		w.Header().Set("Content-Type", "text/html")
		_, _ = w.Write([]byte(`<html><head><title>Empty</title></head><body><p>tiny</p></body></html>`))
	}))
	defer srv.Close()

	s := NewWebScraper(&Config{
		UserAgent:   "DevourTest/1.0",
		Timeout:     2 * time.Second,
		Concurrency: 1,
		MaxDepth:    1,
	})

	_, err := s.Scrape(context.Background(), &Source{
		Name: "empty",
		Type: SourceTypeWeb,
		URL:  srv.URL,
	})
	if err == nil {
		t.Fatal("expected error when page has no extractable docs")
	}
	if !strings.Contains(err.Error(), "extracted no documents") {
		t.Fatalf("unexpected error message: %v", err)
	}
}

func TestWebScraper_AllowsRedirectedDocumentPath(t *testing.T) {
	content := strings.Repeat("ruby docs content ", 30)
	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		switch r.URL.Path {
		case "/core/Regexp.html":
			http.Redirect(w, r, "/3.4.1/Regexp.html", http.StatusFound)
		case "/3.4.1/Regexp.html":
			w.Header().Set("Content-Type", "text/html")
			_, _ = w.Write([]byte(`<html><head><title>Regexp</title></head><body><main>` + content + `</main></body></html>`))
		default:
			http.NotFound(w, r)
		}
	}))
	defer srv.Close()

	s := NewWebScraper(&Config{
		UserAgent:   "DevourTest/1.0",
		Timeout:     2 * time.Second,
		Concurrency: 1,
		MaxDepth:    1,
	})

	docs, err := s.Scrape(context.Background(), &Source{
		Name: "ruby",
		Type: SourceTypeWeb,
		URL:  srv.URL + "/core/Regexp.html",
	})
	if err != nil {
		t.Fatalf("unexpected scrape error: %v", err)
	}
	if len(docs) == 0 {
		t.Fatal("expected redirected page to be scraped")
	}
	if !strings.Contains(docs[0].URL, "/3.4.1/Regexp.html") {
		t.Fatalf("expected final redirected URL, got %q", docs[0].URL)
	}
}

func TestWebScraper_GlobalWrapperNormalizesOutput(t *testing.T) {
	content := strings.Repeat("docs content ", 20)
	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		w.Header().Set("Content-Type", "text/html")
		_, _ = w.Write([]byte(`<html><head><title>Regex Guide ¶ deprecated</title></head><body><main>` + content + `</main></body></html>`))
	}))
	defer srv.Close()

	s := NewScraper(SourceTypeWeb, &Config{
		UserAgent:   "DevourTest/1.0",
		Timeout:     2 * time.Second,
		Concurrency: 1,
		MaxDepth:    1,
	})
	if s == nil {
		t.Fatal("expected web scraper")
	}

	docs, err := s.Scrape(context.Background(), &Source{
		Name: "test",
		Type: SourceTypeWeb,
		URL:  srv.URL,
	})
	if err != nil {
		t.Fatalf("unexpected scrape error: %v", err)
	}
	if len(docs) == 0 {
		t.Fatal("expected at least one document")
	}
	if docs[0].Title != "Regex Guide" {
		t.Fatalf("expected normalized title, got %q", docs[0].Title)
	}
}