package scraper import ( "context" "crypto/sha256" "encoding/hex" "fmt" "net/http" "strings" "time" "github.com/yourorg/devour/pkg/tsdocs" ) type TSDocsScraper struct { config *Config parser *tsdocs.Parser client *http.Client } func NewTSDocsScraper(config *Config) *TSDocsScraper { return &TSDocsScraper{ config: config, parser: tsdocs.NewParser(), client: &http.Client{ Timeout: config.Timeout, }, } } func (s *TSDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) { var documents []*Document if source.URL == "" { return nil, fmt.Errorf("URL is required for TypeScript docs scraper") } html, err := s.fetchPage(ctx, source.URL) if err != nil { return nil, fmt.Errorf("failed to fetch page: %w", err) } module, err := s.parser.ParseModulePage(html, source.URL) if err != nil { return nil, fmt.Errorf("failed to parse module: %w", err) } mainDoc := s.moduleToDocument(module, source.Name) documents = append(documents, mainDoc) for _, iface := range module.Interfaces { doc := s.interfaceToDocument(iface, module, source.Name) documents = append(documents, doc) } for _, fn := range module.Functions { doc := s.functionToDocument(fn, module, source.Name) documents = append(documents, doc) } for _, class := range module.Classes { doc := s.classToDocument(class, module, source.Name) documents = append(documents, doc) } for _, ta := range module.Types { doc := s.typeAliasToDocument(ta, module, source.Name) documents = append(documents, doc) } return documents, nil } func (s *TSDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) { html, err := s.fetchPage(ctx, source.URL) if err != nil { return false, "", err } hash := s.generateHash(html) changed := hash != lastHash return changed, hash, nil } func (s *TSDocsScraper) fetchPage(ctx context.Context, url string) (string, error) { return fetchExternalPage(ctx, s.client, s.config.UserAgent, url) } func (s *TSDocsScraper) generateHash(content string) string { hash := sha256.Sum256([]byte(content)) return hex.EncodeToString(hash[:]) } func (s *TSDocsScraper) moduleToDocument(module *tsdocs.Module, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s\n\n", module.Name) fmt.Fprintf(&content, "%s\n", module.Doc) metadata := map[string]interface{}{ "module": module.Name, "version": module.Version, "doc_url": module.DocURL, "doc_type": "ts-module", } return &Document{ ID: generateDocID(module.DocURL), Source: sourceName, Type: "ts-module", Title: module.Name, Content: content.String(), URL: module.DocURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *TSDocsScraper) interfaceToDocument(iface *tsdocs.Interface, module *tsdocs.Module, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s (interface)\n\n", iface.Name) if iface.Doc != "" { fmt.Fprintf(&content, "%s\n\n", iface.Doc) } if len(iface.Properties) > 0 { fmt.Fprintf(&content, "## Properties\n") for _, p := range iface.Properties { fmt.Fprintf(&content, "- `%s: %s`\n", p.Name, p.Type) } } metadata := map[string]interface{}{ "module": module.Name, "name": iface.Name, "doc_url": coalesceDocURL(iface.DocURL, module.DocURL), } docURL := coalesceDocURL(iface.DocURL, module.DocURL) return &Document{ ID: generateDocID(docURL), Source: sourceName, Type: "ts-interface", Title: iface.Name, Content: content.String(), URL: docURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *TSDocsScraper) functionToDocument(fn *tsdocs.Function, module *tsdocs.Module, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s()\n\n", fn.Name) if fn.Signature != "" { fmt.Fprintf(&content, "```typescript\n%s\n```\n\n", fn.Signature) } if fn.Doc != "" { fmt.Fprintf(&content, "%s\n", fn.Doc) } metadata := map[string]interface{}{ "module": module.Name, "name": fn.Name, "return_type": fn.ReturnType, "doc_url": coalesceDocURL(fn.DocURL, module.DocURL), } docURL := coalesceDocURL(fn.DocURL, module.DocURL) return &Document{ ID: generateDocID(docURL), Source: sourceName, Type: "ts-function", Title: fn.Name, Content: content.String(), URL: docURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *TSDocsScraper) classToDocument(class *tsdocs.Class, module *tsdocs.Module, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s (class)\n\n", class.Name) if class.Doc != "" { fmt.Fprintf(&content, "%s\n\n", class.Doc) } if len(class.Methods) > 0 { fmt.Fprintf(&content, "## Methods\n") for _, m := range class.Methods { fmt.Fprintf(&content, "- `%s()`\n", m.Name) } } metadata := map[string]interface{}{ "module": module.Name, "name": class.Name, "doc_url": coalesceDocURL(class.DocURL, module.DocURL), } docURL := coalesceDocURL(class.DocURL, module.DocURL) return &Document{ ID: generateDocID(docURL), Source: sourceName, Type: "ts-class", Title: class.Name, Content: content.String(), URL: docURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func (s *TSDocsScraper) typeAliasToDocument(ta *tsdocs.TypeAlias, module *tsdocs.Module, sourceName string) *Document { var content strings.Builder fmt.Fprintf(&content, "# %s (type)\n\n", ta.Name) fmt.Fprintf(&content, "```typescript\ntype %s = %s\n```\n\n", ta.Name, ta.Type) if ta.Doc != "" { fmt.Fprintf(&content, "%s\n", ta.Doc) } metadata := map[string]interface{}{ "module": module.Name, "name": ta.Name, "doc_url": coalesceDocURL(ta.DocURL, module.DocURL), } docURL := coalesceDocURL(ta.DocURL, module.DocURL) return &Document{ ID: generateDocID(docURL), Source: sourceName, Type: "ts-type", Title: ta.Name, Content: content.String(), URL: docURL, Metadata: metadata, Hash: s.generateHash(content.String()), Timestamp: time.Now(), } } func coalesceDocURL(primary, fallback string) string { if strings.TrimSpace(primary) != "" { return primary } return fallback }