mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
255 lines
6.4 KiB
Go
255 lines
6.4 KiB
Go
package scraper
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"fmt"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/yourorg/devour/pkg/tsdocs"
|
|
)
|
|
|
|
type TSDocsScraper struct {
|
|
config *Config
|
|
parser *tsdocs.Parser
|
|
client *http.Client
|
|
}
|
|
|
|
func NewTSDocsScraper(config *Config) *TSDocsScraper {
|
|
return &TSDocsScraper{
|
|
config: config,
|
|
parser: tsdocs.NewParser(),
|
|
client: &http.Client{
|
|
Timeout: config.Timeout,
|
|
},
|
|
}
|
|
}
|
|
|
|
func (s *TSDocsScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
|
|
var documents []*Document
|
|
|
|
if source.URL == "" {
|
|
return nil, fmt.Errorf("URL is required for TypeScript docs scraper")
|
|
}
|
|
|
|
html, err := s.fetchPage(ctx, source.URL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to fetch page: %w", err)
|
|
}
|
|
|
|
module, err := s.parser.ParseModulePage(html, source.URL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to parse module: %w", err)
|
|
}
|
|
|
|
mainDoc := s.moduleToDocument(module, source.Name)
|
|
documents = append(documents, mainDoc)
|
|
|
|
for _, iface := range module.Interfaces {
|
|
doc := s.interfaceToDocument(iface, module, source.Name)
|
|
documents = append(documents, doc)
|
|
}
|
|
|
|
for _, fn := range module.Functions {
|
|
doc := s.functionToDocument(fn, module, source.Name)
|
|
documents = append(documents, doc)
|
|
}
|
|
|
|
for _, class := range module.Classes {
|
|
doc := s.classToDocument(class, module, source.Name)
|
|
documents = append(documents, doc)
|
|
}
|
|
|
|
for _, ta := range module.Types {
|
|
doc := s.typeAliasToDocument(ta, module, source.Name)
|
|
documents = append(documents, doc)
|
|
}
|
|
|
|
return documents, nil
|
|
}
|
|
|
|
func (s *TSDocsScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
|
|
html, err := s.fetchPage(ctx, source.URL)
|
|
if err != nil {
|
|
return false, "", err
|
|
}
|
|
|
|
hash := s.generateHash(html)
|
|
changed := hash != lastHash
|
|
|
|
return changed, hash, nil
|
|
}
|
|
|
|
func (s *TSDocsScraper) fetchPage(ctx context.Context, url string) (string, error) {
|
|
return fetchExternalPage(ctx, s.client, s.config.UserAgent, url)
|
|
}
|
|
|
|
func (s *TSDocsScraper) generateHash(content string) string {
|
|
hash := sha256.Sum256([]byte(content))
|
|
return hex.EncodeToString(hash[:])
|
|
}
|
|
|
|
func (s *TSDocsScraper) moduleToDocument(module *tsdocs.Module, sourceName string) *Document {
|
|
var content strings.Builder
|
|
fmt.Fprintf(&content, "# %s\n\n", module.Name)
|
|
fmt.Fprintf(&content, "%s\n", module.Doc)
|
|
|
|
metadata := map[string]interface{}{
|
|
"module": module.Name,
|
|
"version": module.Version,
|
|
"doc_url": module.DocURL,
|
|
"doc_type": "ts-module",
|
|
}
|
|
|
|
return &Document{
|
|
ID: generateDocID(module.DocURL),
|
|
Source: sourceName,
|
|
Type: "ts-module",
|
|
Title: module.Name,
|
|
Content: content.String(),
|
|
URL: module.DocURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content.String()),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *TSDocsScraper) interfaceToDocument(iface *tsdocs.Interface, module *tsdocs.Module, sourceName string) *Document {
|
|
var content strings.Builder
|
|
fmt.Fprintf(&content, "# %s (interface)\n\n", iface.Name)
|
|
if iface.Doc != "" {
|
|
fmt.Fprintf(&content, "%s\n\n", iface.Doc)
|
|
}
|
|
if len(iface.Properties) > 0 {
|
|
fmt.Fprintf(&content, "## Properties\n")
|
|
for _, p := range iface.Properties {
|
|
fmt.Fprintf(&content, "- `%s: %s`\n", p.Name, p.Type)
|
|
}
|
|
}
|
|
|
|
metadata := map[string]interface{}{
|
|
"module": module.Name,
|
|
"name": iface.Name,
|
|
"doc_url": coalesceDocURL(iface.DocURL, module.DocURL),
|
|
}
|
|
|
|
docURL := coalesceDocURL(iface.DocURL, module.DocURL)
|
|
|
|
return &Document{
|
|
ID: generateDocID(docURL),
|
|
Source: sourceName,
|
|
Type: "ts-interface",
|
|
Title: iface.Name,
|
|
Content: content.String(),
|
|
URL: docURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content.String()),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *TSDocsScraper) functionToDocument(fn *tsdocs.Function, module *tsdocs.Module, sourceName string) *Document {
|
|
var content strings.Builder
|
|
fmt.Fprintf(&content, "# %s()\n\n", fn.Name)
|
|
if fn.Signature != "" {
|
|
fmt.Fprintf(&content, "```typescript\n%s\n```\n\n", fn.Signature)
|
|
}
|
|
if fn.Doc != "" {
|
|
fmt.Fprintf(&content, "%s\n", fn.Doc)
|
|
}
|
|
|
|
metadata := map[string]interface{}{
|
|
"module": module.Name,
|
|
"name": fn.Name,
|
|
"return_type": fn.ReturnType,
|
|
"doc_url": coalesceDocURL(fn.DocURL, module.DocURL),
|
|
}
|
|
|
|
docURL := coalesceDocURL(fn.DocURL, module.DocURL)
|
|
|
|
return &Document{
|
|
ID: generateDocID(docURL),
|
|
Source: sourceName,
|
|
Type: "ts-function",
|
|
Title: fn.Name,
|
|
Content: content.String(),
|
|
URL: docURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content.String()),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *TSDocsScraper) classToDocument(class *tsdocs.Class, module *tsdocs.Module, sourceName string) *Document {
|
|
var content strings.Builder
|
|
fmt.Fprintf(&content, "# %s (class)\n\n", class.Name)
|
|
if class.Doc != "" {
|
|
fmt.Fprintf(&content, "%s\n\n", class.Doc)
|
|
}
|
|
if len(class.Methods) > 0 {
|
|
fmt.Fprintf(&content, "## Methods\n")
|
|
for _, m := range class.Methods {
|
|
fmt.Fprintf(&content, "- `%s()`\n", m.Name)
|
|
}
|
|
}
|
|
|
|
metadata := map[string]interface{}{
|
|
"module": module.Name,
|
|
"name": class.Name,
|
|
"doc_url": coalesceDocURL(class.DocURL, module.DocURL),
|
|
}
|
|
|
|
docURL := coalesceDocURL(class.DocURL, module.DocURL)
|
|
|
|
return &Document{
|
|
ID: generateDocID(docURL),
|
|
Source: sourceName,
|
|
Type: "ts-class",
|
|
Title: class.Name,
|
|
Content: content.String(),
|
|
URL: docURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content.String()),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (s *TSDocsScraper) typeAliasToDocument(ta *tsdocs.TypeAlias, module *tsdocs.Module, sourceName string) *Document {
|
|
var content strings.Builder
|
|
fmt.Fprintf(&content, "# %s (type)\n\n", ta.Name)
|
|
fmt.Fprintf(&content, "```typescript\ntype %s = %s\n```\n\n", ta.Name, ta.Type)
|
|
if ta.Doc != "" {
|
|
fmt.Fprintf(&content, "%s\n", ta.Doc)
|
|
}
|
|
|
|
metadata := map[string]interface{}{
|
|
"module": module.Name,
|
|
"name": ta.Name,
|
|
"doc_url": coalesceDocURL(ta.DocURL, module.DocURL),
|
|
}
|
|
|
|
docURL := coalesceDocURL(ta.DocURL, module.DocURL)
|
|
|
|
return &Document{
|
|
ID: generateDocID(docURL),
|
|
Source: sourceName,
|
|
Type: "ts-type",
|
|
Title: ta.Name,
|
|
Content: content.String(),
|
|
URL: docURL,
|
|
Metadata: metadata,
|
|
Hash: s.generateHash(content.String()),
|
|
Timestamp: time.Now(),
|
|
}
|
|
}
|
|
|
|
func coalesceDocURL(primary, fallback string) string {
|
|
if strings.TrimSpace(primary) != "" {
|
|
return primary
|
|
}
|
|
return fallback
|
|
}
|