Files
Devour/internal/scraper/openapi.go
T
Tomas Dvorak 898a3c303f update
2026-02-24 10:33:59 +01:00

339 lines
8.8 KiB
Go

package scraper
import (
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"sort"
"strings"
"time"
"gopkg.in/yaml.v3"
)
// OpenAPIScraper parses OpenAPI/Swagger specifications.
type OpenAPIScraper struct {
config *Config
client *http.Client
}
// NewOpenAPIScraper creates a new OpenAPI scraper.
func NewOpenAPIScraper(config *Config) *OpenAPIScraper {
timeout := 30 * time.Second
if config != nil && config.Timeout > 0 {
timeout = config.Timeout
}
return &OpenAPIScraper{
config: config,
client: &http.Client{Timeout: timeout},
}
}
// Scrape fetches and parses an OpenAPI specification.
func (s *OpenAPIScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) {
if source == nil {
return nil, fmt.Errorf("source is required")
}
raw, specURL, err := s.readSpec(ctx, source)
if err != nil {
return nil, err
}
spec, err := parseOpenAPISpec(raw)
if err != nil {
return nil, err
}
docs := make([]*Document, 0)
mainContent := buildMainSpecContent(spec)
docs = append(docs, &Document{
ID: generateDocID(specURL + "#openapi"),
Source: coalesceSourceName(source.Name, "openapi"),
Type: "openapi-spec",
Title: spec.Info.Title,
Content: mainContent,
URL: specURL,
Metadata: map[string]interface{}{
"openapi": spec.Version,
"servers": spec.Servers,
},
Hash: hashBytes(raw),
Timestamp: time.Now(),
})
paths := make([]string, 0, len(spec.Paths))
for path := range spec.Paths {
paths = append(paths, path)
}
sort.Strings(paths)
for _, p := range paths {
opMap := spec.Paths[p]
methods := make([]string, 0, len(opMap))
for m := range opMap {
methods = append(methods, strings.ToUpper(m))
}
sort.Strings(methods)
for _, method := range methods {
op := opMap[strings.ToLower(method)]
if op == nil {
continue
}
title := strings.TrimSpace(op.Summary)
if title == "" {
title = fmt.Sprintf("%s %s", method, p)
}
content := buildOperationContent(method, p, op)
docURL := fmt.Sprintf("%s#%s-%s", specURL, strings.ToLower(method), sanitizeFragment(p))
docs = append(docs, &Document{
ID: generateDocID(docURL),
Source: coalesceSourceName(source.Name, "openapi"),
Type: "openapi-operation",
Title: title,
Content: content,
URL: docURL,
Metadata: map[string]interface{}{
"method": method,
"path": p,
"operation_id": op.OperationID,
},
Hash: hashString(content),
Timestamp: time.Now(),
})
}
}
return docs, nil
}
// DetectChanges checks if the spec has been updated.
func (s *OpenAPIScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) {
if source == nil {
return false, "", fmt.Errorf("source is required")
}
raw, _, err := s.readSpec(ctx, source)
if err != nil {
return false, "", err
}
hash := hashBytes(raw)
return hash != lastHash, hash, nil
}
func (s *OpenAPIScraper) readSpec(ctx context.Context, source *Source) ([]byte, string, error) {
rawPath := strings.TrimSpace(source.URL)
if rawPath == "" {
rawPath = strings.TrimSpace(source.Path)
}
if rawPath == "" {
return nil, "", fmt.Errorf("openapi source requires url or path")
}
if strings.HasPrefix(rawPath, "http://") || strings.HasPrefix(rawPath, "https://") {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawPath, nil)
if err != nil {
return nil, "", err
}
if s.config != nil && strings.TrimSpace(s.config.UserAgent) != "" {
req.Header.Set("User-Agent", s.config.UserAgent)
}
resp, err := s.client.Do(req)
if err != nil {
return nil, "", err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return nil, "", fmt.Errorf("openapi fetch failed: HTTP %d", resp.StatusCode)
}
body, err := io.ReadAll(io.LimitReader(resp.Body, 10<<20))
if err != nil {
return nil, "", err
}
return body, rawPath, nil
}
b, err := os.ReadFile(rawPath)
if err != nil {
return nil, "", err
}
return b, "file://" + rawPath, nil
}
type openAPISpec struct {
Version string `json:"openapi" yaml:"openapi"`
Swagger string `json:"swagger" yaml:"swagger"`
Info openAPIInfo `json:"info" yaml:"info"`
Servers []openAPIServer `json:"servers" yaml:"servers"`
Paths map[string]pathItems `json:"paths" yaml:"paths"`
}
type openAPIInfo struct {
Title string `json:"title" yaml:"title"`
Version string `json:"version" yaml:"version"`
Description string `json:"description" yaml:"description"`
}
type openAPIServer struct {
URL string `json:"url" yaml:"url"`
Description string `json:"description" yaml:"description"`
}
type pathItems map[string]*openAPIOperation
type openAPIOperation struct {
Summary string `json:"summary" yaml:"summary"`
Description string `json:"description" yaml:"description"`
OperationID string `json:"operationId" yaml:"operationId"`
Parameters []openAPIParameter `json:"parameters" yaml:"parameters"`
Responses map[string]response `json:"responses" yaml:"responses"`
RequestBody map[string]any `json:"requestBody" yaml:"requestBody"`
Tags []string `json:"tags" yaml:"tags"`
Deprecated bool `json:"deprecated" yaml:"deprecated"`
Security []map[string][]string `json:"security" yaml:"security"`
}
type openAPIParameter struct {
Name string `json:"name" yaml:"name"`
In string `json:"in" yaml:"in"`
Description string `json:"description" yaml:"description"`
Required bool `json:"required" yaml:"required"`
}
type response struct {
Description string `json:"description" yaml:"description"`
}
func parseOpenAPISpec(raw []byte) (*openAPISpec, error) {
var spec openAPISpec
if err := json.Unmarshal(raw, &spec); err != nil {
if yamlErr := yaml.Unmarshal(raw, &spec); yamlErr != nil {
return nil, fmt.Errorf("invalid openapi content: %w", err)
}
}
if strings.TrimSpace(spec.Info.Title) == "" {
spec.Info.Title = "OpenAPI Specification"
}
if strings.TrimSpace(spec.Version) == "" {
spec.Version = spec.Swagger
}
if spec.Paths == nil {
spec.Paths = map[string]pathItems{}
}
return &spec, nil
}
func buildMainSpecContent(spec *openAPISpec) string {
var b strings.Builder
fmt.Fprintf(&b, "# %s\n\n", spec.Info.Title)
if spec.Info.Version != "" {
fmt.Fprintf(&b, "- API Version: %s\n", spec.Info.Version)
}
if spec.Version != "" {
fmt.Fprintf(&b, "- OpenAPI: %s\n", spec.Version)
}
fmt.Fprintf(&b, "- Paths: %d\n", len(spec.Paths))
if spec.Info.Description != "" {
fmt.Fprintf(&b, "\n%s\n", strings.TrimSpace(spec.Info.Description))
}
if len(spec.Servers) > 0 {
fmt.Fprintf(&b, "\n## Servers\n")
for _, s := range spec.Servers {
fmt.Fprintf(&b, "- %s", s.URL)
if s.Description != "" {
fmt.Fprintf(&b, " - %s", s.Description)
}
fmt.Fprintln(&b)
}
}
return b.String()
}
func buildOperationContent(method, path string, op *openAPIOperation) string {
var b strings.Builder
fmt.Fprintf(&b, "# %s %s\n\n", method, path)
if op.Summary != "" {
fmt.Fprintf(&b, "%s\n\n", strings.TrimSpace(op.Summary))
}
if op.Description != "" {
fmt.Fprintf(&b, "%s\n\n", strings.TrimSpace(op.Description))
}
if op.OperationID != "" {
fmt.Fprintf(&b, "- Operation ID: `%s`\n", op.OperationID)
}
if len(op.Tags) > 0 {
fmt.Fprintf(&b, "- Tags: %s\n", strings.Join(op.Tags, ", "))
}
if op.Deprecated {
fmt.Fprintln(&b, "- Deprecated: true")
}
if len(op.Parameters) > 0 {
fmt.Fprintln(&b, "\n## Parameters")
for _, p := range op.Parameters {
req := "optional"
if p.Required {
req = "required"
}
fmt.Fprintf(&b, "- `%s` (%s, %s)", p.Name, p.In, req)
if p.Description != "" {
fmt.Fprintf(&b, ": %s", strings.TrimSpace(p.Description))
}
fmt.Fprintln(&b)
}
}
if len(op.Responses) > 0 {
codes := make([]string, 0, len(op.Responses))
for code := range op.Responses {
codes = append(codes, code)
}
sort.Strings(codes)
fmt.Fprintln(&b, "\n## Responses")
for _, code := range codes {
resp := op.Responses[code]
fmt.Fprintf(&b, "- `%s`", code)
if resp.Description != "" {
fmt.Fprintf(&b, ": %s", strings.TrimSpace(resp.Description))
}
fmt.Fprintln(&b)
}
}
return b.String()
}
func sanitizeFragment(path string) string {
path = strings.ToLower(path)
path = strings.ReplaceAll(path, "/", "-")
path = strings.ReplaceAll(path, "{", "")
path = strings.ReplaceAll(path, "}", "")
path = strings.Trim(path, "-")
if path == "" {
return "root"
}
return path
}
func hashBytes(b []byte) string {
h := sha256.Sum256(b)
return hex.EncodeToString(h[:])
}
func hashString(s string) string {
h := sha256.Sum256([]byte(s))
return hex.EncodeToString(h[:])
}
func coalesceSourceName(name, fallback string) string {
if strings.TrimSpace(name) != "" {
return name
}
return fallback
}