package scraper import ( "context" "crypto/sha256" "encoding/hex" "encoding/json" "fmt" "io" "net/http" "os" "sort" "strings" "time" "gopkg.in/yaml.v3" ) // OpenAPIScraper parses OpenAPI/Swagger specifications. type OpenAPIScraper struct { config *Config client *http.Client } // NewOpenAPIScraper creates a new OpenAPI scraper. func NewOpenAPIScraper(config *Config) *OpenAPIScraper { timeout := 30 * time.Second if config != nil && config.Timeout > 0 { timeout = config.Timeout } return &OpenAPIScraper{ config: config, client: &http.Client{Timeout: timeout}, } } // Scrape fetches and parses an OpenAPI specification. func (s *OpenAPIScraper) Scrape(ctx context.Context, source *Source) ([]*Document, error) { if source == nil { return nil, fmt.Errorf("source is required") } raw, specURL, err := s.readSpec(ctx, source) if err != nil { return nil, err } spec, err := parseOpenAPISpec(raw) if err != nil { return nil, err } docs := make([]*Document, 0) mainContent := buildMainSpecContent(spec) docs = append(docs, &Document{ ID: generateDocID(specURL + "#openapi"), Source: coalesceSourceName(source.Name, "openapi"), Type: "openapi-spec", Title: spec.Info.Title, Content: mainContent, URL: specURL, Metadata: map[string]interface{}{ "openapi": spec.Version, "servers": spec.Servers, }, Hash: hashBytes(raw), Timestamp: time.Now(), }) paths := make([]string, 0, len(spec.Paths)) for path := range spec.Paths { paths = append(paths, path) } sort.Strings(paths) for _, p := range paths { opMap := spec.Paths[p] methods := make([]string, 0, len(opMap)) for m := range opMap { methods = append(methods, strings.ToUpper(m)) } sort.Strings(methods) for _, method := range methods { op := opMap[strings.ToLower(method)] if op == nil { continue } title := strings.TrimSpace(op.Summary) if title == "" { title = fmt.Sprintf("%s %s", method, p) } content := buildOperationContent(method, p, op) docURL := fmt.Sprintf("%s#%s-%s", specURL, strings.ToLower(method), sanitizeFragment(p)) docs = append(docs, &Document{ ID: generateDocID(docURL), Source: coalesceSourceName(source.Name, "openapi"), Type: "openapi-operation", Title: title, Content: content, URL: docURL, Metadata: map[string]interface{}{ "method": method, "path": p, "operation_id": op.OperationID, }, Hash: hashString(content), Timestamp: time.Now(), }) } } return docs, nil } // DetectChanges checks if the spec has been updated. func (s *OpenAPIScraper) DetectChanges(ctx context.Context, source *Source, lastHash string) (bool, string, error) { if source == nil { return false, "", fmt.Errorf("source is required") } raw, _, err := s.readSpec(ctx, source) if err != nil { return false, "", err } hash := hashBytes(raw) return hash != lastHash, hash, nil } func (s *OpenAPIScraper) readSpec(ctx context.Context, source *Source) ([]byte, string, error) { rawPath := strings.TrimSpace(source.URL) if rawPath == "" { rawPath = strings.TrimSpace(source.Path) } if rawPath == "" { return nil, "", fmt.Errorf("openapi source requires url or path") } if strings.HasPrefix(rawPath, "http://") || strings.HasPrefix(rawPath, "https://") { req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawPath, nil) if err != nil { return nil, "", err } if s.config != nil && strings.TrimSpace(s.config.UserAgent) != "" { req.Header.Set("User-Agent", s.config.UserAgent) } resp, err := s.client.Do(req) if err != nil { return nil, "", err } defer resp.Body.Close() if resp.StatusCode < 200 || resp.StatusCode >= 300 { return nil, "", fmt.Errorf("openapi fetch failed: HTTP %d", resp.StatusCode) } body, err := io.ReadAll(io.LimitReader(resp.Body, 10<<20)) if err != nil { return nil, "", err } return body, rawPath, nil } b, err := os.ReadFile(rawPath) if err != nil { return nil, "", err } return b, "file://" + rawPath, nil } type openAPISpec struct { Version string `json:"openapi" yaml:"openapi"` Swagger string `json:"swagger" yaml:"swagger"` Info openAPIInfo `json:"info" yaml:"info"` Servers []openAPIServer `json:"servers" yaml:"servers"` Paths map[string]pathItems `json:"paths" yaml:"paths"` } type openAPIInfo struct { Title string `json:"title" yaml:"title"` Version string `json:"version" yaml:"version"` Description string `json:"description" yaml:"description"` } type openAPIServer struct { URL string `json:"url" yaml:"url"` Description string `json:"description" yaml:"description"` } type pathItems map[string]*openAPIOperation type openAPIOperation struct { Summary string `json:"summary" yaml:"summary"` Description string `json:"description" yaml:"description"` OperationID string `json:"operationId" yaml:"operationId"` Parameters []openAPIParameter `json:"parameters" yaml:"parameters"` Responses map[string]response `json:"responses" yaml:"responses"` RequestBody map[string]any `json:"requestBody" yaml:"requestBody"` Tags []string `json:"tags" yaml:"tags"` Deprecated bool `json:"deprecated" yaml:"deprecated"` Security []map[string][]string `json:"security" yaml:"security"` } type openAPIParameter struct { Name string `json:"name" yaml:"name"` In string `json:"in" yaml:"in"` Description string `json:"description" yaml:"description"` Required bool `json:"required" yaml:"required"` } type response struct { Description string `json:"description" yaml:"description"` } func parseOpenAPISpec(raw []byte) (*openAPISpec, error) { var spec openAPISpec if err := json.Unmarshal(raw, &spec); err != nil { if yamlErr := yaml.Unmarshal(raw, &spec); yamlErr != nil { return nil, fmt.Errorf("invalid openapi content: %w", err) } } if strings.TrimSpace(spec.Info.Title) == "" { spec.Info.Title = "OpenAPI Specification" } if strings.TrimSpace(spec.Version) == "" { spec.Version = spec.Swagger } if spec.Paths == nil { spec.Paths = map[string]pathItems{} } return &spec, nil } func buildMainSpecContent(spec *openAPISpec) string { var b strings.Builder fmt.Fprintf(&b, "# %s\n\n", spec.Info.Title) if spec.Info.Version != "" { fmt.Fprintf(&b, "- API Version: %s\n", spec.Info.Version) } if spec.Version != "" { fmt.Fprintf(&b, "- OpenAPI: %s\n", spec.Version) } fmt.Fprintf(&b, "- Paths: %d\n", len(spec.Paths)) if spec.Info.Description != "" { fmt.Fprintf(&b, "\n%s\n", strings.TrimSpace(spec.Info.Description)) } if len(spec.Servers) > 0 { fmt.Fprintf(&b, "\n## Servers\n") for _, s := range spec.Servers { fmt.Fprintf(&b, "- %s", s.URL) if s.Description != "" { fmt.Fprintf(&b, " - %s", s.Description) } fmt.Fprintln(&b) } } return b.String() } func buildOperationContent(method, path string, op *openAPIOperation) string { var b strings.Builder fmt.Fprintf(&b, "# %s %s\n\n", method, path) if op.Summary != "" { fmt.Fprintf(&b, "%s\n\n", strings.TrimSpace(op.Summary)) } if op.Description != "" { fmt.Fprintf(&b, "%s\n\n", strings.TrimSpace(op.Description)) } if op.OperationID != "" { fmt.Fprintf(&b, "- Operation ID: `%s`\n", op.OperationID) } if len(op.Tags) > 0 { fmt.Fprintf(&b, "- Tags: %s\n", strings.Join(op.Tags, ", ")) } if op.Deprecated { fmt.Fprintln(&b, "- Deprecated: true") } if len(op.Parameters) > 0 { fmt.Fprintln(&b, "\n## Parameters") for _, p := range op.Parameters { req := "optional" if p.Required { req = "required" } fmt.Fprintf(&b, "- `%s` (%s, %s)", p.Name, p.In, req) if p.Description != "" { fmt.Fprintf(&b, ": %s", strings.TrimSpace(p.Description)) } fmt.Fprintln(&b) } } if len(op.Responses) > 0 { codes := make([]string, 0, len(op.Responses)) for code := range op.Responses { codes = append(codes, code) } sort.Strings(codes) fmt.Fprintln(&b, "\n## Responses") for _, code := range codes { resp := op.Responses[code] fmt.Fprintf(&b, "- `%s`", code) if resp.Description != "" { fmt.Fprintf(&b, ": %s", strings.TrimSpace(resp.Description)) } fmt.Fprintln(&b) } } return b.String() } func sanitizeFragment(path string) string { path = strings.ToLower(path) path = strings.ReplaceAll(path, "/", "-") path = strings.ReplaceAll(path, "{", "") path = strings.ReplaceAll(path, "}", "") path = strings.Trim(path, "-") if path == "" { return "root" } return path } func hashBytes(b []byte) string { h := sha256.Sum256(b) return hex.EncodeToString(h[:]) } func hashString(s string) string { h := sha256.Sum256([]byte(s)) return hex.EncodeToString(h[:]) } func coalesceSourceName(name, fallback string) string { if strings.TrimSpace(name) != "" { return name } return fallback }