mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-03 20:13:03 +00:00
1049 lines
27 KiB
Go
1049 lines
27 KiB
Go
package cmd
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"net/url"
|
|
"path"
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/spf13/cobra"
|
|
appconfig "github.com/yourorg/devour/internal/config"
|
|
"github.com/yourorg/devour/internal/scraper"
|
|
"github.com/yourorg/devour/internal/search"
|
|
"github.com/yourorg/devour/internal/storage"
|
|
)
|
|
|
|
var askCmd = &cobra.Command{
|
|
Use: "ask <question>",
|
|
Short: "Ask docs directly and get a structured answer",
|
|
Long: `Fetch official documentation for a language, rank the most relevant sections,
|
|
and return a structured answer with sources.
|
|
|
|
Examples:
|
|
devour ask --lang go "how to regex match digits" --format json
|
|
devour ask --lang python "async timeout example"
|
|
devour ask --lang rust "tokio spawn best practice" --max-sources 5`,
|
|
Args: cobra.MinimumNArgs(1),
|
|
RunE: runAsk,
|
|
}
|
|
|
|
var (
|
|
askLanguage string
|
|
askFormat string
|
|
askMaxSources int
|
|
askTimeoutSec int
|
|
)
|
|
|
|
type askResponse struct {
|
|
Query string `json:"query"`
|
|
Language string `json:"language"`
|
|
SearchedTerms []string `json:"searched_terms"`
|
|
Retrieval askRetrieval `json:"retrieval"`
|
|
Answer askAnswer `json:"answer"`
|
|
Sources []askSource `json:"sources"`
|
|
Confidence float64 `json:"confidence"`
|
|
FetchedAt time.Time `json:"fetched_at"`
|
|
}
|
|
|
|
type askRetrieval struct {
|
|
Mode string `json:"mode"`
|
|
LocalHits int `json:"local_hits"`
|
|
FallbackFetched int `json:"fallback_fetch_count"`
|
|
}
|
|
|
|
type askAnswer struct {
|
|
Summary string `json:"summary"`
|
|
RecommendedAPI []string `json:"recommended_api"`
|
|
Example string `json:"example,omitempty"`
|
|
Notes []string `json:"notes"`
|
|
}
|
|
|
|
type askSource struct {
|
|
DocID string `json:"doc_id"`
|
|
Title string `json:"title"`
|
|
URL string `json:"url"`
|
|
Type string `json:"type"`
|
|
SearchTerm string `json:"search_term"`
|
|
Relevance float64 `json:"relevance"`
|
|
Snippet string `json:"snippet"`
|
|
}
|
|
|
|
type rankedDoc struct {
|
|
doc *scraper.Document
|
|
score float64
|
|
searchTerm string
|
|
}
|
|
|
|
type askPersistenceWarning struct {
|
|
operation string
|
|
cause error
|
|
}
|
|
|
|
func (w *askPersistenceWarning) Error() string {
|
|
return fmt.Sprintf("persistence warning: %s: %v", w.operation, w.cause)
|
|
}
|
|
|
|
func (w *askPersistenceWarning) Unwrap() error {
|
|
return w.cause
|
|
}
|
|
|
|
func init() {
|
|
askCmd.Flags().StringVar(&askLanguage, "lang", "", "language/framework (required)")
|
|
askCmd.Flags().StringVarP(&askFormat, "format", "f", "json", "output format (json, text)")
|
|
askCmd.Flags().IntVar(&askMaxSources, "max-sources", 5, "maximum number of source snippets to return")
|
|
askCmd.Flags().IntVar(&askTimeoutSec, "timeout", 60, "request timeout in seconds")
|
|
_ = askCmd.MarkFlagRequired("lang")
|
|
}
|
|
|
|
func runAsk(cmd *cobra.Command, args []string) error {
|
|
question := strings.TrimSpace(strings.Join(args, " "))
|
|
if question == "" {
|
|
return fmt.Errorf("question is required")
|
|
}
|
|
|
|
languageIn := strings.ToLower(strings.TrimSpace(askLanguage))
|
|
language, ok := normalizeLanguage(languageIn)
|
|
if !ok {
|
|
return fmt.Errorf("unsupported language: %s. Supported: %s", languageIn, strings.Join(supportedLanguages(), ", "))
|
|
}
|
|
|
|
terms := deriveSearchTerms(language, question)
|
|
if len(terms) == 0 {
|
|
return fmt.Errorf("could not derive a search term from the question")
|
|
}
|
|
|
|
if askMaxSources <= 0 {
|
|
askMaxSources = 5
|
|
}
|
|
if askTimeoutSec <= 0 {
|
|
askTimeoutSec = 60
|
|
}
|
|
|
|
cfg, err := loadAppConfig()
|
|
if err != nil {
|
|
return fmt.Errorf("load app config for ask command: %w", err)
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(askTimeoutSec)*time.Second)
|
|
defer cancel()
|
|
|
|
localRanked, localErr := gatherLocalAskDocs(ctx, cfg, language, question, terms, askMaxSources*4)
|
|
if localErr != nil {
|
|
localRanked = nil
|
|
}
|
|
ranked := append([]rankedDoc{}, localRanked...)
|
|
retrievalMode := "local-first"
|
|
|
|
fallbackNeeded := shouldFallbackToLive(localRanked, terms)
|
|
|
|
fallbackCount := 0
|
|
fetchErrors := []error{}
|
|
if localErr != nil {
|
|
fetchErrors = append(fetchErrors, fmt.Errorf("local retrieval failed: %w", localErr))
|
|
}
|
|
if fallbackNeeded {
|
|
fallbackDocs, fetched, errs := fetchAskDocsFromLive(ctx, cfg, language, question, terms)
|
|
fallbackCount = fetched
|
|
fetchErrors = append(fetchErrors, errs...)
|
|
ranked = mergeRankedDocs(ranked, fallbackDocs)
|
|
if fetched > 0 {
|
|
retrievalMode = "local+live"
|
|
} else if len(localRanked) == 0 {
|
|
retrievalMode = "live"
|
|
}
|
|
}
|
|
|
|
if len(ranked) == 0 {
|
|
if len(fetchErrors) == 0 {
|
|
return fmt.Errorf("no docs found for %q", language)
|
|
}
|
|
return fmt.Errorf("no docs found for %q: %w", language, errors.Join(fetchErrors...))
|
|
}
|
|
|
|
sort.Slice(ranked, func(i, j int) bool {
|
|
if ranked[i].score == ranked[j].score {
|
|
return ranked[i].doc.Title < ranked[j].doc.Title
|
|
}
|
|
return ranked[i].score > ranked[j].score
|
|
})
|
|
|
|
if askMaxSources > len(ranked) {
|
|
askMaxSources = len(ranked)
|
|
}
|
|
top := ranked[:askMaxSources]
|
|
|
|
response := askResponse{
|
|
Query: question,
|
|
Language: language,
|
|
SearchedTerms: terms,
|
|
Retrieval: askRetrieval{
|
|
Mode: retrievalMode,
|
|
LocalHits: len(localRanked),
|
|
FallbackFetched: fallbackCount,
|
|
},
|
|
Answer: askAnswer{
|
|
Summary: summarizeTopDocs(question, top),
|
|
RecommendedAPI: ensureAPISlice(extractRecommendedAPI(top)),
|
|
Example: extractExample(top),
|
|
Notes: []string{
|
|
"Retrieval is hybrid local-first: local index first, then targeted live fetch fallback.",
|
|
"Relevance uses lexical ranking across local snippets and fetched docs.",
|
|
},
|
|
},
|
|
Sources: buildAskSources(question, top),
|
|
Confidence: computeConfidence(question, top),
|
|
FetchedAt: time.Now(),
|
|
}
|
|
for _, fetchErr := range fetchErrors {
|
|
var persistenceWarning *askPersistenceWarning
|
|
if errors.As(fetchErr, &persistenceWarning) {
|
|
response.Answer.Notes = append(response.Answer.Notes, persistenceWarning.Error())
|
|
}
|
|
}
|
|
|
|
switch strings.ToLower(askFormat) {
|
|
case "text":
|
|
printAskText(response)
|
|
return nil
|
|
case "json":
|
|
enc := json.NewEncoder(cmd.OutOrStdout())
|
|
enc.SetIndent("", " ")
|
|
return enc.Encode(response)
|
|
default:
|
|
return fmt.Errorf("unsupported format: %s", askFormat)
|
|
}
|
|
}
|
|
|
|
func gatherLocalAskDocs(ctx context.Context, cfg *appconfig.Config, language, question string, terms []string, limit int) ([]rankedDoc, error) {
|
|
engine := search.NewEngine(cfg)
|
|
query := strings.TrimSpace(question + " " + language + " " + strings.Join(terms, " "))
|
|
results, _, err := engine.Search(ctx, query, search.SearchOptions{Limit: limit * 2})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
out := make([]rankedDoc, 0, len(results))
|
|
for _, result := range results {
|
|
if !resultMatchesLanguage(result, language) {
|
|
continue
|
|
}
|
|
out = append(out, rankedDoc{
|
|
doc: &scraper.Document{
|
|
ID: result.DocID,
|
|
Source: result.Source,
|
|
Type: result.Type,
|
|
Title: result.Title,
|
|
Content: result.Snippet,
|
|
URL: result.URL,
|
|
},
|
|
score: result.Score,
|
|
searchTerm: "local-index",
|
|
})
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func resultMatchesLanguage(result search.Result, language string) bool {
|
|
urlLower := strings.ToLower(result.URL)
|
|
typeLower := strings.ToLower(result.Type)
|
|
sourceLower := strings.ToLower(result.Source)
|
|
titleLower := strings.ToLower(result.Title)
|
|
|
|
matchAny := func(parts ...string) bool {
|
|
for _, p := range parts {
|
|
if strings.Contains(urlLower, p) || strings.Contains(typeLower, p) || strings.Contains(sourceLower, p) || strings.Contains(titleLower, p) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
switch language {
|
|
case "go":
|
|
return matchAny("pkg.go.dev", "go-")
|
|
case "rust":
|
|
return matchAny("docs.rs", "rust-")
|
|
case "python":
|
|
return matchAny("docs.python.org", "python")
|
|
case "java":
|
|
return matchAny("docs.oracle.com", "java")
|
|
case "spring":
|
|
return matchAny("docs.spring.io", "spring")
|
|
case "typescript":
|
|
return matchAny("typescriptlang.org", "ts-")
|
|
case "react":
|
|
return matchAny("react.dev", "react")
|
|
case "vue":
|
|
return matchAny("vuejs.org", "vue")
|
|
case "nuxt":
|
|
return matchAny("nuxt.com", "nuxt")
|
|
case "docker":
|
|
return matchAny("docs.docker.com", "docker")
|
|
case "cloudflare":
|
|
return matchAny("developers.cloudflare.com", "cloudflare")
|
|
case "astro":
|
|
return matchAny("docs.astro.build", "astro")
|
|
case "csharp":
|
|
return matchAny("learn.microsoft.com", "c#")
|
|
case "kotlin":
|
|
return matchAny("kotlinlang.org", "kotlin")
|
|
case "php":
|
|
return matchAny("php.net", "php")
|
|
case "ruby":
|
|
return matchAny("ruby-doc.org", "ruby")
|
|
case "elixir":
|
|
return matchAny("hexdocs.pm", "elixir")
|
|
case "nextjs":
|
|
return matchAny("nextjs.org", "next")
|
|
case "svelte":
|
|
return matchAny("svelte.dev", "svelte")
|
|
case "angular":
|
|
return matchAny("angular.dev", "angular")
|
|
case "remix":
|
|
return matchAny("remix.run", "remix")
|
|
case "solid":
|
|
return matchAny("solidjs.com", "solid")
|
|
case "express":
|
|
return matchAny("expressjs.com", "express")
|
|
default:
|
|
return true
|
|
}
|
|
}
|
|
|
|
func fetchAskDocsFromLive(ctx context.Context, cfg *appconfig.Config, language, question string, terms []string) ([]rankedDoc, int, []error) {
|
|
sourceType := scraper.SourceType(mapLanguageToType(language))
|
|
if sourceType == "" {
|
|
return nil, 0, []error{fmt.Errorf("unsupported language: %s", language)}
|
|
}
|
|
sc := toScraperConfig(cfg, 2)
|
|
sc.MaxDepth = 1
|
|
s := scraper.NewScraper(sourceType, sc)
|
|
if s == nil {
|
|
return nil, 0, []error{fmt.Errorf("no scraper for %s (%s)", language, sourceType)}
|
|
}
|
|
|
|
var ranked []rankedDoc
|
|
var fetchErrors []error
|
|
seenURL := make(map[string]bool)
|
|
totalFetched := 0
|
|
fetchedDocs := make([]*scraper.Document, 0)
|
|
|
|
for _, term := range terms {
|
|
docURLs, err := candidateDocURLs(language, term)
|
|
if err != nil {
|
|
fetchErrors = append(fetchErrors, fmt.Errorf("%s: %w", term, err))
|
|
continue
|
|
}
|
|
termFetched := false
|
|
termErrors := make([]error, 0, len(docURLs))
|
|
for _, docURL := range docURLs {
|
|
if seenURL[docURL] {
|
|
continue
|
|
}
|
|
seenURL[docURL] = true
|
|
|
|
source := &scraper.Source{
|
|
Name: fmt.Sprintf("%s:%s", language, term),
|
|
Type: sourceType,
|
|
URL: docURL,
|
|
}
|
|
applySourceProfile(source)
|
|
|
|
docs, err := s.Scrape(ctx, source)
|
|
if err != nil {
|
|
termErrors = append(termErrors, fmt.Errorf("%s: %w", docURL, err))
|
|
continue
|
|
}
|
|
if len(docs) == 0 {
|
|
termErrors = append(termErrors, fmt.Errorf("%s: no documents extracted", docURL))
|
|
continue
|
|
}
|
|
|
|
termFetched = true
|
|
totalFetched += len(docs)
|
|
fetchedDocs = append(fetchedDocs, docs...)
|
|
for _, doc := range docs {
|
|
ranked = append(ranked, rankedDoc{
|
|
doc: doc,
|
|
score: scoreDocument(question, doc),
|
|
searchTerm: term,
|
|
})
|
|
}
|
|
// Stop after first successful candidate for this term.
|
|
break
|
|
}
|
|
if !termFetched && len(termErrors) > 0 {
|
|
fetchErrors = append(fetchErrors, errors.Join(termErrors...))
|
|
}
|
|
}
|
|
|
|
// Persist fallback docs for future local-first queries.
|
|
if len(fetchedDocs) > 0 {
|
|
if _, err := storage.SaveDocuments(fetchedDocs, storage.SaveOptions{
|
|
Format: "json",
|
|
OutputDir: cfg.Storage.DocsDir,
|
|
AllowEmpty: true,
|
|
}); err != nil {
|
|
fetchErrors = append(fetchErrors, &askPersistenceWarning{
|
|
operation: "save fallback docs",
|
|
cause: err,
|
|
})
|
|
}
|
|
if cfg.Indexing.Enabled {
|
|
engine := search.NewEngine(cfg)
|
|
if _, err := engine.Rebuild(context.Background()); err != nil {
|
|
fetchErrors = append(fetchErrors, &askPersistenceWarning{
|
|
operation: "rebuild index after fallback",
|
|
cause: err,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
return ranked, totalFetched, fetchErrors
|
|
}
|
|
|
|
func candidateDocURLs(language, term string) ([]string, error) {
|
|
primary, err := constructDocURL(language, term)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
seen := map[string]bool{}
|
|
out := make([]string, 0, 5)
|
|
add := func(raw string) {
|
|
raw = strings.TrimSpace(raw)
|
|
if raw == "" || seen[raw] {
|
|
return
|
|
}
|
|
seen[raw] = true
|
|
out = append(out, raw)
|
|
}
|
|
add(primary)
|
|
|
|
switch language {
|
|
case "nextjs":
|
|
add("https://nextjs.org/docs/app/building-your-application/routing")
|
|
add("https://nextjs.org/docs/app/building-your-application/data-fetching")
|
|
add("https://nextjs.org/docs")
|
|
case "svelte":
|
|
add("https://svelte.dev/docs/kit")
|
|
add("https://svelte.dev/docs/svelte/overview")
|
|
case "angular":
|
|
add("https://angular.dev/guide/http")
|
|
add("https://angular.dev/guide/components")
|
|
case "remix":
|
|
add("https://v2.remix.run/docs/file-conventions/routes")
|
|
add("https://v2.remix.run/docs")
|
|
case "solid":
|
|
add("https://github.com/solidjs/solid-docs")
|
|
case "express":
|
|
add("https://expressjs.com/en/guide/routing.html")
|
|
add("https://expressjs.com/en/guide/using-middleware.html")
|
|
}
|
|
|
|
return out, nil
|
|
}
|
|
|
|
func mergeRankedDocs(primary, secondary []rankedDoc) []rankedDoc {
|
|
merged := make([]rankedDoc, 0, len(primary)+len(secondary))
|
|
seen := map[string]bool{}
|
|
|
|
add := func(item rankedDoc) {
|
|
if item.doc == nil {
|
|
return
|
|
}
|
|
key := strings.TrimSpace(item.doc.URL)
|
|
if key == "" {
|
|
key = strings.TrimSpace(item.doc.ID)
|
|
}
|
|
if key == "" {
|
|
key = strings.TrimSpace(item.doc.Title) + ":" + item.searchTerm
|
|
}
|
|
if seen[key] {
|
|
return
|
|
}
|
|
seen[key] = true
|
|
merged = append(merged, item)
|
|
}
|
|
|
|
for _, p := range primary {
|
|
add(p)
|
|
}
|
|
for _, s := range secondary {
|
|
add(s)
|
|
}
|
|
return merged
|
|
}
|
|
|
|
func topLocalMatchesTerms(ranked []rankedDoc, terms []string) bool {
|
|
if len(ranked) == 0 || len(terms) == 0 {
|
|
return false
|
|
}
|
|
maxCheck := len(ranked)
|
|
if maxCheck > 5 {
|
|
maxCheck = 5
|
|
}
|
|
for i := 0; i < maxCheck; i++ {
|
|
if ranked[i].doc == nil {
|
|
continue
|
|
}
|
|
hay := strings.ToLower(ranked[i].doc.Title + " " + ranked[i].doc.URL + " " + ranked[i].doc.Content)
|
|
for _, term := range terms {
|
|
if strings.Contains(hay, strings.ToLower(term)) {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func shouldFallbackToLive(localRanked []rankedDoc, terms []string) bool {
|
|
if len(localRanked) == 0 {
|
|
return true
|
|
}
|
|
|
|
// Very low confidence local ranking should trigger a live fetch.
|
|
if localRanked[0].score < 0.2 {
|
|
return true
|
|
}
|
|
|
|
if !topLocalMatchesTerms(localRanked, terms) {
|
|
return true
|
|
}
|
|
|
|
// If we only have one weak match, try live fallback to improve recall.
|
|
if len(localRanked) < 2 && localRanked[0].score < 1.5 {
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
func deriveSearchTerms(language, question string) []string {
|
|
q := strings.ToLower(question)
|
|
var terms []string
|
|
|
|
has := func(parts ...string) bool {
|
|
for _, p := range parts {
|
|
if strings.Contains(q, p) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
switch language {
|
|
case "go":
|
|
if has("regex", "regexp", "regular expression") {
|
|
terms = append(terms, "regexp")
|
|
}
|
|
if has("http", "request", "response", "server", "client") {
|
|
terms = append(terms, "net/http")
|
|
}
|
|
if has("json") {
|
|
terms = append(terms, "encoding/json")
|
|
}
|
|
case "rust":
|
|
if has("regex", "regexp") {
|
|
terms = append(terms, "regex")
|
|
}
|
|
if has("async", "await", "task") {
|
|
terms = append(terms, "tokio")
|
|
}
|
|
case "python":
|
|
if has("regex", "regexp", "regular expression") {
|
|
terms = append(terms, "re")
|
|
}
|
|
if has("async", "await", "task") {
|
|
terms = append(terms, "asyncio")
|
|
}
|
|
case "java":
|
|
if has("regex", "regexp") {
|
|
terms = append(terms, "java/util/regex/package-summary")
|
|
}
|
|
if has("http", "client") {
|
|
terms = append(terms, "java/net/http/package-summary")
|
|
}
|
|
case "spring":
|
|
if has("mcp") {
|
|
terms = append(terms, "mcp-overview")
|
|
}
|
|
terms = append(terms, "features")
|
|
case "typescript":
|
|
if has("regex", "regexp") {
|
|
terms = append(terms, "2/template-literal-types")
|
|
}
|
|
terms = append(terms, "2/basic-types")
|
|
case "react":
|
|
if has("hook", "state", "effect", "memo") {
|
|
terms = append(terms, "hooks")
|
|
}
|
|
terms = append(terms, "hooks")
|
|
case "vue":
|
|
if has("reactivity", "ref", "computed", "watch") {
|
|
terms = append(terms, "essentials/reactivity-fundamentals")
|
|
}
|
|
terms = append(terms, "essentials/reactivity-fundamentals")
|
|
case "nuxt":
|
|
terms = append(terms, "directory-structure")
|
|
case "docker":
|
|
terms = append(terms, "compose")
|
|
case "cloudflare":
|
|
terms = append(terms, "workers")
|
|
case "astro":
|
|
terms = append(terms, "components")
|
|
case "csharp":
|
|
if has("regex", "regexp", "regular expression") {
|
|
terms = append(terms, "regular-expressions")
|
|
}
|
|
terms = append(terms, "operators")
|
|
case "kotlin":
|
|
if has("regex", "regexp", "regular expression") {
|
|
terms = append(terms, "regex")
|
|
}
|
|
terms = append(terms, "collections-overview")
|
|
case "php":
|
|
if has("regex", "regexp", "regular expression") {
|
|
terms = append(terms, "function.preg-match.php")
|
|
}
|
|
terms = append(terms, "pcre")
|
|
case "ruby":
|
|
if has("regex", "regexp", "regular expression") {
|
|
terms = append(terms, "Regexp")
|
|
}
|
|
terms = append(terms, "String")
|
|
case "elixir":
|
|
if has("regex", "regexp", "regular expression") {
|
|
terms = append(terms, "Regex")
|
|
}
|
|
terms = append(terms, "String")
|
|
case "nextjs":
|
|
if has("routing", "route", "router") {
|
|
terms = append(terms, "routing")
|
|
}
|
|
if has("data", "fetch", "server") {
|
|
terms = append(terms, "data-fetching")
|
|
}
|
|
terms = append(terms, "routing")
|
|
case "svelte":
|
|
if has("store", "state") {
|
|
terms = append(terms, "stores")
|
|
}
|
|
if has("kit", "routing", "load") {
|
|
terms = append(terms, "kit")
|
|
}
|
|
terms = append(terms, "overview")
|
|
case "angular":
|
|
if has("http", "client", "request") {
|
|
terms = append(terms, "http")
|
|
}
|
|
if has("routing", "route", "router") {
|
|
terms = append(terms, "routing")
|
|
}
|
|
terms = append(terms, "components")
|
|
case "remix":
|
|
if has("route", "routing") {
|
|
terms = append(terms, "routes")
|
|
}
|
|
if has("loader", "action", "data") {
|
|
terms = append(terms, "loaders")
|
|
}
|
|
terms = append(terms, "routes")
|
|
case "solid":
|
|
added := false
|
|
if has("router", "route", "routing") {
|
|
terms = append(terms, "solid-router")
|
|
added = true
|
|
}
|
|
if has("signal", "state") {
|
|
terms = append(terms, "signals")
|
|
added = true
|
|
}
|
|
if has("server", "ssr", "start") {
|
|
terms = append(terms, "solid-start")
|
|
added = true
|
|
}
|
|
if !added {
|
|
terms = append(terms, "signals")
|
|
}
|
|
case "express":
|
|
if has("middleware", "next", "request") {
|
|
terms = append(terms, "middleware")
|
|
}
|
|
if has("routing", "route", "router") {
|
|
terms = append(terms, "routing")
|
|
}
|
|
terms = append(terms, "middleware")
|
|
}
|
|
|
|
// Use a primary token from the query as a candidate if it looks useful.
|
|
if primary := primaryQueryToken(question); primary != "" {
|
|
terms = append(terms, primary)
|
|
}
|
|
|
|
return dedupeTerms(terms)
|
|
}
|
|
|
|
func primaryQueryToken(query string) string {
|
|
stop := map[string]bool{
|
|
"how": true, "to": true, "do": true, "i": true, "in": true,
|
|
"the": true, "a": true, "an": true, "best": true, "way": true,
|
|
"for": true, "with": true, "and": true, "using": true,
|
|
"what": true, "why": true, "when": true, "where": true, "which": true,
|
|
"does": true, "is": true, "are": true, "can": true, "could": true,
|
|
"should": true, "would": true, "need": true, "help": true, "please": true,
|
|
"show": true, "example": true, "examples": true, "docs": true, "documentation": true,
|
|
}
|
|
for _, tok := range tokenize(query) {
|
|
if len(tok) < 3 || stop[tok] {
|
|
continue
|
|
}
|
|
if _, ok := normalizeLanguage(tok); ok {
|
|
continue
|
|
}
|
|
return tok
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func dedupeTerms(terms []string) []string {
|
|
seen := make(map[string]bool)
|
|
out := make([]string, 0, len(terms))
|
|
for _, term := range terms {
|
|
term = strings.TrimSpace(term)
|
|
key := strings.ToLower(term)
|
|
if term == "" || seen[key] {
|
|
continue
|
|
}
|
|
seen[key] = true
|
|
out = append(out, term)
|
|
}
|
|
return out
|
|
}
|
|
|
|
func scoreDocument(query string, doc *scraper.Document) float64 {
|
|
tokens := tokenize(query)
|
|
title := strings.ToLower(doc.Title)
|
|
content := strings.ToLower(doc.Content)
|
|
docType := strings.ToLower(doc.Type)
|
|
|
|
var score float64 = 0.1
|
|
for _, tok := range tokens {
|
|
if strings.Contains(title, tok) {
|
|
score += 3.0
|
|
}
|
|
if strings.Contains(content, tok) {
|
|
score += 1.0
|
|
}
|
|
if urlContains(doc.URL, tok) {
|
|
score += 0.8
|
|
}
|
|
}
|
|
|
|
if strings.Contains(docType, "function") || strings.Contains(docType, "method") {
|
|
score += 0.3
|
|
}
|
|
if strings.Contains(docType, "section") {
|
|
score += 0.2
|
|
}
|
|
|
|
return score
|
|
}
|
|
|
|
func urlContains(rawURL, token string) bool {
|
|
u, err := url.Parse(rawURL)
|
|
if err != nil {
|
|
return strings.Contains(strings.ToLower(rawURL), token)
|
|
}
|
|
return strings.Contains(strings.ToLower(u.Path), token) || strings.Contains(strings.ToLower(u.Fragment), token)
|
|
}
|
|
|
|
func summarizeTopDocs(question string, docs []rankedDoc) string {
|
|
if len(docs) == 0 {
|
|
return ""
|
|
}
|
|
|
|
snippet := extractSnippet(docs[0].doc.Content, tokenize(question))
|
|
if snippet == "" || len(snippet) < 30 {
|
|
return docs[0].doc.Title
|
|
}
|
|
return snippet
|
|
}
|
|
|
|
func ensureAPISlice(apis []string) []string {
|
|
if apis == nil {
|
|
return []string{}
|
|
}
|
|
return apis
|
|
}
|
|
|
|
func extractRecommendedAPI(docs []rankedDoc) []string {
|
|
titlePattern := regexp.MustCompile(`\.(?:func|type|method)\s+([A-Za-z_][A-Za-z0-9_]*)`)
|
|
titleDashPattern := regexp.MustCompile(`^([A-Za-z_][A-Za-z0-9_.]+)\s+-`)
|
|
callPattern := regexp.MustCompile(`\b([A-Za-z_][A-Za-z0-9_]*(?:\.[A-Za-z_][A-Za-z0-9_]*)?)\(`)
|
|
disallowed := map[string]bool{
|
|
"main": true, "len": true, "make": true, "new": true, "append": true,
|
|
"copy": true, "print": true, "println": true, "panic": true,
|
|
}
|
|
seen := make(map[string]bool)
|
|
var out []string
|
|
pkgHints := make(map[string]bool)
|
|
|
|
for _, rd := range docs {
|
|
if u, err := url.Parse(rd.doc.URL); err == nil {
|
|
base := strings.ToLower(path.Base(strings.Trim(u.Path, "/")))
|
|
if base != "" && base != "." && base != "/" {
|
|
pkgHints[base] = true
|
|
}
|
|
}
|
|
|
|
if m := titlePattern.FindStringSubmatch(rd.doc.Title); len(m) > 1 {
|
|
name := m[1]
|
|
if !seen[name] {
|
|
seen[name] = true
|
|
out = append(out, name)
|
|
}
|
|
}
|
|
if m := titleDashPattern.FindStringSubmatch(rd.doc.Title); len(m) > 1 {
|
|
name := m[1]
|
|
if !seen[name] {
|
|
seen[name] = true
|
|
out = append(out, name)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Titles are usually higher quality than free-form content call extraction.
|
|
if len(out) >= 6 {
|
|
return out[:6]
|
|
}
|
|
|
|
for _, rd := range docs {
|
|
for _, m := range callPattern.FindAllStringSubmatch(rd.doc.Content, -1) {
|
|
if len(m) < 2 {
|
|
continue
|
|
}
|
|
name := m[1]
|
|
if len(name) < 3 || seen[name] || disallowed[strings.ToLower(name)] {
|
|
continue
|
|
}
|
|
if !strings.Contains(name, ".") {
|
|
r := rune(name[0])
|
|
if r >= 'a' && r <= 'z' {
|
|
// Skip local/internal-looking identifiers like validID.
|
|
continue
|
|
}
|
|
} else {
|
|
parts := strings.SplitN(name, ".", 2)
|
|
if len(parts) != 2 {
|
|
continue
|
|
}
|
|
if !pkgHints[strings.ToLower(parts[0])] {
|
|
// Keep calls scoped to the fetched package/module docs.
|
|
continue
|
|
}
|
|
}
|
|
if strings.Count(name, ".") > 2 {
|
|
continue
|
|
}
|
|
seen[name] = true
|
|
out = append(out, name)
|
|
if len(out) >= 8 {
|
|
return out
|
|
}
|
|
}
|
|
}
|
|
|
|
return out
|
|
}
|
|
|
|
func extractExample(docs []rankedDoc) string {
|
|
blockPattern := regexp.MustCompile("(?s)```[a-zA-Z0-9]*\\n(.*?)\\n```")
|
|
var fallback string
|
|
for _, rd := range docs {
|
|
matches := blockPattern.FindAllStringSubmatch(rd.doc.Content, -1)
|
|
for _, m := range matches {
|
|
if len(m) < 2 {
|
|
continue
|
|
}
|
|
example := strings.TrimSpace(m[1])
|
|
if example == "" {
|
|
continue
|
|
}
|
|
if fallback == "" {
|
|
fallback = example
|
|
}
|
|
// Prefer executable-looking snippets over plain signatures.
|
|
if strings.Contains(example, "\n") || strings.Contains(example, "=") || strings.Contains(example, "{") {
|
|
if len(example) > 500 {
|
|
example = example[:500]
|
|
}
|
|
return sanitizeSnippet(example)
|
|
}
|
|
}
|
|
}
|
|
if fallback != "" {
|
|
if len(fallback) > 500 {
|
|
fallback = fallback[:500]
|
|
}
|
|
return sanitizeSnippet(fallback)
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func buildAskSources(question string, docs []rankedDoc) []askSource {
|
|
tokens := tokenize(question)
|
|
out := make([]askSource, 0, len(docs))
|
|
for _, rd := range docs {
|
|
out = append(out, askSource{
|
|
DocID: rd.doc.ID,
|
|
Title: rd.doc.Title,
|
|
URL: rd.doc.URL,
|
|
Type: rd.doc.Type,
|
|
SearchTerm: rd.searchTerm,
|
|
Relevance: rd.score,
|
|
Snippet: extractSnippet(rd.doc.Content, tokens),
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func computeConfidence(question string, docs []rankedDoc) float64 {
|
|
if len(docs) == 0 {
|
|
return 0
|
|
}
|
|
|
|
tokCount := len(tokenize(question))
|
|
if tokCount == 0 {
|
|
tokCount = 1
|
|
}
|
|
|
|
maxPossible := float64(tokCount) * 4.0
|
|
if maxPossible < 1 {
|
|
maxPossible = 1
|
|
}
|
|
|
|
conf := docs[0].score / maxPossible
|
|
if len(docs) > 1 && docs[0].score-docs[1].score < 0.5 {
|
|
conf *= 0.92
|
|
}
|
|
if conf < 0.1 {
|
|
conf = 0.1
|
|
}
|
|
if conf > 0.99 {
|
|
conf = 0.99
|
|
}
|
|
return conf
|
|
}
|
|
|
|
func tokenize(text string) []string {
|
|
nonWord := regexp.MustCompile(`[^a-zA-Z0-9_/.-]+`)
|
|
clean := strings.ToLower(nonWord.ReplaceAllString(text, " "))
|
|
raw := strings.Fields(clean)
|
|
out := make([]string, 0, len(raw))
|
|
for _, tok := range raw {
|
|
if len(tok) < 2 {
|
|
continue
|
|
}
|
|
out = append(out, tok)
|
|
}
|
|
return out
|
|
}
|
|
|
|
func extractSnippet(content string, tokens []string) string {
|
|
if content == "" {
|
|
return ""
|
|
}
|
|
flat := strings.Join(strings.Fields(content), " ")
|
|
if len(flat) > 7000 {
|
|
flat = flat[:7000]
|
|
}
|
|
|
|
lower := strings.ToLower(flat)
|
|
pos := -1
|
|
for _, tok := range tokens {
|
|
if idx := strings.Index(lower, tok); idx >= 0 {
|
|
pos = idx
|
|
break
|
|
}
|
|
}
|
|
|
|
if pos < 0 {
|
|
if len(flat) > 220 {
|
|
return sanitizeSnippet(flat[:220])
|
|
}
|
|
return sanitizeSnippet(flat)
|
|
}
|
|
|
|
// Prefer sentence-ish boundaries around the match for cleaner snippets.
|
|
start := strings.LastIndexAny(flat[:pos], ".!?")
|
|
if start >= 0 {
|
|
start++
|
|
} else {
|
|
start = pos - 90
|
|
if start < 0 {
|
|
start = 0
|
|
}
|
|
}
|
|
|
|
end := len(flat)
|
|
if next := strings.IndexAny(flat[pos:], ".!?"); next >= 0 {
|
|
end = pos + next + 1
|
|
} else {
|
|
end = pos + 180
|
|
if end > len(flat) {
|
|
end = len(flat)
|
|
}
|
|
}
|
|
|
|
snippet := strings.TrimSpace(flat[start:end])
|
|
if len(snippet) > 260 {
|
|
snippet = snippet[:260]
|
|
}
|
|
return sanitizeSnippet(snippet)
|
|
}
|
|
|
|
func sanitizeSnippet(s string) string {
|
|
replacer := strings.NewReplacer("```", " ", "`", "", "¶", " ", "\u00a0", " ")
|
|
s = replacer.Replace(s)
|
|
s = strings.Join(strings.Fields(s), " ")
|
|
return strings.TrimSpace(s)
|
|
}
|
|
|
|
func printAskText(resp askResponse) {
|
|
fmt.Fprintf(rootCmd.OutOrStdout(), "Query: %s\n", resp.Query)
|
|
fmt.Fprintf(rootCmd.OutOrStdout(), "Language: %s\n", resp.Language)
|
|
fmt.Fprintf(rootCmd.OutOrStdout(), "Searched terms: %s\n", strings.Join(resp.SearchedTerms, ", "))
|
|
fmt.Fprintf(rootCmd.OutOrStdout(), "Retrieval: %s (local hits: %d, fallback fetched: %d)\n", resp.Retrieval.Mode, resp.Retrieval.LocalHits, resp.Retrieval.FallbackFetched)
|
|
fmt.Fprintf(rootCmd.OutOrStdout(), "Confidence: %.2f\n\n", resp.Confidence)
|
|
fmt.Fprintln(rootCmd.OutOrStdout(), "Summary:")
|
|
fmt.Fprintln(rootCmd.OutOrStdout(), resp.Answer.Summary)
|
|
fmt.Fprintln(rootCmd.OutOrStdout())
|
|
|
|
if len(resp.Answer.RecommendedAPI) > 0 {
|
|
fmt.Fprintln(rootCmd.OutOrStdout(), "Recommended API:")
|
|
for _, api := range resp.Answer.RecommendedAPI {
|
|
fmt.Fprintf(rootCmd.OutOrStdout(), " - %s\n", api)
|
|
}
|
|
fmt.Fprintln(rootCmd.OutOrStdout())
|
|
}
|
|
|
|
if resp.Answer.Example != "" {
|
|
fmt.Fprintln(rootCmd.OutOrStdout(), "Example:")
|
|
fmt.Fprintln(rootCmd.OutOrStdout(), resp.Answer.Example)
|
|
fmt.Fprintln(rootCmd.OutOrStdout())
|
|
}
|
|
|
|
fmt.Fprintln(rootCmd.OutOrStdout(), "Sources:")
|
|
for _, src := range resp.Sources {
|
|
fmt.Fprintf(rootCmd.OutOrStdout(), "- [%s] %s (%s)\n %s\n", src.Type, src.Title, src.SearchTerm, src.URL)
|
|
}
|
|
}
|