package search import ( "context" "crypto/sha256" "encoding/hex" "encoding/json" "fmt" "io/fs" "os" "path/filepath" "sort" "strings" "time" "github.com/yourorg/devour/internal/config" ) type Engine struct { DocsDir string IndexDir string MetadataDir string SnippetLength int } type SearchOptions struct { Limit int Threshold float64 } type Result struct { ID string `json:"id"` DocID string `json:"doc_id"` Title string `json:"title"` URL string `json:"url,omitempty"` Type string `json:"type"` Source string `json:"source,omitempty"` Path string `json:"path"` Score float64 `json:"score"` Snippet string `json:"snippet"` Meta map[string]any `json:"metadata,omitempty"` } type IndexStats struct { Documents int `json:"documents"` Tokens int `json:"tokens"` LastIndexedAt time.Time `json:"last_indexed_at"` IndexPath string `json:"index_path"` MetadataPath string `json:"metadata_path"` SourceFileHash string `json:"source_file_hash"` } type indexedDoc struct { ID string `json:"id"` DocID string `json:"doc_id"` Title string `json:"title"` URL string `json:"url,omitempty"` Type string `json:"type"` Source string `json:"source,omitempty"` Path string `json:"path"` Content string `json:"content"` TermFreq map[string]int `json:"term_freq"` Length int `json:"length"` } type persistedIndex struct { Version string `json:"version"` BuiltAt time.Time `json:"built_at"` Docs []indexedDoc `json:"docs"` } type persistedMeta struct { Version string `json:"version"` BuiltAt time.Time `json:"built_at"` DocsDir string `json:"docs_dir"` SourceFileHash string `json:"source_file_hash"` DocCount int `json:"doc_count"` } type rawDoc struct { ID string `json:"id"` Source string `json:"source"` Type string `json:"type"` Title string `json:"title"` Content string `json:"content"` URL string `json:"url,omitempty"` Metadata map[string]any `json:"metadata,omitempty"` } const ( indexFileName = "lexical_index.json" metaFileName = "lexical_index_meta.json" indexVersion = "1" ) func NewEngine(cfg *config.Config) *Engine { snippetLength := cfg.Indexing.SnippetLength if snippetLength <= 0 { snippetLength = 220 } return &Engine{ DocsDir: cfg.Storage.DocsDir, IndexDir: cfg.Storage.IndexDir, MetadataDir: cfg.Storage.MetadataDir, SnippetLength: snippetLength, } } func (e *Engine) Rebuild(ctx context.Context) (*IndexStats, error) { if strings.TrimSpace(e.DocsDir) == "" { return nil, fmt.Errorf("docs directory is required") } if err := os.MkdirAll(e.IndexDir, 0o755); err != nil { return nil, err } if err := os.MkdirAll(e.MetadataDir, 0o755); err != nil { return nil, err } docFiles, sourceHash, err := e.listDocFiles() if err != nil { return nil, err } docs := make([]indexedDoc, 0, len(docFiles)) tokenCount := 0 for _, file := range docFiles { select { case <-ctx.Done(): return nil, ctx.Err() default: } rd, err := parseDocFile(file) if err != nil { continue } if strings.TrimSpace(rd.Content) == "" { continue } id := rd.ID if id == "" { id = hashString(file + ":" + rd.Title) } termFreq := frequency(tokenize(strings.Join([]string{rd.Title, rd.Content, rd.URL, rd.Type}, " "))) length := 0 for _, v := range termFreq { length += v } tokenCount += length docs = append(docs, indexedDoc{ ID: hashString(file), DocID: id, Title: bestTitle(rd.Title, file), URL: strings.TrimSpace(rd.URL), Type: defaultString(strings.TrimSpace(rd.Type), "document"), Source: strings.TrimSpace(rd.Source), Path: file, Content: collapseWhitespace(rd.Content), TermFreq: termFreq, Length: length, }) } index := persistedIndex{Version: indexVersion, BuiltAt: time.Now(), Docs: docs} indexPath := filepath.Join(e.IndexDir, indexFileName) if err := writeJSON(indexPath, index); err != nil { return nil, err } meta := persistedMeta{ Version: indexVersion, BuiltAt: index.BuiltAt, DocsDir: e.DocsDir, SourceFileHash: sourceHash, DocCount: len(docs), } metaPath := filepath.Join(e.MetadataDir, metaFileName) if err := writeJSON(metaPath, meta); err != nil { return nil, err } return &IndexStats{ Documents: len(docs), Tokens: tokenCount, LastIndexedAt: index.BuiltAt, IndexPath: indexPath, MetadataPath: metaPath, SourceFileHash: sourceHash, }, nil } func (e *Engine) EnsureIndexed(ctx context.Context) (*IndexStats, error) { metaPath := filepath.Join(e.MetadataDir, metaFileName) b, err := os.ReadFile(metaPath) if err != nil { if os.IsNotExist(err) { return e.Rebuild(ctx) } return nil, err } var meta persistedMeta if err := json.Unmarshal(b, &meta); err != nil { return e.Rebuild(ctx) } _, sourceHash, err := e.listDocFiles() if err != nil { return nil, err } if sourceHash != meta.SourceFileHash { return e.Rebuild(ctx) } return &IndexStats{ Documents: meta.DocCount, LastIndexedAt: meta.BuiltAt, IndexPath: filepath.Join(e.IndexDir, indexFileName), MetadataPath: metaPath, SourceFileHash: meta.SourceFileHash, }, nil } func (e *Engine) Search(ctx context.Context, query string, opts SearchOptions) ([]Result, *IndexStats, error) { query = strings.TrimSpace(query) if query == "" { return nil, nil, fmt.Errorf("query is required") } stats, err := e.EnsureIndexed(ctx) if err != nil { return nil, nil, err } indexPath := filepath.Join(e.IndexDir, indexFileName) b, err := os.ReadFile(indexPath) if err != nil { return nil, nil, err } var idx persistedIndex if err := json.Unmarshal(b, &idx); err != nil { return nil, nil, err } limit := opts.Limit if limit <= 0 { limit = 5 } threshold := opts.Threshold if threshold < 0 { threshold = 0 } queryTokens := tokenize(query) if len(queryTokens) == 0 { return nil, stats, nil } qFreq := frequency(queryTokens) type scored struct { doc indexedDoc score float64 } matches := make([]scored, 0) for _, doc := range idx.Docs { select { case <-ctx.Done(): return nil, nil, ctx.Err() default: } score := lexicalScore(qFreq, queryTokens, doc) if score <= 0 { continue } if threshold > 0 && score < threshold { continue } matches = append(matches, scored{doc: doc, score: score}) } sort.Slice(matches, func(i, j int) bool { if matches[i].score == matches[j].score { return matches[i].doc.Title < matches[j].doc.Title } return matches[i].score > matches[j].score }) if limit > len(matches) { limit = len(matches) } results := make([]Result, 0, limit) for i := 0; i < limit; i++ { d := matches[i].doc results = append(results, Result{ ID: d.ID, DocID: d.DocID, Title: d.Title, URL: d.URL, Type: d.Type, Source: d.Source, Path: d.Path, Score: matches[i].score, Snippet: bestSnippet(d.Content, queryTokens, e.SnippetLength), Meta: map[string]any{ "length": d.Length, }, }) } return results, stats, nil } func (e *Engine) listDocFiles() ([]string, string, error) { files := make([]string, 0) h := sha256.New() err := filepath.WalkDir(e.DocsDir, func(path string, d fs.DirEntry, err error) error { if err != nil { return err } if d.IsDir() { return nil } ext := strings.ToLower(filepath.Ext(path)) switch ext { case ".json", ".md", ".txt": default: return nil } info, statErr := d.Info() if statErr != nil { return statErr } files = append(files, path) fmt.Fprintf(h, "%s|%d|%d\n", path, info.Size(), info.ModTime().UnixNano()) return nil }) if err != nil { if os.IsNotExist(err) { return []string{}, hashString("empty"), nil } return nil, "", err } sort.Strings(files) return files, hex.EncodeToString(h.Sum(nil)), nil } func parseDocFile(path string) (*rawDoc, error) { b, err := os.ReadFile(path) if err != nil { return nil, err } ext := strings.ToLower(filepath.Ext(path)) switch ext { case ".json": var d rawDoc if err := json.Unmarshal(b, &d); err == nil && (d.Title != "" || d.Content != "") { return &d, nil } // Not a structured doc JSON, index as raw text fallback. return &rawDoc{Title: filepath.Base(path), Content: string(b), Type: "json", Source: "local"}, nil case ".md": content := string(b) title := markdownTitle(content) return &rawDoc{Title: title, Content: content, Type: "markdown", Source: "local"}, nil default: return &rawDoc{Title: filepath.Base(path), Content: string(b), Type: "text", Source: "local"}, nil } } func markdownTitle(content string) string { for _, line := range strings.Split(content, "\n") { trim := strings.TrimSpace(line) if strings.HasPrefix(trim, "#") { trim = strings.TrimLeft(trim, "#") trim = strings.TrimSpace(trim) if trim != "" { return trim } } } return "" } func writeJSON(path string, v any) error { b, err := json.MarshalIndent(v, "", " ") if err != nil { return err } return os.WriteFile(path, b, 0o644) } func tokenize(input string) []string { replacer := strings.NewReplacer(",", " ", ".", " ", "(", " ", ")", " ", "[", " ", "]", " ", "{", " ", "}", " ", ":", " ", ";", " ", "!", " ", "?", " ", "\n", " ", "\r", " ", "\t", " ", "\"", " ", "'", " ", "`", " ", ) clean := strings.ToLower(replacer.Replace(input)) parts := strings.Fields(clean) out := make([]string, 0, len(parts)) for _, p := range parts { p = strings.TrimSpace(p) if len(p) < 2 { continue } out = append(out, p) } return out } func frequency(tokens []string) map[string]int { m := make(map[string]int, len(tokens)) for _, t := range tokens { m[t]++ } return m } func lexicalScore(qFreq map[string]int, queryTokens []string, doc indexedDoc) float64 { if len(doc.TermFreq) == 0 { return 0 } titleLower := strings.ToLower(doc.Title) urlLower := strings.ToLower(doc.URL) contentLower := strings.ToLower(doc.Content) score := 0.0 for token, qCount := range qFreq { dCount := doc.TermFreq[token] if dCount == 0 { continue } part := float64(dCount*qCount) / float64(max(1, doc.Length)) score += part * 8.0 if strings.Contains(titleLower, token) { score += 2.5 } if strings.Contains(urlLower, token) { score += 1.2 } } phrase := strings.Join(queryTokens, " ") if len(queryTokens) > 1 && strings.Contains(contentLower, phrase) { score += 1.5 } return score } func bestSnippet(content string, queryTokens []string, maxLen int) string { if maxLen <= 0 { maxLen = 220 } flat := collapseWhitespace(content) if flat == "" { return "" } if len(flat) <= maxLen { return flat } lower := strings.ToLower(flat) start := 0 for _, tok := range queryTokens { if idx := strings.Index(lower, tok); idx >= 0 { start = idx - (maxLen / 4) if start < 0 { start = 0 } break } } end := start + maxLen if end > len(flat) { end = len(flat) } snippet := strings.TrimSpace(flat[start:end]) if end < len(flat) { snippet += "..." } return snippet } func collapseWhitespace(s string) string { return strings.Join(strings.Fields(strings.TrimSpace(s)), " ") } func bestTitle(title, path string) string { title = strings.TrimSpace(title) if title != "" { return title } base := filepath.Base(path) base = strings.TrimSuffix(base, filepath.Ext(base)) base = strings.ReplaceAll(base, "_", " ") base = strings.TrimSpace(base) if base == "" { return "Documentation" } return base } func defaultString(v, fallback string) string { if strings.TrimSpace(v) == "" { return fallback } return v } func hashString(s string) string { sum := sha256.Sum256([]byte(s)) return hex.EncodeToString(sum[:12]) } func max(a, b int) int { if a > b { return a } return b }