mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
552 lines
13 KiB
Go
552 lines
13 KiB
Go
package search
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io/fs"
|
|
"log"
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/yourorg/devour/internal/config"
|
|
)
|
|
|
|
type Engine struct {
|
|
DocsDir string
|
|
IndexDir string
|
|
MetadataDir string
|
|
SnippetLength int
|
|
}
|
|
|
|
type SearchOptions struct {
|
|
Limit int
|
|
Threshold float64
|
|
}
|
|
|
|
type Result struct {
|
|
ID string `json:"id"`
|
|
DocID string `json:"doc_id"`
|
|
Title string `json:"title"`
|
|
URL string `json:"url,omitempty"`
|
|
Type string `json:"type"`
|
|
Source string `json:"source,omitempty"`
|
|
Path string `json:"path"`
|
|
Score float64 `json:"score"`
|
|
Snippet string `json:"snippet"`
|
|
Meta map[string]any `json:"metadata,omitempty"`
|
|
}
|
|
|
|
type IndexStats struct {
|
|
Documents int `json:"documents"`
|
|
Tokens int `json:"tokens"`
|
|
LastIndexedAt time.Time `json:"last_indexed_at"`
|
|
IndexPath string `json:"index_path"`
|
|
MetadataPath string `json:"metadata_path"`
|
|
SourceFileHash string `json:"source_file_hash"`
|
|
}
|
|
|
|
type indexedDoc struct {
|
|
ID string `json:"id"`
|
|
DocID string `json:"doc_id"`
|
|
Title string `json:"title"`
|
|
URL string `json:"url,omitempty"`
|
|
Type string `json:"type"`
|
|
Source string `json:"source,omitempty"`
|
|
Path string `json:"path"`
|
|
Content string `json:"content"`
|
|
TermFreq map[string]int `json:"term_freq"`
|
|
Length int `json:"length"`
|
|
}
|
|
|
|
type persistedIndex struct {
|
|
Version string `json:"version"`
|
|
BuiltAt time.Time `json:"built_at"`
|
|
Docs []indexedDoc `json:"docs"`
|
|
}
|
|
|
|
type persistedMeta struct {
|
|
Version string `json:"version"`
|
|
BuiltAt time.Time `json:"built_at"`
|
|
DocsDir string `json:"docs_dir"`
|
|
SourceFileHash string `json:"source_file_hash"`
|
|
DocCount int `json:"doc_count"`
|
|
}
|
|
|
|
type rawDoc struct {
|
|
ID string `json:"id"`
|
|
Source string `json:"source"`
|
|
Type string `json:"type"`
|
|
Title string `json:"title"`
|
|
Content string `json:"content"`
|
|
URL string `json:"url,omitempty"`
|
|
Metadata map[string]any `json:"metadata,omitempty"`
|
|
}
|
|
|
|
const (
|
|
indexFileName = "lexical_index.json"
|
|
metaFileName = "lexical_index_meta.json"
|
|
indexVersion = "1"
|
|
)
|
|
|
|
func NewEngine(cfg *config.Config) *Engine {
|
|
snippetLength := cfg.Indexing.SnippetLength
|
|
if snippetLength <= 0 {
|
|
snippetLength = 220
|
|
}
|
|
return &Engine{
|
|
DocsDir: cfg.Storage.DocsDir,
|
|
IndexDir: cfg.Storage.IndexDir,
|
|
MetadataDir: cfg.Storage.MetadataDir,
|
|
SnippetLength: snippetLength,
|
|
}
|
|
}
|
|
|
|
func (e *Engine) Rebuild(ctx context.Context) (*IndexStats, error) {
|
|
if strings.TrimSpace(e.DocsDir) == "" {
|
|
return nil, fmt.Errorf("docs directory is required")
|
|
}
|
|
if err := os.MkdirAll(e.IndexDir, 0o755); err != nil {
|
|
return nil, fmt.Errorf("create index dir %q: %w", e.IndexDir, err)
|
|
}
|
|
if err := os.MkdirAll(e.MetadataDir, 0o755); err != nil {
|
|
return nil, fmt.Errorf("create metadata dir %q: %w", e.MetadataDir, err)
|
|
}
|
|
|
|
docFiles, sourceHash, err := e.listDocFiles()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("list docs for rebuild: %w", err)
|
|
}
|
|
|
|
docs := make([]indexedDoc, 0, len(docFiles))
|
|
parseErrors := make([]error, 0)
|
|
tokenCount := 0
|
|
for _, file := range docFiles {
|
|
select {
|
|
case <-ctx.Done():
|
|
return nil, ctx.Err()
|
|
default:
|
|
}
|
|
|
|
rd, err := parseDocFile(file)
|
|
if err != nil {
|
|
if len(parseErrors) < 20 {
|
|
parseErrors = append(parseErrors, fmt.Errorf("%s: %w", file, err))
|
|
}
|
|
continue
|
|
}
|
|
if strings.TrimSpace(rd.Content) == "" {
|
|
continue
|
|
}
|
|
|
|
id := rd.ID
|
|
if id == "" {
|
|
id = hashString(file + ":" + rd.Title)
|
|
}
|
|
termFreq := frequency(tokenize(strings.Join([]string{rd.Title, rd.Content, rd.URL, rd.Type}, " ")))
|
|
length := 0
|
|
for _, v := range termFreq {
|
|
length += v
|
|
}
|
|
tokenCount += length
|
|
|
|
docs = append(docs, indexedDoc{
|
|
ID: hashString(file),
|
|
DocID: id,
|
|
Title: bestTitle(rd.Title, file),
|
|
URL: strings.TrimSpace(rd.URL),
|
|
Type: defaultString(strings.TrimSpace(rd.Type), "document"),
|
|
Source: strings.TrimSpace(rd.Source),
|
|
Path: file,
|
|
Content: collapseWhitespace(rd.Content),
|
|
TermFreq: termFreq,
|
|
Length: length,
|
|
})
|
|
}
|
|
if len(parseErrors) > 0 {
|
|
log.Printf("search rebuild skipped %d files due to parse/read errors (sample: %v)", len(parseErrors), parseErrors[0])
|
|
if len(docFiles) > 0 && len(docs) == 0 {
|
|
return nil, fmt.Errorf("rebuild produced no indexable docs after parse failures: %w", errors.Join(parseErrors...))
|
|
}
|
|
}
|
|
|
|
index := persistedIndex{Version: indexVersion, BuiltAt: time.Now(), Docs: docs}
|
|
indexPath := filepath.Join(e.IndexDir, indexFileName)
|
|
if err := writeJSON(indexPath, index); err != nil {
|
|
return nil, fmt.Errorf("write lexical index: %w", err)
|
|
}
|
|
|
|
meta := persistedMeta{
|
|
Version: indexVersion,
|
|
BuiltAt: index.BuiltAt,
|
|
DocsDir: e.DocsDir,
|
|
SourceFileHash: sourceHash,
|
|
DocCount: len(docs),
|
|
}
|
|
metaPath := filepath.Join(e.MetadataDir, metaFileName)
|
|
if err := writeJSON(metaPath, meta); err != nil {
|
|
return nil, fmt.Errorf("write lexical metadata: %w", err)
|
|
}
|
|
|
|
return &IndexStats{
|
|
Documents: len(docs),
|
|
Tokens: tokenCount,
|
|
LastIndexedAt: index.BuiltAt,
|
|
IndexPath: indexPath,
|
|
MetadataPath: metaPath,
|
|
SourceFileHash: sourceHash,
|
|
}, nil
|
|
}
|
|
|
|
func (e *Engine) EnsureIndexed(ctx context.Context) (*IndexStats, error) {
|
|
metaPath := filepath.Join(e.MetadataDir, metaFileName)
|
|
b, err := os.ReadFile(metaPath)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
return e.Rebuild(ctx)
|
|
}
|
|
return nil, fmt.Errorf("read index metadata %q: %w", metaPath, err)
|
|
}
|
|
|
|
var meta persistedMeta
|
|
if err := json.Unmarshal(b, &meta); err != nil {
|
|
stats, rebuildErr := e.Rebuild(ctx)
|
|
if rebuildErr != nil {
|
|
return nil, fmt.Errorf("rebuild after invalid metadata %q: %w", metaPath, rebuildErr)
|
|
}
|
|
return stats, nil
|
|
}
|
|
|
|
_, sourceHash, err := e.listDocFiles()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("list docs for metadata check: %w", err)
|
|
}
|
|
if sourceHash != meta.SourceFileHash {
|
|
stats, rebuildErr := e.Rebuild(ctx)
|
|
if rebuildErr != nil {
|
|
return nil, fmt.Errorf("rebuild after source hash change: %w", rebuildErr)
|
|
}
|
|
return stats, nil
|
|
}
|
|
|
|
return &IndexStats{
|
|
Documents: meta.DocCount,
|
|
LastIndexedAt: meta.BuiltAt,
|
|
IndexPath: filepath.Join(e.IndexDir, indexFileName),
|
|
MetadataPath: metaPath,
|
|
SourceFileHash: meta.SourceFileHash,
|
|
}, nil
|
|
}
|
|
|
|
func (e *Engine) Search(ctx context.Context, query string, opts SearchOptions) ([]Result, *IndexStats, error) {
|
|
query = strings.TrimSpace(query)
|
|
if query == "" {
|
|
return nil, nil, fmt.Errorf("query is required")
|
|
}
|
|
|
|
stats, err := e.EnsureIndexed(ctx)
|
|
if err != nil {
|
|
return nil, nil, fmt.Errorf("ensure lexical index: %w", err)
|
|
}
|
|
|
|
indexPath := filepath.Join(e.IndexDir, indexFileName)
|
|
b, err := os.ReadFile(indexPath)
|
|
if err != nil {
|
|
return nil, nil, fmt.Errorf("read lexical index %q: %w", indexPath, err)
|
|
}
|
|
|
|
var idx persistedIndex
|
|
if err := json.Unmarshal(b, &idx); err != nil {
|
|
return nil, nil, fmt.Errorf("decode lexical index %q: %w", indexPath, err)
|
|
}
|
|
|
|
limit := opts.Limit
|
|
if limit <= 0 {
|
|
limit = 5
|
|
}
|
|
threshold := opts.Threshold
|
|
if threshold < 0 {
|
|
threshold = 0
|
|
}
|
|
|
|
queryTokens := tokenize(query)
|
|
if len(queryTokens) == 0 {
|
|
return nil, stats, nil
|
|
}
|
|
qFreq := frequency(queryTokens)
|
|
|
|
type scored struct {
|
|
doc indexedDoc
|
|
score float64
|
|
}
|
|
matches := make([]scored, 0)
|
|
|
|
for _, doc := range idx.Docs {
|
|
select {
|
|
case <-ctx.Done():
|
|
return nil, nil, fmt.Errorf("search canceled: %w", ctx.Err())
|
|
default:
|
|
}
|
|
score := lexicalScore(qFreq, queryTokens, doc)
|
|
if score <= 0 {
|
|
continue
|
|
}
|
|
if threshold > 0 && score < threshold {
|
|
continue
|
|
}
|
|
matches = append(matches, scored{doc: doc, score: score})
|
|
}
|
|
|
|
sort.Slice(matches, func(i, j int) bool {
|
|
if matches[i].score == matches[j].score {
|
|
return matches[i].doc.Title < matches[j].doc.Title
|
|
}
|
|
return matches[i].score > matches[j].score
|
|
})
|
|
|
|
if limit > len(matches) {
|
|
limit = len(matches)
|
|
}
|
|
|
|
results := make([]Result, 0, limit)
|
|
for i := 0; i < limit; i++ {
|
|
d := matches[i].doc
|
|
results = append(results, Result{
|
|
ID: d.ID,
|
|
DocID: d.DocID,
|
|
Title: d.Title,
|
|
URL: d.URL,
|
|
Type: d.Type,
|
|
Source: d.Source,
|
|
Path: d.Path,
|
|
Score: matches[i].score,
|
|
Snippet: bestSnippet(d.Content, queryTokens, e.SnippetLength),
|
|
Meta: map[string]any{
|
|
"length": d.Length,
|
|
},
|
|
})
|
|
}
|
|
|
|
return results, stats, nil
|
|
}
|
|
|
|
func (e *Engine) listDocFiles() ([]string, string, error) {
|
|
files := make([]string, 0)
|
|
h := sha256.New()
|
|
|
|
err := filepath.WalkDir(e.DocsDir, func(path string, d fs.DirEntry, err error) error {
|
|
if err != nil {
|
|
return fmt.Errorf("walk docs entry %q: %w", path, err)
|
|
}
|
|
if d.IsDir() {
|
|
return nil
|
|
}
|
|
ext := strings.ToLower(filepath.Ext(path))
|
|
switch ext {
|
|
case ".json", ".md", ".txt":
|
|
default:
|
|
return nil
|
|
}
|
|
|
|
info, statErr := d.Info()
|
|
if statErr != nil {
|
|
return fmt.Errorf("stat docs file %q: %w", path, statErr)
|
|
}
|
|
files = append(files, path)
|
|
fmt.Fprintf(h, "%s|%d|%d\n", path, info.Size(), info.ModTime().UnixNano())
|
|
return nil
|
|
})
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
return []string{}, hashString("empty"), nil
|
|
}
|
|
return nil, "", fmt.Errorf("walk docs dir %q: %w", e.DocsDir, err)
|
|
}
|
|
|
|
sort.Strings(files)
|
|
return files, hex.EncodeToString(h.Sum(nil)), nil
|
|
}
|
|
|
|
func parseDocFile(path string) (*rawDoc, error) {
|
|
b, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("read doc file %q: %w", path, err)
|
|
}
|
|
ext := strings.ToLower(filepath.Ext(path))
|
|
switch ext {
|
|
case ".json":
|
|
var d rawDoc
|
|
if err := json.Unmarshal(b, &d); err == nil && (d.Title != "" || d.Content != "") {
|
|
return &d, nil
|
|
}
|
|
// Not a structured doc JSON, index as raw text fallback.
|
|
return &rawDoc{Title: filepath.Base(path), Content: string(b), Type: "json", Source: "local"}, nil
|
|
case ".md":
|
|
content := string(b)
|
|
title := markdownTitle(content)
|
|
return &rawDoc{Title: title, Content: content, Type: "markdown", Source: "local"}, nil
|
|
default:
|
|
return &rawDoc{Title: filepath.Base(path), Content: string(b), Type: "text", Source: "local"}, nil
|
|
}
|
|
}
|
|
|
|
func markdownTitle(content string) string {
|
|
for _, line := range strings.Split(content, "\n") {
|
|
trim := strings.TrimSpace(line)
|
|
if strings.HasPrefix(trim, "#") {
|
|
trim = strings.TrimLeft(trim, "#")
|
|
trim = strings.TrimSpace(trim)
|
|
if trim != "" {
|
|
return trim
|
|
}
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func writeJSON(path string, v any) error {
|
|
b, err := json.MarshalIndent(v, "", " ")
|
|
if err != nil {
|
|
return fmt.Errorf("marshal json payload: %w", err)
|
|
}
|
|
if err := os.WriteFile(path, b, 0o644); err != nil {
|
|
return fmt.Errorf("write json file %q: %w", path, err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func tokenize(input string) []string {
|
|
replacer := strings.NewReplacer(",", " ", ".", " ", "(", " ", ")", " ", "[", " ", "]", " ", "{", " ", "}", " ",
|
|
":", " ", ";", " ", "!", " ", "?", " ", "\n", " ", "\r", " ", "\t", " ", "\"", " ", "'", " ", "`", " ",
|
|
)
|
|
clean := strings.ToLower(replacer.Replace(input))
|
|
parts := strings.Fields(clean)
|
|
out := make([]string, 0, len(parts))
|
|
for _, p := range parts {
|
|
p = strings.TrimSpace(p)
|
|
if len(p) < 2 {
|
|
continue
|
|
}
|
|
out = append(out, p)
|
|
}
|
|
return out
|
|
}
|
|
|
|
func frequency(tokens []string) map[string]int {
|
|
m := make(map[string]int, len(tokens))
|
|
for _, t := range tokens {
|
|
m[t]++
|
|
}
|
|
return m
|
|
}
|
|
|
|
func lexicalScore(qFreq map[string]int, queryTokens []string, doc indexedDoc) float64 {
|
|
if len(doc.TermFreq) == 0 {
|
|
return 0
|
|
}
|
|
|
|
titleLower := strings.ToLower(doc.Title)
|
|
urlLower := strings.ToLower(doc.URL)
|
|
contentLower := strings.ToLower(doc.Content)
|
|
|
|
score := 0.0
|
|
for token, qCount := range qFreq {
|
|
dCount := doc.TermFreq[token]
|
|
if dCount == 0 {
|
|
continue
|
|
}
|
|
part := float64(dCount*qCount) / float64(max(1, doc.Length))
|
|
score += part * 8.0
|
|
if strings.Contains(titleLower, token) {
|
|
score += 2.5
|
|
}
|
|
if strings.Contains(urlLower, token) {
|
|
score += 1.2
|
|
}
|
|
}
|
|
|
|
phrase := strings.Join(queryTokens, " ")
|
|
if len(queryTokens) > 1 && strings.Contains(contentLower, phrase) {
|
|
score += 1.5
|
|
}
|
|
return score
|
|
}
|
|
|
|
func bestSnippet(content string, queryTokens []string, maxLen int) string {
|
|
if maxLen <= 0 {
|
|
maxLen = 220
|
|
}
|
|
flat := collapseWhitespace(content)
|
|
if flat == "" {
|
|
return ""
|
|
}
|
|
if len(flat) <= maxLen {
|
|
return flat
|
|
}
|
|
|
|
lower := strings.ToLower(flat)
|
|
start := 0
|
|
for _, tok := range queryTokens {
|
|
if idx := strings.Index(lower, tok); idx >= 0 {
|
|
start = idx - (maxLen / 4)
|
|
if start < 0 {
|
|
start = 0
|
|
}
|
|
break
|
|
}
|
|
}
|
|
end := start + maxLen
|
|
if end > len(flat) {
|
|
end = len(flat)
|
|
}
|
|
snippet := strings.TrimSpace(flat[start:end])
|
|
if end < len(flat) {
|
|
snippet += "..."
|
|
}
|
|
return snippet
|
|
}
|
|
|
|
func collapseWhitespace(s string) string {
|
|
return strings.Join(strings.Fields(strings.TrimSpace(s)), " ")
|
|
}
|
|
|
|
func bestTitle(title, path string) string {
|
|
title = strings.TrimSpace(title)
|
|
if title != "" {
|
|
return title
|
|
}
|
|
base := filepath.Base(path)
|
|
base = strings.TrimSuffix(base, filepath.Ext(base))
|
|
base = strings.ReplaceAll(base, "_", " ")
|
|
base = strings.TrimSpace(base)
|
|
if base == "" {
|
|
return "Documentation"
|
|
}
|
|
return base
|
|
}
|
|
|
|
func defaultString(v, fallback string) string {
|
|
if strings.TrimSpace(v) == "" {
|
|
return fallback
|
|
}
|
|
return v
|
|
}
|
|
|
|
func hashString(s string) string {
|
|
sum := sha256.Sum256([]byte(s))
|
|
return hex.EncodeToString(sum[:12])
|
|
}
|
|
|
|
func max(a, b int) int {
|
|
if a > b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|