This commit is contained in:
Tomas Dvorak
2026-02-24 10:33:59 +01:00
parent 409acd2e08
commit 898a3c303f
1374 changed files with 290409 additions and 29187 deletions
+528
View File
@@ -0,0 +1,528 @@
package search
import (
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
"io/fs"
"os"
"path/filepath"
"sort"
"strings"
"time"
"github.com/yourorg/devour/internal/config"
)
type Engine struct {
DocsDir string
IndexDir string
MetadataDir string
SnippetLength int
}
type SearchOptions struct {
Limit int
Threshold float64
}
type Result struct {
ID string `json:"id"`
DocID string `json:"doc_id"`
Title string `json:"title"`
URL string `json:"url,omitempty"`
Type string `json:"type"`
Source string `json:"source,omitempty"`
Path string `json:"path"`
Score float64 `json:"score"`
Snippet string `json:"snippet"`
Meta map[string]any `json:"metadata,omitempty"`
}
type IndexStats struct {
Documents int `json:"documents"`
Tokens int `json:"tokens"`
LastIndexedAt time.Time `json:"last_indexed_at"`
IndexPath string `json:"index_path"`
MetadataPath string `json:"metadata_path"`
SourceFileHash string `json:"source_file_hash"`
}
type indexedDoc struct {
ID string `json:"id"`
DocID string `json:"doc_id"`
Title string `json:"title"`
URL string `json:"url,omitempty"`
Type string `json:"type"`
Source string `json:"source,omitempty"`
Path string `json:"path"`
Content string `json:"content"`
TermFreq map[string]int `json:"term_freq"`
Length int `json:"length"`
}
type persistedIndex struct {
Version string `json:"version"`
BuiltAt time.Time `json:"built_at"`
Docs []indexedDoc `json:"docs"`
}
type persistedMeta struct {
Version string `json:"version"`
BuiltAt time.Time `json:"built_at"`
DocsDir string `json:"docs_dir"`
SourceFileHash string `json:"source_file_hash"`
DocCount int `json:"doc_count"`
}
type rawDoc struct {
ID string `json:"id"`
Source string `json:"source"`
Type string `json:"type"`
Title string `json:"title"`
Content string `json:"content"`
URL string `json:"url,omitempty"`
Metadata map[string]any `json:"metadata,omitempty"`
}
const (
indexFileName = "lexical_index.json"
metaFileName = "lexical_index_meta.json"
indexVersion = "1"
)
func NewEngine(cfg *config.Config) *Engine {
snippetLength := cfg.Indexing.SnippetLength
if snippetLength <= 0 {
snippetLength = 220
}
return &Engine{
DocsDir: cfg.Storage.DocsDir,
IndexDir: cfg.Storage.IndexDir,
MetadataDir: cfg.Storage.MetadataDir,
SnippetLength: snippetLength,
}
}
func (e *Engine) Rebuild(ctx context.Context) (*IndexStats, error) {
if strings.TrimSpace(e.DocsDir) == "" {
return nil, fmt.Errorf("docs directory is required")
}
if err := os.MkdirAll(e.IndexDir, 0o755); err != nil {
return nil, err
}
if err := os.MkdirAll(e.MetadataDir, 0o755); err != nil {
return nil, err
}
docFiles, sourceHash, err := e.listDocFiles()
if err != nil {
return nil, err
}
docs := make([]indexedDoc, 0, len(docFiles))
tokenCount := 0
for _, file := range docFiles {
select {
case <-ctx.Done():
return nil, ctx.Err()
default:
}
rd, err := parseDocFile(file)
if err != nil {
continue
}
if strings.TrimSpace(rd.Content) == "" {
continue
}
id := rd.ID
if id == "" {
id = hashString(file + ":" + rd.Title)
}
termFreq := frequency(tokenize(strings.Join([]string{rd.Title, rd.Content, rd.URL, rd.Type}, " ")))
length := 0
for _, v := range termFreq {
length += v
}
tokenCount += length
docs = append(docs, indexedDoc{
ID: hashString(file),
DocID: id,
Title: bestTitle(rd.Title, file),
URL: strings.TrimSpace(rd.URL),
Type: defaultString(strings.TrimSpace(rd.Type), "document"),
Source: strings.TrimSpace(rd.Source),
Path: file,
Content: collapseWhitespace(rd.Content),
TermFreq: termFreq,
Length: length,
})
}
index := persistedIndex{Version: indexVersion, BuiltAt: time.Now(), Docs: docs}
indexPath := filepath.Join(e.IndexDir, indexFileName)
if err := writeJSON(indexPath, index); err != nil {
return nil, err
}
meta := persistedMeta{
Version: indexVersion,
BuiltAt: index.BuiltAt,
DocsDir: e.DocsDir,
SourceFileHash: sourceHash,
DocCount: len(docs),
}
metaPath := filepath.Join(e.MetadataDir, metaFileName)
if err := writeJSON(metaPath, meta); err != nil {
return nil, err
}
return &IndexStats{
Documents: len(docs),
Tokens: tokenCount,
LastIndexedAt: index.BuiltAt,
IndexPath: indexPath,
MetadataPath: metaPath,
SourceFileHash: sourceHash,
}, nil
}
func (e *Engine) EnsureIndexed(ctx context.Context) (*IndexStats, error) {
metaPath := filepath.Join(e.MetadataDir, metaFileName)
b, err := os.ReadFile(metaPath)
if err != nil {
if os.IsNotExist(err) {
return e.Rebuild(ctx)
}
return nil, err
}
var meta persistedMeta
if err := json.Unmarshal(b, &meta); err != nil {
return e.Rebuild(ctx)
}
_, sourceHash, err := e.listDocFiles()
if err != nil {
return nil, err
}
if sourceHash != meta.SourceFileHash {
return e.Rebuild(ctx)
}
return &IndexStats{
Documents: meta.DocCount,
LastIndexedAt: meta.BuiltAt,
IndexPath: filepath.Join(e.IndexDir, indexFileName),
MetadataPath: metaPath,
SourceFileHash: meta.SourceFileHash,
}, nil
}
func (e *Engine) Search(ctx context.Context, query string, opts SearchOptions) ([]Result, *IndexStats, error) {
query = strings.TrimSpace(query)
if query == "" {
return nil, nil, fmt.Errorf("query is required")
}
stats, err := e.EnsureIndexed(ctx)
if err != nil {
return nil, nil, err
}
indexPath := filepath.Join(e.IndexDir, indexFileName)
b, err := os.ReadFile(indexPath)
if err != nil {
return nil, nil, err
}
var idx persistedIndex
if err := json.Unmarshal(b, &idx); err != nil {
return nil, nil, err
}
limit := opts.Limit
if limit <= 0 {
limit = 5
}
threshold := opts.Threshold
if threshold < 0 {
threshold = 0
}
queryTokens := tokenize(query)
if len(queryTokens) == 0 {
return nil, stats, nil
}
qFreq := frequency(queryTokens)
type scored struct {
doc indexedDoc
score float64
}
matches := make([]scored, 0)
for _, doc := range idx.Docs {
select {
case <-ctx.Done():
return nil, nil, ctx.Err()
default:
}
score := lexicalScore(qFreq, queryTokens, doc)
if score <= 0 {
continue
}
if threshold > 0 && score < threshold {
continue
}
matches = append(matches, scored{doc: doc, score: score})
}
sort.Slice(matches, func(i, j int) bool {
if matches[i].score == matches[j].score {
return matches[i].doc.Title < matches[j].doc.Title
}
return matches[i].score > matches[j].score
})
if limit > len(matches) {
limit = len(matches)
}
results := make([]Result, 0, limit)
for i := 0; i < limit; i++ {
d := matches[i].doc
results = append(results, Result{
ID: d.ID,
DocID: d.DocID,
Title: d.Title,
URL: d.URL,
Type: d.Type,
Source: d.Source,
Path: d.Path,
Score: matches[i].score,
Snippet: bestSnippet(d.Content, queryTokens, e.SnippetLength),
Meta: map[string]any{
"length": d.Length,
},
})
}
return results, stats, nil
}
func (e *Engine) listDocFiles() ([]string, string, error) {
files := make([]string, 0)
h := sha256.New()
err := filepath.WalkDir(e.DocsDir, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if d.IsDir() {
return nil
}
ext := strings.ToLower(filepath.Ext(path))
switch ext {
case ".json", ".md", ".txt":
default:
return nil
}
info, statErr := d.Info()
if statErr != nil {
return statErr
}
files = append(files, path)
fmt.Fprintf(h, "%s|%d|%d\n", path, info.Size(), info.ModTime().UnixNano())
return nil
})
if err != nil {
if os.IsNotExist(err) {
return []string{}, hashString("empty"), nil
}
return nil, "", err
}
sort.Strings(files)
return files, hex.EncodeToString(h.Sum(nil)), nil
}
func parseDocFile(path string) (*rawDoc, error) {
b, err := os.ReadFile(path)
if err != nil {
return nil, err
}
ext := strings.ToLower(filepath.Ext(path))
switch ext {
case ".json":
var d rawDoc
if err := json.Unmarshal(b, &d); err == nil && (d.Title != "" || d.Content != "") {
return &d, nil
}
// Not a structured doc JSON, index as raw text fallback.
return &rawDoc{Title: filepath.Base(path), Content: string(b), Type: "json", Source: "local"}, nil
case ".md":
content := string(b)
title := markdownTitle(content)
return &rawDoc{Title: title, Content: content, Type: "markdown", Source: "local"}, nil
default:
return &rawDoc{Title: filepath.Base(path), Content: string(b), Type: "text", Source: "local"}, nil
}
}
func markdownTitle(content string) string {
for _, line := range strings.Split(content, "\n") {
trim := strings.TrimSpace(line)
if strings.HasPrefix(trim, "#") {
trim = strings.TrimLeft(trim, "#")
trim = strings.TrimSpace(trim)
if trim != "" {
return trim
}
}
}
return ""
}
func writeJSON(path string, v any) error {
b, err := json.MarshalIndent(v, "", " ")
if err != nil {
return err
}
return os.WriteFile(path, b, 0o644)
}
func tokenize(input string) []string {
replacer := strings.NewReplacer(",", " ", ".", " ", "(", " ", ")", " ", "[", " ", "]", " ", "{", " ", "}", " ",
":", " ", ";", " ", "!", " ", "?", " ", "\n", " ", "\r", " ", "\t", " ", "\"", " ", "'", " ", "`", " ",
)
clean := strings.ToLower(replacer.Replace(input))
parts := strings.Fields(clean)
out := make([]string, 0, len(parts))
for _, p := range parts {
p = strings.TrimSpace(p)
if len(p) < 2 {
continue
}
out = append(out, p)
}
return out
}
func frequency(tokens []string) map[string]int {
m := make(map[string]int, len(tokens))
for _, t := range tokens {
m[t]++
}
return m
}
func lexicalScore(qFreq map[string]int, queryTokens []string, doc indexedDoc) float64 {
if len(doc.TermFreq) == 0 {
return 0
}
titleLower := strings.ToLower(doc.Title)
urlLower := strings.ToLower(doc.URL)
contentLower := strings.ToLower(doc.Content)
score := 0.0
for token, qCount := range qFreq {
dCount := doc.TermFreq[token]
if dCount == 0 {
continue
}
part := float64(dCount*qCount) / float64(max(1, doc.Length))
score += part * 8.0
if strings.Contains(titleLower, token) {
score += 2.5
}
if strings.Contains(urlLower, token) {
score += 1.2
}
}
phrase := strings.Join(queryTokens, " ")
if len(queryTokens) > 1 && strings.Contains(contentLower, phrase) {
score += 1.5
}
return score
}
func bestSnippet(content string, queryTokens []string, maxLen int) string {
if maxLen <= 0 {
maxLen = 220
}
flat := collapseWhitespace(content)
if flat == "" {
return ""
}
if len(flat) <= maxLen {
return flat
}
lower := strings.ToLower(flat)
start := 0
for _, tok := range queryTokens {
if idx := strings.Index(lower, tok); idx >= 0 {
start = idx - (maxLen / 4)
if start < 0 {
start = 0
}
break
}
}
end := start + maxLen
if end > len(flat) {
end = len(flat)
}
snippet := strings.TrimSpace(flat[start:end])
if end < len(flat) {
snippet += "..."
}
return snippet
}
func collapseWhitespace(s string) string {
return strings.Join(strings.Fields(strings.TrimSpace(s)), " ")
}
func bestTitle(title, path string) string {
title = strings.TrimSpace(title)
if title != "" {
return title
}
base := filepath.Base(path)
base = strings.TrimSuffix(base, filepath.Ext(base))
base = strings.ReplaceAll(base, "_", " ")
base = strings.TrimSpace(base)
if base == "" {
return "Documentation"
}
return base
}
func defaultString(v, fallback string) string {
if strings.TrimSpace(v) == "" {
return fallback
}
return v
}
func hashString(s string) string {
sum := sha256.Sum256([]byte(s))
return hex.EncodeToString(sum[:12])
}
func max(a, b int) int {
if a > b {
return a
}
return b
}
+56
View File
@@ -0,0 +1,56 @@
package search
import (
"context"
"encoding/json"
"os"
"path/filepath"
"testing"
"github.com/yourorg/devour/internal/config"
)
func TestEngineRebuildAndSearch(t *testing.T) {
tmp := t.TempDir()
docsDir := filepath.Join(tmp, "docs")
indexDir := filepath.Join(tmp, "index")
metaDir := filepath.Join(tmp, "metadata")
if err := os.MkdirAll(docsDir, 0o755); err != nil {
t.Fatal(err)
}
doc := map[string]any{
"id": "1",
"title": "HTTP Client",
"content": "Use net/http client with timeout",
"type": "go-doc",
"source": "go",
"url": "https://pkg.go.dev/net/http",
}
b, _ := json.Marshal(doc)
if err := os.WriteFile(filepath.Join(docsDir, "doc.json"), b, 0o644); err != nil {
t.Fatal(err)
}
cfg := config.Default()
cfg.Storage.DocsDir = docsDir
cfg.Storage.IndexDir = indexDir
cfg.Storage.MetadataDir = metaDir
e := NewEngine(cfg)
stats, err := e.Rebuild(context.Background())
if err != nil {
t.Fatalf("rebuild failed: %v", err)
}
if stats.Documents == 0 {
t.Fatal("expected documents in index")
}
results, _, err := e.Search(context.Background(), "http timeout", SearchOptions{Limit: 5})
if err != nil {
t.Fatalf("search failed: %v", err)
}
if len(results) == 0 {
t.Fatal("expected at least one search result")
}
}