mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
update
This commit is contained in:
@@ -0,0 +1,528 @@
|
||||
package search
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/yourorg/devour/internal/config"
|
||||
)
|
||||
|
||||
type Engine struct {
|
||||
DocsDir string
|
||||
IndexDir string
|
||||
MetadataDir string
|
||||
SnippetLength int
|
||||
}
|
||||
|
||||
type SearchOptions struct {
|
||||
Limit int
|
||||
Threshold float64
|
||||
}
|
||||
|
||||
type Result struct {
|
||||
ID string `json:"id"`
|
||||
DocID string `json:"doc_id"`
|
||||
Title string `json:"title"`
|
||||
URL string `json:"url,omitempty"`
|
||||
Type string `json:"type"`
|
||||
Source string `json:"source,omitempty"`
|
||||
Path string `json:"path"`
|
||||
Score float64 `json:"score"`
|
||||
Snippet string `json:"snippet"`
|
||||
Meta map[string]any `json:"metadata,omitempty"`
|
||||
}
|
||||
|
||||
type IndexStats struct {
|
||||
Documents int `json:"documents"`
|
||||
Tokens int `json:"tokens"`
|
||||
LastIndexedAt time.Time `json:"last_indexed_at"`
|
||||
IndexPath string `json:"index_path"`
|
||||
MetadataPath string `json:"metadata_path"`
|
||||
SourceFileHash string `json:"source_file_hash"`
|
||||
}
|
||||
|
||||
type indexedDoc struct {
|
||||
ID string `json:"id"`
|
||||
DocID string `json:"doc_id"`
|
||||
Title string `json:"title"`
|
||||
URL string `json:"url,omitempty"`
|
||||
Type string `json:"type"`
|
||||
Source string `json:"source,omitempty"`
|
||||
Path string `json:"path"`
|
||||
Content string `json:"content"`
|
||||
TermFreq map[string]int `json:"term_freq"`
|
||||
Length int `json:"length"`
|
||||
}
|
||||
|
||||
type persistedIndex struct {
|
||||
Version string `json:"version"`
|
||||
BuiltAt time.Time `json:"built_at"`
|
||||
Docs []indexedDoc `json:"docs"`
|
||||
}
|
||||
|
||||
type persistedMeta struct {
|
||||
Version string `json:"version"`
|
||||
BuiltAt time.Time `json:"built_at"`
|
||||
DocsDir string `json:"docs_dir"`
|
||||
SourceFileHash string `json:"source_file_hash"`
|
||||
DocCount int `json:"doc_count"`
|
||||
}
|
||||
|
||||
type rawDoc struct {
|
||||
ID string `json:"id"`
|
||||
Source string `json:"source"`
|
||||
Type string `json:"type"`
|
||||
Title string `json:"title"`
|
||||
Content string `json:"content"`
|
||||
URL string `json:"url,omitempty"`
|
||||
Metadata map[string]any `json:"metadata,omitempty"`
|
||||
}
|
||||
|
||||
const (
|
||||
indexFileName = "lexical_index.json"
|
||||
metaFileName = "lexical_index_meta.json"
|
||||
indexVersion = "1"
|
||||
)
|
||||
|
||||
func NewEngine(cfg *config.Config) *Engine {
|
||||
snippetLength := cfg.Indexing.SnippetLength
|
||||
if snippetLength <= 0 {
|
||||
snippetLength = 220
|
||||
}
|
||||
return &Engine{
|
||||
DocsDir: cfg.Storage.DocsDir,
|
||||
IndexDir: cfg.Storage.IndexDir,
|
||||
MetadataDir: cfg.Storage.MetadataDir,
|
||||
SnippetLength: snippetLength,
|
||||
}
|
||||
}
|
||||
|
||||
func (e *Engine) Rebuild(ctx context.Context) (*IndexStats, error) {
|
||||
if strings.TrimSpace(e.DocsDir) == "" {
|
||||
return nil, fmt.Errorf("docs directory is required")
|
||||
}
|
||||
if err := os.MkdirAll(e.IndexDir, 0o755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := os.MkdirAll(e.MetadataDir, 0o755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
docFiles, sourceHash, err := e.listDocFiles()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
docs := make([]indexedDoc, 0, len(docFiles))
|
||||
tokenCount := 0
|
||||
for _, file := range docFiles {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
default:
|
||||
}
|
||||
|
||||
rd, err := parseDocFile(file)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if strings.TrimSpace(rd.Content) == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
id := rd.ID
|
||||
if id == "" {
|
||||
id = hashString(file + ":" + rd.Title)
|
||||
}
|
||||
termFreq := frequency(tokenize(strings.Join([]string{rd.Title, rd.Content, rd.URL, rd.Type}, " ")))
|
||||
length := 0
|
||||
for _, v := range termFreq {
|
||||
length += v
|
||||
}
|
||||
tokenCount += length
|
||||
|
||||
docs = append(docs, indexedDoc{
|
||||
ID: hashString(file),
|
||||
DocID: id,
|
||||
Title: bestTitle(rd.Title, file),
|
||||
URL: strings.TrimSpace(rd.URL),
|
||||
Type: defaultString(strings.TrimSpace(rd.Type), "document"),
|
||||
Source: strings.TrimSpace(rd.Source),
|
||||
Path: file,
|
||||
Content: collapseWhitespace(rd.Content),
|
||||
TermFreq: termFreq,
|
||||
Length: length,
|
||||
})
|
||||
}
|
||||
|
||||
index := persistedIndex{Version: indexVersion, BuiltAt: time.Now(), Docs: docs}
|
||||
indexPath := filepath.Join(e.IndexDir, indexFileName)
|
||||
if err := writeJSON(indexPath, index); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
meta := persistedMeta{
|
||||
Version: indexVersion,
|
||||
BuiltAt: index.BuiltAt,
|
||||
DocsDir: e.DocsDir,
|
||||
SourceFileHash: sourceHash,
|
||||
DocCount: len(docs),
|
||||
}
|
||||
metaPath := filepath.Join(e.MetadataDir, metaFileName)
|
||||
if err := writeJSON(metaPath, meta); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &IndexStats{
|
||||
Documents: len(docs),
|
||||
Tokens: tokenCount,
|
||||
LastIndexedAt: index.BuiltAt,
|
||||
IndexPath: indexPath,
|
||||
MetadataPath: metaPath,
|
||||
SourceFileHash: sourceHash,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (e *Engine) EnsureIndexed(ctx context.Context) (*IndexStats, error) {
|
||||
metaPath := filepath.Join(e.MetadataDir, metaFileName)
|
||||
b, err := os.ReadFile(metaPath)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return e.Rebuild(ctx)
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var meta persistedMeta
|
||||
if err := json.Unmarshal(b, &meta); err != nil {
|
||||
return e.Rebuild(ctx)
|
||||
}
|
||||
|
||||
_, sourceHash, err := e.listDocFiles()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if sourceHash != meta.SourceFileHash {
|
||||
return e.Rebuild(ctx)
|
||||
}
|
||||
|
||||
return &IndexStats{
|
||||
Documents: meta.DocCount,
|
||||
LastIndexedAt: meta.BuiltAt,
|
||||
IndexPath: filepath.Join(e.IndexDir, indexFileName),
|
||||
MetadataPath: metaPath,
|
||||
SourceFileHash: meta.SourceFileHash,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (e *Engine) Search(ctx context.Context, query string, opts SearchOptions) ([]Result, *IndexStats, error) {
|
||||
query = strings.TrimSpace(query)
|
||||
if query == "" {
|
||||
return nil, nil, fmt.Errorf("query is required")
|
||||
}
|
||||
|
||||
stats, err := e.EnsureIndexed(ctx)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
indexPath := filepath.Join(e.IndexDir, indexFileName)
|
||||
b, err := os.ReadFile(indexPath)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
var idx persistedIndex
|
||||
if err := json.Unmarshal(b, &idx); err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
limit := opts.Limit
|
||||
if limit <= 0 {
|
||||
limit = 5
|
||||
}
|
||||
threshold := opts.Threshold
|
||||
if threshold < 0 {
|
||||
threshold = 0
|
||||
}
|
||||
|
||||
queryTokens := tokenize(query)
|
||||
if len(queryTokens) == 0 {
|
||||
return nil, stats, nil
|
||||
}
|
||||
qFreq := frequency(queryTokens)
|
||||
|
||||
type scored struct {
|
||||
doc indexedDoc
|
||||
score float64
|
||||
}
|
||||
matches := make([]scored, 0)
|
||||
|
||||
for _, doc := range idx.Docs {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return nil, nil, ctx.Err()
|
||||
default:
|
||||
}
|
||||
score := lexicalScore(qFreq, queryTokens, doc)
|
||||
if score <= 0 {
|
||||
continue
|
||||
}
|
||||
if threshold > 0 && score < threshold {
|
||||
continue
|
||||
}
|
||||
matches = append(matches, scored{doc: doc, score: score})
|
||||
}
|
||||
|
||||
sort.Slice(matches, func(i, j int) bool {
|
||||
if matches[i].score == matches[j].score {
|
||||
return matches[i].doc.Title < matches[j].doc.Title
|
||||
}
|
||||
return matches[i].score > matches[j].score
|
||||
})
|
||||
|
||||
if limit > len(matches) {
|
||||
limit = len(matches)
|
||||
}
|
||||
|
||||
results := make([]Result, 0, limit)
|
||||
for i := 0; i < limit; i++ {
|
||||
d := matches[i].doc
|
||||
results = append(results, Result{
|
||||
ID: d.ID,
|
||||
DocID: d.DocID,
|
||||
Title: d.Title,
|
||||
URL: d.URL,
|
||||
Type: d.Type,
|
||||
Source: d.Source,
|
||||
Path: d.Path,
|
||||
Score: matches[i].score,
|
||||
Snippet: bestSnippet(d.Content, queryTokens, e.SnippetLength),
|
||||
Meta: map[string]any{
|
||||
"length": d.Length,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
return results, stats, nil
|
||||
}
|
||||
|
||||
func (e *Engine) listDocFiles() ([]string, string, error) {
|
||||
files := make([]string, 0)
|
||||
h := sha256.New()
|
||||
|
||||
err := filepath.WalkDir(e.DocsDir, func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if d.IsDir() {
|
||||
return nil
|
||||
}
|
||||
ext := strings.ToLower(filepath.Ext(path))
|
||||
switch ext {
|
||||
case ".json", ".md", ".txt":
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
|
||||
info, statErr := d.Info()
|
||||
if statErr != nil {
|
||||
return statErr
|
||||
}
|
||||
files = append(files, path)
|
||||
fmt.Fprintf(h, "%s|%d|%d\n", path, info.Size(), info.ModTime().UnixNano())
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return []string{}, hashString("empty"), nil
|
||||
}
|
||||
return nil, "", err
|
||||
}
|
||||
|
||||
sort.Strings(files)
|
||||
return files, hex.EncodeToString(h.Sum(nil)), nil
|
||||
}
|
||||
|
||||
func parseDocFile(path string) (*rawDoc, error) {
|
||||
b, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
ext := strings.ToLower(filepath.Ext(path))
|
||||
switch ext {
|
||||
case ".json":
|
||||
var d rawDoc
|
||||
if err := json.Unmarshal(b, &d); err == nil && (d.Title != "" || d.Content != "") {
|
||||
return &d, nil
|
||||
}
|
||||
// Not a structured doc JSON, index as raw text fallback.
|
||||
return &rawDoc{Title: filepath.Base(path), Content: string(b), Type: "json", Source: "local"}, nil
|
||||
case ".md":
|
||||
content := string(b)
|
||||
title := markdownTitle(content)
|
||||
return &rawDoc{Title: title, Content: content, Type: "markdown", Source: "local"}, nil
|
||||
default:
|
||||
return &rawDoc{Title: filepath.Base(path), Content: string(b), Type: "text", Source: "local"}, nil
|
||||
}
|
||||
}
|
||||
|
||||
func markdownTitle(content string) string {
|
||||
for _, line := range strings.Split(content, "\n") {
|
||||
trim := strings.TrimSpace(line)
|
||||
if strings.HasPrefix(trim, "#") {
|
||||
trim = strings.TrimLeft(trim, "#")
|
||||
trim = strings.TrimSpace(trim)
|
||||
if trim != "" {
|
||||
return trim
|
||||
}
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func writeJSON(path string, v any) error {
|
||||
b, err := json.MarshalIndent(v, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(path, b, 0o644)
|
||||
}
|
||||
|
||||
func tokenize(input string) []string {
|
||||
replacer := strings.NewReplacer(",", " ", ".", " ", "(", " ", ")", " ", "[", " ", "]", " ", "{", " ", "}", " ",
|
||||
":", " ", ";", " ", "!", " ", "?", " ", "\n", " ", "\r", " ", "\t", " ", "\"", " ", "'", " ", "`", " ",
|
||||
)
|
||||
clean := strings.ToLower(replacer.Replace(input))
|
||||
parts := strings.Fields(clean)
|
||||
out := make([]string, 0, len(parts))
|
||||
for _, p := range parts {
|
||||
p = strings.TrimSpace(p)
|
||||
if len(p) < 2 {
|
||||
continue
|
||||
}
|
||||
out = append(out, p)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func frequency(tokens []string) map[string]int {
|
||||
m := make(map[string]int, len(tokens))
|
||||
for _, t := range tokens {
|
||||
m[t]++
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
func lexicalScore(qFreq map[string]int, queryTokens []string, doc indexedDoc) float64 {
|
||||
if len(doc.TermFreq) == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
titleLower := strings.ToLower(doc.Title)
|
||||
urlLower := strings.ToLower(doc.URL)
|
||||
contentLower := strings.ToLower(doc.Content)
|
||||
|
||||
score := 0.0
|
||||
for token, qCount := range qFreq {
|
||||
dCount := doc.TermFreq[token]
|
||||
if dCount == 0 {
|
||||
continue
|
||||
}
|
||||
part := float64(dCount*qCount) / float64(max(1, doc.Length))
|
||||
score += part * 8.0
|
||||
if strings.Contains(titleLower, token) {
|
||||
score += 2.5
|
||||
}
|
||||
if strings.Contains(urlLower, token) {
|
||||
score += 1.2
|
||||
}
|
||||
}
|
||||
|
||||
phrase := strings.Join(queryTokens, " ")
|
||||
if len(queryTokens) > 1 && strings.Contains(contentLower, phrase) {
|
||||
score += 1.5
|
||||
}
|
||||
return score
|
||||
}
|
||||
|
||||
func bestSnippet(content string, queryTokens []string, maxLen int) string {
|
||||
if maxLen <= 0 {
|
||||
maxLen = 220
|
||||
}
|
||||
flat := collapseWhitespace(content)
|
||||
if flat == "" {
|
||||
return ""
|
||||
}
|
||||
if len(flat) <= maxLen {
|
||||
return flat
|
||||
}
|
||||
|
||||
lower := strings.ToLower(flat)
|
||||
start := 0
|
||||
for _, tok := range queryTokens {
|
||||
if idx := strings.Index(lower, tok); idx >= 0 {
|
||||
start = idx - (maxLen / 4)
|
||||
if start < 0 {
|
||||
start = 0
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
end := start + maxLen
|
||||
if end > len(flat) {
|
||||
end = len(flat)
|
||||
}
|
||||
snippet := strings.TrimSpace(flat[start:end])
|
||||
if end < len(flat) {
|
||||
snippet += "..."
|
||||
}
|
||||
return snippet
|
||||
}
|
||||
|
||||
func collapseWhitespace(s string) string {
|
||||
return strings.Join(strings.Fields(strings.TrimSpace(s)), " ")
|
||||
}
|
||||
|
||||
func bestTitle(title, path string) string {
|
||||
title = strings.TrimSpace(title)
|
||||
if title != "" {
|
||||
return title
|
||||
}
|
||||
base := filepath.Base(path)
|
||||
base = strings.TrimSuffix(base, filepath.Ext(base))
|
||||
base = strings.ReplaceAll(base, "_", " ")
|
||||
base = strings.TrimSpace(base)
|
||||
if base == "" {
|
||||
return "Documentation"
|
||||
}
|
||||
return base
|
||||
}
|
||||
|
||||
func defaultString(v, fallback string) string {
|
||||
if strings.TrimSpace(v) == "" {
|
||||
return fallback
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
func hashString(s string) string {
|
||||
sum := sha256.Sum256([]byte(s))
|
||||
return hex.EncodeToString(sum[:12])
|
||||
}
|
||||
|
||||
func max(a, b int) int {
|
||||
if a > b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
@@ -0,0 +1,56 @@
|
||||
package search
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/yourorg/devour/internal/config"
|
||||
)
|
||||
|
||||
func TestEngineRebuildAndSearch(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
docsDir := filepath.Join(tmp, "docs")
|
||||
indexDir := filepath.Join(tmp, "index")
|
||||
metaDir := filepath.Join(tmp, "metadata")
|
||||
if err := os.MkdirAll(docsDir, 0o755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
doc := map[string]any{
|
||||
"id": "1",
|
||||
"title": "HTTP Client",
|
||||
"content": "Use net/http client with timeout",
|
||||
"type": "go-doc",
|
||||
"source": "go",
|
||||
"url": "https://pkg.go.dev/net/http",
|
||||
}
|
||||
b, _ := json.Marshal(doc)
|
||||
if err := os.WriteFile(filepath.Join(docsDir, "doc.json"), b, 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
cfg := config.Default()
|
||||
cfg.Storage.DocsDir = docsDir
|
||||
cfg.Storage.IndexDir = indexDir
|
||||
cfg.Storage.MetadataDir = metaDir
|
||||
|
||||
e := NewEngine(cfg)
|
||||
stats, err := e.Rebuild(context.Background())
|
||||
if err != nil {
|
||||
t.Fatalf("rebuild failed: %v", err)
|
||||
}
|
||||
if stats.Documents == 0 {
|
||||
t.Fatal("expected documents in index")
|
||||
}
|
||||
|
||||
results, _, err := e.Search(context.Background(), "http timeout", SearchOptions{Limit: 5})
|
||||
if err != nil {
|
||||
t.Fatalf("search failed: %v", err)
|
||||
}
|
||||
if len(results) == 0 {
|
||||
t.Fatal("expected at least one search result")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user