Files
Devour/internal/vector/store.go
T
Tomas Dvorak 55885a0e8f first commit
2026-02-22 10:42:17 +01:00

291 lines
6.3 KiB
Go

// Package vector provides vector storage capabilities.
package vector
import (
"context"
"crypto/sha256"
"encoding/hex"
"fmt"
"os"
"path/filepath"
"sync"
)
// Config holds vector store configuration.
type Config struct {
Type string `yaml:"type"`
PersistDir string `yaml:"persist_dir"`
SimilarityMetric string `yaml:"similarity_metric"`
}
// Document represents an indexed document.
type Document struct {
ID string
Content string
Vector []float32
Metadata map[string]any
}
// SearchResult represents a search result.
type SearchResult struct {
ID string
Content string
Score float64
Metadata map[string]any
}
type scoredDoc struct {
id string
score float64
}
// Store defines the interface for vector storage.
type Store interface {
// Add adds documents to the store.
Add(ctx context.Context, docs []*Document) error
// Search finds similar documents.
Search(ctx context.Context, vector []float32, limit int) ([]*SearchResult, error)
// Delete removes a document by ID.
Delete(ctx context.Context, id string) error
// Get retrieves a document by ID.
Get(ctx context.Context, id string) (*Document, error)
// Count returns the number of documents.
Count(ctx context.Context) (int, error)
// Clear removes all documents.
Clear(ctx context.Context) error
}
// NewStore creates a new vector store.
func NewStore(config *Config) Store {
switch config.Type {
case "memory":
return NewMemoryStore(config)
case "chromem":
return NewChromemStore(config)
default:
return NewMemoryStore(config)
}
}
// MemoryStore implements Store with in-memory storage.
type MemoryStore struct {
config *Config
mu sync.RWMutex
docs map[string]*Document
vectors [][]float32
}
// NewMemoryStore creates an in-memory vector store.
func NewMemoryStore(config *Config) *MemoryStore {
return &MemoryStore{
config: config,
docs: make(map[string]*Document),
}
}
// Add adds documents to the store.
func (s *MemoryStore) Add(ctx context.Context, docs []*Document) error {
s.mu.Lock()
defer s.mu.Unlock()
for _, doc := range docs {
if doc.ID == "" {
doc.ID = generateID(doc.Content)
}
s.docs[doc.ID] = doc
s.vectors = append(s.vectors, doc.Vector)
}
// Persist if directory is set
if s.config.PersistDir != "" {
return s.persist()
}
return nil
}
// Search finds similar documents using cosine similarity.
func (s *MemoryStore) Search(ctx context.Context, vector []float32, limit int) ([]*SearchResult, error) {
s.mu.RLock()
defer s.mu.RUnlock()
var scored []*scoredDoc
for id, doc := range s.docs {
if doc.Vector != nil {
score := cosineSimilarity(vector, doc.Vector)
scored = append(scored, &scoredDoc{id: id, score: score})
}
}
sortByScore(scored)
results := make([]*SearchResult, 0, limit)
for i := 0; i < len(scored) && i < limit; i++ {
doc := s.docs[scored[i].id]
results = append(results, &SearchResult{
ID: doc.ID,
Content: doc.Content,
Score: scored[i].score,
Metadata: doc.Metadata,
})
}
return results, nil
}
// Delete removes a document by ID.
func (s *MemoryStore) Delete(ctx context.Context, id string) error {
s.mu.Lock()
defer s.mu.Unlock()
delete(s.docs, id)
// Rebuild vectors slice
s.vectors = nil
for _, doc := range s.docs {
s.vectors = append(s.vectors, doc.Vector)
}
return nil
}
// Get retrieves a document by ID.
func (s *MemoryStore) Get(ctx context.Context, id string) (*Document, error) {
s.mu.RLock()
defer s.mu.RUnlock()
doc, ok := s.docs[id]
if !ok {
return nil, fmt.Errorf("document not found: %s", id)
}
return doc, nil
}
// Count returns the number of documents.
func (s *MemoryStore) Count(ctx context.Context) (int, error) {
s.mu.RLock()
defer s.mu.RUnlock()
return len(s.docs), nil
}
// Clear removes all documents.
func (s *MemoryStore) Clear(ctx context.Context) error {
s.mu.Lock()
defer s.mu.Unlock()
s.docs = make(map[string]*Document)
s.vectors = nil
return nil
}
// persist saves documents to disk.
func (s *MemoryStore) persist() error {
if s.config.PersistDir == "" {
return nil
}
// Create directory if needed
if err := os.MkdirAll(s.config.PersistDir, 0755); err != nil {
return err
}
// Save documents
dataFile := filepath.Join(s.config.PersistDir, "documents.json")
// TODO: Implement actual JSON serialization
_ = dataFile // Placeholder
return nil
}
// ChromemStore wraps chromem-go for production use.
type ChromemStore struct {
config *Config
// TODO: Add chromem-go client
}
// NewChromemStore creates a chromem-backed store.
func NewChromemStore(config *Config) *ChromemStore {
return &ChromemStore{config: config}
}
func (s *ChromemStore) Add(ctx context.Context, docs []*Document) error {
// TODO: Implement with chromem-go
return fmt.Errorf("chromem store not implemented")
}
func (s *ChromemStore) Search(ctx context.Context, vector []float32, limit int) ([]*SearchResult, error) {
return nil, fmt.Errorf("chromem store not implemented")
}
func (s *ChromemStore) Delete(ctx context.Context, id string) error {
return fmt.Errorf("chromem store not implemented")
}
func (s *ChromemStore) Get(ctx context.Context, id string) (*Document, error) {
return nil, fmt.Errorf("chromem store not implemented")
}
func (s *ChromemStore) Count(ctx context.Context) (int, error) {
return 0, fmt.Errorf("chromem store not implemented")
}
func (s *ChromemStore) Clear(ctx context.Context) error {
return fmt.Errorf("chromem store not implemented")
}
// Helper functions
func generateID(content string) string {
hash := sha256.Sum256([]byte(content))
return hex.EncodeToString(hash[:12])
}
func cosineSimilarity(a, b []float32) float64 {
if len(a) != len(b) {
return 0
}
var dotProduct, normA, normB float64
for i := range a {
dotProduct += float64(a[i]) * float64(b[i])
normA += float64(a[i]) * float64(a[i])
normB += float64(b[i]) * float64(b[i])
}
if normA == 0 || normB == 0 {
return 0
}
return dotProduct / (sqrt(normA) * sqrt(normB))
}
func sqrt(x float64) float64 {
// Simple Newton's method for square root
if x == 0 {
return 0
}
z := x
for i := 0; i < 10; i++ {
z = z - (z*z-x)/(2*z)
}
return z
}
func sortByScore(docs []*scoredDoc) {
for i := 1; i < len(docs); i++ {
for j := i; j > 0 && docs[j].score > docs[j-1].score; j-- {
docs[j], docs[j-1] = docs[j-1], docs[j]
}
}
}