Files
Devour/internal/indexer/indexer.go
Tomas Dvorak 55885a0e8f first commit
2026-02-22 10:42:17 +01:00

272 lines
6.6 KiB
Go

// Package indexer provides document indexing and embedding generation.
package indexer
import (
"context"
"crypto/sha256"
"encoding/hex"
"fmt"
"strings"
"sync"
"github.com/yourorg/devour/internal/scraper"
"github.com/yourorg/devour/internal/vector"
)
// Config holds indexer configuration.
type Config struct {
Provider string `yaml:"provider"`
Model string `yaml:"model"`
Dimensions int `yaml:"dimensions"`
APIKey string `yaml:"api_key"`
BatchSize int `yaml:"batch_size"`
// Chunking settings
ChunkSize int `yaml:"chunk_size"`
ChunkOverlap int `yaml:"chunk_overlap"`
}
// Chunk represents a text chunk with its embedding.
type Chunk struct {
ID string `json:"id"`
DocID string `json:"doc_id"`
Content string `json:"content"`
Vector []float32 `json:"vector,omitempty"`
Metadata map[string]any `json:"metadata"`
Position int `json:"position"`
}
// Indexer handles document indexing with embeddings.
type Indexer struct {
config *Config
vectorStore vector.Store
embedder Embedder
mu sync.RWMutex
chunks map[string][]*Chunk // docID -> chunks
}
// Embedder generates embeddings for text.
type Embedder interface {
Embed(ctx context.Context, texts []string) ([][]float32, error)
}
// NewIndexer creates a new indexer.
func NewIndexer(config *Config, vectorConfig *vector.Config) *Indexer {
return &Indexer{
config: config,
vectorStore: vector.NewStore(vectorConfig),
chunks: make(map[string][]*Chunk),
}
}
// SetEmbedder sets the embedding provider.
func (i *Indexer) SetEmbedder(embedder Embedder) {
i.embedder = embedder
}
// Index processes documents and adds them to the vector store.
func (i *Indexer) Index(ctx context.Context, docs []*scraper.Document) error {
if i.embedder == nil {
return fmt.Errorf("embedder not configured")
}
for _, doc := range docs {
if err := i.indexDocument(ctx, doc); err != nil {
return fmt.Errorf("failed to index document %s: %w", doc.ID, err)
}
}
return nil
}
// indexDocument processes a single document.
func (i *Indexer) indexDocument(ctx context.Context, doc *scraper.Document) error {
// Chunk the document
chunks := i.chunkDocument(doc)
// Generate embeddings for all chunks
texts := make([]string, len(chunks))
for j, chunk := range chunks {
texts[j] = chunk.Content
}
embeddings, err := i.embedder.Embed(ctx, texts)
if err != nil {
return fmt.Errorf("failed to generate embeddings: %w", err)
}
// Create vector documents
vecDocs := make([]*vector.Document, len(chunks))
for j, chunk := range chunks {
chunk.Vector = embeddings[j]
vecDocs[j] = &vector.Document{
ID: chunk.ID,
Content: chunk.Content,
Vector: chunk.Vector,
Metadata: chunk.Metadata,
}
}
// Add to vector store
if err := i.vectorStore.Add(ctx, vecDocs); err != nil {
return fmt.Errorf("failed to add to vector store: %w", err)
}
// Store chunks
i.mu.Lock()
i.chunks[doc.ID] = chunks
i.mu.Unlock()
return nil
}
// chunkDocument splits a document into chunks.
func (i *Indexer) chunkDocument(doc *scraper.Document) []*Chunk {
chunkSize := i.config.ChunkSize
if chunkSize == 0 {
chunkSize = 512 // Default chunk size in tokens (approximate)
}
// Simple chunking by paragraphs and size
content := doc.Content
paragraphs := strings.Split(content, "\n\n")
var chunks []*Chunk
var currentChunk strings.Builder
var currentSize int
position := 0
for _, para := range paragraphs {
paraSize := len(para) // Approximate token count
// If paragraph fits, add it
if currentSize+paraSize <= chunkSize || currentSize == 0 {
if currentSize > 0 {
currentChunk.WriteString("\n\n")
}
currentChunk.WriteString(para)
currentSize += paraSize
} else {
// Save current chunk
if currentSize > 0 {
chunks = append(chunks, i.createChunk(doc, currentChunk.String(), position))
position++
}
currentChunk.Reset()
currentChunk.WriteString(para)
currentSize = paraSize
}
}
// Don't forget the last chunk
if currentSize > 0 {
chunks = append(chunks, i.createChunk(doc, currentChunk.String(), position))
}
// If no chunks were created, create one from the entire content
if len(chunks) == 0 {
chunks = append(chunks, i.createChunk(doc, content, 0))
}
return chunks
}
// createChunk creates a chunk from content.
func (i *Indexer) createChunk(doc *scraper.Document, content string, position int) *Chunk {
return &Chunk{
ID: generateChunkID(doc.ID, position),
DocID: doc.ID,
Content: content,
Metadata: map[string]any{
"source": doc.Source,
"title": doc.Title,
"url": doc.URL,
"type": doc.Type,
"position": position,
"timestamp": doc.Timestamp,
},
Position: position,
}
}
// Search finds similar chunks to the query.
func (i *Indexer) Search(ctx context.Context, query string, limit int, threshold float64) ([]*Chunk, error) {
if i.embedder == nil {
return nil, fmt.Errorf("embedder not configured")
}
// Generate embedding for query
embeddings, err := i.embedder.Embed(ctx, []string{query})
if err != nil {
return nil, fmt.Errorf("failed to embed query: %w", err)
}
// Search vector store
results, err := i.vectorStore.Search(ctx, embeddings[0], limit)
if err != nil {
return nil, fmt.Errorf("search failed: %w", err)
}
// Convert to chunks and filter by threshold
var chunks []*Chunk
for _, result := range results {
if result.Score >= threshold {
chunks = append(chunks, &Chunk{
ID: result.ID,
Content: result.Content,
Metadata: result.Metadata,
})
}
}
return chunks, nil
}
// Delete removes a document and its chunks.
func (i *Indexer) Delete(ctx context.Context, docID string) error {
i.mu.Lock()
defer i.mu.Unlock()
chunks, ok := i.chunks[docID]
if !ok {
return nil
}
// Delete chunks from vector store
for _, chunk := range chunks {
if err := i.vectorStore.Delete(ctx, chunk.ID); err != nil {
fmt.Printf("Warning: failed to delete chunk %s: %v\n", chunk.ID, err)
}
}
delete(i.chunks, docID)
return nil
}
// Stats returns indexing statistics.
func (i *Indexer) Stats(ctx context.Context) (map[string]interface{}, error) {
count, err := i.vectorStore.Count(ctx)
if err != nil {
return nil, err
}
i.mu.RLock()
docCount := len(i.chunks)
i.mu.RUnlock()
return map[string]interface{}{
"document_count": docCount,
"chunk_count": count,
"dimensions": i.config.Dimensions,
"model": i.config.Model,
}, nil
}
// generateChunkID generates a unique ID for a chunk.
func generateChunkID(docID string, position int) string {
data := fmt.Sprintf("%s-%d", docID, position)
hash := sha256.Sum256([]byte(data))
return hex.EncodeToString(hash[:12])
}