// Package indexer provides document indexing and embedding generation. package indexer import ( "context" "crypto/sha256" "encoding/hex" "fmt" "strings" "sync" "github.com/yourorg/devour/internal/scraper" "github.com/yourorg/devour/internal/vector" ) // Config holds indexer configuration. type Config struct { Provider string `yaml:"provider"` Model string `yaml:"model"` Dimensions int `yaml:"dimensions"` APIKey string `yaml:"api_key"` BatchSize int `yaml:"batch_size"` // Chunking settings ChunkSize int `yaml:"chunk_size"` ChunkOverlap int `yaml:"chunk_overlap"` } // Chunk represents a text chunk with its embedding. type Chunk struct { ID string `json:"id"` DocID string `json:"doc_id"` Content string `json:"content"` Vector []float32 `json:"vector,omitempty"` Metadata map[string]any `json:"metadata"` Position int `json:"position"` } // Indexer handles document indexing with embeddings. type Indexer struct { config *Config vectorStore vector.Store embedder Embedder mu sync.RWMutex chunks map[string][]*Chunk // docID -> chunks } // Embedder generates embeddings for text. type Embedder interface { Embed(ctx context.Context, texts []string) ([][]float32, error) } // NewIndexer creates a new indexer. func NewIndexer(config *Config, vectorConfig *vector.Config) *Indexer { return &Indexer{ config: config, vectorStore: vector.NewStore(vectorConfig), chunks: make(map[string][]*Chunk), } } // SetEmbedder sets the embedding provider. func (i *Indexer) SetEmbedder(embedder Embedder) { i.embedder = embedder } // Index processes documents and adds them to the vector store. func (i *Indexer) Index(ctx context.Context, docs []*scraper.Document) error { if i.embedder == nil { return fmt.Errorf("embedder not configured") } for _, doc := range docs { if err := i.indexDocument(ctx, doc); err != nil { return fmt.Errorf("failed to index document %s: %w", doc.ID, err) } } return nil } // indexDocument processes a single document. func (i *Indexer) indexDocument(ctx context.Context, doc *scraper.Document) error { // Chunk the document chunks := i.chunkDocument(doc) // Generate embeddings for all chunks texts := make([]string, len(chunks)) for j, chunk := range chunks { texts[j] = chunk.Content } embeddings, err := i.embedder.Embed(ctx, texts) if err != nil { return fmt.Errorf("failed to generate embeddings: %w", err) } // Create vector documents vecDocs := make([]*vector.Document, len(chunks)) for j, chunk := range chunks { chunk.Vector = embeddings[j] vecDocs[j] = &vector.Document{ ID: chunk.ID, Content: chunk.Content, Vector: chunk.Vector, Metadata: chunk.Metadata, } } // Add to vector store if err := i.vectorStore.Add(ctx, vecDocs); err != nil { return fmt.Errorf("failed to add to vector store: %w", err) } // Store chunks i.mu.Lock() i.chunks[doc.ID] = chunks i.mu.Unlock() return nil } // chunkDocument splits a document into chunks. func (i *Indexer) chunkDocument(doc *scraper.Document) []*Chunk { chunkSize := i.config.ChunkSize if chunkSize == 0 { chunkSize = 512 // Default chunk size in tokens (approximate) } // Simple chunking by paragraphs and size content := doc.Content paragraphs := strings.Split(content, "\n\n") var chunks []*Chunk var currentChunk strings.Builder var currentSize int position := 0 for _, para := range paragraphs { paraSize := len(para) // Approximate token count // If paragraph fits, add it if currentSize+paraSize <= chunkSize || currentSize == 0 { if currentSize > 0 { currentChunk.WriteString("\n\n") } currentChunk.WriteString(para) currentSize += paraSize } else { // Save current chunk if currentSize > 0 { chunks = append(chunks, i.createChunk(doc, currentChunk.String(), position)) position++ } currentChunk.Reset() currentChunk.WriteString(para) currentSize = paraSize } } // Don't forget the last chunk if currentSize > 0 { chunks = append(chunks, i.createChunk(doc, currentChunk.String(), position)) } // If no chunks were created, create one from the entire content if len(chunks) == 0 { chunks = append(chunks, i.createChunk(doc, content, 0)) } return chunks } // createChunk creates a chunk from content. func (i *Indexer) createChunk(doc *scraper.Document, content string, position int) *Chunk { return &Chunk{ ID: generateChunkID(doc.ID, position), DocID: doc.ID, Content: content, Metadata: map[string]any{ "source": doc.Source, "title": doc.Title, "url": doc.URL, "type": doc.Type, "position": position, "timestamp": doc.Timestamp, }, Position: position, } } // Search finds similar chunks to the query. func (i *Indexer) Search(ctx context.Context, query string, limit int, threshold float64) ([]*Chunk, error) { if i.embedder == nil { return nil, fmt.Errorf("embedder not configured") } // Generate embedding for query embeddings, err := i.embedder.Embed(ctx, []string{query}) if err != nil { return nil, fmt.Errorf("failed to embed query: %w", err) } // Search vector store results, err := i.vectorStore.Search(ctx, embeddings[0], limit) if err != nil { return nil, fmt.Errorf("search failed: %w", err) } // Convert to chunks and filter by threshold var chunks []*Chunk for _, result := range results { if result.Score >= threshold { chunks = append(chunks, &Chunk{ ID: result.ID, Content: result.Content, Metadata: result.Metadata, }) } } return chunks, nil } // Delete removes a document and its chunks. func (i *Indexer) Delete(ctx context.Context, docID string) error { i.mu.Lock() defer i.mu.Unlock() chunks, ok := i.chunks[docID] if !ok { return nil } // Delete chunks from vector store for _, chunk := range chunks { if err := i.vectorStore.Delete(ctx, chunk.ID); err != nil { fmt.Printf("Warning: failed to delete chunk %s: %v\n", chunk.ID, err) } } delete(i.chunks, docID) return nil } // Stats returns indexing statistics. func (i *Indexer) Stats(ctx context.Context) (map[string]interface{}, error) { count, err := i.vectorStore.Count(ctx) if err != nil { return nil, err } i.mu.RLock() docCount := len(i.chunks) i.mu.RUnlock() return map[string]interface{}{ "document_count": docCount, "chunk_count": count, "dimensions": i.config.Dimensions, "model": i.config.Model, }, nil } // generateChunkID generates a unique ID for a chunk. func generateChunkID(docID string, position int) string { data := fmt.Sprintf("%s-%d", docID, position) hash := sha256.Sum256([]byte(data)) return hex.EncodeToString(hash[:12]) }