mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 12:33:04 +00:00
272 lines
6.6 KiB
Go
272 lines
6.6 KiB
Go
// Package indexer provides document indexing and embedding generation.
|
|
package indexer
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"fmt"
|
|
"strings"
|
|
"sync"
|
|
|
|
"github.com/yourorg/devour/internal/scraper"
|
|
"github.com/yourorg/devour/internal/vector"
|
|
)
|
|
|
|
// Config holds indexer configuration.
|
|
type Config struct {
|
|
Provider string `yaml:"provider"`
|
|
Model string `yaml:"model"`
|
|
Dimensions int `yaml:"dimensions"`
|
|
APIKey string `yaml:"api_key"`
|
|
BatchSize int `yaml:"batch_size"`
|
|
|
|
// Chunking settings
|
|
ChunkSize int `yaml:"chunk_size"`
|
|
ChunkOverlap int `yaml:"chunk_overlap"`
|
|
}
|
|
|
|
// Chunk represents a text chunk with its embedding.
|
|
type Chunk struct {
|
|
ID string `json:"id"`
|
|
DocID string `json:"doc_id"`
|
|
Content string `json:"content"`
|
|
Vector []float32 `json:"vector,omitempty"`
|
|
Metadata map[string]any `json:"metadata"`
|
|
Position int `json:"position"`
|
|
}
|
|
|
|
// Indexer handles document indexing with embeddings.
|
|
type Indexer struct {
|
|
config *Config
|
|
vectorStore vector.Store
|
|
embedder Embedder
|
|
mu sync.RWMutex
|
|
chunks map[string][]*Chunk // docID -> chunks
|
|
}
|
|
|
|
// Embedder generates embeddings for text.
|
|
type Embedder interface {
|
|
Embed(ctx context.Context, texts []string) ([][]float32, error)
|
|
}
|
|
|
|
// NewIndexer creates a new indexer.
|
|
func NewIndexer(config *Config, vectorConfig *vector.Config) *Indexer {
|
|
return &Indexer{
|
|
config: config,
|
|
vectorStore: vector.NewStore(vectorConfig),
|
|
chunks: make(map[string][]*Chunk),
|
|
}
|
|
}
|
|
|
|
// SetEmbedder sets the embedding provider.
|
|
func (i *Indexer) SetEmbedder(embedder Embedder) {
|
|
i.embedder = embedder
|
|
}
|
|
|
|
// Index processes documents and adds them to the vector store.
|
|
func (i *Indexer) Index(ctx context.Context, docs []*scraper.Document) error {
|
|
if i.embedder == nil {
|
|
return fmt.Errorf("embedder not configured")
|
|
}
|
|
|
|
for _, doc := range docs {
|
|
if err := i.indexDocument(ctx, doc); err != nil {
|
|
return fmt.Errorf("failed to index document %s: %w", doc.ID, err)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// indexDocument processes a single document.
|
|
func (i *Indexer) indexDocument(ctx context.Context, doc *scraper.Document) error {
|
|
// Chunk the document
|
|
chunks := i.chunkDocument(doc)
|
|
|
|
// Generate embeddings for all chunks
|
|
texts := make([]string, len(chunks))
|
|
for j, chunk := range chunks {
|
|
texts[j] = chunk.Content
|
|
}
|
|
|
|
embeddings, err := i.embedder.Embed(ctx, texts)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to generate embeddings: %w", err)
|
|
}
|
|
|
|
// Create vector documents
|
|
vecDocs := make([]*vector.Document, len(chunks))
|
|
for j, chunk := range chunks {
|
|
chunk.Vector = embeddings[j]
|
|
vecDocs[j] = &vector.Document{
|
|
ID: chunk.ID,
|
|
Content: chunk.Content,
|
|
Vector: chunk.Vector,
|
|
Metadata: chunk.Metadata,
|
|
}
|
|
}
|
|
|
|
// Add to vector store
|
|
if err := i.vectorStore.Add(ctx, vecDocs); err != nil {
|
|
return fmt.Errorf("failed to add to vector store: %w", err)
|
|
}
|
|
|
|
// Store chunks
|
|
i.mu.Lock()
|
|
i.chunks[doc.ID] = chunks
|
|
i.mu.Unlock()
|
|
|
|
return nil
|
|
}
|
|
|
|
// chunkDocument splits a document into chunks.
|
|
func (i *Indexer) chunkDocument(doc *scraper.Document) []*Chunk {
|
|
chunkSize := i.config.ChunkSize
|
|
if chunkSize == 0 {
|
|
chunkSize = 512 // Default chunk size in tokens (approximate)
|
|
}
|
|
|
|
// Simple chunking by paragraphs and size
|
|
content := doc.Content
|
|
paragraphs := strings.Split(content, "\n\n")
|
|
|
|
var chunks []*Chunk
|
|
var currentChunk strings.Builder
|
|
var currentSize int
|
|
position := 0
|
|
|
|
for _, para := range paragraphs {
|
|
paraSize := len(para) // Approximate token count
|
|
|
|
// If paragraph fits, add it
|
|
if currentSize+paraSize <= chunkSize || currentSize == 0 {
|
|
if currentSize > 0 {
|
|
currentChunk.WriteString("\n\n")
|
|
}
|
|
currentChunk.WriteString(para)
|
|
currentSize += paraSize
|
|
} else {
|
|
// Save current chunk
|
|
if currentSize > 0 {
|
|
chunks = append(chunks, i.createChunk(doc, currentChunk.String(), position))
|
|
position++
|
|
}
|
|
|
|
currentChunk.Reset()
|
|
currentChunk.WriteString(para)
|
|
currentSize = paraSize
|
|
}
|
|
}
|
|
|
|
// Don't forget the last chunk
|
|
if currentSize > 0 {
|
|
chunks = append(chunks, i.createChunk(doc, currentChunk.String(), position))
|
|
}
|
|
|
|
// If no chunks were created, create one from the entire content
|
|
if len(chunks) == 0 {
|
|
chunks = append(chunks, i.createChunk(doc, content, 0))
|
|
}
|
|
|
|
return chunks
|
|
}
|
|
|
|
// createChunk creates a chunk from content.
|
|
func (i *Indexer) createChunk(doc *scraper.Document, content string, position int) *Chunk {
|
|
return &Chunk{
|
|
ID: generateChunkID(doc.ID, position),
|
|
DocID: doc.ID,
|
|
Content: content,
|
|
Metadata: map[string]any{
|
|
"source": doc.Source,
|
|
"title": doc.Title,
|
|
"url": doc.URL,
|
|
"type": doc.Type,
|
|
"position": position,
|
|
"timestamp": doc.Timestamp,
|
|
},
|
|
Position: position,
|
|
}
|
|
}
|
|
|
|
// Search finds similar chunks to the query.
|
|
func (i *Indexer) Search(ctx context.Context, query string, limit int, threshold float64) ([]*Chunk, error) {
|
|
if i.embedder == nil {
|
|
return nil, fmt.Errorf("embedder not configured")
|
|
}
|
|
|
|
// Generate embedding for query
|
|
embeddings, err := i.embedder.Embed(ctx, []string{query})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to embed query: %w", err)
|
|
}
|
|
|
|
// Search vector store
|
|
results, err := i.vectorStore.Search(ctx, embeddings[0], limit)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("search failed: %w", err)
|
|
}
|
|
|
|
// Convert to chunks and filter by threshold
|
|
var chunks []*Chunk
|
|
for _, result := range results {
|
|
if result.Score >= threshold {
|
|
chunks = append(chunks, &Chunk{
|
|
ID: result.ID,
|
|
Content: result.Content,
|
|
Metadata: result.Metadata,
|
|
})
|
|
}
|
|
}
|
|
|
|
return chunks, nil
|
|
}
|
|
|
|
// Delete removes a document and its chunks.
|
|
func (i *Indexer) Delete(ctx context.Context, docID string) error {
|
|
i.mu.Lock()
|
|
defer i.mu.Unlock()
|
|
|
|
chunks, ok := i.chunks[docID]
|
|
if !ok {
|
|
return nil
|
|
}
|
|
|
|
// Delete chunks from vector store
|
|
for _, chunk := range chunks {
|
|
if err := i.vectorStore.Delete(ctx, chunk.ID); err != nil {
|
|
fmt.Printf("Warning: failed to delete chunk %s: %v\n", chunk.ID, err)
|
|
}
|
|
}
|
|
|
|
delete(i.chunks, docID)
|
|
return nil
|
|
}
|
|
|
|
// Stats returns indexing statistics.
|
|
func (i *Indexer) Stats(ctx context.Context) (map[string]interface{}, error) {
|
|
count, err := i.vectorStore.Count(ctx)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
i.mu.RLock()
|
|
docCount := len(i.chunks)
|
|
i.mu.RUnlock()
|
|
|
|
return map[string]interface{}{
|
|
"document_count": docCount,
|
|
"chunk_count": count,
|
|
"dimensions": i.config.Dimensions,
|
|
"model": i.config.Model,
|
|
}, nil
|
|
}
|
|
|
|
// generateChunkID generates a unique ID for a chunk.
|
|
func generateChunkID(docID string, position int) string {
|
|
data := fmt.Sprintf("%s-%d", docID, position)
|
|
hash := sha256.Sum256([]byte(data))
|
|
return hex.EncodeToString(hash[:12])
|
|
}
|