first commit

2026-07-29 07:33:48 +00:00 · 2026-02-22 10:42:17 +01:00
commit 55885a0e8f
239 changed files with 103690 additions and 0 deletions
@@ -0,0 +1,271 @@
+// Package indexer provides document indexing and embedding generation.
+package indexer
+
+import (
+	"context"
+	"crypto/sha256"
+	"encoding/hex"
+	"fmt"
+	"strings"
+	"sync"
+
+	"github.com/yourorg/devour/internal/scraper"
+	"github.com/yourorg/devour/internal/vector"
+)
+
+// Config holds indexer configuration.
+type Config struct {
+	Provider   string `yaml:"provider"`
+	Model      string `yaml:"model"`
+	Dimensions int    `yaml:"dimensions"`
+	APIKey     string `yaml:"api_key"`
+	BatchSize  int    `yaml:"batch_size"`
+
+	// Chunking settings
+	ChunkSize    int `yaml:"chunk_size"`
+	ChunkOverlap int `yaml:"chunk_overlap"`
+}
+
+// Chunk represents a text chunk with its embedding.
+type Chunk struct {
+	ID       string         `json:"id"`
+	DocID    string         `json:"doc_id"`
+	Content  string         `json:"content"`
+	Vector   []float32      `json:"vector,omitempty"`
+	Metadata map[string]any `json:"metadata"`
+	Position int            `json:"position"`
+}
+
+// Indexer handles document indexing with embeddings.
+type Indexer struct {
+	config      *Config
+	vectorStore vector.Store
+	embedder    Embedder
+	mu          sync.RWMutex
+	chunks      map[string][]*Chunk // docID -> chunks
+}
+
+// Embedder generates embeddings for text.
+type Embedder interface {
+	Embed(ctx context.Context, texts []string) ([][]float32, error)
+}
+
+// NewIndexer creates a new indexer.
+func NewIndexer(config *Config, vectorConfig *vector.Config) *Indexer {
+	return &Indexer{
+		config:      config,
+		vectorStore: vector.NewStore(vectorConfig),
+		chunks:      make(map[string][]*Chunk),
+	}
+}
+
+// SetEmbedder sets the embedding provider.
+func (i *Indexer) SetEmbedder(embedder Embedder) {
+	i.embedder = embedder
+}
+
+// Index processes documents and adds them to the vector store.
+func (i *Indexer) Index(ctx context.Context, docs []*scraper.Document) error {
+	if i.embedder == nil {
+		return fmt.Errorf("embedder not configured")
+	}
+
+	for _, doc := range docs {
+		if err := i.indexDocument(ctx, doc); err != nil {
+			return fmt.Errorf("failed to index document %s: %w", doc.ID, err)
+		}
+	}
+
+	return nil
+}
+
+// indexDocument processes a single document.
+func (i *Indexer) indexDocument(ctx context.Context, doc *scraper.Document) error {
+	// Chunk the document
+	chunks := i.chunkDocument(doc)
+
+	// Generate embeddings for all chunks
+	texts := make([]string, len(chunks))
+	for j, chunk := range chunks {
+		texts[j] = chunk.Content
+	}
+
+	embeddings, err := i.embedder.Embed(ctx, texts)
+	if err != nil {
+		return fmt.Errorf("failed to generate embeddings: %w", err)
+	}
+
+	// Create vector documents
+	vecDocs := make([]*vector.Document, len(chunks))
+	for j, chunk := range chunks {
+		chunk.Vector = embeddings[j]
+		vecDocs[j] = &vector.Document{
+			ID:       chunk.ID,
+			Content:  chunk.Content,
+			Vector:   chunk.Vector,
+			Metadata: chunk.Metadata,
+		}
+	}
+
+	// Add to vector store
+	if err := i.vectorStore.Add(ctx, vecDocs); err != nil {
+		return fmt.Errorf("failed to add to vector store: %w", err)
+	}
+
+	// Store chunks
+	i.mu.Lock()
+	i.chunks[doc.ID] = chunks
+	i.mu.Unlock()
+
+	return nil
+}
+
+// chunkDocument splits a document into chunks.
+func (i *Indexer) chunkDocument(doc *scraper.Document) []*Chunk {
+	chunkSize := i.config.ChunkSize
+	if chunkSize == 0 {
+		chunkSize = 512 // Default chunk size in tokens (approximate)
+	}
+
+	// Simple chunking by paragraphs and size
+	content := doc.Content
+	paragraphs := strings.Split(content, "\n\n")
+
+	var chunks []*Chunk
+	var currentChunk strings.Builder
+	var currentSize int
+	position := 0
+
+	for _, para := range paragraphs {
+		paraSize := len(para) // Approximate token count
+
+		// If paragraph fits, add it
+		if currentSize+paraSize <= chunkSize || currentSize == 0 {
+			if currentSize > 0 {
+				currentChunk.WriteString("\n\n")
+			}
+			currentChunk.WriteString(para)
+			currentSize += paraSize
+		} else {
+			// Save current chunk
+			if currentSize > 0 {
+				chunks = append(chunks, i.createChunk(doc, currentChunk.String(), position))
+				position++
+			}
+
+			currentChunk.Reset()
+			currentChunk.WriteString(para)
+			currentSize = paraSize
+		}
+	}
+
+	// Don't forget the last chunk
+	if currentSize > 0 {
+		chunks = append(chunks, i.createChunk(doc, currentChunk.String(), position))
+	}
+
+	// If no chunks were created, create one from the entire content
+	if len(chunks) == 0 {
+		chunks = append(chunks, i.createChunk(doc, content, 0))
+	}
+
+	return chunks
+}
+
+// createChunk creates a chunk from content.
+func (i *Indexer) createChunk(doc *scraper.Document, content string, position int) *Chunk {
+	return &Chunk{
+		ID:      generateChunkID(doc.ID, position),
+		DocID:   doc.ID,
+		Content: content,
+		Metadata: map[string]any{
+			"source":    doc.Source,
+			"title":     doc.Title,
+			"url":       doc.URL,
+			"type":      doc.Type,
+			"position":  position,
+			"timestamp": doc.Timestamp,
+		},
+		Position: position,
+	}
+}
+
+// Search finds similar chunks to the query.
+func (i *Indexer) Search(ctx context.Context, query string, limit int, threshold float64) ([]*Chunk, error) {
+	if i.embedder == nil {
+		return nil, fmt.Errorf("embedder not configured")
+	}
+
+	// Generate embedding for query
+	embeddings, err := i.embedder.Embed(ctx, []string{query})
+	if err != nil {
+		return nil, fmt.Errorf("failed to embed query: %w", err)
+	}
+
+	// Search vector store
+	results, err := i.vectorStore.Search(ctx, embeddings[0], limit)
+	if err != nil {
+		return nil, fmt.Errorf("search failed: %w", err)
+	}
+
+	// Convert to chunks and filter by threshold
+	var chunks []*Chunk
+	for _, result := range results {
+		if result.Score >= threshold {
+			chunks = append(chunks, &Chunk{
+				ID:       result.ID,
+				Content:  result.Content,
+				Metadata: result.Metadata,
+			})
+		}
+	}
+
+	return chunks, nil
+}
+
+// Delete removes a document and its chunks.
+func (i *Indexer) Delete(ctx context.Context, docID string) error {
+	i.mu.Lock()
+	defer i.mu.Unlock()
+
+	chunks, ok := i.chunks[docID]
+	if !ok {
+		return nil
+	}
+
+	// Delete chunks from vector store
+	for _, chunk := range chunks {
+		if err := i.vectorStore.Delete(ctx, chunk.ID); err != nil {
+			fmt.Printf("Warning: failed to delete chunk %s: %v\n", chunk.ID, err)
+		}
+	}
+
+	delete(i.chunks, docID)
+	return nil
+}
+
+// Stats returns indexing statistics.
+func (i *Indexer) Stats(ctx context.Context) (map[string]interface{}, error) {
+	count, err := i.vectorStore.Count(ctx)
+	if err != nil {
+		return nil, err
+	}
+
+	i.mu.RLock()
+	docCount := len(i.chunks)
+	i.mu.RUnlock()
+
+	return map[string]interface{}{
+		"document_count": docCount,
+		"chunk_count":    count,
+		"dimensions":     i.config.Dimensions,
+		"model":          i.config.Model,
+	}, nil
+}
+
+// generateChunkID generates a unique ID for a chunk.
+func generateChunkID(docID string, position int) string {
+	data := fmt.Sprintf("%s-%d", docID, position)
+	hash := sha256.Sum256([]byte(data))
+	return hex.EncodeToString(hash[:12])
+}