Devour/internal/indexer/indexer.go

// Package indexer provides document indexing and embedding generation.
package indexer

import (
	"context"
	"crypto/sha256"
	"encoding/hex"
	"fmt"
	"strings"
	"sync"

	"github.com/yourorg/devour/internal/scraper"
	"github.com/yourorg/devour/internal/vector"
)

// Config holds indexer configuration.
type Config struct {
	Provider   string `yaml:"provider"`
	Model      string `yaml:"model"`
	Dimensions int    `yaml:"dimensions"`
	APIKey     string `yaml:"api_key"`
	BatchSize  int    `yaml:"batch_size"`

	// Chunking settings
	ChunkSize    int `yaml:"chunk_size"`
	ChunkOverlap int `yaml:"chunk_overlap"`
}

// Chunk represents a text chunk with its embedding.
type Chunk struct {
	ID       string         `json:"id"`
	DocID    string         `json:"doc_id"`
	Content  string         `json:"content"`
	Vector   []float32      `json:"vector,omitempty"`
	Metadata map[string]any `json:"metadata"`
	Position int            `json:"position"`
}

// Indexer handles document indexing with embeddings.
type Indexer struct {
	config      *Config
	vectorStore vector.Store
	embedder    Embedder
	mu          sync.RWMutex
	chunks      map[string][]*Chunk // docID -> chunks
}

// Embedder generates embeddings for text.
type Embedder interface {
	Embed(ctx context.Context, texts []string) ([][]float32, error)
}

// NewIndexer creates a new indexer.
func NewIndexer(config *Config, vectorConfig *vector.Config) *Indexer {
	return &Indexer{
		config:      config,
		vectorStore: vector.NewStore(vectorConfig),
		chunks:      make(map[string][]*Chunk),
	}
}

// SetEmbedder sets the embedding provider.
func (i *Indexer) SetEmbedder(embedder Embedder) {
	i.embedder = embedder
}

// Index processes documents and adds them to the vector store.
func (i *Indexer) Index(ctx context.Context, docs []*scraper.Document) error {
	if i.embedder == nil {
		return fmt.Errorf("embedder not configured")
	}

	for _, doc := range docs {
		if err := i.indexDocument(ctx, doc); err != nil {
			return fmt.Errorf("failed to index document %s: %w", doc.ID, err)
		}
	}

	return nil
}

// indexDocument processes a single document.
func (i *Indexer) indexDocument(ctx context.Context, doc *scraper.Document) error {
	// Chunk the document
	chunks := i.chunkDocument(doc)

	// Generate embeddings for all chunks
	texts := make([]string, len(chunks))
	for j, chunk := range chunks {
		texts[j] = chunk.Content
	}

	embeddings, err := i.embedder.Embed(ctx, texts)
	if err != nil {
		return fmt.Errorf("failed to generate embeddings: %w", err)
	}

	// Create vector documents
	vecDocs := make([]*vector.Document, len(chunks))
	for j, chunk := range chunks {
		chunk.Vector = embeddings[j]
		vecDocs[j] = &vector.Document{
			ID:       chunk.ID,
			Content:  chunk.Content,
			Vector:   chunk.Vector,
			Metadata: chunk.Metadata,
		}
	}

	// Add to vector store
	if err := i.vectorStore.Add(ctx, vecDocs); err != nil {
		return fmt.Errorf("failed to add to vector store: %w", err)
	}

	// Store chunks
	i.mu.Lock()
	i.chunks[doc.ID] = chunks
	i.mu.Unlock()

	return nil
}

// chunkDocument splits a document into chunks.
func (i *Indexer) chunkDocument(doc *scraper.Document) []*Chunk {
	chunkSize := i.config.ChunkSize
	if chunkSize == 0 {
		chunkSize = 512 // Default chunk size in tokens (approximate)
	}

	// Simple chunking by paragraphs and size
	content := doc.Content
	paragraphs := strings.Split(content, "\n\n")

	var chunks []*Chunk
	var currentChunk strings.Builder
	var currentSize int
	position := 0

	for _, para := range paragraphs {
		paraSize := len(para) // Approximate token count

		// If paragraph fits, add it
		if currentSize+paraSize <= chunkSize || currentSize == 0 {
			if currentSize > 0 {
				currentChunk.WriteString("\n\n")
			}
			currentChunk.WriteString(para)
			currentSize += paraSize
		} else {
			// Save current chunk
			if currentSize > 0 {
				chunks = append(chunks, i.createChunk(doc, currentChunk.String(), position))
				position++
			}

			currentChunk.Reset()
			currentChunk.WriteString(para)
			currentSize = paraSize
		}
	}

	// Don't forget the last chunk
	if currentSize > 0 {
		chunks = append(chunks, i.createChunk(doc, currentChunk.String(), position))
	}

	// If no chunks were created, create one from the entire content
	if len(chunks) == 0 {
		chunks = append(chunks, i.createChunk(doc, content, 0))
	}

	return chunks
}

// createChunk creates a chunk from content.
func (i *Indexer) createChunk(doc *scraper.Document, content string, position int) *Chunk {
	return &Chunk{
		ID:      generateChunkID(doc.ID, position),
		DocID:   doc.ID,
		Content: content,
		Metadata: map[string]any{
			"source":    doc.Source,
			"title":     doc.Title,
			"url":       doc.URL,
			"type":      doc.Type,
			"position":  position,
			"timestamp": doc.Timestamp,
		},
		Position: position,
	}
}

// Search finds similar chunks to the query.
func (i *Indexer) Search(ctx context.Context, query string, limit int, threshold float64) ([]*Chunk, error) {
	if i.embedder == nil {
		return nil, fmt.Errorf("embedder not configured")
	}

	// Generate embedding for query
	embeddings, err := i.embedder.Embed(ctx, []string{query})
	if err != nil {
		return nil, fmt.Errorf("failed to embed query: %w", err)
	}

	// Search vector store
	results, err := i.vectorStore.Search(ctx, embeddings[0], limit)
	if err != nil {
		return nil, fmt.Errorf("search failed: %w", err)
	}

	// Convert to chunks and filter by threshold
	var chunks []*Chunk
	for _, result := range results {
		if result.Score >= threshold {
			chunks = append(chunks, &Chunk{
				ID:       result.ID,
				Content:  result.Content,
				Metadata: result.Metadata,
			})
		}
	}

	return chunks, nil
}

// Delete removes a document and its chunks.
func (i *Indexer) Delete(ctx context.Context, docID string) error {
	i.mu.Lock()
	defer i.mu.Unlock()

	chunks, ok := i.chunks[docID]
	if !ok {
		return nil
	}

	// Delete chunks from vector store
	for _, chunk := range chunks {
		if err := i.vectorStore.Delete(ctx, chunk.ID); err != nil {
			fmt.Printf("Warning: failed to delete chunk %s: %v\n", chunk.ID, err)
		}
	}

	delete(i.chunks, docID)
	return nil
}

// Stats returns indexing statistics.
func (i *Indexer) Stats(ctx context.Context) (map[string]interface{}, error) {
	count, err := i.vectorStore.Count(ctx)
	if err != nil {
		return nil, err
	}

	i.mu.RLock()
	docCount := len(i.chunks)
	i.mu.RUnlock()

	return map[string]interface{}{
		"document_count": docCount,
		"chunk_count":    count,
		"dimensions":     i.config.Dimensions,
		"model":          i.config.Model,
	}, nil
}

// generateChunkID generates a unique ID for a chunk.
func generateChunkID(docID string, position int) string {
	data := fmt.Sprintf("%s-%d", docID, position)
	hash := sha256.Sum256([]byte(data))
	return hex.EncodeToString(hash[:12])
}