mirror of
https://github.com/Dvorinka/Devour.git
synced 2026-06-04 04:23:02 +00:00
first commit
This commit is contained in:
@@ -0,0 +1,271 @@
|
||||
// Package indexer provides document indexing and embedding generation.
|
||||
package indexer
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/yourorg/devour/internal/scraper"
|
||||
"github.com/yourorg/devour/internal/vector"
|
||||
)
|
||||
|
||||
// Config holds indexer configuration.
|
||||
type Config struct {
|
||||
Provider string `yaml:"provider"`
|
||||
Model string `yaml:"model"`
|
||||
Dimensions int `yaml:"dimensions"`
|
||||
APIKey string `yaml:"api_key"`
|
||||
BatchSize int `yaml:"batch_size"`
|
||||
|
||||
// Chunking settings
|
||||
ChunkSize int `yaml:"chunk_size"`
|
||||
ChunkOverlap int `yaml:"chunk_overlap"`
|
||||
}
|
||||
|
||||
// Chunk represents a text chunk with its embedding.
|
||||
type Chunk struct {
|
||||
ID string `json:"id"`
|
||||
DocID string `json:"doc_id"`
|
||||
Content string `json:"content"`
|
||||
Vector []float32 `json:"vector,omitempty"`
|
||||
Metadata map[string]any `json:"metadata"`
|
||||
Position int `json:"position"`
|
||||
}
|
||||
|
||||
// Indexer handles document indexing with embeddings.
|
||||
type Indexer struct {
|
||||
config *Config
|
||||
vectorStore vector.Store
|
||||
embedder Embedder
|
||||
mu sync.RWMutex
|
||||
chunks map[string][]*Chunk // docID -> chunks
|
||||
}
|
||||
|
||||
// Embedder generates embeddings for text.
|
||||
type Embedder interface {
|
||||
Embed(ctx context.Context, texts []string) ([][]float32, error)
|
||||
}
|
||||
|
||||
// NewIndexer creates a new indexer.
|
||||
func NewIndexer(config *Config, vectorConfig *vector.Config) *Indexer {
|
||||
return &Indexer{
|
||||
config: config,
|
||||
vectorStore: vector.NewStore(vectorConfig),
|
||||
chunks: make(map[string][]*Chunk),
|
||||
}
|
||||
}
|
||||
|
||||
// SetEmbedder sets the embedding provider.
|
||||
func (i *Indexer) SetEmbedder(embedder Embedder) {
|
||||
i.embedder = embedder
|
||||
}
|
||||
|
||||
// Index processes documents and adds them to the vector store.
|
||||
func (i *Indexer) Index(ctx context.Context, docs []*scraper.Document) error {
|
||||
if i.embedder == nil {
|
||||
return fmt.Errorf("embedder not configured")
|
||||
}
|
||||
|
||||
for _, doc := range docs {
|
||||
if err := i.indexDocument(ctx, doc); err != nil {
|
||||
return fmt.Errorf("failed to index document %s: %w", doc.ID, err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// indexDocument processes a single document.
|
||||
func (i *Indexer) indexDocument(ctx context.Context, doc *scraper.Document) error {
|
||||
// Chunk the document
|
||||
chunks := i.chunkDocument(doc)
|
||||
|
||||
// Generate embeddings for all chunks
|
||||
texts := make([]string, len(chunks))
|
||||
for j, chunk := range chunks {
|
||||
texts[j] = chunk.Content
|
||||
}
|
||||
|
||||
embeddings, err := i.embedder.Embed(ctx, texts)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to generate embeddings: %w", err)
|
||||
}
|
||||
|
||||
// Create vector documents
|
||||
vecDocs := make([]*vector.Document, len(chunks))
|
||||
for j, chunk := range chunks {
|
||||
chunk.Vector = embeddings[j]
|
||||
vecDocs[j] = &vector.Document{
|
||||
ID: chunk.ID,
|
||||
Content: chunk.Content,
|
||||
Vector: chunk.Vector,
|
||||
Metadata: chunk.Metadata,
|
||||
}
|
||||
}
|
||||
|
||||
// Add to vector store
|
||||
if err := i.vectorStore.Add(ctx, vecDocs); err != nil {
|
||||
return fmt.Errorf("failed to add to vector store: %w", err)
|
||||
}
|
||||
|
||||
// Store chunks
|
||||
i.mu.Lock()
|
||||
i.chunks[doc.ID] = chunks
|
||||
i.mu.Unlock()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// chunkDocument splits a document into chunks.
|
||||
func (i *Indexer) chunkDocument(doc *scraper.Document) []*Chunk {
|
||||
chunkSize := i.config.ChunkSize
|
||||
if chunkSize == 0 {
|
||||
chunkSize = 512 // Default chunk size in tokens (approximate)
|
||||
}
|
||||
|
||||
// Simple chunking by paragraphs and size
|
||||
content := doc.Content
|
||||
paragraphs := strings.Split(content, "\n\n")
|
||||
|
||||
var chunks []*Chunk
|
||||
var currentChunk strings.Builder
|
||||
var currentSize int
|
||||
position := 0
|
||||
|
||||
for _, para := range paragraphs {
|
||||
paraSize := len(para) // Approximate token count
|
||||
|
||||
// If paragraph fits, add it
|
||||
if currentSize+paraSize <= chunkSize || currentSize == 0 {
|
||||
if currentSize > 0 {
|
||||
currentChunk.WriteString("\n\n")
|
||||
}
|
||||
currentChunk.WriteString(para)
|
||||
currentSize += paraSize
|
||||
} else {
|
||||
// Save current chunk
|
||||
if currentSize > 0 {
|
||||
chunks = append(chunks, i.createChunk(doc, currentChunk.String(), position))
|
||||
position++
|
||||
}
|
||||
|
||||
currentChunk.Reset()
|
||||
currentChunk.WriteString(para)
|
||||
currentSize = paraSize
|
||||
}
|
||||
}
|
||||
|
||||
// Don't forget the last chunk
|
||||
if currentSize > 0 {
|
||||
chunks = append(chunks, i.createChunk(doc, currentChunk.String(), position))
|
||||
}
|
||||
|
||||
// If no chunks were created, create one from the entire content
|
||||
if len(chunks) == 0 {
|
||||
chunks = append(chunks, i.createChunk(doc, content, 0))
|
||||
}
|
||||
|
||||
return chunks
|
||||
}
|
||||
|
||||
// createChunk creates a chunk from content.
|
||||
func (i *Indexer) createChunk(doc *scraper.Document, content string, position int) *Chunk {
|
||||
return &Chunk{
|
||||
ID: generateChunkID(doc.ID, position),
|
||||
DocID: doc.ID,
|
||||
Content: content,
|
||||
Metadata: map[string]any{
|
||||
"source": doc.Source,
|
||||
"title": doc.Title,
|
||||
"url": doc.URL,
|
||||
"type": doc.Type,
|
||||
"position": position,
|
||||
"timestamp": doc.Timestamp,
|
||||
},
|
||||
Position: position,
|
||||
}
|
||||
}
|
||||
|
||||
// Search finds similar chunks to the query.
|
||||
func (i *Indexer) Search(ctx context.Context, query string, limit int, threshold float64) ([]*Chunk, error) {
|
||||
if i.embedder == nil {
|
||||
return nil, fmt.Errorf("embedder not configured")
|
||||
}
|
||||
|
||||
// Generate embedding for query
|
||||
embeddings, err := i.embedder.Embed(ctx, []string{query})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to embed query: %w", err)
|
||||
}
|
||||
|
||||
// Search vector store
|
||||
results, err := i.vectorStore.Search(ctx, embeddings[0], limit)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("search failed: %w", err)
|
||||
}
|
||||
|
||||
// Convert to chunks and filter by threshold
|
||||
var chunks []*Chunk
|
||||
for _, result := range results {
|
||||
if result.Score >= threshold {
|
||||
chunks = append(chunks, &Chunk{
|
||||
ID: result.ID,
|
||||
Content: result.Content,
|
||||
Metadata: result.Metadata,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return chunks, nil
|
||||
}
|
||||
|
||||
// Delete removes a document and its chunks.
|
||||
func (i *Indexer) Delete(ctx context.Context, docID string) error {
|
||||
i.mu.Lock()
|
||||
defer i.mu.Unlock()
|
||||
|
||||
chunks, ok := i.chunks[docID]
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Delete chunks from vector store
|
||||
for _, chunk := range chunks {
|
||||
if err := i.vectorStore.Delete(ctx, chunk.ID); err != nil {
|
||||
fmt.Printf("Warning: failed to delete chunk %s: %v\n", chunk.ID, err)
|
||||
}
|
||||
}
|
||||
|
||||
delete(i.chunks, docID)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Stats returns indexing statistics.
|
||||
func (i *Indexer) Stats(ctx context.Context) (map[string]interface{}, error) {
|
||||
count, err := i.vectorStore.Count(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
i.mu.RLock()
|
||||
docCount := len(i.chunks)
|
||||
i.mu.RUnlock()
|
||||
|
||||
return map[string]interface{}{
|
||||
"document_count": docCount,
|
||||
"chunk_count": count,
|
||||
"dimensions": i.config.Dimensions,
|
||||
"model": i.config.Model,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// generateChunkID generates a unique ID for a chunk.
|
||||
func generateChunkID(docID string, position int) string {
|
||||
data := fmt.Sprintf("%s-%d", docID, position)
|
||||
hash := sha256.Sum256([]byte(data))
|
||||
return hex.EncodeToString(hash[:12])
|
||||
}
|
||||
Reference in New Issue
Block a user