From ef54ee94154f226b56f397f4890cfbfccee2b110 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 9 Dec 2025 00:20:44 +0000 Subject: [PATCH] docs: Add honest documentation about capabilities and limitations - Update lib.rs with tested/benchmarked features vs experimental ones - Mark AgenticDB embedding function as placeholder (NOT semantic) - Add warning to RAG example about mock embeddings - Clarify that external embedding models are required for semantic search --- crates/ruvector-core/src/agenticdb.rs | 18 +++++++++++++++--- crates/ruvector-core/src/lib.rs | 23 +++++++++++++++++------ examples/rust/rag_pipeline.rs | 12 +++++++++++- 3 files changed, 43 insertions(+), 10 deletions(-) diff --git a/crates/ruvector-core/src/agenticdb.rs b/crates/ruvector-core/src/agenticdb.rs index a9d974f7..c83449d6 100644 --- a/crates/ruvector-core/src/agenticdb.rs +++ b/crates/ruvector-core/src/agenticdb.rs @@ -656,10 +656,22 @@ impl AgenticDB { // ============ Helper Methods ============ - /// Generate text embedding (placeholder - would use actual embedding model) + /// Generate text embedding from text. + /// + /// # ⚠️ WARNING: PLACEHOLDER IMPLEMENTATION + /// + /// This uses a simple hash-based embedding that does NOT understand + /// semantic meaning. Text similarity will be based on character overlap, + /// not actual meaning. + /// + /// For real semantic search, integrate an actual embedding model: + /// - `sentence-transformers` via Python bindings + /// - `candle` for native Rust inference + /// - ONNX Runtime for cross-platform models + /// - OpenAI/Anthropic embedding APIs fn generate_text_embedding(&self, text: &str) -> Result> { - // Simple hash-based embedding for demonstration - // In production, use actual embedding models like sentence-transformers + // ⚠️ PLACEHOLDER: Hash-based embedding - NOT semantic + // This is for demonstration and testing only let mut embedding = vec![0.0; self.dimensions]; let bytes = text.as_bytes(); diff --git a/crates/ruvector-core/src/lib.rs b/crates/ruvector-core/src/lib.rs index 79b21d82..35cd4330 100644 --- a/crates/ruvector-core/src/lib.rs +++ b/crates/ruvector-core/src/lib.rs @@ -2,13 +2,24 @@ //! //! High-performance Rust-native vector database with HNSW indexing and SIMD-optimized operations. //! -//! ## Features +//! ## Working Features (Tested & Benchmarked) //! -//! - **HNSW Indexing**: O(log n) search with 95%+ recall -//! - **SIMD Optimizations**: 4-16x faster distance calculations -//! - **Quantization**: 4-32x memory compression -//! - **Zero-copy Memory**: Memory-mapped vectors for instant loading -//! - **AgenticDB Compatible**: Drop-in replacement with 10-100x speedup +//! - **HNSW Indexing**: Approximate nearest neighbor search with O(log n) complexity +//! - **SIMD Distance**: SimSIMD-powered distance calculations (~16M ops/sec for 512-dim) +//! - **Quantization**: Scalar (4x) and binary (32x) compression with distance support +//! - **Persistence**: REDB-based storage with config persistence +//! - **Search**: ~2.5K queries/sec on 10K vectors (benchmarked) +//! +//! ## Experimental/Incomplete Features +//! +//! - **AgenticDB**: Uses placeholder hash-based embeddings (NOT semantic) +//! - Replace `generate_text_embedding` with real model for production use +//! - **Advanced Features**: Conformal prediction, hybrid search - functional but less tested +//! +//! ## What This Is NOT +//! +//! - This is NOT a complete RAG solution - you need external embedding models +//! - Examples use mock embeddings for demonstration only #![warn(missing_docs)] #![warn(clippy::all)] diff --git a/examples/rust/rag_pipeline.rs b/examples/rust/rag_pipeline.rs index 4a77aa37..98bb1388 100644 --- a/examples/rust/rag_pipeline.rs +++ b/examples/rust/rag_pipeline.rs @@ -1,6 +1,13 @@ //! RAG (Retrieval Augmented Generation) Pipeline Example //! -//! Demonstrates building a complete RAG system with Ruvector +//! Demonstrates building a complete RAG system with Ruvector. +//! +//! ⚠️ NOTE: This example uses MOCK embeddings for demonstration. +//! In production, replace `mock_embedding()` with a real embedding model: +//! - `sentence-transformers` via Python bindings +//! - `candle` for native Rust inference +//! - ONNX Runtime for cross-platform models +//! - OpenAI/Anthropic embedding APIs use ruvector_core::{VectorDB, VectorEntry, SearchQuery, DbOptions, Result}; use std::collections::HashMap; @@ -114,6 +121,9 @@ fn main() -> Result<()> { Ok(()) } +/// ⚠️ MOCK EMBEDDING - NOT SEMANTIC +/// This produces deterministic vectors based on seed value. +/// Replace with actual embedding model for real semantic search. fn mock_embedding(dims: usize, seed: f32) -> Vec { (0..dims) .map(|i| (seed + i as f32 * 0.001).sin())