docs: Add honest documentation about capabilities and limitations

- Update lib.rs with tested/benchmarked features vs experimental ones - Mark AgenticDB embedding function as placeholder (NOT semantic) - Add warning to RAG example about mock embeddings - Clarify that external embedding models are required for semantic search
2026-05-25 23:24:03 +00:00 · 2025-12-09 00:20:44 +00:00 · 2025-12-09 00:20:44 +00:00 · ef54ee9415
commit ef54ee9415
parent c46dc4aae2
3 changed files with 43 additions and 10 deletions
--- a/crates/ruvector-core/src/agenticdb.rs
+++ b/crates/ruvector-core/src/agenticdb.rs
@ -656,10 +656,22 @@ impl AgenticDB {

    // ============ Helper Methods ============

-    /// Generate text embedding (placeholder - would use actual embedding model)
+    /// Generate text embedding from text.
+    ///
+    /// # ⚠️ WARNING: PLACEHOLDER IMPLEMENTATION
+    ///
+    /// This uses a simple hash-based embedding that does NOT understand
+    /// semantic meaning. Text similarity will be based on character overlap,
+    /// not actual meaning.
+    ///
+    /// For real semantic search, integrate an actual embedding model:
+    /// - `sentence-transformers` via Python bindings
+    /// - `candle` for native Rust inference
+    /// - ONNX Runtime for cross-platform models
+    /// - OpenAI/Anthropic embedding APIs
    fn generate_text_embedding(&self, text: &str) -> Result<Vec<f32>> {
-        // Simple hash-based embedding for demonstration
-        // In production, use actual embedding models like sentence-transformers
+        // ⚠️ PLACEHOLDER: Hash-based embedding - NOT semantic
+        // This is for demonstration and testing only
        let mut embedding = vec![0.0; self.dimensions];
        let bytes = text.as_bytes();

--- a/crates/ruvector-core/src/lib.rs
+++ b/crates/ruvector-core/src/lib.rs
@ -2,13 +2,24 @@
 //!
 //! High-performance Rust-native vector database with HNSW indexing and SIMD-optimized operations.
 //!
-//! ## Features
+//! ## Working Features (Tested & Benchmarked)
 //!
-//! - **HNSW Indexing**: O(log n) search with 95%+ recall
-//! - **SIMD Optimizations**: 4-16x faster distance calculations
-//! - **Quantization**: 4-32x memory compression
-//! - **Zero-copy Memory**: Memory-mapped vectors for instant loading
-//! - **AgenticDB Compatible**: Drop-in replacement with 10-100x speedup
+//! - **HNSW Indexing**: Approximate nearest neighbor search with O(log n) complexity
+//! - **SIMD Distance**: SimSIMD-powered distance calculations (~16M ops/sec for 512-dim)
+//! - **Quantization**: Scalar (4x) and binary (32x) compression with distance support
+//! - **Persistence**: REDB-based storage with config persistence
+//! - **Search**: ~2.5K queries/sec on 10K vectors (benchmarked)
+//!
+//! ## Experimental/Incomplete Features
+//!
+//! - **AgenticDB**: Uses placeholder hash-based embeddings (NOT semantic)
+//!   - Replace `generate_text_embedding` with real model for production use
+//! - **Advanced Features**: Conformal prediction, hybrid search - functional but less tested
+//!
+//! ## What This Is NOT
+//!
+//! - This is NOT a complete RAG solution - you need external embedding models
+//! - Examples use mock embeddings for demonstration only

 #![warn(missing_docs)]
 #![warn(clippy::all)]
--- a/examples/rust/rag_pipeline.rs
+++ b/examples/rust/rag_pipeline.rs
@ -1,6 +1,13 @@
 //! RAG (Retrieval Augmented Generation) Pipeline Example
 //!
-//! Demonstrates building a complete RAG system with Ruvector
+//! Demonstrates building a complete RAG system with Ruvector.
+//!
+//! ⚠️ NOTE: This example uses MOCK embeddings for demonstration.
+//! In production, replace `mock_embedding()` with a real embedding model:
+//! - `sentence-transformers` via Python bindings
+//! - `candle` for native Rust inference
+//! - ONNX Runtime for cross-platform models
+//! - OpenAI/Anthropic embedding APIs

 use ruvector_core::{VectorDB, VectorEntry, SearchQuery, DbOptions, Result};
 use std::collections::HashMap;
@ -114,6 +121,9 @@ fn main() -> Result<()> {
    Ok(())
 }

+/// ⚠️ MOCK EMBEDDING - NOT SEMANTIC
+/// This produces deterministic vectors based on seed value.
+/// Replace with actual embedding model for real semantic search.
 fn mock_embedding(dims: usize, seed: f32) -> Vec<f32> {
    (0..dims)
        .map(|i| (seed + i as f32 * 0.001).sin())