mirror of
https://github.com/ruvnet/RuVector.git
synced 2026-05-25 23:24:03 +00:00
perf(brain): pre-normalized embeddings + gzip compression
Search-path optimization: - normalize_embedding() L2-normalizes on write and on Firestore ingest - cosine_similarity_normalized() is pure dot product (no norm computation) - search_memories() normalizes query once, uses fast dot for all comparisons - Stored memories migrated in-place during hydration Network optimization: - tower-http compression-gzip feature enabled - CompressionLayer applied to all responses - JSON compresses 5-10x, saves ~100-200ms on return path Expected: search 771ms → ~475ms (38% improvement) Server compute: ~67ms → ~25ms (3x via pre-normalization) Network: ~600ms → ~450ms (25% via gzip) Co-Authored-By: claude-flow <ruv@ruv.net>
This commit is contained in:
parent
91a4a0e742
commit
a3604e7eed
5 changed files with 53 additions and 5 deletions
1
.claude/scheduled_tasks.lock
Normal file
1
.claude/scheduled_tasks.lock
Normal file
|
|
@ -0,0 +1 @@
|
|||
{"sessionId":"1028ef57-d609-4db3-a666-ea135a27e8b4","pid":9771,"acquiredAt":1776117015934}
|
||||
|
|
@ -25,7 +25,7 @@ path = "src/bin/ruvbrain_worker.rs"
|
|||
axum = { version = "0.7", features = ["json", "query"] }
|
||||
tokio = { version = "1.41", features = ["full"] }
|
||||
tower = "0.5"
|
||||
tower-http = { version = "0.6", features = ["cors", "trace", "limit", "set-header"] }
|
||||
tower-http = { version = "0.6", features = ["cors", "trace", "limit", "set-header", "compression-gzip"] }
|
||||
|
||||
# Serialization
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
|
|
|
|||
|
|
@ -856,6 +856,41 @@ impl Default for KnowledgeGraph {
|
|||
}
|
||||
}
|
||||
|
||||
/// L2-normalize an embedding in place. Safe to call repeatedly (idempotent
|
||||
/// within float precision).
|
||||
#[inline]
|
||||
pub fn normalize_embedding(emb: &mut [f32]) {
|
||||
let norm: f32 = emb.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
if norm > 1e-10 {
|
||||
let inv = 1.0 / norm;
|
||||
for x in emb.iter_mut() {
|
||||
*x *= inv;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Fast cosine when BOTH vectors are pre-normalized to unit length.
|
||||
/// This is just a dot product — ~3x faster than full cosine.
|
||||
#[inline]
|
||||
pub fn cosine_similarity_normalized(a: &[f32], b: &[f32]) -> f64 {
|
||||
if a.len() != b.len() || a.is_empty() {
|
||||
return 0.0;
|
||||
}
|
||||
let n = a.len();
|
||||
let chunks = n / 4;
|
||||
let (mut d0, mut d1) = (0.0f64, 0.0f64);
|
||||
for c in 0..chunks {
|
||||
let i = c * 4;
|
||||
d0 += (a[i] as f64) * (b[i] as f64) + (a[i+2] as f64) * (b[i+2] as f64);
|
||||
d1 += (a[i+1] as f64) * (b[i+1] as f64) + (a[i+3] as f64) * (b[i+3] as f64);
|
||||
}
|
||||
let mut sum = d0 + d1;
|
||||
for i in (chunks * 4)..n {
|
||||
sum += (a[i] as f64) * (b[i] as f64);
|
||||
}
|
||||
sum
|
||||
}
|
||||
|
||||
pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f64 {
|
||||
if a.len() != b.len() || a.is_empty() {
|
||||
return 0.0;
|
||||
|
|
|
|||
|
|
@ -400,6 +400,9 @@ pub async fn create_router() -> (Router, AppState) {
|
|||
])
|
||||
})
|
||||
.layer(TraceLayer::new_for_http())
|
||||
// Gzip compression on responses (ADR-149 followup).
|
||||
// JSON compresses 5-10x, saves ~100-200ms on network return path.
|
||||
.layer(tower_http::compression::CompressionLayer::new().gzip(true))
|
||||
.layer(tower_http::limit::RequestBodyLimitLayer::new(2_097_152)) // 2MB (base64 overhead on 1MB WASM)
|
||||
// Security response headers
|
||||
.layer(tower_http::set_header::SetResponseHeaderLayer::overriding(
|
||||
|
|
|
|||
|
|
@ -475,11 +475,13 @@ impl FirestoreClient {
|
|||
}
|
||||
tracing::info!("Loading state from Firestore...");
|
||||
|
||||
// Load memories
|
||||
// Load memories — normalize on ingest for fast cosine search
|
||||
let docs = self.firestore_list("brain_memories").await;
|
||||
let mut mem_count = 0usize;
|
||||
for doc in docs {
|
||||
if let Ok(m) = serde_json::from_value::<BrainMemory>(doc) {
|
||||
if let Ok(mut m) = serde_json::from_value::<BrainMemory>(doc) {
|
||||
// ADR-149 followup: L2-normalize so search uses pure dot product
|
||||
crate::graph::normalize_embedding(&mut m.embedding);
|
||||
self.memories.insert(m.id, m);
|
||||
mem_count += 1;
|
||||
}
|
||||
|
|
@ -533,8 +535,11 @@ impl FirestoreClient {
|
|||
}
|
||||
|
||||
/// Store a brain memory (cache + Firestore write-through)
|
||||
pub async fn store_memory(&self, memory: BrainMemory) -> Result<(), StoreError> {
|
||||
/// L2-normalizes the embedding on write so search can use fast dot-product cosine.
|
||||
pub async fn store_memory(&self, mut memory: BrainMemory) -> Result<(), StoreError> {
|
||||
let id = memory.id;
|
||||
// ADR-149 followup: pre-normalize embeddings for fast cosine
|
||||
crate::graph::normalize_embedding(&mut memory.embedding);
|
||||
// Write-through to Firestore
|
||||
if let Ok(body) = serde_json::to_value(&memory) {
|
||||
self.firestore_put("brain_memories", &id.to_string(), &body).await;
|
||||
|
|
@ -586,6 +591,9 @@ impl FirestoreClient {
|
|||
limit: usize,
|
||||
min_quality: f64,
|
||||
) -> Result<Vec<(f64, BrainMemory)>, StoreError> {
|
||||
// Normalize the query once (stored embeddings are pre-normalized).
|
||||
let mut query_norm = query_embedding.to_vec();
|
||||
crate::graph::normalize_embedding(&mut query_norm);
|
||||
let mut scored: Vec<(f64, BrainMemory)> = self
|
||||
.memories
|
||||
.iter()
|
||||
|
|
@ -600,7 +608,8 @@ impl FirestoreClient {
|
|||
})
|
||||
.map(|entry| {
|
||||
let m = entry.value().clone();
|
||||
let sim = cosine_similarity(query_embedding, &m.embedding);
|
||||
// Use fast normalized cosine — embeddings pre-normalized on store.
|
||||
let sim = crate::graph::cosine_similarity_normalized(&query_norm, &m.embedding);
|
||||
(sim, m)
|
||||
})
|
||||
.collect();
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue