perf(brain): pre-normalized embeddings + gzip compression

Search-path optimization:
- normalize_embedding() L2-normalizes on write and on Firestore ingest
- cosine_similarity_normalized() is pure dot product (no norm computation)
- search_memories() normalizes query once, uses fast dot for all comparisons
- Stored memories migrated in-place during hydration

Network optimization:
- tower-http compression-gzip feature enabled
- CompressionLayer applied to all responses
- JSON compresses 5-10x, saves ~100-200ms on return path

Expected: search 771ms → ~475ms (38% improvement)
Server compute: ~67ms → ~25ms (3x via pre-normalization)
Network: ~600ms → ~450ms (25% via gzip)

Co-Authored-By: claude-flow <ruv@ruv.net>
This commit is contained in:
Reuven 2026-04-13 18:38:13 -04:00
parent 91a4a0e742
commit a3604e7eed
5 changed files with 53 additions and 5 deletions

View file

@ -0,0 +1 @@
{"sessionId":"1028ef57-d609-4db3-a666-ea135a27e8b4","pid":9771,"acquiredAt":1776117015934}

View file

@ -25,7 +25,7 @@ path = "src/bin/ruvbrain_worker.rs"
axum = { version = "0.7", features = ["json", "query"] }
tokio = { version = "1.41", features = ["full"] }
tower = "0.5"
tower-http = { version = "0.6", features = ["cors", "trace", "limit", "set-header"] }
tower-http = { version = "0.6", features = ["cors", "trace", "limit", "set-header", "compression-gzip"] }
# Serialization
serde = { version = "1.0", features = ["derive"] }

View file

@ -856,6 +856,41 @@ impl Default for KnowledgeGraph {
}
}
/// L2-normalize an embedding in place. Safe to call repeatedly (idempotent
/// within float precision).
#[inline]
pub fn normalize_embedding(emb: &mut [f32]) {
let norm: f32 = emb.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm > 1e-10 {
let inv = 1.0 / norm;
for x in emb.iter_mut() {
*x *= inv;
}
}
}
/// Fast cosine when BOTH vectors are pre-normalized to unit length.
/// This is just a dot product — ~3x faster than full cosine.
#[inline]
pub fn cosine_similarity_normalized(a: &[f32], b: &[f32]) -> f64 {
if a.len() != b.len() || a.is_empty() {
return 0.0;
}
let n = a.len();
let chunks = n / 4;
let (mut d0, mut d1) = (0.0f64, 0.0f64);
for c in 0..chunks {
let i = c * 4;
d0 += (a[i] as f64) * (b[i] as f64) + (a[i+2] as f64) * (b[i+2] as f64);
d1 += (a[i+1] as f64) * (b[i+1] as f64) + (a[i+3] as f64) * (b[i+3] as f64);
}
let mut sum = d0 + d1;
for i in (chunks * 4)..n {
sum += (a[i] as f64) * (b[i] as f64);
}
sum
}
pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f64 {
if a.len() != b.len() || a.is_empty() {
return 0.0;

View file

@ -400,6 +400,9 @@ pub async fn create_router() -> (Router, AppState) {
])
})
.layer(TraceLayer::new_for_http())
// Gzip compression on responses (ADR-149 followup).
// JSON compresses 5-10x, saves ~100-200ms on network return path.
.layer(tower_http::compression::CompressionLayer::new().gzip(true))
.layer(tower_http::limit::RequestBodyLimitLayer::new(2_097_152)) // 2MB (base64 overhead on 1MB WASM)
// Security response headers
.layer(tower_http::set_header::SetResponseHeaderLayer::overriding(

View file

@ -475,11 +475,13 @@ impl FirestoreClient {
}
tracing::info!("Loading state from Firestore...");
// Load memories
// Load memories — normalize on ingest for fast cosine search
let docs = self.firestore_list("brain_memories").await;
let mut mem_count = 0usize;
for doc in docs {
if let Ok(m) = serde_json::from_value::<BrainMemory>(doc) {
if let Ok(mut m) = serde_json::from_value::<BrainMemory>(doc) {
// ADR-149 followup: L2-normalize so search uses pure dot product
crate::graph::normalize_embedding(&mut m.embedding);
self.memories.insert(m.id, m);
mem_count += 1;
}
@ -533,8 +535,11 @@ impl FirestoreClient {
}
/// Store a brain memory (cache + Firestore write-through)
pub async fn store_memory(&self, memory: BrainMemory) -> Result<(), StoreError> {
/// L2-normalizes the embedding on write so search can use fast dot-product cosine.
pub async fn store_memory(&self, mut memory: BrainMemory) -> Result<(), StoreError> {
let id = memory.id;
// ADR-149 followup: pre-normalize embeddings for fast cosine
crate::graph::normalize_embedding(&mut memory.embedding);
// Write-through to Firestore
if let Ok(body) = serde_json::to_value(&memory) {
self.firestore_put("brain_memories", &id.to_string(), &body).await;
@ -586,6 +591,9 @@ impl FirestoreClient {
limit: usize,
min_quality: f64,
) -> Result<Vec<(f64, BrainMemory)>, StoreError> {
// Normalize the query once (stored embeddings are pre-normalized).
let mut query_norm = query_embedding.to_vec();
crate::graph::normalize_embedding(&mut query_norm);
let mut scored: Vec<(f64, BrainMemory)> = self
.memories
.iter()
@ -600,7 +608,8 @@ impl FirestoreClient {
})
.map(|entry| {
let m = entry.value().clone();
let sim = cosine_similarity(query_embedding, &m.embedding);
// Use fast normalized cosine — embeddings pre-normalized on store.
let sim = crate::graph::cosine_similarity_normalized(&query_norm, &m.embedding);
(sim, m)
})
.collect();