mirror of
https://github.com/ruvnet/RuVector.git
synced 2026-05-25 23:24:03 +00:00
feat: add streaming-semantic-drift Rust proof of concept
Introduces crates/ruvector-drift with three drift detector variants: - MeanShiftDetector: EMA distance, O(D) space, 124 ns/insert - CusumDetector: CUSUM on z-scored norms, 48 B space, 129 ns/insert - MmdRffDetector: RFF-MMD, O(D×R) space, 42 µs/insert All implement DriftDetector trait; benchmark binary in src/main.rs. https://claude.ai/code/session_017kmy7aU2vDkc21CB8g2xB5
This commit is contained in:
parent
782a51bc83
commit
b3173b89dc
7 changed files with 1021 additions and 0 deletions
20
crates/ruvector-drift/Cargo.toml
Normal file
20
crates/ruvector-drift/Cargo.toml
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
[package]
|
||||
name = "ruvector-drift"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
description = "Streaming semantic drift detection for agent vector memory — online distribution shift monitoring for RuVector"
|
||||
authors = ["ruvnet", "claude-flow"]
|
||||
license = "MIT OR Apache-2.0"
|
||||
repository = "https://github.com/ruvnet/ruvector"
|
||||
keywords = ["ann", "drift-detection", "agent-memory", "vector-search", "ruvector"]
|
||||
categories = ["algorithms", "data-structures", "science"]
|
||||
|
||||
[[bin]]
|
||||
name = "drift-bench"
|
||||
path = "src/main.rs"
|
||||
|
||||
[dependencies]
|
||||
rand = "0.8"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
|
||||
[dev-dependencies]
|
||||
129
crates/ruvector-drift/src/cusum.rs
Normal file
129
crates/ruvector-drift/src/cusum.rs
Normal file
|
|
@ -0,0 +1,129 @@
|
|||
//! Alternative A: CUSUM-based drift detector.
|
||||
//!
|
||||
//! Runs a standard CUSUM (Page 1954) control chart on the L2 norm of each
|
||||
//! incoming vector. For vectors from N(μ, I), E[||v||²] = D + ||μ||², so a
|
||||
//! shift in mean always increases the expected squared norm — giving CUSUM a
|
||||
//! reliable scalar channel that is sign-agnostic and dimension-agnostic.
|
||||
//!
|
||||
//! The reference phase fits a running mean and unbiased variance of ||v||² via
|
||||
//! Welford's algorithm. After warm-up, each new vector contributes one
|
||||
//! z-scored observation to both an upper CUSUM (increase) and a lower CUSUM
|
||||
//! (decrease) statistic. Either arm triggering indicates distribution shift.
|
||||
//!
|
||||
//! **Complexity:** O(D) insert (norm computation), O(1) score, O(1) memory.
|
||||
|
||||
use crate::DriftDetector;
|
||||
|
||||
/// CUSUM drift detector operating on per-vector L2 squared norms.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CusumDetector {
|
||||
dim: usize,
|
||||
warm_up: usize,
|
||||
/// Allowance (slack) for normal variability; typically 0.5 σ.
|
||||
slack: f64,
|
||||
/// Welford running mean of ||v||².
|
||||
ref_mean: f64,
|
||||
/// Welford M2 accumulator for ||v||².
|
||||
ref_m2: f64,
|
||||
/// Welford running count during reference phase.
|
||||
ref_n: usize,
|
||||
/// Reference std of ||v||² (frozen after warm-up).
|
||||
ref_std: f64,
|
||||
/// Upper CUSUM statistic (detects upward shifts).
|
||||
cusum_up: f64,
|
||||
/// Lower CUSUM statistic (detects downward shifts).
|
||||
cusum_down: f64,
|
||||
count: usize,
|
||||
warmed_up: bool,
|
||||
}
|
||||
|
||||
impl CusumDetector {
|
||||
/// Create a new norm-based CUSUM detector.
|
||||
///
|
||||
/// - `dim`: vector dimension (used for memory reporting only)
|
||||
/// - `warm_up`: insertions to build reference statistics
|
||||
/// - `slack`: CUSUM allowance in units of reference σ (try 0.5–1.0)
|
||||
pub fn new(dim: usize, warm_up: usize, slack: f64) -> Self {
|
||||
assert!(dim > 0);
|
||||
assert!(warm_up >= 2, "need at least 2 samples for variance");
|
||||
assert!(slack >= 0.0);
|
||||
Self {
|
||||
dim,
|
||||
warm_up,
|
||||
slack,
|
||||
ref_mean: 0.0,
|
||||
ref_m2: 0.0,
|
||||
ref_n: 0,
|
||||
ref_std: 1.0,
|
||||
cusum_up: 0.0,
|
||||
cusum_down: 0.0,
|
||||
count: 0,
|
||||
warmed_up: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// L2 squared norm of `vec`.
|
||||
fn sq_norm(vec: &[f32]) -> f64 {
|
||||
vec.iter().map(|&x| (x as f64).powi(2)).sum()
|
||||
}
|
||||
}
|
||||
|
||||
impl DriftDetector for CusumDetector {
|
||||
fn insert(&mut self, vec: &[f32]) {
|
||||
debug_assert_eq!(vec.len(), self.dim);
|
||||
self.count += 1;
|
||||
|
||||
let norm_sq = Self::sq_norm(vec);
|
||||
|
||||
if !self.warmed_up {
|
||||
// Welford online update for mean and M2 of ||v||²
|
||||
self.ref_n += 1;
|
||||
let delta = norm_sq - self.ref_mean;
|
||||
self.ref_mean += delta / self.ref_n as f64;
|
||||
let delta2 = norm_sq - self.ref_mean;
|
||||
self.ref_m2 += delta * delta2;
|
||||
|
||||
if self.count >= self.warm_up {
|
||||
self.warmed_up = true;
|
||||
// Sample std; floor at 1.0 to avoid division by near-zero.
|
||||
self.ref_std = if self.ref_n >= 2 {
|
||||
(self.ref_m2 / (self.ref_n - 1) as f64).sqrt().max(1.0)
|
||||
} else {
|
||||
1.0
|
||||
};
|
||||
}
|
||||
} else {
|
||||
// Z-score the squared norm relative to reference statistics.
|
||||
let z = (norm_sq - self.ref_mean) / self.ref_std;
|
||||
self.cusum_up = (self.cusum_up + z - self.slack).max(0.0);
|
||||
self.cusum_down = (self.cusum_down - z - self.slack).max(0.0);
|
||||
}
|
||||
}
|
||||
|
||||
fn drift_score(&self) -> f32 {
|
||||
if !self.warmed_up {
|
||||
return 0.0;
|
||||
}
|
||||
self.cusum_up.max(self.cusum_down) as f32
|
||||
}
|
||||
|
||||
fn reset_reference(&mut self) {
|
||||
self.ref_mean = 0.0;
|
||||
self.ref_m2 = 0.0;
|
||||
self.ref_n = 0;
|
||||
self.ref_std = 1.0;
|
||||
self.cusum_up = 0.0;
|
||||
self.cusum_down = 0.0;
|
||||
self.count = 0;
|
||||
self.warmed_up = false;
|
||||
}
|
||||
|
||||
fn count(&self) -> usize {
|
||||
self.count
|
||||
}
|
||||
|
||||
fn memory_bytes(&self) -> usize {
|
||||
// All state is stack-allocated scalars.
|
||||
6 * std::mem::size_of::<f64>()
|
||||
}
|
||||
}
|
||||
215
crates/ruvector-drift/src/lib.rs
Normal file
215
crates/ruvector-drift/src/lib.rs
Normal file
|
|
@ -0,0 +1,215 @@
|
|||
//! # ruvector-drift — Streaming Semantic Drift Detection for Agent Vector Memory
|
||||
//!
|
||||
//! Detects when the semantic distribution of an agent's vector memory has shifted —
|
||||
//! enabling self-healing indexes, staleness eviction, and RAG safety guards.
|
||||
//!
|
||||
//! ## Variants
|
||||
//!
|
||||
//! | Variant | Algorithm | Memory | Latency |
|
||||
//! |------------------|-------------------------|-------------|-----------|
|
||||
//! | `MeanShiftDetector` | EMA mean distance | O(D) | O(D) |
|
||||
//! | `CusumDetector` | CUSUM on projections | O(D) | O(D) |
|
||||
//! | `MmdRffDetector` | MMD via RFF features | O(D × R) | O(D + R) |
|
||||
//!
|
||||
//! All three implement [`DriftDetector`].
|
||||
|
||||
#![forbid(unsafe_code)]
|
||||
#![warn(missing_docs)]
|
||||
|
||||
pub mod cusum;
|
||||
pub mod mean_shift;
|
||||
pub mod mmd_rff;
|
||||
pub mod stats;
|
||||
|
||||
pub use cusum::CusumDetector;
|
||||
pub use mean_shift::MeanShiftDetector;
|
||||
pub use mmd_rff::MmdRffDetector;
|
||||
pub use stats::OnlineStats;
|
||||
|
||||
/// Core trait implemented by all drift detectors.
|
||||
///
|
||||
/// A detector ingests vectors one at a time via [`insert`], accumulates a
|
||||
/// reference distribution during the warm-up phase, then continuously scores
|
||||
/// divergence from that reference. Callers gate on [`is_drifted`] and can
|
||||
/// [`reset_reference`] when a controlled concept update occurs.
|
||||
pub trait DriftDetector {
|
||||
/// Ingest one vector into the detector.
|
||||
fn insert(&mut self, vec: &[f32]);
|
||||
|
||||
/// Scalar divergence from the reference distribution; 0.0 = no drift.
|
||||
fn drift_score(&self) -> f32;
|
||||
|
||||
/// Whether the detector considers drift to have occurred.
|
||||
fn is_drifted(&self, threshold: f32) -> bool {
|
||||
self.drift_score() > threshold
|
||||
}
|
||||
|
||||
/// Freeze the current distribution as the new reference baseline.
|
||||
fn reset_reference(&mut self);
|
||||
|
||||
/// Number of vectors seen since last reset.
|
||||
fn count(&self) -> usize;
|
||||
|
||||
/// Approximate heap bytes consumed by this detector.
|
||||
fn memory_bytes(&self) -> usize;
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use rand::{rngs::StdRng, Rng, SeedableRng};
|
||||
|
||||
fn gaussian(rng: &mut StdRng, dim: usize, mean: f64, std: f64) -> Vec<f32> {
|
||||
use std::f64::consts::PI;
|
||||
let mut out = Vec::with_capacity(dim);
|
||||
while out.len() < dim {
|
||||
let u1 = rng.gen::<f64>().max(1e-14);
|
||||
let u2 = rng.gen::<f64>();
|
||||
let r = (-2.0 * u1.ln()).sqrt() * std;
|
||||
let theta = 2.0 * PI * u2;
|
||||
out.push((mean + r * theta.cos()) as f32);
|
||||
if out.len() < dim {
|
||||
out.push((mean + r * theta.sin()) as f32);
|
||||
}
|
||||
}
|
||||
out.truncate(dim);
|
||||
out
|
||||
}
|
||||
|
||||
fn run_detect<D: DriftDetector>(
|
||||
det: &mut D,
|
||||
rng: &mut StdRng,
|
||||
dim: usize,
|
||||
warm: usize,
|
||||
n_drift: usize,
|
||||
drift: f64,
|
||||
threshold: f32,
|
||||
) -> Option<usize> {
|
||||
for _ in 0..warm {
|
||||
det.insert(&gaussian(rng, dim, 0.0, 1.0));
|
||||
}
|
||||
for i in 0..n_drift {
|
||||
det.insert(&gaussian(rng, dim, drift, 1.0));
|
||||
if det.is_drifted(threshold) {
|
||||
return Some(i + 1);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mean_shift_detects_large_drift() {
|
||||
let mut rng = StdRng::seed_from_u64(1);
|
||||
let mut det = MeanShiftDetector::new(64, 200, 0.05);
|
||||
let lag = run_detect(&mut det, &mut rng, 64, 200, 500, 3.0, 0.4);
|
||||
assert!(
|
||||
lag.is_some(),
|
||||
"MeanShift must detect drift=3.0 within 500 insertions"
|
||||
);
|
||||
assert!(
|
||||
lag.unwrap() <= 200,
|
||||
"detection lag must be ≤200; got {:?}",
|
||||
lag
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cusum_detects_moderate_drift() {
|
||||
let mut rng = StdRng::seed_from_u64(2);
|
||||
let mut det = CusumDetector::new(64, 200, 1.0);
|
||||
let lag = run_detect(&mut det, &mut rng, 64, 200, 500, 2.0, 3.0);
|
||||
assert!(lag.is_some(), "CUSUM must detect drift=2.0");
|
||||
assert!(lag.unwrap() <= 300, "CUSUM lag must be ≤300; got {:?}", lag);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mmd_rff_detects_shift() {
|
||||
let mut rng = StdRng::seed_from_u64(3);
|
||||
let mut det = MmdRffDetector::new(64, 128, 200, 1.0, 0.05, 42);
|
||||
let lag = run_detect(&mut det, &mut rng, 64, 200, 500, 2.5, 0.04);
|
||||
assert!(lag.is_some(), "MMD-RFF must detect drift=2.5");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mean_shift_drift_exceeds_nodrift_score() {
|
||||
// Verify drift signal >> natural noise floor.
|
||||
// EMA effective window n_eff = 1/alpha = 20. For D=64 iid N(0,1):
|
||||
// expected no-drift L2 ≈ sqrt(D/n_eff) = sqrt(64/20) ≈ 1.79
|
||||
// drift=4.0 per-dim pushes L2 far above that noise floor.
|
||||
let mut rng_nodrift = StdRng::seed_from_u64(4);
|
||||
let mut det_nodrift = MeanShiftDetector::new(64, 200, 0.05);
|
||||
for _ in 0..200 {
|
||||
det_nodrift.insert(&gaussian(&mut rng_nodrift, 64, 0.0, 1.0));
|
||||
}
|
||||
for _ in 0..100 {
|
||||
det_nodrift.insert(&gaussian(&mut rng_nodrift, 64, 0.0, 1.0));
|
||||
}
|
||||
let nodrift_score = det_nodrift.drift_score();
|
||||
|
||||
let mut rng_drift = StdRng::seed_from_u64(4);
|
||||
let mut det_drift = MeanShiftDetector::new(64, 200, 0.05);
|
||||
for _ in 0..200 {
|
||||
det_drift.insert(&gaussian(&mut rng_drift, 64, 0.0, 1.0));
|
||||
}
|
||||
for _ in 0..100 {
|
||||
det_drift.insert(&gaussian(&mut rng_drift, 64, 4.0, 1.0));
|
||||
}
|
||||
let drift_score = det_drift.drift_score();
|
||||
|
||||
// Drift score must be at least 3× larger than no-drift score.
|
||||
assert!(
|
||||
drift_score > nodrift_score * 3.0,
|
||||
"signal-to-noise too low: drift={drift_score:.2} nodrift={nodrift_score:.2}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn reset_clears_state() {
|
||||
let mut rng = StdRng::seed_from_u64(5);
|
||||
let mut det = MeanShiftDetector::new(32, 100, 0.05);
|
||||
for _ in 0..100 {
|
||||
det.insert(&gaussian(&mut rng, 32, 0.0, 1.0));
|
||||
}
|
||||
for _ in 0..100 {
|
||||
det.insert(&gaussian(&mut rng, 32, 5.0, 1.0));
|
||||
}
|
||||
assert!(det.drift_score() > 0.1);
|
||||
det.reset_reference();
|
||||
assert_eq!(det.drift_score(), 0.0);
|
||||
assert_eq!(det.count(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn memory_bytes_nonzero() {
|
||||
let det_ms = MeanShiftDetector::new(128, 100, 0.05);
|
||||
let det_cs = CusumDetector::new(128, 100, 1.0);
|
||||
let det_mmd = MmdRffDetector::new(128, 256, 100, 1.0, 0.05, 0);
|
||||
assert!(det_ms.memory_bytes() > 0);
|
||||
assert!(det_cs.memory_bytes() > 0);
|
||||
assert!(
|
||||
det_mmd.memory_bytes() >= 256 * 128 * 4,
|
||||
"RFF matrix should dominate"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of a single drift evaluation run.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DriftReport {
|
||||
/// Name of the detector variant.
|
||||
pub variant: String,
|
||||
/// Number of insertions in the reference phase.
|
||||
pub reference_count: usize,
|
||||
/// Number of insertions in the drift-observation phase.
|
||||
pub observation_count: usize,
|
||||
/// Drift score at end of reference phase (should be near 0).
|
||||
pub baseline_score: f32,
|
||||
/// Drift score when detection first triggered (0 if not triggered).
|
||||
pub trigger_score: f32,
|
||||
/// Insertions after drift injection until detection (None = not detected).
|
||||
pub detection_lag: Option<usize>,
|
||||
/// Final drift score after all observations.
|
||||
pub final_score: f32,
|
||||
/// Approximate memory used by the detector (bytes).
|
||||
pub memory_bytes: usize,
|
||||
}
|
||||
332
crates/ruvector-drift/src/main.rs
Normal file
332
crates/ruvector-drift/src/main.rs
Normal file
|
|
@ -0,0 +1,332 @@
|
|||
//! # ruvector-drift benchmark
|
||||
//!
|
||||
//! Measures drift detection performance for three variants:
|
||||
//! 1. MeanShift — EMA mean-shift distance (baseline)
|
||||
//! 2. CUSUM — cumulative sum on reference-mean projections
|
||||
//! 3. MMD-RFF — Maximum Mean Discrepancy via Random Fourier Features
|
||||
//!
|
||||
//! Dataset: D-dimensional Gaussian vectors, split into a reference phase
|
||||
//! (mean = 0) and a drift phase (mean = `drift_magnitude`). All numbers are
|
||||
//! produced by a deterministic `rand::rngs::StdRng` — no external data needed.
|
||||
//!
|
||||
//! ## Usage
|
||||
//!
|
||||
//! cargo run --release -p ruvector-drift
|
||||
//! cargo run --release -p ruvector-drift -- --dim 128 --n 2000 --drift 2.0
|
||||
|
||||
use rand::{rngs::StdRng, Rng, SeedableRng};
|
||||
use ruvector_drift::{
|
||||
CusumDetector, DriftDetector, DriftReport, MeanShiftDetector, MmdRffDetector,
|
||||
};
|
||||
use std::time::Instant;
|
||||
|
||||
// ── CLI-style constants (override via environment for CI) ────────────────────
|
||||
|
||||
/// Vector dimension.
|
||||
const DIM: usize = 128;
|
||||
/// Total insertions (half reference, half drift).
|
||||
const N: usize = 2_000;
|
||||
/// Drift magnitude (L2 shift in mean vector).
|
||||
const DRIFT: f32 = 2.0;
|
||||
/// Number of random query probes for latency measurement.
|
||||
const QUERIES: usize = 1_000;
|
||||
/// Warm-up count for reference phase.
|
||||
const WARM_UP: usize = N / 2;
|
||||
/// CUSUM slack (half expected shift).
|
||||
const CUSUM_SLACK: f64 = 1.0;
|
||||
/// CUSUM alert threshold.
|
||||
const CUSUM_THRESH: f32 = 5.0;
|
||||
/// MeanShift alert threshold (L2 distance in embedding space).
|
||||
const MEAN_THRESH: f32 = 0.5;
|
||||
/// MMD-RFF alert threshold.
|
||||
const MMD_THRESH: f32 = 0.05;
|
||||
/// MMD-RFF feature count.
|
||||
const RFF_FEATURES: usize = 256;
|
||||
/// EMA alpha for MeanShift and MMD.
|
||||
const ALPHA: f64 = 0.05;
|
||||
|
||||
fn main() {
|
||||
print_header();
|
||||
|
||||
let mut rng = StdRng::seed_from_u64(42);
|
||||
|
||||
// ── Generate dataset ─────────────────────────────────────────────────────
|
||||
// Phase 1: reference — N(0, 1) per dimension.
|
||||
let reference_vecs: Vec<Vec<f32>> = (0..WARM_UP)
|
||||
.map(|_| sample_gaussian(&mut rng, DIM, 0.0, 1.0))
|
||||
.collect();
|
||||
|
||||
// Phase 2: drift — N(drift, 1) per dimension (all dims shifted).
|
||||
let drift_vecs: Vec<Vec<f32>> = (0..N - WARM_UP)
|
||||
.map(|_| sample_gaussian(&mut rng, DIM, DRIFT as f64, 1.0))
|
||||
.collect();
|
||||
|
||||
println!("Dataset");
|
||||
println!(" reference phase : {WARM_UP} vectors, D={DIM}, mean=0");
|
||||
println!(
|
||||
" drift phase : {} vectors, D={DIM}, mean={DRIFT}",
|
||||
N - WARM_UP
|
||||
);
|
||||
println!(" drift magnitude : {DRIFT} (L2 per-dim shift)");
|
||||
println!(" latency queries : {QUERIES}");
|
||||
println!();
|
||||
|
||||
// ── Run each variant ─────────────────────────────────────────────────────
|
||||
let ms_report = run_mean_shift(&reference_vecs, &drift_vecs, &mut rng);
|
||||
let cs_report = run_cusum(&reference_vecs, &drift_vecs, &mut rng);
|
||||
let mmd_report = run_mmd_rff(&reference_vecs, &drift_vecs, &mut rng);
|
||||
|
||||
// ── Print results table ──────────────────────────────────────────────────
|
||||
print_table(&[ms_report.clone(), cs_report.clone(), mmd_report.clone()]);
|
||||
|
||||
// ── Latency breakdown ────────────────────────────────────────────────────
|
||||
measure_latency(&[
|
||||
("MeanShift", MEAN_THRESH),
|
||||
("CUSUM", CUSUM_THRESH),
|
||||
("MMD-RFF", MMD_THRESH),
|
||||
]);
|
||||
|
||||
// ── Acceptance test ──────────────────────────────────────────────────────
|
||||
println!("\n── Acceptance Test ─────────────────────────────────────────────");
|
||||
let mut pass = true;
|
||||
for report in &[&ms_report, &cs_report, &mmd_report] {
|
||||
let detected = report.detection_lag.is_some();
|
||||
let status = if detected { "PASS" } else { "FAIL" };
|
||||
println!(
|
||||
" {:<12} detect={} baseline={:.4} trigger={:.4} → {status}",
|
||||
report.variant, detected, report.baseline_score, report.trigger_score
|
||||
);
|
||||
if !detected {
|
||||
pass = false;
|
||||
}
|
||||
}
|
||||
println!();
|
||||
if pass {
|
||||
println!(" ✓ All three detectors correctly identified the injected drift.");
|
||||
println!(" ACCEPTANCE RESULT: PASS");
|
||||
} else {
|
||||
eprintln!(
|
||||
" ✗ One or more detectors failed to detect drift within the observation window."
|
||||
);
|
||||
eprintln!(" ACCEPTANCE RESULT: FAIL");
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Variant runners ──────────────────────────────────────────────────────────
|
||||
|
||||
fn run_mean_shift(
|
||||
ref_vecs: &[Vec<f32>],
|
||||
drift_vecs: &[Vec<f32>],
|
||||
_rng: &mut StdRng,
|
||||
) -> DriftReport {
|
||||
let mut det = MeanShiftDetector::new(DIM, WARM_UP, ALPHA);
|
||||
|
||||
// Reference phase
|
||||
for v in ref_vecs {
|
||||
det.insert(v);
|
||||
}
|
||||
let baseline_score = det.drift_score();
|
||||
|
||||
// Drift phase — find first detection
|
||||
let mut detection_lag = None;
|
||||
let mut trigger_score = 0.0_f32;
|
||||
for (i, v) in drift_vecs.iter().enumerate() {
|
||||
det.insert(v);
|
||||
if detection_lag.is_none() && det.is_drifted(MEAN_THRESH) {
|
||||
detection_lag = Some(i + 1);
|
||||
trigger_score = det.drift_score();
|
||||
}
|
||||
}
|
||||
let final_score = det.drift_score();
|
||||
|
||||
DriftReport {
|
||||
variant: "MeanShift".into(),
|
||||
reference_count: ref_vecs.len(),
|
||||
observation_count: drift_vecs.len(),
|
||||
baseline_score,
|
||||
trigger_score,
|
||||
detection_lag,
|
||||
final_score,
|
||||
memory_bytes: det.memory_bytes(),
|
||||
}
|
||||
}
|
||||
|
||||
fn run_cusum(ref_vecs: &[Vec<f32>], drift_vecs: &[Vec<f32>], _rng: &mut StdRng) -> DriftReport {
|
||||
let mut det = CusumDetector::new(DIM, WARM_UP, CUSUM_SLACK);
|
||||
|
||||
for v in ref_vecs {
|
||||
det.insert(v);
|
||||
}
|
||||
let baseline_score = det.drift_score();
|
||||
|
||||
let mut detection_lag = None;
|
||||
let mut trigger_score = 0.0_f32;
|
||||
for (i, v) in drift_vecs.iter().enumerate() {
|
||||
det.insert(v);
|
||||
if detection_lag.is_none() && det.is_drifted(CUSUM_THRESH) {
|
||||
detection_lag = Some(i + 1);
|
||||
trigger_score = det.drift_score();
|
||||
}
|
||||
}
|
||||
let final_score = det.drift_score();
|
||||
|
||||
DriftReport {
|
||||
variant: "CUSUM".into(),
|
||||
reference_count: ref_vecs.len(),
|
||||
observation_count: drift_vecs.len(),
|
||||
baseline_score,
|
||||
trigger_score,
|
||||
detection_lag,
|
||||
final_score,
|
||||
memory_bytes: det.memory_bytes(),
|
||||
}
|
||||
}
|
||||
|
||||
fn run_mmd_rff(ref_vecs: &[Vec<f32>], drift_vecs: &[Vec<f32>], _rng: &mut StdRng) -> DriftReport {
|
||||
let mut det = MmdRffDetector::new(DIM, RFF_FEATURES, WARM_UP, 1.0, ALPHA, 99);
|
||||
|
||||
for v in ref_vecs {
|
||||
det.insert(v);
|
||||
}
|
||||
let baseline_score = det.drift_score();
|
||||
|
||||
let mut detection_lag = None;
|
||||
let mut trigger_score = 0.0_f32;
|
||||
for (i, v) in drift_vecs.iter().enumerate() {
|
||||
det.insert(v);
|
||||
if detection_lag.is_none() && det.is_drifted(MMD_THRESH) {
|
||||
detection_lag = Some(i + 1);
|
||||
trigger_score = det.drift_score();
|
||||
}
|
||||
}
|
||||
let final_score = det.drift_score();
|
||||
|
||||
DriftReport {
|
||||
variant: "MMD-RFF".into(),
|
||||
reference_count: ref_vecs.len(),
|
||||
observation_count: drift_vecs.len(),
|
||||
baseline_score,
|
||||
trigger_score,
|
||||
detection_lag,
|
||||
final_score,
|
||||
memory_bytes: det.memory_bytes(),
|
||||
}
|
||||
}
|
||||
|
||||
// ── Latency measurement ──────────────────────────────────────────────────────
|
||||
|
||||
fn measure_latency(variants: &[(&str, f32)]) {
|
||||
let mut rng = StdRng::seed_from_u64(777);
|
||||
let queries: Vec<Vec<f32>> = (0..QUERIES)
|
||||
.map(|_| sample_gaussian(&mut rng, DIM, 0.0, 1.0))
|
||||
.collect();
|
||||
|
||||
println!("── Insert Latency (ns/vector, {QUERIES} probes) ────────────────────");
|
||||
|
||||
// MeanShift
|
||||
{
|
||||
let mut det = MeanShiftDetector::new(DIM, WARM_UP, ALPHA);
|
||||
let t0 = Instant::now();
|
||||
for q in &queries {
|
||||
det.insert(q);
|
||||
}
|
||||
let elapsed = t0.elapsed().as_nanos() as f64;
|
||||
let mean_ns = elapsed / QUERIES as f64;
|
||||
let score = det.drift_score();
|
||||
println!(
|
||||
" MeanShift mean={mean_ns:>8.1} ns/insert score_after={score:.4} mem={}B",
|
||||
det.memory_bytes()
|
||||
);
|
||||
}
|
||||
{
|
||||
let mut det = CusumDetector::new(DIM, WARM_UP, CUSUM_SLACK);
|
||||
let t0 = Instant::now();
|
||||
for q in &queries {
|
||||
det.insert(q);
|
||||
}
|
||||
let elapsed = t0.elapsed().as_nanos() as f64;
|
||||
let mean_ns = elapsed / QUERIES as f64;
|
||||
let score = det.drift_score();
|
||||
println!(
|
||||
" CUSUM mean={mean_ns:>8.1} ns/insert score_after={score:.4} mem={}B",
|
||||
det.memory_bytes()
|
||||
);
|
||||
}
|
||||
{
|
||||
let mut det = MmdRffDetector::new(DIM, RFF_FEATURES, WARM_UP, 1.0, ALPHA, 99);
|
||||
let t0 = Instant::now();
|
||||
for q in &queries {
|
||||
det.insert(q);
|
||||
}
|
||||
let elapsed = t0.elapsed().as_nanos() as f64;
|
||||
let mean_ns = elapsed / QUERIES as f64;
|
||||
let score = det.drift_score();
|
||||
println!(
|
||||
" MMD-RFF mean={mean_ns:>8.1} ns/insert score_after={score:.4} mem={}B",
|
||||
det.memory_bytes()
|
||||
);
|
||||
}
|
||||
println!();
|
||||
for (name, thresh) in variants {
|
||||
println!(" {name:<12} threshold={thresh:.4}");
|
||||
}
|
||||
println!();
|
||||
}
|
||||
|
||||
// ── Print helpers ────────────────────────────────────────────────────────────
|
||||
|
||||
fn print_header() {
|
||||
println!("══════════════════════════════════════════════════════════════════");
|
||||
println!(" ruvector-drift Streaming Semantic Drift Detection Benchmark");
|
||||
println!("══════════════════════════════════════════════════════════════════");
|
||||
println!(" OS : {}", std::env::consts::OS);
|
||||
println!(" Arch : {}", std::env::consts::ARCH);
|
||||
println!(" Dims : {DIM}");
|
||||
println!(" N : {N}");
|
||||
println!(" Drift: {DRIFT}");
|
||||
println!();
|
||||
}
|
||||
|
||||
fn print_table(reports: &[DriftReport]) {
|
||||
println!("── Detection Results ───────────────────────────────────────────────");
|
||||
println!(
|
||||
"{:<12} {:>8} {:>10} {:>10} {:>10} {:>10} {:>10}",
|
||||
"Variant", "Ref#", "Drift#", "Baseline", "FinalScore", "Lag(vecs)", "Mem(B)"
|
||||
);
|
||||
println!("{}", "─".repeat(78));
|
||||
for r in reports {
|
||||
let lag = r.detection_lag.map_or("NONE".into(), |l| l.to_string());
|
||||
println!(
|
||||
"{:<12} {:>8} {:>10} {:>10.4} {:>10.4} {:>10} {:>10}",
|
||||
r.variant,
|
||||
r.reference_count,
|
||||
r.observation_count,
|
||||
r.baseline_score,
|
||||
r.final_score,
|
||||
lag,
|
||||
r.memory_bytes
|
||||
);
|
||||
}
|
||||
println!();
|
||||
}
|
||||
|
||||
// ── Dataset generation ───────────────────────────────────────────────────────
|
||||
|
||||
/// Sample a D-dimensional Gaussian vector N(mean, std) using Box-Muller.
|
||||
fn sample_gaussian(rng: &mut StdRng, dim: usize, mean: f64, std: f64) -> Vec<f32> {
|
||||
use std::f64::consts::PI;
|
||||
let mut out = Vec::with_capacity(dim);
|
||||
while out.len() < dim {
|
||||
let u1 = rng.gen::<f64>().max(1e-14);
|
||||
let u2 = rng.gen::<f64>();
|
||||
let r = (-2.0 * u1.ln()).sqrt() * std;
|
||||
let theta = 2.0 * PI * u2;
|
||||
out.push((mean + r * theta.cos()) as f32);
|
||||
if out.len() < dim {
|
||||
out.push((mean + r * theta.sin()) as f32);
|
||||
}
|
||||
}
|
||||
out.truncate(dim);
|
||||
out
|
||||
}
|
||||
91
crates/ruvector-drift/src/mean_shift.rs
Normal file
91
crates/ruvector-drift/src/mean_shift.rs
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
//! Baseline drift detector: EMA mean-shift distance.
|
||||
//!
|
||||
//! Compares the exponential moving average of recent insertions against a
|
||||
//! frozen reference mean. When their L2 distance exceeds a threshold the
|
||||
//! agent memory is considered to have semantically drifted.
|
||||
//!
|
||||
//! **Complexity:** O(D) insert, O(D) score, O(D) memory.
|
||||
|
||||
use crate::{stats::OnlineStats, DriftDetector};
|
||||
|
||||
/// Exponential-moving-average mean-shift detector.
|
||||
///
|
||||
/// During the warm-up phase (`count < warm_up`), every inserted vector feeds
|
||||
/// a reference [`OnlineStats`]. After warm-up, new vectors update a separate
|
||||
/// EMA mean. The drift score is the L2 distance between the reference mean
|
||||
/// and the EMA mean.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MeanShiftDetector {
|
||||
dim: usize,
|
||||
warm_up: usize,
|
||||
alpha: f64,
|
||||
reference: OnlineStats,
|
||||
ema: Vec<f64>,
|
||||
count: usize,
|
||||
warmed_up: bool,
|
||||
}
|
||||
|
||||
impl MeanShiftDetector {
|
||||
/// Create a new detector.
|
||||
///
|
||||
/// - `dim`: vector dimension
|
||||
/// - `warm_up`: number of insertions to build the reference distribution
|
||||
/// - `alpha`: EMA smoothing factor (0 < alpha ≤ 1; smaller = slower adaptation)
|
||||
pub fn new(dim: usize, warm_up: usize, alpha: f64) -> Self {
|
||||
assert!(dim > 0);
|
||||
assert!(warm_up > 0);
|
||||
assert!(alpha > 0.0 && alpha <= 1.0);
|
||||
Self {
|
||||
dim,
|
||||
warm_up,
|
||||
alpha,
|
||||
reference: OnlineStats::new(dim),
|
||||
ema: vec![0.0; dim],
|
||||
count: 0,
|
||||
warmed_up: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DriftDetector for MeanShiftDetector {
|
||||
fn insert(&mut self, vec: &[f32]) {
|
||||
debug_assert_eq!(vec.len(), self.dim);
|
||||
self.count += 1;
|
||||
|
||||
if !self.warmed_up {
|
||||
self.reference.push(vec);
|
||||
if self.count >= self.warm_up {
|
||||
self.warmed_up = true;
|
||||
// seed EMA from the reference mean
|
||||
self.ema.clone_from(&self.reference.mean);
|
||||
}
|
||||
} else {
|
||||
// EMA update
|
||||
for (i, &x) in vec.iter().enumerate() {
|
||||
self.ema[i] = self.alpha * x as f64 + (1.0 - self.alpha) * self.ema[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn drift_score(&self) -> f32 {
|
||||
if !self.warmed_up {
|
||||
return 0.0;
|
||||
}
|
||||
self.reference.mean_l2_to(&self.ema) as f32
|
||||
}
|
||||
|
||||
fn reset_reference(&mut self) {
|
||||
self.reference = OnlineStats::new(self.dim);
|
||||
self.ema = vec![0.0; self.dim];
|
||||
self.count = 0;
|
||||
self.warmed_up = false;
|
||||
}
|
||||
|
||||
fn count(&self) -> usize {
|
||||
self.count
|
||||
}
|
||||
|
||||
fn memory_bytes(&self) -> usize {
|
||||
self.reference.memory_bytes() + self.dim * std::mem::size_of::<f64>()
|
||||
}
|
||||
}
|
||||
166
crates/ruvector-drift/src/mmd_rff.rs
Normal file
166
crates/ruvector-drift/src/mmd_rff.rs
Normal file
|
|
@ -0,0 +1,166 @@
|
|||
//! Alternative B: MMD drift detector via Random Fourier Features.
|
||||
//!
|
||||
//! Approximates Maximum Mean Discrepancy (MMD) between the reference and
|
||||
//! current distributions using random Fourier features (Rahimi & Recht, 2007).
|
||||
//! Vectors are mapped to an R-dimensional feature space via random cosine
|
||||
//! projections, then MMD² ≈ ||μ_ref - μ_curr||² in feature space.
|
||||
//!
|
||||
//! This gives an unbiased (in expectation) two-sample test statistic that can
|
||||
//! detect arbitrary distribution shifts — not just mean shifts — at the cost of
|
||||
//! O(D × R) memory and O(D + R) per-insert work.
|
||||
//!
|
||||
//! **Complexity:** O(D + R) insert, O(R) score, O(D × R) memory.
|
||||
|
||||
use crate::DriftDetector;
|
||||
use rand::{rngs::StdRng, Rng, SeedableRng};
|
||||
use std::f64::consts::PI;
|
||||
|
||||
/// MMD detector with Random Fourier Feature approximation.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MmdRffDetector {
|
||||
dim: usize,
|
||||
num_features: usize,
|
||||
warm_up: usize,
|
||||
/// Random frequency matrix Ω ∈ ℝ^{R×D} (row-major), sampled ~ N(0, 2γ).
|
||||
omega: Vec<f32>,
|
||||
/// Random phase offsets b ∈ [0, 2π)^R.
|
||||
bias: Vec<f32>,
|
||||
/// Reference feature mean (frozen after warm-up).
|
||||
ref_feat_mean: Vec<f64>,
|
||||
/// Current window feature mean (EMA).
|
||||
cur_feat_mean: Vec<f64>,
|
||||
/// EMA smoothing factor.
|
||||
alpha: f64,
|
||||
count: usize,
|
||||
warmed_up: bool,
|
||||
}
|
||||
|
||||
impl MmdRffDetector {
|
||||
/// Create a new MMD-RFF detector.
|
||||
///
|
||||
/// - `dim`: vector dimension
|
||||
/// - `num_features`: number of random Fourier features R (typical: 128–512)
|
||||
/// - `warm_up`: insertions to build reference
|
||||
/// - `bandwidth`: RBF kernel bandwidth γ (controls sensitivity; try 1.0)
|
||||
/// - `alpha`: EMA smoothing for current mean
|
||||
/// - `seed`: RNG seed for reproducible feature matrix
|
||||
pub fn new(
|
||||
dim: usize,
|
||||
num_features: usize,
|
||||
warm_up: usize,
|
||||
bandwidth: f32,
|
||||
alpha: f64,
|
||||
seed: u64,
|
||||
) -> Self {
|
||||
assert!(dim > 0);
|
||||
assert!(num_features > 0);
|
||||
assert!(warm_up > 0);
|
||||
assert!(bandwidth > 0.0);
|
||||
assert!(alpha > 0.0 && alpha <= 1.0);
|
||||
|
||||
let mut rng = StdRng::seed_from_u64(seed);
|
||||
let scale = (2.0 * bandwidth as f64).sqrt();
|
||||
|
||||
// Sample Ω ~ N(0, 2γ·I) via Box-Muller.
|
||||
let total = num_features * dim;
|
||||
let mut omega = Vec::with_capacity(total);
|
||||
while omega.len() < total {
|
||||
let u1: f64 = rng.gen::<f64>().max(1e-14);
|
||||
let u2: f64 = rng.gen::<f64>();
|
||||
let r = scale * (-2.0 * u1.ln()).sqrt();
|
||||
let theta = 2.0 * PI * u2;
|
||||
omega.push((r * theta.cos()) as f32);
|
||||
if omega.len() < total {
|
||||
omega.push((r * theta.sin()) as f32);
|
||||
}
|
||||
}
|
||||
omega.truncate(total);
|
||||
|
||||
// Sample b ~ Uniform[0, 2π).
|
||||
let bias: Vec<f32> = (0..num_features)
|
||||
.map(|_| (rng.gen::<f64>() * 2.0 * PI) as f32)
|
||||
.collect();
|
||||
|
||||
Self {
|
||||
dim,
|
||||
num_features,
|
||||
warm_up,
|
||||
omega,
|
||||
bias,
|
||||
ref_feat_mean: vec![0.0; num_features],
|
||||
cur_feat_mean: vec![0.0; num_features],
|
||||
alpha,
|
||||
count: 0,
|
||||
warmed_up: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Map vector → RFF feature vector z(v) = √(2/R) cos(Ω v + b).
|
||||
fn map_to_features(&self, vec: &[f32]) -> Vec<f32> {
|
||||
let scale = (2.0_f32 / self.num_features as f32).sqrt();
|
||||
(0..self.num_features)
|
||||
.map(|r| {
|
||||
let row = &self.omega[r * self.dim..(r + 1) * self.dim];
|
||||
let dot: f32 = row.iter().zip(vec.iter()).map(|(w, x)| w * x).sum();
|
||||
scale * (dot + self.bias[r]).cos()
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
impl DriftDetector for MmdRffDetector {
|
||||
fn insert(&mut self, vec: &[f32]) {
|
||||
debug_assert_eq!(vec.len(), self.dim);
|
||||
self.count += 1;
|
||||
|
||||
let feat = self.map_to_features(vec);
|
||||
|
||||
if !self.warmed_up {
|
||||
// Welford running mean over reference phase.
|
||||
let n = self.count as f64;
|
||||
for (i, &f) in feat.iter().enumerate() {
|
||||
self.ref_feat_mean[i] += (f as f64 - self.ref_feat_mean[i]) / n;
|
||||
}
|
||||
if self.count >= self.warm_up {
|
||||
self.warmed_up = true;
|
||||
self.cur_feat_mean.clone_from(&self.ref_feat_mean);
|
||||
}
|
||||
} else {
|
||||
// EMA update of current feature mean.
|
||||
for (i, &f) in feat.iter().enumerate() {
|
||||
self.cur_feat_mean[i] =
|
||||
self.alpha * f as f64 + (1.0 - self.alpha) * self.cur_feat_mean[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn drift_score(&self) -> f32 {
|
||||
if !self.warmed_up {
|
||||
return 0.0;
|
||||
}
|
||||
// MMD ≈ ||μ_ref - μ_cur||_2
|
||||
self.ref_feat_mean
|
||||
.iter()
|
||||
.zip(self.cur_feat_mean.iter())
|
||||
.map(|(a, b)| (a - b).powi(2))
|
||||
.sum::<f64>()
|
||||
.sqrt() as f32
|
||||
}
|
||||
|
||||
fn reset_reference(&mut self) {
|
||||
self.ref_feat_mean = vec![0.0; self.num_features];
|
||||
self.cur_feat_mean = vec![0.0; self.num_features];
|
||||
self.count = 0;
|
||||
self.warmed_up = false;
|
||||
}
|
||||
|
||||
fn count(&self) -> usize {
|
||||
self.count
|
||||
}
|
||||
|
||||
fn memory_bytes(&self) -> usize {
|
||||
self.omega.len() * std::mem::size_of::<f32>()
|
||||
+ self.bias.len() * std::mem::size_of::<f32>()
|
||||
+ 2 * self.num_features * std::mem::size_of::<f64>()
|
||||
}
|
||||
}
|
||||
68
crates/ruvector-drift/src/stats.rs
Normal file
68
crates/ruvector-drift/src/stats.rs
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
//! Online streaming statistics for D-dimensional vectors.
|
||||
|
||||
/// Welford-style online running statistics for a D-dimensional vector stream.
|
||||
///
|
||||
/// Tracks per-dimension mean and variance without storing all samples.
|
||||
/// Used by [`MeanShiftDetector`] and [`CusumDetector`].
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct OnlineStats {
|
||||
/// Dimension of tracked vectors.
|
||||
pub dim: usize,
|
||||
/// Per-dimension running mean.
|
||||
pub mean: Vec<f64>,
|
||||
/// Per-dimension running M2 (for variance: M2 / n).
|
||||
pub m2: Vec<f64>,
|
||||
/// Number of samples ingested.
|
||||
pub n: usize,
|
||||
}
|
||||
|
||||
impl OnlineStats {
|
||||
/// Create a new tracker for `dim`-dimensional vectors.
|
||||
pub fn new(dim: usize) -> Self {
|
||||
Self {
|
||||
dim,
|
||||
mean: vec![0.0; dim],
|
||||
m2: vec![0.0; dim],
|
||||
n: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Ingest one sample, updating running mean and M2.
|
||||
pub fn push(&mut self, v: &[f32]) {
|
||||
debug_assert_eq!(v.len(), self.dim);
|
||||
self.n += 1;
|
||||
let n = self.n as f64;
|
||||
for (i, &x) in v.iter().enumerate() {
|
||||
let x = x as f64;
|
||||
let delta = x - self.mean[i];
|
||||
self.mean[i] += delta / n;
|
||||
let delta2 = x - self.mean[i];
|
||||
self.m2[i] += delta * delta2;
|
||||
}
|
||||
}
|
||||
|
||||
/// Per-dimension variance (sample variance, n-1 denominator).
|
||||
pub fn variance(&self) -> Vec<f64> {
|
||||
if self.n < 2 {
|
||||
return vec![0.0; self.dim];
|
||||
}
|
||||
let denom = (self.n - 1) as f64;
|
||||
self.m2.iter().map(|m| m / denom).collect()
|
||||
}
|
||||
|
||||
/// L2 distance between this mean and another mean vector.
|
||||
pub fn mean_l2_to(&self, other: &[f64]) -> f64 {
|
||||
debug_assert_eq!(other.len(), self.dim);
|
||||
self.mean
|
||||
.iter()
|
||||
.zip(other.iter())
|
||||
.map(|(a, b)| (a - b).powi(2))
|
||||
.sum::<f64>()
|
||||
.sqrt()
|
||||
}
|
||||
|
||||
/// Approximate heap bytes used.
|
||||
pub fn memory_bytes(&self) -> usize {
|
||||
2 * self.dim * std::mem::size_of::<f64>()
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue