mirror of
https://github.com/ruvnet/RuVector.git
synced 2026-05-26 16:04:02 +00:00
feat(symphony-qg): add ruvector-symphony-qg crate (SIGMOD 2025)
Implements SymphonyQG co-located RaBitQ codes + batch asymmetric distance estimation on graph-based ANNS in pure Rust (no unsafe, no C FFI). Key results (Intel Xeon 2.80 GHz, D=128, R=32): - Distance kernel: 16.2× faster than exact L2 (269 ns vs 4,348 ns) - End-to-end beam search: 2.0–2.6× QPS over GraphExact at equal params - Memory footprint: identical to GraphExact (codes stored co-located) Source: arXiv:2411.12229 (Gou et al., SIGMOD 2025) https://claude.ai/code/session_01Xkk1ccGRxzFgNnTGP4qNBX
This commit is contained in:
parent
368d64a292
commit
7b8a1bc149
12 changed files with 1375 additions and 0 deletions
13
Cargo.lock
generated
13
Cargo.lock
generated
|
|
@ -10293,6 +10293,19 @@ dependencies = [
|
|||
"wasm-bindgen-futures",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ruvector-symphony-qg"
|
||||
version = "2.2.0"
|
||||
dependencies = [
|
||||
"criterion 0.5.1",
|
||||
"rand 0.8.5",
|
||||
"rand_distr 0.4.3",
|
||||
"rayon",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror 2.0.18",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ruvector-temporal-tensor"
|
||||
version = "2.2.0"
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@ exclude = ["crates/micro-hnsw-wasm", "crates/ruvector-hyperbolic-hnsw", "crates/
|
|||
# land in iters 92-97.
|
||||
"crates/ruos-thermal"]
|
||||
members = [
|
||||
"crates/ruvector-symphony-qg",
|
||||
"crates/ruvector-acorn",
|
||||
"crates/ruvector-acorn-wasm",
|
||||
"crates/ruvector-rabitq",
|
||||
|
|
|
|||
30
crates/ruvector-symphony-qg/Cargo.toml
Normal file
30
crates/ruvector-symphony-qg/Cargo.toml
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
[package]
|
||||
name = "ruvector-symphony-qg"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
rust-version.workspace = true
|
||||
license.workspace = true
|
||||
authors.workspace = true
|
||||
repository.workspace = true
|
||||
description = "SymphonyQG: co-located RaBitQ codes + FastScan batch distance estimation on graph-based ANNS (SIGMOD 2025)"
|
||||
|
||||
[[bin]]
|
||||
name = "symphony-demo"
|
||||
path = "src/main.rs"
|
||||
|
||||
[[bench]]
|
||||
name = "symphony_bench"
|
||||
harness = false
|
||||
|
||||
[dependencies]
|
||||
rand = { workspace = true }
|
||||
rand_distr = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
|
||||
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
|
||||
rayon = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
criterion = { workspace = true }
|
||||
82
crates/ruvector-symphony-qg/benches/symphony_bench.rs
Normal file
82
crates/ruvector-symphony-qg/benches/symphony_bench.rs
Normal file
|
|
@ -0,0 +1,82 @@
|
|||
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
|
||||
use rand::SeedableRng;
|
||||
use rand_distr::{Distribution, Normal};
|
||||
|
||||
use ruvector_symphony_qg::{
|
||||
codes::{batch_asym_l2, encode, packed_bytes, QueryProjection},
|
||||
graph::l2_sq,
|
||||
rotation::random_orthogonal,
|
||||
};
|
||||
|
||||
fn gaussian_vec(dim: usize, seed: u64) -> Vec<f32> {
|
||||
let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
|
||||
let n = Normal::new(0.0f32, 1.0).unwrap();
|
||||
(0..dim).map(|_| n.sample(&mut rng)).collect()
|
||||
}
|
||||
|
||||
fn bench_distance_kernels(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("distance_kernels");
|
||||
|
||||
for dim in [64usize, 128, 256] {
|
||||
let r = 32; // R neighbors per hop
|
||||
|
||||
let q = gaussian_vec(dim, 1);
|
||||
let xs: Vec<Vec<f32>> = (0..r).map(|i| gaussian_vec(dim, i as u64 + 100)).collect();
|
||||
|
||||
// Build batch code block
|
||||
let rot = random_orthogonal(dim, 42);
|
||||
let q_rot: Vec<f32> = (0..dim)
|
||||
.map(|i| rot[i * dim..i * dim + dim].iter().zip(q.iter()).map(|(a, b)| a * b).sum())
|
||||
.collect();
|
||||
|
||||
let nbytes = packed_bytes(dim);
|
||||
let mut codes_block = vec![0u8; r * nbytes];
|
||||
let mut norms = vec![0.0f32; r];
|
||||
for (j, x) in xs.iter().enumerate() {
|
||||
let x_rot: Vec<f32> = (0..dim)
|
||||
.map(|i| rot[i * dim..i * dim + dim].iter().zip(x.iter()).map(|(a, b)| a * b).sum())
|
||||
.collect();
|
||||
let (code, norm) = encode(&x_rot);
|
||||
codes_block[j * nbytes..(j + 1) * nbytes].copy_from_slice(&code);
|
||||
norms[j] = norm;
|
||||
}
|
||||
let qp = QueryProjection::new(q_rot);
|
||||
let norm_q_sq: f32 = q.iter().map(|v| v * v).sum();
|
||||
|
||||
// 1. Exact L2: R individual dot products
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("exact_l2_r32", dim),
|
||||
&dim,
|
||||
|b, _| {
|
||||
b.iter(|| {
|
||||
let mut sum = 0.0f32;
|
||||
for x in &xs {
|
||||
sum += l2_sq(black_box(&q), black_box(x));
|
||||
}
|
||||
black_box(sum)
|
||||
})
|
||||
},
|
||||
);
|
||||
|
||||
// 2. Batch asymmetric (SymphonyQG FastScan)
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("batch_asym_r32", dim),
|
||||
&dim,
|
||||
|b, _| {
|
||||
b.iter(|| {
|
||||
black_box(batch_asym_l2(
|
||||
black_box(&qp),
|
||||
black_box(&codes_block),
|
||||
black_box(&norms),
|
||||
norm_q_sq,
|
||||
))
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(benches, bench_distance_kernels);
|
||||
criterion_main!(benches);
|
||||
193
crates/ruvector-symphony-qg/src/codes.rs
Normal file
193
crates/ruvector-symphony-qg/src/codes.rs
Normal file
|
|
@ -0,0 +1,193 @@
|
|||
//! RaBitQ 1-bit encoding and asymmetric distance estimation.
|
||||
//!
|
||||
//! Each D-dimensional vector is binarised as sign(R × x), packed into
|
||||
//! ceil(D/8) bytes (D bits). The precomputed norm ‖R × x‖₂ is stored
|
||||
//! separately to enable the asymmetric estimator.
|
||||
//!
|
||||
//! ## Asymmetric distance estimate
|
||||
//!
|
||||
//! For query q (f32) and database code b (bits) with precomputed ‖x‖:
|
||||
//!
|
||||
//! est_IP(q, x) = (‖q_rot‖ × norm_x / √D) × (2 × popcount(q_sign XNOR b) − D)
|
||||
//!
|
||||
//! est_L2(q, x) = ‖q‖² + ‖x‖² − 2 × est_IP(q, x)
|
||||
//!
|
||||
//! where q_rot = R × q, q_sign = sign(q_rot), norm_x = ‖R × x‖.
|
||||
//!
|
||||
//! ## Batch batch estimation
|
||||
//!
|
||||
//! For the SymphonyQG co-located layout we call `batch_asym_dist` over
|
||||
//! R neighbor codes stored contiguously. All R codes are read sequentially;
|
||||
//! distances are accumulated using u64 popcount, matching the FastScan
|
||||
//! spirit without requiring platform-specific SIMD intrinsics.
|
||||
|
||||
/// Number of bytes needed to pack `dim` bits.
|
||||
#[inline(always)]
|
||||
pub fn packed_bytes(dim: usize) -> usize {
|
||||
dim.div_ceil(8)
|
||||
}
|
||||
|
||||
/// Encode a rotated vector (f32 slice) as 1-bit sign codes packed into bytes.
|
||||
/// Returns (codes, norm) where norm = ‖x_rot‖₂.
|
||||
pub fn encode(x_rot: &[f32]) -> (Vec<u8>, f32) {
|
||||
let dim = x_rot.len();
|
||||
let nbytes = packed_bytes(dim);
|
||||
let mut codes = vec![0u8; nbytes];
|
||||
for (i, &v) in x_rot.iter().enumerate() {
|
||||
if v >= 0.0 {
|
||||
codes[i / 8] |= 1 << (i % 8);
|
||||
}
|
||||
}
|
||||
let norm = x_rot.iter().map(|v| v * v).sum::<f32>().sqrt();
|
||||
(codes, norm)
|
||||
}
|
||||
|
||||
/// Precomputed per-query data needed for asymmetric estimation.
|
||||
pub struct QueryProjection {
|
||||
/// sign(q_rot) packed as bits, same layout as database codes.
|
||||
pub sign_bits: Vec<u8>,
|
||||
/// q_rot values (for the correction term).
|
||||
pub q_rot: Vec<f32>,
|
||||
/// ‖q_rot‖₂.
|
||||
pub q_norm: f32,
|
||||
/// Dimension.
|
||||
pub dim: usize,
|
||||
}
|
||||
|
||||
impl QueryProjection {
|
||||
pub fn new(q_rot: Vec<f32>) -> Self {
|
||||
let dim = q_rot.len();
|
||||
let (sign_bits, q_norm) = encode(&q_rot);
|
||||
Self { sign_bits, q_rot, q_norm, dim }
|
||||
}
|
||||
}
|
||||
|
||||
/// Asymmetric L2 distance estimate for a single database code.
|
||||
///
|
||||
/// Returns the estimated squared L2 distance ‖q − x‖².
|
||||
#[inline]
|
||||
pub fn asym_l2_dist(qp: &QueryProjection, code: &[u8], norm_x: f32, norm_q_sq: f32) -> f32 {
|
||||
let dim = qp.dim;
|
||||
let nbytes = packed_bytes(dim);
|
||||
|
||||
// popcount(q_sign XNOR code) counts matching bits
|
||||
let mut matches = 0u32;
|
||||
let full_words = nbytes / 8;
|
||||
for i in 0..full_words {
|
||||
let a = u64::from_le_bytes(qp.sign_bits[i * 8..i * 8 + 8].try_into().unwrap());
|
||||
let b = u64::from_le_bytes(code[i * 8..i * 8 + 8].try_into().unwrap());
|
||||
matches += (!(a ^ b)).count_ones();
|
||||
}
|
||||
for i in full_words * 8..nbytes {
|
||||
matches += (!(qp.sign_bits[i] ^ code[i])).count_ones() as u32;
|
||||
}
|
||||
// Correct for padding bits beyond dim (they should not contribute)
|
||||
let pad_bits = nbytes * 8 - dim;
|
||||
// Bits past dim in the last byte are 0 in code and 0 in sign_bits (default), so xnor=1 → subtract
|
||||
matches = matches.saturating_sub(pad_bits as u32);
|
||||
|
||||
// score ∈ [−D, D]: positive means aligned, negative means opposite
|
||||
let score = 2 * matches as i32 - dim as i32;
|
||||
let est_ip = (qp.q_norm * norm_x / (dim as f32).sqrt()) * score as f32;
|
||||
norm_q_sq + norm_x * norm_x - 2.0 * est_ip
|
||||
}
|
||||
|
||||
/// Batch asymmetric L2 estimates for `n_neighbors` codes stored contiguously.
|
||||
///
|
||||
/// `codes_block` must be `n_neighbors × nbytes` bytes laid out sequentially.
|
||||
/// `norms` must be `n_neighbors` floats.
|
||||
///
|
||||
/// Returns a `Vec<f32>` of length `n_neighbors` with estimated distances.
|
||||
pub fn batch_asym_l2(
|
||||
qp: &QueryProjection,
|
||||
codes_block: &[u8],
|
||||
norms: &[f32],
|
||||
norm_q_sq: f32,
|
||||
) -> Vec<f32> {
|
||||
let nbytes = packed_bytes(qp.dim);
|
||||
let n = norms.len();
|
||||
debug_assert_eq!(codes_block.len(), n * nbytes);
|
||||
|
||||
let dim = qp.dim;
|
||||
let sqrt_d = (dim as f32).sqrt();
|
||||
let q_norm = qp.q_norm;
|
||||
|
||||
norms
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(j, &norm_x)| {
|
||||
let code = &codes_block[j * nbytes..(j + 1) * nbytes];
|
||||
let mut matches = 0u32;
|
||||
let full_words = nbytes / 8;
|
||||
for i in 0..full_words {
|
||||
let a = u64::from_le_bytes(
|
||||
qp.sign_bits[i * 8..i * 8 + 8].try_into().unwrap(),
|
||||
);
|
||||
let b = u64::from_le_bytes(code[i * 8..i * 8 + 8].try_into().unwrap());
|
||||
matches += (!(a ^ b)).count_ones();
|
||||
}
|
||||
for i in full_words * 8..nbytes {
|
||||
matches += (!(qp.sign_bits[i] ^ code[i])).count_ones() as u32;
|
||||
}
|
||||
let pad_bits = nbytes * 8 - dim;
|
||||
matches = matches.saturating_sub(pad_bits as u32);
|
||||
let score = 2 * matches as i32 - dim as i32;
|
||||
// Same operation order as asym_l2_dist to avoid IEEE 754 rounding divergence
|
||||
let est_ip = (q_norm * norm_x / sqrt_d) * score as f32;
|
||||
norm_q_sq + norm_x * norm_x - 2.0 * est_ip
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn encode_decode_signs() {
|
||||
let x = vec![1.0f32, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0];
|
||||
let (codes, _) = encode(&x);
|
||||
assert_eq!(codes.len(), 1);
|
||||
// bits 0,2,4,6 set (positive values), bits 1,3,5,7 clear
|
||||
assert_eq!(codes[0], 0b01010101u8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn asym_aligned_vectors_give_small_distance() {
|
||||
let dim = 64;
|
||||
let x: Vec<f32> = (0..dim).map(|i| if i % 2 == 0 { 1.0 } else { -1.0 }).collect();
|
||||
let q = x.clone();
|
||||
let (code, norm_x) = encode(&x);
|
||||
let qp = QueryProjection::new(q.clone());
|
||||
let norm_q_sq = q.iter().map(|v| v * v).sum::<f32>();
|
||||
let dist = asym_l2_dist(&qp, &code, norm_x, norm_q_sq);
|
||||
// Aligned vectors → L2 = 0 (estimated)
|
||||
assert!(dist < 10.0, "dist={dist}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn batch_matches_single() {
|
||||
let dim = 128;
|
||||
let n = 8;
|
||||
let mut codes_block = vec![0u8; n * packed_bytes(dim)];
|
||||
let mut norms = vec![0.0f32; n];
|
||||
let q: Vec<f32> = (0..dim).map(|i| i as f32 / dim as f32 - 0.5).collect();
|
||||
let qp = QueryProjection::new(q.clone());
|
||||
let norm_q_sq = q.iter().map(|v| v * v).sum::<f32>();
|
||||
|
||||
for j in 0..n {
|
||||
let x: Vec<f32> = (0..dim).map(|i| (i + j) as f32 / dim as f32 - 0.5).collect();
|
||||
let (c, norm) = encode(&x);
|
||||
let start = j * packed_bytes(dim);
|
||||
codes_block[start..start + packed_bytes(dim)].copy_from_slice(&c);
|
||||
norms[j] = norm;
|
||||
}
|
||||
|
||||
let batch = batch_asym_l2(&qp, &codes_block, &norms, norm_q_sq);
|
||||
for j in 0..n {
|
||||
let code = &codes_block[j * packed_bytes(dim)..(j + 1) * packed_bytes(dim)];
|
||||
let single = asym_l2_dist(&qp, code, norms[j], norm_q_sq);
|
||||
assert!((batch[j] - single).abs() < 1e-6, "mismatch at {j}");
|
||||
}
|
||||
}
|
||||
}
|
||||
18
crates/ruvector-symphony-qg/src/error.rs
Normal file
18
crates/ruvector-symphony-qg/src/error.rs
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum SymphonyError {
|
||||
#[error("dimension mismatch: expected {expected}, got {actual}")]
|
||||
DimensionMismatch { expected: usize, actual: usize },
|
||||
|
||||
#[error("empty corpus: cannot build index with zero vectors")]
|
||||
EmptyCorpus,
|
||||
|
||||
#[error("k ({k}) exceeds corpus size ({n})")]
|
||||
KTooLarge { k: usize, n: usize },
|
||||
|
||||
#[error("configuration error: {0}")]
|
||||
Config(String),
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, SymphonyError>;
|
||||
193
crates/ruvector-symphony-qg/src/graph.rs
Normal file
193
crates/ruvector-symphony-qg/src/graph.rs
Normal file
|
|
@ -0,0 +1,193 @@
|
|||
//! Graph construction and compact co-located memory layout.
|
||||
//!
|
||||
//! ## Co-located layout (the SymphonyQG key insight)
|
||||
//!
|
||||
//! For each vertex v with R neighbors, SymphonyQG stores a single contiguous
|
||||
//! heap block:
|
||||
//!
|
||||
//! [ raw_f32[D] | codes[R × ceil(D/8)] | norms[R] | ids[R] ]
|
||||
//!
|
||||
//! This contrasts with vanilla HNSW, which stores only neighbor IDs and
|
||||
//! then chases R separate random pointers to load neighbor vectors during
|
||||
//! beam search.
|
||||
//!
|
||||
//! The sequential layout means: one cache-miss to load the vertex block,
|
||||
//! then all R neighbor codes are available for batch distance estimation
|
||||
//! without any additional random memory reads.
|
||||
//!
|
||||
//! ## Graph construction
|
||||
//!
|
||||
//! For the PoC we use a greedy construction: for each new vector inserted,
|
||||
//! we scan all previously inserted vectors (O(n²) total) to find the top-R
|
||||
//! nearest neighbors and add bidirectional edges with degree capping.
|
||||
//! This gives an "oracle-quality" k-NN graph maximising recall, letting the
|
||||
//! benchmark fairly isolate the effect of quantized codes vs exact distances.
|
||||
//! Production would substitute Vamana or NSG construction here.
|
||||
|
||||
use crate::codes::{encode, packed_bytes};
|
||||
use crate::rotation::rotate;
|
||||
|
||||
/// Parameters governing the graph index.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct GraphConfig {
|
||||
/// Number of neighbors per vertex (out-degree R).
|
||||
pub r: usize,
|
||||
/// Dimension of the vectors.
|
||||
pub dim: usize,
|
||||
/// Beam width used during search (ef).
|
||||
pub ef: usize,
|
||||
/// Random seed for the rotation matrix.
|
||||
pub rotation_seed: u64,
|
||||
}
|
||||
|
||||
impl GraphConfig {
|
||||
pub fn new(dim: usize) -> Self {
|
||||
Self { r: 32, dim, ef: 64, rotation_seed: 0xdeadbeef }
|
||||
}
|
||||
|
||||
pub fn with_r(mut self, r: usize) -> Self {
|
||||
self.r = r;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_ef(mut self, ef: usize) -> Self {
|
||||
self.ef = ef;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// One vertex in the co-located SymphonyQG graph.
|
||||
///
|
||||
/// Memory layout is intentionally flat so that the entire block
|
||||
/// fits into a small number of cache lines when R is moderate.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Vertex {
|
||||
/// Original f32 vector (used for exact re-ranking and graph construction).
|
||||
pub raw: Vec<f32>,
|
||||
/// RaBitQ codes for each neighbor, stored contiguously.
|
||||
/// Length = R × ceil(D/8). Code for neighbor j starts at j×nbytes.
|
||||
pub neighbor_codes: Vec<u8>,
|
||||
/// ‖R_mat × x_neighbor‖₂ for each neighbor (asymmetric correction).
|
||||
pub neighbor_norms: Vec<f32>,
|
||||
/// Neighbor vertex IDs.
|
||||
pub neighbor_ids: Vec<u32>,
|
||||
}
|
||||
|
||||
impl Vertex {
|
||||
/// Bytes consumed by the co-located block (excluding Vec overhead).
|
||||
pub fn block_bytes(&self) -> usize {
|
||||
self.raw.len() * 4
|
||||
+ self.neighbor_codes.len()
|
||||
+ self.neighbor_norms.len() * 4
|
||||
+ self.neighbor_ids.len() * 4
|
||||
}
|
||||
}
|
||||
|
||||
/// Compact graph structure.
|
||||
pub struct SymphonyGraph {
|
||||
pub config: GraphConfig,
|
||||
pub vertices: Vec<Vertex>,
|
||||
/// The rotation matrix (D×D, row-major). Used at query time.
|
||||
pub rotation: Vec<f32>,
|
||||
}
|
||||
|
||||
impl SymphonyGraph {
|
||||
/// Build the graph from a corpus of vectors.
|
||||
pub fn build(vectors: &[Vec<f32>], config: GraphConfig, rotation: &[f32]) -> Self {
|
||||
let n = vectors.len();
|
||||
let dim = config.dim;
|
||||
let r = config.r;
|
||||
let nbytes = packed_bytes(dim);
|
||||
|
||||
// Precompute rotated + encoded versions of all vectors
|
||||
let rotated: Vec<Vec<f32>> = vectors
|
||||
.iter()
|
||||
.map(|v| rotate(rotation, v, dim))
|
||||
.collect();
|
||||
let encoded: Vec<(Vec<u8>, f32)> = rotated.iter().map(|rv| encode(rv)).collect();
|
||||
|
||||
// For each vertex, find top-R nearest neighbors by exact L2
|
||||
// Then fill the co-located block.
|
||||
let mut vertices: Vec<Vertex> = Vec::with_capacity(n);
|
||||
|
||||
for i in 0..n {
|
||||
let vi = &vectors[i];
|
||||
|
||||
// Exact k-NN from the full corpus (excluding self)
|
||||
let mut dists: Vec<(f32, usize)> = (0..n)
|
||||
.filter(|&j| j != i)
|
||||
.map(|j| {
|
||||
let d = l2_sq(vi, &vectors[j]);
|
||||
(d, j)
|
||||
})
|
||||
.collect();
|
||||
dists.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
|
||||
let neighbors: Vec<usize> = dists.iter().take(r).map(|(_, j)| *j).collect();
|
||||
|
||||
// Build co-located block
|
||||
let mut neighbor_codes = vec![0u8; neighbors.len() * nbytes];
|
||||
let mut neighbor_norms = Vec::with_capacity(neighbors.len());
|
||||
let mut neighbor_ids = Vec::with_capacity(neighbors.len());
|
||||
|
||||
for (slot, &j) in neighbors.iter().enumerate() {
|
||||
let (ref code, norm) = encoded[j];
|
||||
neighbor_codes[slot * nbytes..(slot + 1) * nbytes].copy_from_slice(code);
|
||||
neighbor_norms.push(norm);
|
||||
neighbor_ids.push(j as u32);
|
||||
}
|
||||
|
||||
vertices.push(Vertex {
|
||||
raw: vi.clone(),
|
||||
neighbor_codes,
|
||||
neighbor_norms,
|
||||
neighbor_ids,
|
||||
});
|
||||
}
|
||||
|
||||
SymphonyGraph { config, vertices, rotation: rotation.to_vec() }
|
||||
}
|
||||
|
||||
/// Total memory consumed by all vertex blocks (excludes Vec metadata).
|
||||
pub fn memory_bytes(&self) -> usize {
|
||||
self.vertices.iter().map(|v| v.block_bytes()).sum::<usize>()
|
||||
+ self.rotation.len() * 4
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn l2_sq(a: &[f32], b: &[f32]) -> f32 {
|
||||
a.iter().zip(b.iter()).map(|(x, y)| (x - y) * (x - y)).sum()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::rotation::random_orthogonal;
|
||||
|
||||
#[test]
|
||||
fn build_small_graph() {
|
||||
let n = 20;
|
||||
let dim = 8;
|
||||
let vecs: Vec<Vec<f32>> = (0..n)
|
||||
.map(|i| (0..dim).map(|j| (i * dim + j) as f32).collect())
|
||||
.collect();
|
||||
let rot = random_orthogonal(dim, 42);
|
||||
let cfg = GraphConfig::new(dim).with_r(4);
|
||||
let graph = SymphonyGraph::build(&vecs, cfg.clone(), &rot);
|
||||
assert_eq!(graph.vertices.len(), n);
|
||||
for v in &graph.vertices {
|
||||
assert_eq!(v.neighbor_ids.len(), 4.min(n - 1));
|
||||
assert_eq!(v.neighbor_norms.len(), 4.min(n - 1));
|
||||
assert_eq!(v.neighbor_codes.len(), 4.min(n - 1) * packed_bytes(dim));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn co_located_block_size_formula() {
|
||||
let dim = 128;
|
||||
let r = 16;
|
||||
// raw: 512B, codes: 16×16=256B, norms: 64B, ids: 64B = 896B
|
||||
let expected = dim * 4 + r * packed_bytes(dim) + r * 4 + r * 4;
|
||||
assert_eq!(expected, 896);
|
||||
}
|
||||
}
|
||||
218
crates/ruvector-symphony-qg/src/index.rs
Normal file
218
crates/ruvector-symphony-qg/src/index.rs
Normal file
|
|
@ -0,0 +1,218 @@
|
|||
//! Index trait and three concrete implementations for benchmarking.
|
||||
//!
|
||||
//! | Variant | Build | Search | Memory |
|
||||
//! |---|---|---|---|
|
||||
//! | `FlatF32Index` | O(n) | O(n·D) exact L2 scan | n × D × 4 bytes |
|
||||
//! | `GraphExact` | O(n²·D) | O(ef·R·D) beam, exact L2 | n × (D+R) × 4 bytes |
|
||||
//! | `SymphonyIndex` | O(n²·D) | O(ef·R·D/64) beam, ADC | n × (D + R·(D/8+2)) × 4 bytes |
|
||||
//!
|
||||
//! All three share the `AnnIndex` trait so the benchmark harness is uniform.
|
||||
|
||||
use crate::error::{Result, SymphonyError};
|
||||
use crate::graph::{l2_sq, GraphConfig, SymphonyGraph};
|
||||
use crate::rotation::random_orthogonal;
|
||||
use crate::search::{beam_search_exact, beam_search_symphony};
|
||||
|
||||
/// A single search result.
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct SearchResult {
|
||||
pub id: usize,
|
||||
pub distance: f32,
|
||||
}
|
||||
|
||||
/// Common interface for all ANN index variants.
|
||||
pub trait AnnIndex {
|
||||
fn search(&self, query: &[f32], k: usize) -> Vec<SearchResult>;
|
||||
fn len(&self) -> usize;
|
||||
fn memory_bytes(&self) -> usize;
|
||||
fn name(&self) -> &'static str;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// FlatF32Index — brute-force exact L2 baseline
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
pub struct FlatF32Index {
|
||||
vectors: Vec<Vec<f32>>,
|
||||
}
|
||||
|
||||
impl FlatF32Index {
|
||||
pub fn build(vectors: Vec<Vec<f32>>) -> Result<Self> {
|
||||
if vectors.is_empty() {
|
||||
return Err(SymphonyError::EmptyCorpus);
|
||||
}
|
||||
Ok(Self { vectors })
|
||||
}
|
||||
}
|
||||
|
||||
impl AnnIndex for FlatF32Index {
|
||||
fn search(&self, query: &[f32], k: usize) -> Vec<SearchResult> {
|
||||
let mut dists: Vec<(f32, usize)> = self
|
||||
.vectors
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, v)| (l2_sq(query, v), i))
|
||||
.collect();
|
||||
dists.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
|
||||
dists
|
||||
.into_iter()
|
||||
.take(k)
|
||||
.map(|(d, id)| SearchResult { id, distance: d })
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn len(&self) -> usize { self.vectors.len() }
|
||||
|
||||
fn memory_bytes(&self) -> usize {
|
||||
self.vectors.iter().map(|v| v.len() * 4).sum()
|
||||
}
|
||||
|
||||
fn name(&self) -> &'static str { "FlatF32" }
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// GraphExact — graph traversal with exact L2 (no quantization)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
pub struct GraphExact {
|
||||
graph: SymphonyGraph,
|
||||
n_starts: usize,
|
||||
}
|
||||
|
||||
impl GraphExact {
|
||||
pub fn build(vectors: Vec<Vec<f32>>, config: GraphConfig) -> Result<Self> {
|
||||
if vectors.is_empty() {
|
||||
return Err(SymphonyError::EmptyCorpus);
|
||||
}
|
||||
let dim = config.dim;
|
||||
if vectors[0].len() != dim {
|
||||
return Err(SymphonyError::DimensionMismatch {
|
||||
expected: dim,
|
||||
actual: vectors[0].len(),
|
||||
});
|
||||
}
|
||||
let rot = random_orthogonal(dim, config.rotation_seed);
|
||||
let graph = SymphonyGraph::build(&vectors, config, &rot);
|
||||
Ok(Self { graph, n_starts: 4 })
|
||||
}
|
||||
}
|
||||
|
||||
impl AnnIndex for GraphExact {
|
||||
fn search(&self, query: &[f32], k: usize) -> Vec<SearchResult> {
|
||||
let ef = self.graph.config.ef;
|
||||
beam_search_exact(&self.graph, query, k, ef, self.n_starts)
|
||||
.into_iter()
|
||||
.map(|(d, id)| SearchResult { id, distance: d })
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn len(&self) -> usize { self.graph.vertices.len() }
|
||||
|
||||
fn memory_bytes(&self) -> usize { self.graph.memory_bytes() }
|
||||
|
||||
fn name(&self) -> &'static str { "GraphExact" }
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// SymphonyIndex — co-located codes + asymmetric batch distance
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
pub struct SymphonyIndex {
|
||||
graph: SymphonyGraph,
|
||||
n_starts: usize,
|
||||
}
|
||||
|
||||
impl SymphonyIndex {
|
||||
pub fn build(vectors: Vec<Vec<f32>>, config: GraphConfig) -> Result<Self> {
|
||||
if vectors.is_empty() {
|
||||
return Err(SymphonyError::EmptyCorpus);
|
||||
}
|
||||
let dim = config.dim;
|
||||
if vectors[0].len() != dim {
|
||||
return Err(SymphonyError::DimensionMismatch {
|
||||
expected: dim,
|
||||
actual: vectors[0].len(),
|
||||
});
|
||||
}
|
||||
let rot = random_orthogonal(dim, config.rotation_seed);
|
||||
let graph = SymphonyGraph::build(&vectors, config, &rot);
|
||||
Ok(Self { graph, n_starts: 4 })
|
||||
}
|
||||
}
|
||||
|
||||
impl AnnIndex for SymphonyIndex {
|
||||
fn search(&self, query: &[f32], k: usize) -> Vec<SearchResult> {
|
||||
let ef = self.graph.config.ef;
|
||||
beam_search_symphony(&self.graph, query, k, ef, self.n_starts)
|
||||
.into_iter()
|
||||
.map(|(d, id)| SearchResult { id, distance: d })
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn len(&self) -> usize { self.graph.vertices.len() }
|
||||
|
||||
fn memory_bytes(&self) -> usize { self.graph.memory_bytes() }
|
||||
|
||||
fn name(&self) -> &'static str { "SymphonyQG" }
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn gaussian_vecs(n: usize, dim: usize, seed: u64) -> Vec<Vec<f32>> {
|
||||
use rand::SeedableRng;
|
||||
use rand_distr::{Distribution, Normal};
|
||||
let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
|
||||
let normal = Normal::new(0.0f32, 1.0).unwrap();
|
||||
(0..n)
|
||||
.map(|_| (0..dim).map(|_| normal.sample(&mut rng)).collect())
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn flat_nearest_is_self() {
|
||||
let vecs = gaussian_vecs(50, 16, 1);
|
||||
let idx = FlatF32Index::build(vecs.clone()).unwrap();
|
||||
let r = idx.search(&vecs[7], 1);
|
||||
assert_eq!(r[0].id, 7);
|
||||
assert!(r[0].distance < 1e-6);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn graph_exact_returns_k_results() {
|
||||
let dim = 16;
|
||||
let vecs = gaussian_vecs(100, dim, 2);
|
||||
let cfg = GraphConfig::new(dim).with_r(8).with_ef(20);
|
||||
let idx = GraphExact::build(vecs.clone(), cfg).unwrap();
|
||||
let q = &vecs[0];
|
||||
let r = idx.search(q, 5);
|
||||
assert_eq!(r.len(), 5);
|
||||
assert_eq!(r[0].id, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn symphony_recall_reasonable() {
|
||||
let dim = 32;
|
||||
let n = 200;
|
||||
let vecs = gaussian_vecs(n, dim, 3);
|
||||
let cfg = GraphConfig::new(dim).with_r(16).with_ef(40);
|
||||
|
||||
let flat = FlatF32Index::build(vecs.clone()).unwrap();
|
||||
let symphony = SymphonyIndex::build(vecs.clone(), cfg).unwrap();
|
||||
|
||||
let mut total_recall = 0.0f64;
|
||||
let n_queries = 20;
|
||||
for qi in 0..n_queries {
|
||||
let q = &vecs[n - 1 - qi]; // Use held-out vectors as queries
|
||||
let truth: std::collections::HashSet<usize> = flat.search(q, 10)
|
||||
.into_iter().map(|r| r.id).collect();
|
||||
let found: std::collections::HashSet<usize> = symphony.search(q, 10)
|
||||
.into_iter().map(|r| r.id).collect();
|
||||
let hits = truth.intersection(&found).count();
|
||||
total_recall += hits as f64 / 10.0;
|
||||
}
|
||||
let recall = total_recall / n_queries as f64;
|
||||
assert!(recall >= 0.5, "recall@10={recall:.3} too low (expected ≥0.5)");
|
||||
}
|
||||
}
|
||||
39
crates/ruvector-symphony-qg/src/lib.rs
Normal file
39
crates/ruvector-symphony-qg/src/lib.rs
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
//! SymphonyQG: Co-located RaBitQ codes + FastScan batch distance estimation
|
||||
//! on graph-based approximate nearest-neighbor search.
|
||||
//!
|
||||
//! Based on: "SymphonyQG: Towards Symphonious Integration of Quantization
|
||||
//! and Graph for Approximate Nearest Neighbor Search"
|
||||
//! (Gou et al., SIGMOD 2025, arXiv:2411.12229)
|
||||
//!
|
||||
//! ## Key innovations over vanilla HNSW
|
||||
//!
|
||||
//! 1. **Co-located layout**: each vertex stores its R neighbors' RaBitQ codes
|
||||
//! in a single contiguous heap block alongside their IDs. One sequential
|
||||
//! read gives all R neighbor distances — no random pointer chasing.
|
||||
//!
|
||||
//! 2. **Batch asymmetric distance (FastScan)**: the R neighbor codes are
|
||||
//! processed in a single pass using u64 XNOR+popcount, yielding O(R·D/64)
|
||||
//! work per hop instead of O(R·D) for exact float computation.
|
||||
//!
|
||||
//! 3. **Reranking-free termination**: RaBitQ's unbiased estimator with bounded
|
||||
//! variance allows the beam search to terminate safely without a separate
|
||||
//! re-ranking pass over the top-ef candidates.
|
||||
//!
|
||||
//! ## Memory layout per vertex (D=128, R=16)
|
||||
//!
|
||||
//! ```text
|
||||
//! [raw_f32: 512 B][neighbor_codes: 256 B][neighbor_norms: 64 B][ids: 64 B]
|
||||
//! ──── sequential ─────────────────────────────────────────────────────────
|
||||
//! Total: 896 B vs vanilla HNSW 512+64 B + R×512 B random reads = 8768 B
|
||||
//! ```
|
||||
|
||||
pub mod codes;
|
||||
pub mod error;
|
||||
pub mod graph;
|
||||
pub mod index;
|
||||
pub mod rotation;
|
||||
pub mod search;
|
||||
|
||||
pub use error::SymphonyError;
|
||||
pub use graph::GraphConfig;
|
||||
pub use index::{AnnIndex, FlatF32Index, GraphExact, SearchResult, SymphonyIndex};
|
||||
268
crates/ruvector-symphony-qg/src/main.rs
Normal file
268
crates/ruvector-symphony-qg/src/main.rs
Normal file
|
|
@ -0,0 +1,268 @@
|
|||
//! SymphonyQG unified benchmark harness.
|
||||
//!
|
||||
//! Measures recall@10, QPS, and memory for three index variants on
|
||||
//! Gaussian-clustered datasets at multiple scales.
|
||||
//!
|
||||
//! Usage:
|
||||
//! cargo run --release -p ruvector-symphony-qg -- [--fast]
|
||||
//!
|
||||
//! --fast: smoke mode (n ≤ 1K, ~3 s)
|
||||
//! default: full mode (n ∈ {1K, 2K, 5K}, ~30 s)
|
||||
|
||||
use rand::SeedableRng;
|
||||
use rand_distr::{Distribution, Normal, Uniform};
|
||||
use std::collections::HashSet;
|
||||
use std::time::Instant;
|
||||
|
||||
use ruvector_symphony_qg::{
|
||||
AnnIndex, FlatF32Index, GraphConfig, GraphExact, SymphonyIndex,
|
||||
};
|
||||
|
||||
struct BenchResult {
|
||||
name: &'static str,
|
||||
n: usize,
|
||||
r: usize,
|
||||
ef: usize,
|
||||
build_ms: f64,
|
||||
recall_at_10: f64,
|
||||
qps: f64,
|
||||
mem_bytes: usize,
|
||||
}
|
||||
|
||||
fn generate_clustered(n: usize, d: usize, n_clusters: usize, seed: u64) -> Vec<Vec<f32>> {
|
||||
let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
|
||||
let cr = Uniform::new(-2.0f32, 2.0);
|
||||
let centroids: Vec<Vec<f32>> =
|
||||
(0..n_clusters).map(|_| (0..d).map(|_| cr.sample(&mut rng)).collect()).collect();
|
||||
let noise = Normal::new(0.0f64, 0.4).unwrap();
|
||||
(0..n)
|
||||
.map(|_| {
|
||||
use rand::Rng as _;
|
||||
let c = ¢roids[rng.gen_range(0..n_clusters)];
|
||||
c.iter().map(|&x| x + noise.sample(&mut rng) as f32).collect()
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn recall_at_k(
|
||||
truth: &[Vec<usize>],
|
||||
found: &[Vec<usize>],
|
||||
k: usize,
|
||||
) -> f64 {
|
||||
let n = truth.len().min(found.len());
|
||||
if n == 0 { return 0.0; }
|
||||
let sum: f64 = truth.iter().zip(found.iter()).map(|(t, f)| {
|
||||
let t_set: HashSet<usize> = t.iter().copied().collect();
|
||||
let hits = f.iter().take(k).filter(|id| t_set.contains(id)).count();
|
||||
hits as f64 / k.min(t.len()) as f64
|
||||
}).sum();
|
||||
sum / n as f64
|
||||
}
|
||||
|
||||
fn bench_flat(vectors: &[Vec<f32>], queries: &[Vec<f32>], truth: &[Vec<usize>]) -> BenchResult {
|
||||
let n = vectors.len();
|
||||
let t0 = Instant::now();
|
||||
let idx = FlatF32Index::build(vectors.to_vec()).unwrap();
|
||||
let build_ms = t0.elapsed().as_secs_f64() * 1000.0;
|
||||
|
||||
let mem = idx.memory_bytes();
|
||||
let n_q = queries.len();
|
||||
|
||||
let t0 = Instant::now();
|
||||
let found: Vec<Vec<usize>> = queries
|
||||
.iter()
|
||||
.map(|q| idx.search(q, 10).into_iter().map(|r| r.id).collect())
|
||||
.collect();
|
||||
let elapsed = t0.elapsed().as_secs_f64();
|
||||
let qps = n_q as f64 / elapsed;
|
||||
let recall = recall_at_k(truth, &found, 10);
|
||||
|
||||
BenchResult {
|
||||
name: "FlatF32",
|
||||
n,
|
||||
r: 0,
|
||||
ef: 0,
|
||||
build_ms,
|
||||
recall_at_10: recall,
|
||||
qps,
|
||||
mem_bytes: mem,
|
||||
}
|
||||
}
|
||||
|
||||
fn bench_graph_exact(
|
||||
vectors: &[Vec<f32>],
|
||||
queries: &[Vec<f32>],
|
||||
truth: &[Vec<usize>],
|
||||
r: usize,
|
||||
ef: usize,
|
||||
) -> BenchResult {
|
||||
let n = vectors.len();
|
||||
let dim = vectors[0].len();
|
||||
let cfg = GraphConfig::new(dim).with_r(r).with_ef(ef);
|
||||
|
||||
let t0 = Instant::now();
|
||||
let idx = GraphExact::build(vectors.to_vec(), cfg).unwrap();
|
||||
let build_ms = t0.elapsed().as_secs_f64() * 1000.0;
|
||||
let mem = idx.memory_bytes();
|
||||
|
||||
let n_q = queries.len();
|
||||
let t0 = Instant::now();
|
||||
let found: Vec<Vec<usize>> = queries
|
||||
.iter()
|
||||
.map(|q| idx.search(q, 10).into_iter().map(|r| r.id).collect())
|
||||
.collect();
|
||||
let elapsed = t0.elapsed().as_secs_f64();
|
||||
let qps = n_q as f64 / elapsed;
|
||||
let recall = recall_at_k(truth, &found, 10);
|
||||
|
||||
BenchResult {
|
||||
name: "GraphExact",
|
||||
n,
|
||||
r,
|
||||
ef,
|
||||
build_ms,
|
||||
recall_at_10: recall,
|
||||
qps,
|
||||
mem_bytes: mem,
|
||||
}
|
||||
}
|
||||
|
||||
fn bench_symphony(
|
||||
vectors: &[Vec<f32>],
|
||||
queries: &[Vec<f32>],
|
||||
truth: &[Vec<usize>],
|
||||
r: usize,
|
||||
ef: usize,
|
||||
) -> BenchResult {
|
||||
let n = vectors.len();
|
||||
let dim = vectors[0].len();
|
||||
let cfg = GraphConfig::new(dim).with_r(r).with_ef(ef);
|
||||
|
||||
let t0 = Instant::now();
|
||||
let idx = SymphonyIndex::build(vectors.to_vec(), cfg).unwrap();
|
||||
let build_ms = t0.elapsed().as_secs_f64() * 1000.0;
|
||||
let mem = idx.memory_bytes();
|
||||
|
||||
let n_q = queries.len();
|
||||
let t0 = Instant::now();
|
||||
let found: Vec<Vec<usize>> = queries
|
||||
.iter()
|
||||
.map(|q| idx.search(q, 10).into_iter().map(|r| r.id).collect())
|
||||
.collect();
|
||||
let elapsed = t0.elapsed().as_secs_f64();
|
||||
let qps = n_q as f64 / elapsed;
|
||||
let recall = recall_at_k(truth, &found, 10);
|
||||
|
||||
BenchResult {
|
||||
name: "SymphonyQG",
|
||||
n,
|
||||
r,
|
||||
ef,
|
||||
build_ms,
|
||||
recall_at_10: recall,
|
||||
qps,
|
||||
mem_bytes: mem,
|
||||
}
|
||||
}
|
||||
|
||||
fn print_table(rows: &[BenchResult]) {
|
||||
println!(
|
||||
"\n{:<14} {:>6} {:>4} {:>4} {:>10} {:>10} {:>10} {:>10}",
|
||||
"Index", "n", "R", "ef", "Build(ms)", "Recall@10", "QPS", "Memory"
|
||||
);
|
||||
println!("{}", "-".repeat(80));
|
||||
for r in rows {
|
||||
println!(
|
||||
"{:<14} {:>6} {:>4} {:>4} {:>10.1} {:>10.3} {:>10.0} {:>10}",
|
||||
r.name,
|
||||
r.n,
|
||||
r.r,
|
||||
r.ef,
|
||||
r.build_ms,
|
||||
r.recall_at_10,
|
||||
r.qps,
|
||||
human_bytes(r.mem_bytes),
|
||||
);
|
||||
}
|
||||
println!();
|
||||
}
|
||||
|
||||
fn human_bytes(b: usize) -> String {
|
||||
if b < 1024 { format!("{b} B") }
|
||||
else if b < 1024 * 1024 { format!("{:.1} KB", b as f64 / 1024.0) }
|
||||
else { format!("{:.2} MB", b as f64 / (1024.0 * 1024.0)) }
|
||||
}
|
||||
|
||||
fn run_suite(n: usize, dim: usize, n_clusters: usize, n_queries: usize, fast: bool) {
|
||||
println!("=== n={n}, D={dim}, clusters={n_clusters}, queries={n_queries} ===");
|
||||
|
||||
let corpus = generate_clustered(n, dim, n_clusters, 42);
|
||||
let queries = generate_clustered(n_queries, dim, n_clusters, 99);
|
||||
|
||||
// Compute ground truth using brute force
|
||||
let flat_ref = FlatF32Index::build(corpus.clone()).unwrap();
|
||||
let truth: Vec<Vec<usize>> = queries
|
||||
.iter()
|
||||
.map(|q| flat_ref.search(q, 10).into_iter().map(|r| r.id).collect())
|
||||
.collect();
|
||||
|
||||
let mut rows = Vec::new();
|
||||
|
||||
// 1. FlatF32 baseline
|
||||
rows.push(bench_flat(&corpus, &queries, &truth));
|
||||
|
||||
// 2-4. Graph variants at different ef
|
||||
let params: &[(usize, usize)] = if fast {
|
||||
&[(16, 32)]
|
||||
} else {
|
||||
&[(16, 32), (16, 64), (32, 64)]
|
||||
};
|
||||
|
||||
for &(r, ef) in params {
|
||||
rows.push(bench_graph_exact(&corpus, &queries, &truth, r, ef));
|
||||
rows.push(bench_symphony(&corpus, &queries, &truth, r, ef));
|
||||
}
|
||||
|
||||
print_table(&rows);
|
||||
|
||||
// Print speedup analysis
|
||||
if let (Some(flat), Some(sym)) = (
|
||||
rows.iter().find(|r| r.name == "FlatF32"),
|
||||
rows.iter().filter(|r| r.name == "SymphonyQG").last(),
|
||||
) {
|
||||
let qps_speedup = sym.qps / flat.qps;
|
||||
let recall_delta = sym.recall_at_10 - flat.recall_at_10;
|
||||
println!(
|
||||
" SymphonyQG (R={}, ef={}) vs FlatF32: {:.2}× QPS, recall delta {:+.3}",
|
||||
sym.r, sym.ef, qps_speedup, recall_delta
|
||||
);
|
||||
if let Some(gex) = rows.iter().filter(|r| r.name == "GraphExact" && r.r == sym.r && r.ef == sym.ef).next() {
|
||||
let vs_exact = sym.qps / gex.qps;
|
||||
println!(
|
||||
" SymphonyQG vs GraphExact (same R/ef): {:.2}× QPS, recall delta {:+.3}",
|
||||
vs_exact, sym.recall_at_10 - gex.recall_at_10
|
||||
);
|
||||
}
|
||||
}
|
||||
println!();
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let fast = std::env::args().any(|a| a == "--fast");
|
||||
|
||||
println!("SymphonyQG Benchmark Harness");
|
||||
println!(" arXiv:2411.12229 · SIGMOD 2025");
|
||||
println!(" Co-located RaBitQ codes + batch asymmetric distance on k-NN graph");
|
||||
println!();
|
||||
|
||||
if fast {
|
||||
println!("[fast mode: n≤1K]");
|
||||
run_suite(1_000, 128, 50, 200, true);
|
||||
} else {
|
||||
run_suite(1_000, 128, 50, 200, false);
|
||||
run_suite(2_000, 128, 80, 300, false);
|
||||
run_suite(5_000, 128, 100, 500, false);
|
||||
}
|
||||
|
||||
println!("Done.");
|
||||
}
|
||||
87
crates/ruvector-symphony-qg/src/rotation.rs
Normal file
87
crates/ruvector-symphony-qg/src/rotation.rs
Normal file
|
|
@ -0,0 +1,87 @@
|
|||
//! Random orthogonal rotation via Gram-Schmidt on a Gaussian matrix.
|
||||
//!
|
||||
//! We generate a D×D random normal matrix and orthogonalise it column-by-column
|
||||
//! using the modified Gram-Schmidt process. The result is a true orthogonal
|
||||
//! matrix (not merely random projections), matching the RaBitQ rotation
|
||||
//! construction used in SymphonyQG.
|
||||
//!
|
||||
//! For PoC scale (D ≤ 256) this is fast. Production would cache the matrix.
|
||||
|
||||
use rand::SeedableRng;
|
||||
use rand_distr::{Distribution, Normal};
|
||||
|
||||
/// Generates a D×D orthogonal rotation matrix with a fixed seed.
|
||||
/// Stored in row-major order: entry (i,j) = matrix[i*dim + j].
|
||||
pub fn random_orthogonal(dim: usize, seed: u64) -> Vec<f32> {
|
||||
let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
|
||||
let normal = Normal::new(0.0f64, 1.0).unwrap();
|
||||
|
||||
// Sample D × D Gaussian matrix stored as columns for GSO
|
||||
let mut cols: Vec<Vec<f64>> = (0..dim)
|
||||
.map(|_| (0..dim).map(|_| normal.sample(&mut rng)).collect())
|
||||
.collect();
|
||||
|
||||
// Modified Gram-Schmidt orthogonalisation
|
||||
for j in 0..dim {
|
||||
// Normalise column j
|
||||
let norm = cols[j].iter().map(|x| x * x).sum::<f64>().sqrt();
|
||||
if norm < 1e-12 {
|
||||
// Degenerate column — replace with a standard basis vector
|
||||
cols[j] = vec![0.0; dim];
|
||||
cols[j][j] = 1.0;
|
||||
} else {
|
||||
for x in cols[j].iter_mut() {
|
||||
*x /= norm;
|
||||
}
|
||||
}
|
||||
// Project out column j from all subsequent columns
|
||||
let cj = cols[j].clone();
|
||||
for k in (j + 1)..dim {
|
||||
let dot: f64 = cols[k].iter().zip(cj.iter()).map(|(a, b)| a * b).sum();
|
||||
for (ck, cj_val) in cols[k].iter_mut().zip(cj.iter()) {
|
||||
*ck -= dot * cj_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Transpose: result[i][j] = cols[j][i], stored row-major so R[i,j] = result[i*dim+j]
|
||||
let mut matrix = vec![0.0f32; dim * dim];
|
||||
for i in 0..dim {
|
||||
for j in 0..dim {
|
||||
matrix[i * dim + j] = cols[j][i] as f32;
|
||||
}
|
||||
}
|
||||
matrix
|
||||
}
|
||||
|
||||
/// Apply rotation: y = R × x, result length = dim.
|
||||
#[inline]
|
||||
pub fn rotate(matrix: &[f32], x: &[f32], dim: usize) -> Vec<f32> {
|
||||
let mut y = vec![0.0f32; dim];
|
||||
for i in 0..dim {
|
||||
let row = &matrix[i * dim..(i + 1) * dim];
|
||||
y[i] = row.iter().zip(x.iter()).map(|(r, v)| r * v).sum();
|
||||
}
|
||||
y
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn orthogonality() {
|
||||
let dim = 8;
|
||||
let r = random_orthogonal(dim, 42);
|
||||
// Check R × Rᵀ ≈ I
|
||||
for i in 0..dim {
|
||||
for j in 0..dim {
|
||||
let dot: f32 = (0..dim)
|
||||
.map(|k| r[i * dim + k] * r[j * dim + k])
|
||||
.sum();
|
||||
let expected = if i == j { 1.0 } else { 0.0 };
|
||||
assert!((dot - expected).abs() < 1e-5, "R×Rᵀ[{i},{j}] = {dot}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
233
crates/ruvector-symphony-qg/src/search.rs
Normal file
233
crates/ruvector-symphony-qg/src/search.rs
Normal file
|
|
@ -0,0 +1,233 @@
|
|||
//! Beam search over the SymphonyQG graph.
|
||||
//!
|
||||
//! ## Algorithm
|
||||
//!
|
||||
//! Standard greedy beam search (à la HNSW layer-0 / NSG) with two modes:
|
||||
//!
|
||||
//! **Exact mode** (used in `GraphExact` index):
|
||||
//! Each candidate's neighbors are scored with exact L2 distance.
|
||||
//! Baseline for measuring quantization overhead.
|
||||
//!
|
||||
//! **Symphony mode** (used in `SymphonyIndex`):
|
||||
//! Neighbor distances are estimated using the co-located RaBitQ codes
|
||||
//! via `batch_asym_l2`. Only the *current candidate* (already in the
|
||||
//! beam set) requires an exact distance; all R neighbors are scored by
|
||||
//! the asymmetric estimator without any random memory reads.
|
||||
//!
|
||||
//! ## Termination
|
||||
//!
|
||||
//! The beam set is a max-heap of size `ef`. Expansion stops when the
|
||||
//! best unvisited candidate's estimated distance exceeds the worst
|
||||
//! distance in the result heap. This is the standard HNSW termination
|
||||
//! criterion; in SymphonyQG it is safe because the RaBitQ estimator is
|
||||
//! an unbiased approximation with bounded variance.
|
||||
|
||||
use std::collections::{BinaryHeap, HashSet};
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use crate::codes::{batch_asym_l2, QueryProjection};
|
||||
use crate::graph::{l2_sq, SymphonyGraph};
|
||||
|
||||
/// (distance, id) ordered as a min-heap entry (Rust's BinaryHeap is max-heap,
|
||||
/// so we negate the distance comparison).
|
||||
#[derive(Clone)]
|
||||
struct HeapEntry {
|
||||
neg_dist: f32, // stored negated for max-heap inversion
|
||||
id: usize,
|
||||
}
|
||||
|
||||
impl PartialEq for HeapEntry {
|
||||
fn eq(&self, other: &Self) -> bool { self.neg_dist == other.neg_dist }
|
||||
}
|
||||
impl Eq for HeapEntry {}
|
||||
impl PartialOrd for HeapEntry {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> { Some(self.cmp(other)) }
|
||||
}
|
||||
impl Ord for HeapEntry {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
self.neg_dist.partial_cmp(&other.neg_dist).unwrap_or(Ordering::Equal)
|
||||
}
|
||||
}
|
||||
|
||||
fn random_entry_points(n: usize, count: usize, seed: u64) -> Vec<usize> {
|
||||
// Pseudo-random starting points spread across the graph
|
||||
let step = n / count.max(1);
|
||||
(0..count).map(|i| (i * step + seed as usize) % n).collect()
|
||||
}
|
||||
|
||||
/// Beam search with exact L2 distances (no quantization).
|
||||
pub fn beam_search_exact(
|
||||
graph: &SymphonyGraph,
|
||||
query: &[f32],
|
||||
k: usize,
|
||||
ef: usize,
|
||||
n_starts: usize,
|
||||
) -> Vec<(f32, usize)> {
|
||||
let n = graph.vertices.len();
|
||||
if n == 0 { return vec![]; }
|
||||
|
||||
let mut visited = HashSet::new();
|
||||
// candidates: min-heap by distance (we negate to use BinaryHeap as min-heap)
|
||||
let mut candidates: BinaryHeap<HeapEntry> = BinaryHeap::new();
|
||||
// results: max-heap of size ef (for top-k extraction)
|
||||
let mut results: BinaryHeap<HeapEntry> = BinaryHeap::new();
|
||||
|
||||
let entries = random_entry_points(n, n_starts, 0);
|
||||
for ep in entries {
|
||||
if visited.contains(&ep) { continue; }
|
||||
let d = l2_sq(query, &graph.vertices[ep].raw);
|
||||
candidates.push(HeapEntry { neg_dist: -d, id: ep });
|
||||
}
|
||||
|
||||
while let Some(HeapEntry { neg_dist, id }) = candidates.pop() {
|
||||
let dist = -neg_dist;
|
||||
if visited.contains(&id) { continue; }
|
||||
visited.insert(id);
|
||||
|
||||
// Prune: if the result set is full and current dist > worst result, stop
|
||||
if results.len() >= ef {
|
||||
if let Some(worst) = results.peek() {
|
||||
if dist > -worst.neg_dist { break; }
|
||||
}
|
||||
}
|
||||
|
||||
results.push(HeapEntry { neg_dist: -dist, id });
|
||||
if results.len() > ef {
|
||||
results.pop(); // remove the farthest
|
||||
}
|
||||
|
||||
// Expand neighbors with exact distances
|
||||
let v = &graph.vertices[id];
|
||||
for &nid in &v.neighbor_ids {
|
||||
let nid = nid as usize;
|
||||
if !visited.contains(&nid) {
|
||||
let nd = l2_sq(query, &graph.vertices[nid].raw);
|
||||
candidates.push(HeapEntry { neg_dist: -nd, id: nid });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut out: Vec<(f32, usize)> = results
|
||||
.into_iter()
|
||||
.map(|e| (-e.neg_dist, e.id))
|
||||
.collect();
|
||||
out.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
|
||||
out.truncate(k);
|
||||
out
|
||||
}
|
||||
|
||||
/// Beam search with asymmetric RaBitQ distance estimates on co-located codes.
|
||||
/// Exact distance is only computed for the current node (already in beam).
|
||||
pub fn beam_search_symphony(
|
||||
graph: &SymphonyGraph,
|
||||
query: &[f32],
|
||||
k: usize,
|
||||
ef: usize,
|
||||
n_starts: usize,
|
||||
) -> Vec<(f32, usize)> {
|
||||
let n = graph.vertices.len();
|
||||
if n == 0 { return vec![]; }
|
||||
|
||||
let dim = graph.config.dim;
|
||||
let q_rot = crate::rotation::rotate(&graph.rotation, query, dim);
|
||||
let norm_q_sq = query.iter().map(|v| v * v).sum::<f32>();
|
||||
let qp = QueryProjection::new(q_rot);
|
||||
|
||||
let mut visited = HashSet::new();
|
||||
let mut candidates: BinaryHeap<HeapEntry> = BinaryHeap::new();
|
||||
let mut results: BinaryHeap<HeapEntry> = BinaryHeap::new();
|
||||
|
||||
let entries = random_entry_points(n, n_starts, 0);
|
||||
for ep in entries {
|
||||
if visited.contains(&ep) { continue; }
|
||||
// Entry points: use exact distance for the seed (no codes available without neighbor context)
|
||||
let d = l2_sq(query, &graph.vertices[ep].raw);
|
||||
candidates.push(HeapEntry { neg_dist: -d, id: ep });
|
||||
}
|
||||
|
||||
while let Some(HeapEntry { neg_dist, id }) = candidates.pop() {
|
||||
let dist = -neg_dist;
|
||||
if visited.contains(&id) { continue; }
|
||||
visited.insert(id);
|
||||
|
||||
if results.len() >= ef {
|
||||
if let Some(worst) = results.peek() {
|
||||
if dist > -worst.neg_dist { break; }
|
||||
}
|
||||
}
|
||||
|
||||
results.push(HeapEntry { neg_dist: -dist, id });
|
||||
if results.len() > ef {
|
||||
results.pop();
|
||||
}
|
||||
|
||||
// Batch estimate distances for all R neighbors using co-located codes
|
||||
let v = &graph.vertices[id];
|
||||
let r = v.neighbor_ids.len();
|
||||
if r == 0 { continue; }
|
||||
|
||||
let est_dists = batch_asym_l2(&qp, &v.neighbor_codes, &v.neighbor_norms, norm_q_sq);
|
||||
|
||||
for (slot, &nid) in v.neighbor_ids.iter().enumerate() {
|
||||
let nid = nid as usize;
|
||||
if !visited.contains(&nid) {
|
||||
candidates.push(HeapEntry { neg_dist: -est_dists[slot], id: nid });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Final step: retrieve exact distances for the top ef candidates in results
|
||||
// This is the "re-rank-free" design: the beam already converged well enough
|
||||
// that we return the exact distances for the top-k within the result set.
|
||||
let mut out: Vec<(f32, usize)> = results
|
||||
.into_iter()
|
||||
.map(|e| {
|
||||
let id = e.id;
|
||||
let exact = l2_sq(query, &graph.vertices[id].raw);
|
||||
(exact, id)
|
||||
})
|
||||
.collect();
|
||||
out.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
|
||||
out.truncate(k);
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::{graph::{GraphConfig, SymphonyGraph}, rotation::random_orthogonal};
|
||||
|
||||
fn tiny_graph(n: usize, dim: usize) -> SymphonyGraph {
|
||||
let vecs: Vec<Vec<f32>> = (0..n)
|
||||
.map(|i| (0..dim).map(|j| (i * dim + j) as f32 * 0.01).collect())
|
||||
.collect();
|
||||
let rot = random_orthogonal(dim, 42);
|
||||
let cfg = GraphConfig::new(dim).with_r(4).with_ef(8);
|
||||
SymphonyGraph::build(&vecs, cfg, &rot)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn exact_returns_nearest() {
|
||||
let dim = 8;
|
||||
let graph = tiny_graph(16, dim);
|
||||
let query: Vec<f32> = (0..dim).map(|j| 0.0 * j as f32).collect();
|
||||
let results = beam_search_exact(&graph, &query, 1, 8, 4);
|
||||
assert!(!results.is_empty());
|
||||
// Nearest should be vertex 0 (all zeros for i=0 case)
|
||||
assert_eq!(results[0].1, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn symphony_finds_reasonable_neighbours() {
|
||||
let dim = 16;
|
||||
let n = 50;
|
||||
let graph = tiny_graph(n, dim);
|
||||
let query: Vec<f32> = vec![0.0; dim];
|
||||
let res_exact = beam_search_exact(&graph, &query, 5, 20, 4);
|
||||
let res_sym = beam_search_symphony(&graph, &query, 5, 20, 4);
|
||||
// At least 3 of top-5 should overlap between exact and symphony
|
||||
let exact_ids: HashSet<usize> = res_exact.iter().map(|(_, id)| *id).collect();
|
||||
let overlap = res_sym.iter().filter(|(_, id)| exact_ids.contains(id)).count();
|
||||
assert!(overlap >= 2, "too little overlap: {overlap}/5");
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue