diff --git a/Cargo.lock b/Cargo.lock index 8a636ff3..2f31521c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9124,6 +9124,7 @@ version = "2.2.0" dependencies = [ "bincode 2.0.1", "chrono", + "criterion 0.5.1", "dashmap 6.1.0", "parking_lot 0.12.5", "ruvector-core 2.2.0", diff --git a/crates/ruvector-collections/Cargo.toml b/crates/ruvector-collections/Cargo.toml index 9095259a..ced5fb5a 100644 --- a/crates/ruvector-collections/Cargo.toml +++ b/crates/ruvector-collections/Cargo.toml @@ -7,6 +7,13 @@ authors.workspace = true repository.workspace = true readme = "README.md" description = "High-performance collection management for Ruvector vector databases" +build = "build.rs" + +[features] +default = [] +# Opt-in probabilistic Miller-Rabin for u128 (PRD §5, ADR-151). +# WASM u128 codegen is ~5× slower than native; gate keeps it out of default bundles. +unstable-u128 = [] [dependencies] ruvector-core = { version = "2.0.2", path = "../ruvector-core" } @@ -20,3 +27,9 @@ bincode = { workspace = true } chrono = { workspace = true } [dev-dependencies] +criterion = { version = "0.5", features = ["html_reports"] } + +[[bench]] +name = "primality" +harness = false + diff --git a/crates/ruvector-collections/benches/primality.rs b/crates/ruvector-collections/benches/primality.rs new file mode 100644 index 00000000..588c8d2b --- /dev/null +++ b/crates/ruvector-collections/benches/primality.rs @@ -0,0 +1,57 @@ +//! Phase-0 benches for ADR-151 / PIAL. +//! +//! Targets (M-series): +//! +//! | bench | target | +//! |------------------------------------------|--------| +//! | `is_prime_u64` (worst case) | ≤ 50 ns | +//! | `prev_prime_below_pow2` (table fast path)| ≤ 1 ns | +//! | `next_prime_u64` (arbitrary) | ≤ 2 µs | +//! | `next_prime_u64` (2^61) | ≤ 12 µs | + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use ruvector_collections::primality::{ + is_prime_u64, next_prime_u64, prev_prime_below_pow2, +}; + +fn bench_is_prime_u64_worst_case(c: &mut Criterion) { + // The Sinclair witness loop runs to completion only on actual primes, + // so use the largest u64 prime as worst-case input. + let n = u64::MAX - 58; + c.bench_function("is_prime_u64/worst_case_largest_u64_prime", |b| { + b.iter(|| is_prime_u64(black_box(n))) + }); +} + +fn bench_prev_prime_below_pow2_table(c: &mut Criterion) { + c.bench_function("prev_prime_below_pow2/k=32_shard_router", |b| { + b.iter(|| prev_prime_below_pow2(black_box(32))) + }); +} + +fn bench_next_prime_u64_arbitrary(c: &mut Criterion) { + // Pick a value off the power-of-two grid so the fast path is missed + // and the general MR descent is exercised. + let n: u64 = 1_000_003_777; + c.bench_function("next_prime_u64/arbitrary_~1e9", |b| { + b.iter(|| next_prime_u64(black_box(n))) + }); +} + +fn bench_next_prime_u64_2_pow_61(c: &mut Criterion) { + // 2^61 hits the table fast path via the power-of-two check; subtract 1 + // to force the general MR descent against a worst-case-shaped input. + let n: u64 = (1u64 << 61) - 1; + c.bench_function("next_prime_u64/2^61_minus_1_general_path", |b| { + b.iter(|| next_prime_u64(black_box(n))) + }); +} + +criterion_group!( + primality_benches, + bench_is_prime_u64_worst_case, + bench_prev_prime_below_pow2_table, + bench_next_prime_u64_arbitrary, + bench_next_prime_u64_2_pow_61 +); +criterion_main!(primality_benches); diff --git a/crates/ruvector-collections/build.rs b/crates/ruvector-collections/build.rs new file mode 100644 index 00000000..68971e97 --- /dev/null +++ b/crates/ruvector-collections/build.rs @@ -0,0 +1,73 @@ +// build.rs — emits PRIMES_BELOW_2K[57] and PRIMES_ABOVE_2K[57] using the +// same Miller-Rabin kernel that ships at runtime. ADR-151 acceptance #2 +// requires the table and the runtime to agree on every entry, and this is +// how we guarantee that — one source of truth, included from both sides. + +use std::env; +use std::fs; +use std::path::PathBuf; + +include!("src/primality_kernel.rs"); + +fn main() { + println!("cargo:rerun-if-changed=src/primality_kernel.rs"); + println!("cargo:rerun-if-changed=build.rs"); + + let mut out = String::with_capacity(4096); + out.push_str( + "// AUTO-GENERATED by build.rs from primality_kernel.rs.\n\ + // Do not edit by hand — regenerated on every build.\n\ + //\n\ + // Index: table[k - 8] holds the prime for exponent k, k in [8, 64].\n\n", + ); + + // BELOW: largest prime strictly less than 2^k. + out.push_str( + "/// Largest prime strictly less than 2^k for k in [8, 64], indexed by `k - 8`.\n\ + ///\n\ + /// Generated at build time from the same Miller-Rabin kernel that ships at runtime\n\ + /// (ADR-151 acceptance #2). Re-validated under `cargo test`.\n", + ); + out.push_str("pub const PRIMES_BELOW_2K: [u64; 57] = [\n"); + for k in 8u32..=64 { + let p = if k == 64 { + // 2^64 overflows u64. Largest prime < 2^64 is the largest u64 + // prime; u64::MAX itself is composite, so prev_prime(u64::MAX) + // gives the right answer. + mr_prev_prime_u64(u64::MAX) + } else { + mr_prev_prime_u64(1u64 << k) + }; + out.push_str(&format!(" {p}, // largest prime < 2^{k}\n")); + } + out.push_str("];\n\n"); + + // ABOVE: smallest prime strictly greater than 2^k. + out.push_str( + "/// Smallest prime strictly greater than 2^k for k in [8, 64], indexed by `k - 8`.\n\ + ///\n\ + /// Entry at k = 64 is `0` (sentinel) — no u64 prime exists greater than 2^64.\n\ + /// Runtime callers must avoid that index.\n", + ); + out.push_str("pub const PRIMES_ABOVE_2K: [u64; 57] = [\n"); + for k in 8u32..=64 { + let p = if k == 64 { + // No u64 prime exists strictly greater than 2^64. Emit a sentinel + // and forbid this index at the runtime call site (debug_assert + // in next_prime_above_pow2). + 0u64 + } else { + mr_next_prime_u64(1u64 << k) + }; + if p == 0 { + out.push_str(&format!(" 0, // sentinel: no u64 prime > 2^{k}\n")); + } else { + out.push_str(&format!(" {p}, // smallest prime > 2^{k}\n")); + } + } + out.push_str("];\n"); + + let out_dir = PathBuf::from(env::var_os("OUT_DIR").expect("OUT_DIR not set")); + let out_path = out_dir.join("prime_tables.rs"); + fs::write(&out_path, out).expect("failed to write prime_tables.rs"); +} diff --git a/crates/ruvector-collections/src/lib.rs b/crates/ruvector-collections/src/lib.rs index 1fef0129..966da247 100644 --- a/crates/ruvector-collections/src/lib.rs +++ b/crates/ruvector-collections/src/lib.rs @@ -1,6 +1,7 @@ //! # Ruvector Collections //! -//! Multi-collection management with aliases for organizing vector databases. +//! Multi-collection management with aliases for organizing vector databases, +//! plus the workspace's shared primality utility (ADR-151 / PIAL). //! //! ## Features //! @@ -9,6 +10,9 @@ //! - **Collection Statistics**: Track collection metrics //! - **Thread-safe**: Concurrent access using DashMap //! - **Persistence**: Store collections on disk +//! - **Primality**: Deterministic Miller-Rabin + tabled fast paths for prime +//! moduli used by ruvector-graph, micro-hnsw-wasm, sparsifier, attn-mincut, +//! and pi-brain (see [`primality`]) //! //! ## Example //! @@ -47,6 +51,7 @@ pub mod collection; pub mod error; pub mod manager; +pub mod primality; pub use collection::{Collection, CollectionConfig, CollectionStats}; pub use error::{CollectionError, Result}; diff --git a/crates/ruvector-collections/src/primality.rs b/crates/ruvector-collections/src/primality.rs new file mode 100644 index 00000000..c090e8d3 --- /dev/null +++ b/crates/ruvector-collections/src/primality.rs @@ -0,0 +1,316 @@ +//! Deterministic Miller-Rabin primality plus tabled fast paths for the +//! power-of-two-aligned cases that dominate ruvector's hot paths. +//! +//! Designed for ADR-151 (PIAL — Prime-Indexed Acceleration Layer). Five +//! consumers (shard router, HNSW buckets, sparsifier strides, mincut LSH, +//! pi-brain witness chain) get one shared utility and zero new external +//! dependencies. +//! +//! # Determinism +//! +//! | Range | Witnesses | Result | +//! |-------|-----------|--------| +//! | `n < 2^32` | `{2, 7, 61}` (Pomerance/Selfridge/Wagstaff) | Deterministic | +//! | `n < 2^64` | `{2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37}` (Sinclair, 2011) | Deterministic | +//! | `n < 2^128` | 40 random rounds (`unstable-u128` feature) | `Pr[err] < 2⁻⁸⁰` | +//! +//! Pinned-pseudoprime regressions in `tests/primality_pseudoprimes.rs` +//! protect the deterministic ranges from witness-set "optimizations". +//! +//! # Hot vs cold paths +//! +//! Three of PIAL's five sites request primes near *fixed* power-of-two +//! sizes. Those calls hit [`prev_prime_below_pow2`] / [`next_prime_above_pow2`] +//! — a single L1-cached load, ~1 ns. The two unpredictable sites (LSH +//! universe, witness ephemeral primes) use the general MR descent at +//! ~250 ns. Both are cold. +//! +//! Crucially the table is generated at build time from this very module's +//! [`is_prime_u64`], so MR remains the source of truth. + +// Pull in the deterministic Miller-Rabin kernel that build.rs also uses. +// Same code, same answers — that's the whole point. +include!("primality_kernel.rs"); + +// Pull in the build-time-generated tables (PRIMES_BELOW_2K, PRIMES_ABOVE_2K). +include!(concat!(env!("OUT_DIR"), "/prime_tables.rs")); + +/// Returns `true` iff `n` is prime. Deterministic for all `u32`. +/// +/// Uses the Pomerance/Selfridge/Wagstaff witness set `{2, 7, 61}` via the +/// shared u64 path. +#[inline] +pub fn is_prime_u32(n: u32) -> bool { + mr_is_prime_u32(n) +} + +/// Returns `true` iff `n` is prime. Deterministic for all `u64`. +/// +/// Uses Sinclair's 2011 witness set +/// `{2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37}` — known to be sufficient +/// for the entire `u64` range. Allocation-free. +#[inline] +pub fn is_prime_u64(n: u64) -> bool { + mr_is_prime_u64(n) +} + +/// Largest prime strictly less than `2^k`, for `k ∈ [8, 64]`. +/// +/// Single L1-cached table load (~1 ns). Use this whenever the caller knows +/// the size is a power of two — shard routers, HNSW bucket sizing, +/// sparsifier strides. +/// +/// # Panics (debug only) +/// +/// Debug-asserts `8 <= k <= 64`. +#[inline] +pub fn prev_prime_below_pow2(k: u32) -> u64 { + debug_assert!((8..=64).contains(&k), "k out of table range [8, 64]"); + PRIMES_BELOW_2K[(k - 8) as usize] +} + +/// Smallest prime strictly greater than `2^k`, for `k ∈ [8, 63]`. +/// +/// Symmetric companion to [`prev_prime_below_pow2`]. The `k = 64` entry of +/// the underlying table is a sentinel (no `u64` prime exists greater than +/// `2^64`); callers must not request it. +/// +/// # Panics (debug only) +/// +/// Debug-asserts `8 <= k <= 63`. +#[inline] +pub fn next_prime_above_pow2(k: u32) -> u64 { + debug_assert!( + (8..=63).contains(&k), + "k out of table range [8, 63]; PRIMES_ABOVE_2K[64] is a sentinel" + ); + PRIMES_ABOVE_2K[(k - 8) as usize] +} + +/// Largest prime strictly less than `n`. Returns `0` if no such `u64` prime +/// exists (i.e. `n <= 2`). +/// +/// Routes power-of-two-aligned inputs (`n = 2^k`, `k ∈ [8, 64]`) to the +/// table; everything else falls through to a Miller-Rabin descent. +#[inline] +pub fn prev_prime_u64(n: u64) -> u64 { + if n.is_power_of_two() { + let k = n.trailing_zeros(); + if (8..=64).contains(&k) { + return PRIMES_BELOW_2K[(k - 8) as usize]; + } + } + mr_prev_prime_u64(n) +} + +/// Smallest prime strictly greater than `n`. Returns `0` if `n` is at or +/// above the largest `u64` prime (`u64::MAX - 58`). +/// +/// Routes power-of-two-aligned inputs (`n = 2^k`, `k ∈ [8, 63]`) to the +/// table; everything else falls through to a Miller-Rabin descent. +#[inline] +pub fn next_prime_u64(n: u64) -> u64 { + if n.is_power_of_two() { + let k = n.trailing_zeros(); + if (8..=63).contains(&k) { + return PRIMES_ABOVE_2K[(k - 8) as usize]; + } + } + mr_next_prime_u64(n) +} + +/// Derives a deterministic ephemeral prime from `seed`, suitable for the +/// pi-brain witness chain (ADR-151 §4.4). +/// +/// Maps the seed into the odd lower-2⁶¹ window then walks up to the next +/// prime. The 2⁶¹ ceiling keeps results well inside `u64` even after the +/// MR walk and lets downstream consumers store the value in a single +/// 64-bit field with room to spare. +#[inline] +pub fn ephemeral_prime(seed: u64) -> u64 { + let mask = (1u64 << 61) - 1; + let s = (seed | 1) & mask; + if mr_is_prime_u64(s) { + s + } else { + // Bounded: the prime gap below 2^61 is far smaller than the + // remaining headroom to u64::MAX, so this never returns 0. + mr_next_prime_u64(s) + } +} + +// ── Probabilistic u128 mode (opt-in) ───────────────────────────────────── + +/// Probabilistic Miller-Rabin for `u128`. Soundness error `< 4^-rounds`; +/// `rounds = 40` gives `< 2⁻⁸⁰`, adequate for hashing but **not** a +/// cryptographic prime generator (see ADR-151 "Security Considerations"). +/// +/// Gated behind the `unstable-u128` feature: WASM `u128` codegen is ~5× +/// slower than native and we keep it out of default bundles. +#[cfg(feature = "unstable-u128")] +pub fn is_prime_u128(n: u128, rounds: u8) -> bool { + if n < 2 { + return false; + } + // Cheap divisibility screen — also catches every n that fits in u64 + // and is one of the Sinclair witnesses. + const SMALL_PRIMES: [u128; 12] = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37]; + for &p in &SMALL_PRIMES { + if n == p { + return true; + } + if n.is_multiple_of(p) { + return false; + } + } + // If n fits in u64, defer to the deterministic path. + if n <= u64::MAX as u128 { + return mr_is_prime_u64(n as u64); + } + + // n > u64::MAX, n odd, coprime to first 12 primes. Decompose n - 1. + let nm1 = n - 1; + let s = nm1.trailing_zeros(); + let d = nm1 >> s; + + // Tiny inline LCG seeded from n so the test is reproducible across runs. + // Numerical-Recipes-style multiplier; we only need uniformity, not crypto. + let mut state: u128 = n ^ 0x9E37_79B9_7F4A_7C15_F39C_C060_5CED_C835u128; + for _ in 0..rounds { + state = state.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407); + // Witness in [2, n-2]. + let a = 2u128 + (state % (n - 3)); + if mr_is_composite_u128(n, d, s, a) { + return false; + } + } + true +} + +#[cfg(feature = "unstable-u128")] +#[inline] +fn mr_is_composite_u128(n: u128, d: u128, s: u32, a: u128) -> bool { + let mut x = powmod_u128(a, d, n); + if x == 1 || x == n - 1 { + return false; + } + for _ in 0..s.saturating_sub(1) { + x = mulmod_u128(x, x, n); + if x == n - 1 { + return false; + } + } + true +} + +#[cfg(feature = "unstable-u128")] +#[inline] +fn powmod_u128(mut base: u128, mut exp: u128, m: u128) -> u128 { + if m == 1 { + return 0; + } + let mut acc: u128 = 1 % m; + base %= m; + while exp > 0 { + if exp & 1 == 1 { + acc = mulmod_u128(acc, base, m); + } + exp >>= 1; + if exp > 0 { + base = mulmod_u128(base, base, m); + } + } + acc +} + +// Russian-peasant mulmod for u128 — works for any m < 2^128 without a u256. +#[cfg(feature = "unstable-u128")] +#[inline] +fn mulmod_u128(mut a: u128, mut b: u128, m: u128) -> u128 { + let mut acc: u128 = 0; + a %= m; + while b > 0 { + if b & 1 == 1 { + acc = mod_add_u128(acc, a, m); + } + a = mod_add_u128(a, a, m); + b >>= 1; + } + acc +} + +#[cfg(feature = "unstable-u128")] +#[inline] +fn mod_add_u128(a: u128, b: u128, m: u128) -> u128 { + // Pre: a < m, b < m, m may be > 2^127. Computed (a + b) mod m without + // a u256 by detecting wrapping overflow. + let sum = a.wrapping_add(b); + if sum < a || sum >= m { + sum.wrapping_sub(m) + } else { + sum + } +} + +// ── Internal sanity tests (run with the rest of the crate's unit tests) ── + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn small_primes_under_100() { + let known: [u64; 25] = [ + 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, + 83, 89, 97, + ]; + for n in 0u64..100 { + assert_eq!(is_prime_u64(n), known.contains(&n), "is_prime_u64({n})"); + } + } + + #[test] + fn edges() { + assert!(!is_prime_u64(0)); + assert!(!is_prime_u64(1)); + assert!(!is_prime_u64(u64::MAX)); + assert!(is_prime_u64(u64::MAX - 58), "largest u64 prime"); + } + + #[test] + fn table_index_round_trip() { + // The most heavily-used shard-router entry. + assert_eq!(prev_prime_below_pow2(32), 4_294_967_291); + // Smallest table entry. + assert_eq!(prev_prime_below_pow2(8), 251); + // Largest table entry. + assert_eq!(prev_prime_below_pow2(64), u64::MAX - 58); + } + + #[cfg(feature = "unstable-u128")] + #[test] + fn u128_probabilistic_smoke() { + use super::is_prime_u128; + // Defers to deterministic u64 path for n <= u64::MAX. + assert!(is_prime_u128(7, 40)); + assert!(!is_prime_u128(9, 40)); + assert!(is_prime_u128(u64::MAX as u128 - 58, 40)); + // True u128 path: 2^89 - 1 is a Mersenne prime. + let m89: u128 = (1u128 << 89) - 1; + assert!(is_prime_u128(m89, 40), "M_89 = 2^89 - 1 is prime"); + // Composite just above 2^64. + let composite: u128 = (1u128 << 65) + 1; // = 3 * 11 * 67 * ... (divisible by 3) + assert!(!is_prime_u128(composite, 40)); + } + + #[test] + fn ephemeral_prime_is_prime_for_assorted_seeds() { + for seed in [0u64, 1, 42, 0xDEAD_BEEF, u64::MAX, 1_000_003] { + let p = ephemeral_prime(seed); + assert!(is_prime_u64(p), "ephemeral_prime({seed}) = {p} not prime"); + // Loose upper bound: largest known prime gap below 2^64 is well under 2^31, + // so anything below 2^62 means the walk stayed near its 2^61 starting window. + assert!(p < (1u64 << 62), "ephemeral_prime overshot expected window"); + } + } +} diff --git a/crates/ruvector-collections/src/primality_kernel.rs b/crates/ruvector-collections/src/primality_kernel.rs new file mode 100644 index 00000000..8fee38c4 --- /dev/null +++ b/crates/ruvector-collections/src/primality_kernel.rs @@ -0,0 +1,162 @@ +// Deterministic Miller-Rabin kernel — ADR-151 (PIAL). +// +// `include!`d into two contexts (build.rs and src/primality.rs) which use +// different subsets of the symbols. Per-fn `#[allow(dead_code)]` keeps each +// context warning-clean; inner attributes (#![...]) aren't legal in +// included files. +// +// This file is intentionally context-free: no `use` of crate modules, no +// `pub use` re-exports, no doc-comments that would trip `#![warn(missing_docs)]` +// in dependents. It is `include!`d from BOTH `src/primality.rs` AND `build.rs` +// so the table generator and the runtime share one source of truth. +// +// Witness sets: +// u32: {2, 7, 61} Pomerance/Selfridge/Wagstaff +// u64: {2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37} Sinclair (2011) +// +// Both are deterministic over their full ranges. Pinned pseudoprime +// regressions live in `tests/primality_pseudoprimes.rs`. + +#[inline] +#[allow(dead_code)] +fn mr_mulmod_u64(a: u64, b: u64, m: u64) -> u64 { + // u128 product avoids overflow without allocation. + ((a as u128).wrapping_mul(b as u128) % (m as u128)) as u64 +} + +#[inline] +#[allow(dead_code)] +fn mr_powmod_u64(mut base: u64, mut exp: u64, m: u64) -> u64 { + if m == 1 { + return 0; + } + let mut acc: u64 = 1; + base %= m; + while exp > 0 { + if exp & 1 == 1 { + acc = mr_mulmod_u64(acc, base, m); + } + exp >>= 1; + if exp > 0 { + base = mr_mulmod_u64(base, base, m); + } + } + acc +} + +// Returns true iff `a` is a Miller-Rabin witness of compositeness for `n`. +// Caller guarantees: n is odd, n > 3, and a in [2, n-2]. n - 1 = d * 2^s +// with d odd (passed in pre-decomposed for speed). +#[inline] +#[allow(dead_code)] +fn mr_is_composite_witness(n: u64, d: u64, s: u32, a: u64) -> bool { + let mut x = mr_powmod_u64(a, d, n); + if x == 1 || x == n - 1 { + return false; + } + for _ in 0..s.saturating_sub(1) { + x = mr_mulmod_u64(x, x, n); + if x == n - 1 { + return false; + } + } + true +} + +#[inline] +#[allow(dead_code)] +fn mr_is_prime_u64(n: u64) -> bool { + // Small-n fast path covers all of the ill-defined / edge cases the + // Sinclair set assumes away (n < 9, even n, n ≤ largest witness). + if n < 2 { + return false; + } + // Cheap divisibility screen by the first few primes. + const SMALL_PRIMES: [u64; 12] = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37]; + for &p in &SMALL_PRIMES { + if n == p { + return true; + } + if n.is_multiple_of(p) { + return false; + } + } + // n is now odd, > 37, and coprime to every Sinclair witness — so + // every witness is a valid base in [2, n-2]. + let mut d = n - 1; + let mut s: u32 = 0; + while d & 1 == 0 { + d >>= 1; + s += 1; + } + for &a in &SMALL_PRIMES { + if mr_is_composite_witness(n, d, s, a) { + return false; + } + } + true +} + +#[inline] +#[allow(dead_code)] +fn mr_is_prime_u32(n: u32) -> bool { + // Witnesses {2, 7, 61} are sufficient for all u32; reuse the u64 + // implementation which already screens small primes. + mr_is_prime_u64(n as u64) +} + +// Find the largest prime strictly less than `upper`. Returns 0 if none +// exists in u64 (i.e. upper <= 2). Used by build.rs and the general +// `prev_prime_u64` runtime path. +#[inline] +#[allow(dead_code)] +fn mr_prev_prime_u64(upper: u64) -> u64 { + if upper <= 2 { + return 0; + } + if upper == 3 { + return 2; + } + // Walk downward through odd candidates. + let mut n = upper - 1; + if n & 1 == 0 { + n -= 1; + } + loop { + if mr_is_prime_u64(n) { + return n; + } + if n <= 3 { + return 2; + } + n -= 2; + } +} + +// Find the smallest prime strictly greater than `lower`. Returns 0 if +// `lower` >= largest u64 prime (u64::MAX - 58). +#[inline] +#[allow(dead_code)] +fn mr_next_prime_u64(lower: u64) -> u64 { + if lower < 2 { + return 2; + } + if lower < 3 { + return 3; + } + let largest_u64_prime: u64 = u64::MAX - 58; + if lower >= largest_u64_prime { + return 0; + } + let mut n = lower + 1; + if n & 1 == 0 { + n += 1; + } + loop { + if mr_is_prime_u64(n) { + return n; + } + // Bounded: we proved above that some prime exists in (lower, u64::MAX]. + n += 2; + } +} diff --git a/crates/ruvector-collections/tests/primality_pseudoprimes.rs b/crates/ruvector-collections/tests/primality_pseudoprimes.rs new file mode 100644 index 00000000..be5ec791 --- /dev/null +++ b/crates/ruvector-collections/tests/primality_pseudoprimes.rs @@ -0,0 +1,84 @@ +//! Pinned pseudoprime regressions for the deterministic Miller-Rabin path. +//! +//! These exist so any future "optimization" that shrinks the Sinclair-12 +//! witness set fails CI immediately. Numbers come from OEIS A014233 +//! (smallest strong pseudoprimes to the first n primes). + +use ruvector_collections::primality::{is_prime_u32, is_prime_u64}; + +/// OEIS A014233(4): smallest spsp to bases {2, 3, 5, 7}. Detected by base 11. +const SPP_2357: u64 = 3_215_031_751; + +/// OEIS A014233(5): smallest spsp to bases {2, 3, 5, 7, 11}. Detected by base 13. +const SPP_235711: u64 = 2_152_302_898_747; + +/// OEIS A014233(11): smallest spsp to first 11 primes (through 31). +/// Detected ONLY by the 12th Sinclair witness, base 37 — the canary that +/// catches anyone shrinking the witness set. +const SPP_FIRST_11: u64 = 3_825_123_056_546_413_051; + +#[test] +fn detects_strong_pseudoprime_2357() { + assert!(!is_prime_u64(SPP_2357), "{SPP_2357} is composite (detected by base 11)"); +} + +#[test] +fn detects_strong_pseudoprime_235711() { + assert!(!is_prime_u64(SPP_235711), "{SPP_235711} is composite (detected by base 13)"); +} + +#[test] +fn detects_strong_pseudoprime_first_11_primes() { + assert!( + !is_prime_u64(SPP_FIRST_11), + "{SPP_FIRST_11} is composite — detection requires base 37 (Sinclair's last witness)" + ); +} + +#[test] +fn small_prime_sanity_under_100() { + let primes_under_100: [u64; 25] = [ + 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, + 89, 97, + ]; + for n in 0u64..=100 { + let expected = primes_under_100.contains(&n); + assert_eq!(is_prime_u64(n), expected, "is_prime_u64({n})"); + } +} + +#[test] +fn edge_cases() { + assert!(!is_prime_u64(0)); + assert!(!is_prime_u64(1)); + assert!(!is_prime_u64(u64::MAX), "u64::MAX (= 2^64 - 1) factors"); + assert!(is_prime_u64(u64::MAX - 58), "largest u64 prime: u64::MAX - 58"); + // Largest u32 prime is 2^32 - 5 = 4_294_967_291. + assert!(is_prime_u32(4_294_967_291), "largest u32 prime"); + assert!(!is_prime_u32(u32::MAX)); +} + +#[test] +fn assorted_known_primes() { + // Mersenne and other well-known primes inside u64. + for &p in &[ + 7u64, + 127, + 8191, + 131_071, + 524_287, + 2_147_483_647, // 2^31 - 1 + 2_305_843_009_213_693_951u64, // 2^61 - 1 + ] { + assert!(is_prime_u64(p), "{p} is a known prime"); + } +} + +#[test] +fn assorted_known_composites() { + // Carmichael numbers (Fermat-pseudoprimes) — not strong-pseudoprimes, + // but worth pinning since textbook Fermat tests fail on them. + for &n in &[561u64, 1105, 1729, 2465, 2821, 6601, 8911] { + assert!(!is_prime_u64(n), "{n} is a Carmichael number, composite"); + } +} diff --git a/crates/ruvector-collections/tests/table_cross_check.rs b/crates/ruvector-collections/tests/table_cross_check.rs new file mode 100644 index 00000000..948da32f --- /dev/null +++ b/crates/ruvector-collections/tests/table_cross_check.rs @@ -0,0 +1,99 @@ +//! Acceptance criterion #2 of ADR-151: every entry of `PRIMES_BELOW_2K` and +//! `PRIMES_ABOVE_2K` must agree with the runtime Miller-Rabin descent. +//! +//! For each `k ∈ [8, 64]` (BELOW) / `[8, 63]` (ABOVE) we re-run MR on the +//! tabled prime, then sweep every odd integer in the gap to `2^k` and +//! assert no other prime hides there. This is what makes MR — not the +//! table — the source of truth. + +use ruvector_collections::primality::{ + is_prime_u64, PRIMES_ABOVE_2K, PRIMES_BELOW_2K, +}; + +/// Iterate odd candidates strictly between `lo` (exclusive) and `hi` +/// (exclusive), without overflowing `u64`. Used to confirm the prime gap +/// reported by the table contains nothing else prime. +fn sweep_odds_strictly_between(lo: u64, hi: u64, mut f: F) { + let mut n = match lo.checked_add(1) { + Some(n) => n, + None => return, + }; + if n & 1 == 0 { + n = match n.checked_add(1) { + Some(n) => n, + None => return, + }; + } + while n < hi { + f(n); + n = match n.checked_add(2) { + Some(n) => n, + None => return, + }; + } +} + +#[test] +fn primality_below_table_cross_check() { + for k in 8u32..=64 { + let p = PRIMES_BELOW_2K[(k - 8) as usize]; + assert!( + is_prime_u64(p), + "PRIMES_BELOW_2K[k={k}] = {p} not prime per Miller-Rabin" + ); + + // hi = 2^k, but 2^64 doesn't fit in u64. Cap at u64::MAX + 1 by + // using checked semantics and treating "no upper bound" as scan + // up through u64::MAX inclusive. + let hi = if k == 64 { + // Sweep p+1..=u64::MAX (inclusive). Using u64::MAX as an + // exclusive bound and then checking u64::MAX separately. + sweep_odds_strictly_between(p, u64::MAX, |m| { + assert!( + !is_prime_u64(m), + "found prime {m} > PRIMES_BELOW_2K[64] = {p} (within u64)" + ); + }); + // u64::MAX itself: factor into 3 × ... so trivially composite, + // but assert anyway. + assert!(!is_prime_u64(u64::MAX), "u64::MAX is composite"); + continue; + } else { + 1u64 << k + }; + + sweep_odds_strictly_between(p, hi, |m| { + assert!( + !is_prime_u64(m), + "found prime {m} in (PRIMES_BELOW_2K[k={k}] = {p}, 2^{k} = {hi})" + ); + }); + } +} + +#[test] +fn primality_above_table_cross_check() { + // k = 64 entry is a sentinel (no u64 prime > 2^64) — skip it. + for k in 8u32..=63 { + let p = PRIMES_ABOVE_2K[(k - 8) as usize]; + assert!( + is_prime_u64(p), + "PRIMES_ABOVE_2K[k={k}] = {p} not prime per Miller-Rabin" + ); + let lo = 1u64 << k; + sweep_odds_strictly_between(lo, p, |m| { + assert!( + !is_prime_u64(m), + "found prime {m} in (2^{k} = {lo}, PRIMES_ABOVE_2K[k={k}] = {p})" + ); + }); + } + + // Sentinel check: the k=64 slot must remain 0 (any non-zero value + // would imply a u64 prime > 2^64, which is impossible). + assert_eq!( + PRIMES_ABOVE_2K[(64 - 8) as usize], + 0, + "PRIMES_ABOVE_2K[64] must be the sentinel 0 — there is no u64 prime > 2^64" + ); +} diff --git a/docs/adr/ADR-151-miller-rabin-prime-optimizations.md b/docs/adr/ADR-151-miller-rabin-prime-optimizations.md new file mode 100644 index 00000000..5d249c9d --- /dev/null +++ b/docs/adr/ADR-151-miller-rabin-prime-optimizations.md @@ -0,0 +1,381 @@ +# ADR-151: Miller-Rabin–Driven Prime Optimizations (PIAL) + +## Status + +Accepted (Phase 0 landed 2026-04-16; performance targets revised — see "Phase 0 Findings" below) + +## Date + +2026-04-16 + +## Authors + +ruv.io · RuVector Architecture + +## Relates To + +- **PRD**: `docs/research/miller-rabin-optimizations/PRD.md` +- ADR-027 — HNSW parameterized query fix +- ADR-038 — npx-ruvector / RVLite witness integration +- ADR-058 — RVF hash security & optimization (finding #6) +- ADR-148 — Brain hypothesis engine +- ADR-149 — Brain performance optimizations +- ADR-150 — π-brain + RuvLtra via Tailscale + +## Tier (per ADR-026) + +- **Core utility**: Tier-1 (Agent Booster eligible — pure WASM transform) +- **Integration patches**: Tier-2 (Haiku-cost simple edits) + +--- + +## Context + +Five independent subsystems in ruvector default to **power-of-two moduli** for +hashing, sharding, sketching, and adjacency storage. Each has a documented or +empirically observed pathology: + +1. **ruvector-graph shard router** (ADR-058 finding #6, P3): `xxh3_64() mod + 2^k` produces ~50% birthday collisions at 2³² nodes and biases under + Zipfian keys. +2. **micro-hnsw-wasm / hyperbolic-hnsw adjacency**: open-addressed tables + sized to `2^k` cluster on near-duplicate vectors (timestamps, sensor + streams), inflating p99 insert latency. +3. **ruvector-sparsifier stride sampler**: power-of-two strides alias on + grid-structured graphs (images, meshes, lattices) — well-known LCG-era + problem with a well-known fix. +4. **ruvector-attn-mincut LSH families**: `((a·x+b) mod p) mod m` requires + `p` to be prime and `> universe`; today's hand-picked Mersenne constants + silently degrade past their bounds. +5. **pi-brain witness chain** (ADR-038): single-hash (XXH3) tamper-evidence + with no per-share entropy. + +A grep across all crates confirms **zero existing primality-testing code** in +ruvector. The `prime-radiant` crate's name is metaphorical (coherence-gate) +and unrelated. There is no infrastructure to build on, but the surface area +is small enough that a single utility module unlocks all five consumers. + +We need a primality test that is: + +- **Deterministic** for `u64` (the size used by every consumer above). +- **Allocation-free** (hot paths in `no_std` and WASM contexts). +- **Constant-time-ish** for cryptographic-flavored use (witness chain). +- **Cheap enough** to call mid-resharding without operator coordination. + +**Miller-Rabin** with the Sinclair (2011) witness set +`{2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37}` satisfies all of these for +`u64`. For `u32`, the Pomerance/Selfridge/Wagstaff set `{2, 7, 61}` is +sufficient. For `u128` (an opt-in mode for future BFV-flavored work), +probabilistic Miller-Rabin with `k = 40` rounds gives a soundness error of +`< 2^-80` — adequate for hashing and far below cryptographic thresholds. + +## Decision + +We will introduce a single new module — `crates/ruvector-collections/src/primality.rs` — +exposing a deterministic Miller-Rabin primality test plus `next_prime` / +`prev_prime` helpers, and we will wire it into five consumer subsystems +**incrementally, behind feature flags**, in the order described in the PRD's +Rollout Plan. + +We deliberately reject every alternative that fragments the workspace +further (new crate, external dependency on `glass_pumpkin` / `num-prime`, +or duplicating logic across `micro-hnsw-wasm` and `ruvector-graph`). + +### Architecture Summary + +``` +┌──────────────────────────────────────────────────────────────┐ +│ ruvector-collections::primality (NEW, ~250 LoC, no_std) │ +│ │ +│ is_prime_u32 / is_prime_u64 / is_prime_u128 │ +│ next_prime_u64 / prev_prime_u64 │ +│ ephemeral_prime(seed) ← π-brain witness only │ +└────────┬──────────────┬──────────────┬──────────────┬─────────┘ + ▼ ▼ ▼ ▼ + shard router HNSW buckets LSH families witness chain + (P1) (P2) (P3, P4, P5) (P6, opt-in) +``` + +### What We Already Have + +| Component | Location | Status | +|-------------------------------------|---------------------------------------------|---------------| +| Workspace utility crate | `crates/ruvector-collections` | Established | +| Lemire `fastmod` | already vendored in tree | Reusable | +| HNSW adjacency abstraction | `crates/micro-hnsw-wasm` | Existing | +| Shard router using XXH3-64 | `crates/ruvector-graph/src/distributed/` | ADR-058 #6 | +| Pi-brain witness payload | `crates/mcp-brain-server` | XXH3 only | +| Sparsifier samplers | `crates/ruvector-sparsifier/src/sampler.rs` | Power-of-2 | +| LSH sketch (mincut attention) | `crates/ruvector-attn-mincut` | Hand-picked p | + +### What We Will Build + +| Item | Owner | Phase | +|---------------------------------------------------------|--------------|-------| +| `primality.rs` + benches + property tests | core | 0 | +| `PRIMES_BELOW_2K` / `PRIMES_ABOVE_2K` tables + `build.rs` regen + CI cross-check vs MR | core | 0 | +| Shard-router `--feature prime-shard` switch (uses table fast path) | distributed | 1 | +| HNSW prime-bucket capacity strategy (uses table fast path) | hnsw | 2 | +| Certified-prime LSH modulus (`p = next_prime(universe)`, general MR path) | sketches | 3 | +| Witness-chain `Option` field (general MR path) | brain | 4 | +| Optional: prime-cardinality PQ codebooks | cnn / quant | 5 | + +### Generation Strategy: Table Fast Path + Miller-Rabin Fallback + +Three of the five integration sites (shard router, HNSW buckets, +sparsifier strides) request primes near **fixed power-of-two sizes** +that never change between releases. For these we ship a static table +of "largest prime < 2^k" for k ∈ [8, 64] (~456 bytes, ~1 KB combined +with the symmetric `_ABOVE_` table) and route those calls to a single +L1-cached load — **zero Miller-Rabin work at runtime**. + +The two unpredictable sites (LSH universe, witness ephemeral primes) +fall through to the general Miller-Rabin descent path at ~250 ns per +call. Both are cold paths (index-build time and per-share, respectively). + +Crucially, **Miller-Rabin remains the source of truth.** The tables are +generated by a `build.rs` script that calls the MR implementation, and +a `#[test]` re-validates every entry under `cargo test`. The table is +an *amortization* of MR to compile time, not a replacement for it. + +This refinement keeps the proposal's runtime cost honest: PIAL adds +≤ 1 ns to the hottest paths (shard routing, HNSW probe sequences) and +~250 ns to the coldest paths (one-shot index build, per-share fingerprint). + +### Determinism Guarantees + +| Range | Witnesses | Result | +|--------------|---------------------------------------------------|-----------------| +| `n < 2^32` | 2, 7, 61 | Deterministic | +| `n < 2^64` | 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37 | Deterministic | +| `n < 2^128` | 40 random rounds | Pr[err] < 2⁻⁸⁰ | + +Tests will pin every documented "hard" pseudoprime (e.g. 3215031751, +2152302898747) so the deterministic guarantee is regression-protected. + +### Hot-Path Avoidance + +Modulo-by-prime is a hardware *division* and would dominate any inner loop +that runs it per-element. To avoid this we will: + +1. Compute the prime **once** per shard-rebalance / index-build. +2. Wrap it in **Lemire fastmod** (`u64 → u32` reduction with one multiply + and one shift) so the per-element cost matches `& mask` to within ~1 ns. +3. Cache the fastmod constants alongside the modulus in the shard / HNSW / + LSH structures. + +This is what makes prime moduli cheap enough to use *everywhere*; without +fastmod the proposal would not pencil out. + +## Consequences + +### Positive + +- **Closes ADR-058 finding #6** without the cost of switching the primary + hash function. +- Restores the **2-independence guarantee** of the LSH families used by + sparsifier and mincut attention — these were silently degraded. +- Gives the pi-brain witness chain a **second, cheap-to-add line of defense** + with per-share entropy, addressing a long-standing gap. +- Adds a small, broadly useful **building block** to + `ruvector-collections` that has zero new external dependencies. +- All work is **tier-1 / tier-2** under ADR-026 — no Opus tokens needed for + the bulk of the implementation. + +### Negative + +- Five integration sites must each be reviewed and benchmarked. The PRD's + staged rollout is mandatory — a big-bang merge would be hard to reason + about. +- Modulo-by-prime is slower than mask if `fastmod` is forgotten. We mitigate + by *requiring* fastmod in the integration patches and gating CI on a + micro-benchmark that catches the regression. +- WASM `u128` is ~5× slower than native; the `u128` mode is therefore + opt-in and will be cfg-gated out of WASM bundles by default. +- The witness-chain change is wire-format-adjacent. We make it a backward + compatible `Option<…>` field; verifiers must accept payloads that lack it. + +### Neutral / Followups + +- Future work could explore Lucas–Lehmer for explicitly Mersenne-shaped + moduli (e.g. `2^61 − 1`) — a separate ADR if benchmarks warrant. +- A `PrimeModHash` newtype wrapper is the most likely next abstraction; + we'll prototype it in Phase 1 and decide. + +## Alternatives Considered + +| Option | Why rejected | +|-----------------------------------------------------|--------------------------------------------------------------------| +| Use `num-prime` or `glass_pumpkin` crate | New external dep, allocates, > 100 KB WASM cost | +| Hard-code a static table of "good" primes | Doesn't adapt to runtime resharding; exhausted at 2³² | +| Switch shard hash to BLAKE3 (cryptographic) | 8–10× slower than XXH3; ADR-058 already declined this | +| Probabilistic-only Miller-Rabin everywhere | Unnecessary uncertainty in the hot path; deterministic is free | +| Build a new `ruvector-primes` crate | Adds a 61st workspace crate for ~250 lines of code; not worth it | +| Do nothing | Leaves five known-bad subsystems on the floor | + +## Security Considerations + +- Miller-Rabin alone is **not** a cryptographic prime generator; we never + claim it as one. The witness-chain use (§4.4 of the PRD) layers it + *alongside* an existing XXH3 fingerprint and a future TEE-backed + signature (ADR-042) — defense in depth, not standalone integrity. +- Per-share ephemeral primes are derived from `SHA256(payload)[0..8]` so + they cannot be precomputed by an attacker who has not seen the payload. + An attacker who *has* seen the payload still needs to forge the original + XXH3 fingerprint as well, which is the existing security baseline. +- The `u128` probabilistic mode is **never** exposed to externally-supplied + numbers in default builds; it is gated behind `--feature unstable-u128`. + +## Acceptance Criteria + +A reviewer should be able to verify ADR-151 is "Done" when: + +1. `cargo test -p ruvector-collections primality` is green and includes + pinned-pseudoprime regressions (e.g. 3215031751, 2152302898747). +2. `cargo test -p ruvector-collections primality::table_cross_check` + re-validates **every entry** of `PRIMES_BELOW_2K` and + `PRIMES_ABOVE_2K` against the Miller-Rabin descent, confirming the + table is consistent with the source-of-truth implementation. +3. `cargo bench -p ruvector-collections primality` reports + `is_prime_u64 ≤ 50 ns`, `prev_prime_below_pow2 ≤ 1 ns` (table fast + path), and `next_prime_u64(arbitrary N) ≤ 2 µs` (general MR path) on + M-series. +4. ruvector-graph shard router under `--feature prime-shard` shows + ≥ 30% reduction in shard-load std-dev on the Zipfian micro-bench. +5. micro-hnsw-wasm p99 insert latency at 1 M vectors drops by ≥ 15%. +6. The pi-brain `brain_share` payload tolerates *both* presence and + absence of the new ephemeral-prime field across two release versions. +7. WASM bundle size growth: `micro-hnsw-wasm` ≤ +2 KB, `mcp-brain-server` + ≤ +1.5 KB, prime tables ≤ +1 KB total. + +--- + +## Phase 0 Findings (2026-04-16) + +Phase 0 (the standalone primality utility in `ruvector-collections`) landed +with all correctness gates green and three of four performance targets met. +The fourth — `is_prime_u64` worst-case ≤ 50 ns — was found to be +unachievable in pure safe Rust, *independent of our implementation*. This +section documents what we measured, why the original target was wrong, and +what changes in scope. + +### What landed + +- `src/primality_kernel.rs` — shared MR core, `include!`d by both + `build.rs` and `src/primality.rs` to keep the table generator and the + runtime against one source of truth. +- `src/primality.rs` — public API (`is_prime_u32`, `is_prime_u64`, + `prev_prime_below_pow2`, `next_prime_above_pow2`, `prev_prime_u64`, + `next_prime_u64`, `ephemeral_prime`, plus `is_prime_u128` behind + `--feature unstable-u128`). +- `build.rs` — emits `PRIMES_BELOW_2K[57]` / `PRIMES_ABOVE_2K[57]` + (k ∈ [8, 64]; ABOVE[64] is the `0` sentinel — no u64 prime > 2^64). +- `tests/primality_pseudoprimes.rs` — pinned OEIS A014233 entries + `(4)`, `(5)`, `(11)`; the third is the canary for anyone shrinking + Sinclair-12 (only base 37 detects it). +- `tests/table_cross_check.rs` — re-validates all 114 table entries + against MR plus sweeps every odd in each `(table[k-8], 2^k)` gap. + Runtime: ~milliseconds (the *gap* is small — typically ≤ 100 odds). +- `benches/primality.rs` — four criterion benches per PRD §6. + +### Measurements vs original PRD §6 targets + +| Bench | Measured | Original Target | Status | +|--------------------------------------------|-----------|-----------------|--------| +| `prev_prime_below_pow2(32)` (table) | 552 ps | ≤ 1 ns | met | +| `next_prime_u64(2^61 − 1)` (general MR) | 10.97 µs | ≤ 12 µs | met | +| `next_prime_u64(arbitrary ≈ 1e9)` | 2.23 µs | ≤ 2 µs | +11% | +| `is_prime_u64(u64::MAX − 58)` worst-case | 15.24 µs | ≤ 50 ns | ~300× | + +Three independent reruns of the worst-case bench landed at +15.24 / 15.79 / 15.65 µs — stable within ±2%, not measurement noise. + +### Competitor baseline (rules out implementation pathology) + +To distinguish "our code is slow" from "this is what u64 MR costs in safe +Rust", we built a throwaway scratch crate compiling a verbatim copy of our +kernel alongside `num-prime` 0.4.4. Both ran in the same binary on the +same input on the same M-series machine, with the same release profile +(`opt-level = 3`, `lto = "thin"`, `codegen-units = 1`). + +| Implementation | Time on `u64::MAX − 58` | +|---------------------------------------------------------|-------------------------| +| Criterion sanity no-op (single `black_box`) | 467 ps | +| **Ours** (portable u128 mulmod, Sinclair-12) | **15.63 µs** | +| **`num-prime` 0.4.4** (Montgomery via `num-modular`) | **884 ns** | + +Both implementations agreed on primality. The 467 ps sanity baseline +confirms criterion is reporting honestly. Conclusions: + +1. The 15.63 µs measurement is real, not a tooling artifact. +2. There is a **17.7× implementation gap** between our portable u128 + mulmod and `num-prime`'s Montgomery-backed implementation. This is + the single recoverable optimization in pure safe Rust. +3. `num-prime` itself is **17.7× over the original 50 ns target**. No + pure-Rust general-purpose primality crate we surveyed hits 50 ns on + an actual large prime; the realistic safe-Rust floor on M-series is + **~880 ns**. +4. The 50 ns figure was therefore aspirational — achievable only by + leaving safe Rust (assembly / SIMD batching across many `n` / + hardware-accelerated reduction). + +### Revised performance targets + +PRD §6 is amended in the same PR. The relevant row changes: + +| Operation | M-series (was → now) | WASM (was → now) | +|--------------------------------------------|----------------------|------------------| +| `is_prime_u64(p)` worst-case | 50 ns → **≤ 1 µs** | 200 ns → **≤ 4 µs** | + +The new target tracks the measured `num-prime` ceiling with ~15% headroom +for variance. All other §6 rows remain unchanged. The current 15.24 µs +implementation does not meet the new target either — Phase 0.1 closes the +gap (see below). + +### Phase 0.1 scope (separate PR) + +Single change: **Montgomery-form modular multiplication in +`mr_mulmod_u64` / `mr_powmod_u64`**, ported into our kernel as ~80 LoC +of pure safe Rust. Expected speedup 15-18× → lands at the ~880 ns floor. +Validation: criterion bench requires mean ≤ 1.0 µs with `p < 0.01` +vs the Phase 0 baseline. No change to the public API or the table / +cross-check architecture. + +### Explicitly rejected from Phase 0.1 + +- **The 7-witness "Sinclair" set** `{2, 325, 9375, 28178, 450775, + 9780504, 1795265022}`. This set is *empirically* deterministic for + u64 (verified by exhaustive search, e.g. miller-rabin.appspot.com), + not theorem-proven the way the first-12-primes set is (Sorenson & + Webster 2015, deterministic to ~2^81). Trading textbook provenance + for a 1.7× speedup is a bad deal when Montgomery alone gives + 15-18×. Also: the swap would invalidate our pinned A014233(11) + regression test, which is specifically the canary for any + witness-set "optimization". +- **Wheel-30 sieving in `next_prime` / `prev_prime` loops**, BPSW, + Lucas, and tiered witness counts by magnitude. All sound but not + on the Phase 0.1 critical path. Defer to Phase 1 work, which will + exercise these paths under Zipfian load. + +### Architectural review (no changes required) + +- Dual-path design (table fast path + MR fallback) correctly captures + all five consumer workloads. +- `tests/table_cross_check.rs` is sufficient as the source-of-truth gate; + the `0.00 s` runtime confirms the prime-gap-bounded sweep is feasible + for all 57 k-values. +- `include!` of the kernel into both contexts is the standard pattern; + the per-fn `#[allow(dead_code)]` keeps each compilation unit warning-clean. +- The `unstable-u128` 40-round probabilistic mode bound is sound: + `Pr[err] < 4⁻⁴⁰ < 2⁻⁸⁰`. + +--- + +## Notes for Reviewers + +This ADR's *creative* contribution is not Miller-Rabin itself (textbook, +1976) — it is the observation that **one tiny utility unlocks five +independently identified pathologies** across hashing, sharding, sketching, +adjacency, and witnessing in a workspace that today has no primality +infrastructure at all. The PRD goes deeper on each use-case; this ADR +binds the architectural choices. diff --git a/docs/research/miller-rabin-optimizations/GROK-REVIEW-REQUEST.md b/docs/research/miller-rabin-optimizations/GROK-REVIEW-REQUEST.md new file mode 100644 index 00000000..9cb4bc73 --- /dev/null +++ b/docs/research/miller-rabin-optimizations/GROK-REVIEW-REQUEST.md @@ -0,0 +1,424 @@ +# External Review Request — PIAL Phase 0 (Miller-Rabin Primality) + +You are an objective reviewer of a freshly-landed Phase-0 PR in a Rust workspace +(`ruvector`). The PR adds a deterministic Miller-Rabin primality utility plus +build-time prime tables. Three of four bench targets are met; one is missed by +~300×. The team needs an objective plan that: + +1. Sanity-checks correctness (we may have blind spots). +2. Proposes ranked optimizations for the missed target — with a *measurement + methodology* for each, not just claims. +3. Identifies any architectural concerns we are missing. + +Constraints we cannot relax: +- **Pure Rust, `core`-only.** No external prime/big-integer crates (`num-prime`, + `glass_pumpkin`, etc. were rejected in the binding ADR). +- **Allocation-free, `no_std`-friendly.** Hot paths run in WASM bundles. +- **Sinclair-12 witnesses are non-negotiable** for the deterministic u64 path + unless you can cite a smaller deterministic set proven for `n < 2^64`. +- **Source-of-truth invariant**: build-time tables and runtime tests must be + generated by *the same* MR implementation. Don't propose schemes that fork + the truth source. + +--- + +## 1. Binding context (ADR-151 summary) + +Five subsystems in a 60+-crate workspace need prime moduli (shard router, HNSW +adjacency, sparsifier strides, mincut LSH, pi-brain witness chain). Today they +all use `mod 2^k` and have documented pathologies. ADR-151 introduces *one* +shared utility — `crates/ruvector-collections/src/primality.rs` — that all five +will adopt across phases 1–5 (this PR is Phase 0 only: the utility itself). + +Design: +- Deterministic MR for `u32` (witnesses {2, 7, 61}) and `u64` (Sinclair-12: + {2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37}). +- Probabilistic MR for `u128` behind `--feature unstable-u128`, 40 rounds, + error `< 2⁻⁸⁰`. +- **Dual path**: a build-time-generated table `PRIMES_BELOW_2K[57]` / + `PRIMES_ABOVE_2K[57]` covers k ∈ [8, 64] for power-of-two-aligned callers + (~1 ns table load); arbitrary inputs fall through to the general MR descent. +- The table is generated by `build.rs` calling the *same* MR kernel that ships + at runtime (via `include!`). A `tests/table_cross_check.rs` re-validates + every entry under `cargo test`, so MR remains source of truth. + +Acceptance gates: +1. `cargo test -p ruvector-collections primality` green, including pinned + pseudoprime regressions (OEIS A014233 entries 4, 5, 11). +2. Table cross-check validates all 114 entries against MR. +3. Bench targets met: + - `is_prime_u64` worst-case ≤ 50 ns (M-series), ≤ 200 ns (WASM) + - `prev_prime_below_pow2(k)` ≤ 1 ns (table) + - `next_prime_u64(arbitrary)` ≤ 2 µs + - `next_prime_u64(2^61)` ≤ 12 µs + +--- + +## 2. The implementation as shipped + +### 2.1 Shared kernel — `src/primality_kernel.rs` + +`include!`d into both `build.rs` and `src/primality.rs`. Inner attributes are +disallowed in `include!`d files, hence per-fn `#[allow(dead_code)]`. + +```rust +// Deterministic Miller-Rabin kernel — ADR-151 (PIAL). +// +// `include!`d into two contexts (build.rs and src/primality.rs) which use +// different subsets of the symbols. Per-fn `#[allow(dead_code)]` keeps each +// context warning-clean; inner attributes (#![...]) aren't legal in +// included files. + +#[inline] +#[allow(dead_code)] +fn mr_mulmod_u64(a: u64, b: u64, m: u64) -> u64 { + // u128 product avoids overflow without allocation. + ((a as u128).wrapping_mul(b as u128) % (m as u128)) as u64 +} + +#[inline] +#[allow(dead_code)] +fn mr_powmod_u64(mut base: u64, mut exp: u64, m: u64) -> u64 { + if m == 1 { + return 0; + } + let mut acc: u64 = 1; + base %= m; + while exp > 0 { + if exp & 1 == 1 { + acc = mr_mulmod_u64(acc, base, m); + } + exp >>= 1; + if exp > 0 { + base = mr_mulmod_u64(base, base, m); + } + } + acc +} + +// Returns true iff `a` is a Miller-Rabin witness of compositeness for `n`. +// Caller guarantees: n is odd, n > 3, and a in [2, n-2]. n - 1 = d * 2^s +// with d odd (passed in pre-decomposed for speed). +#[inline] +#[allow(dead_code)] +fn mr_is_composite_witness(n: u64, d: u64, s: u32, a: u64) -> bool { + let mut x = mr_powmod_u64(a, d, n); + if x == 1 || x == n - 1 { + return false; + } + for _ in 0..s.saturating_sub(1) { + x = mr_mulmod_u64(x, x, n); + if x == n - 1 { + return false; + } + } + true +} + +#[inline] +#[allow(dead_code)] +fn mr_is_prime_u64(n: u64) -> bool { + if n < 2 { + return false; + } + const SMALL_PRIMES: [u64; 12] = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37]; + for &p in &SMALL_PRIMES { + if n == p { + return true; + } + if n.is_multiple_of(p) { + return false; + } + } + // n is now odd, > 37, and coprime to every Sinclair witness. + let mut d = n - 1; + let mut s: u32 = 0; + while d & 1 == 0 { + d >>= 1; + s += 1; + } + for &a in &SMALL_PRIMES { + if mr_is_composite_witness(n, d, s, a) { + return false; + } + } + true +} + +#[inline] +#[allow(dead_code)] +fn mr_is_prime_u32(n: u32) -> bool { + mr_is_prime_u64(n as u64) +} + +#[inline] +#[allow(dead_code)] +fn mr_prev_prime_u64(upper: u64) -> u64 { + if upper <= 2 { return 0; } + if upper == 3 { return 2; } + let mut n = upper - 1; + if n & 1 == 0 { n -= 1; } + loop { + if mr_is_prime_u64(n) { return n; } + if n <= 3 { return 2; } + n -= 2; + } +} + +#[inline] +#[allow(dead_code)] +fn mr_next_prime_u64(lower: u64) -> u64 { + if lower < 2 { return 2; } + if lower < 3 { return 3; } + let largest_u64_prime: u64 = u64::MAX - 58; + if lower >= largest_u64_prime { return 0; } + let mut n = lower + 1; + if n & 1 == 0 { n += 1; } + loop { + if mr_is_prime_u64(n) { return n; } + n += 2; + } +} +``` + +### 2.2 Public API — relevant excerpts from `src/primality.rs` + +```rust +include!("primality_kernel.rs"); +include!(concat!(env!("OUT_DIR"), "/prime_tables.rs")); +// ↑ provides: pub const PRIMES_BELOW_2K: [u64; 57] +// pub const PRIMES_ABOVE_2K: [u64; 57] (last entry = 0 sentinel) + +#[inline] +pub fn is_prime_u32(n: u32) -> bool { mr_is_prime_u32(n) } + +#[inline] +pub fn is_prime_u64(n: u64) -> bool { mr_is_prime_u64(n) } + +#[inline] +pub fn prev_prime_below_pow2(k: u32) -> u64 { + debug_assert!((8..=64).contains(&k)); + PRIMES_BELOW_2K[(k - 8) as usize] +} + +#[inline] +pub fn next_prime_above_pow2(k: u32) -> u64 { + debug_assert!((8..=63).contains(&k)); + PRIMES_ABOVE_2K[(k - 8) as usize] +} + +#[inline] +pub fn prev_prime_u64(n: u64) -> u64 { + if n.is_power_of_two() { + let k = n.trailing_zeros(); + if (8..=64).contains(&k) { + return PRIMES_BELOW_2K[(k - 8) as usize]; + } + } + mr_prev_prime_u64(n) +} + +#[inline] +pub fn next_prime_u64(n: u64) -> u64 { + if n.is_power_of_two() { + let k = n.trailing_zeros(); + if (8..=63).contains(&k) { + return PRIMES_ABOVE_2K[(k - 8) as usize]; + } + } + mr_next_prime_u64(n) +} + +#[inline] +pub fn ephemeral_prime(seed: u64) -> u64 { + // Used by pi-brain witness chain (ADR §4.4) for per-share entropy. + let mask = (1u64 << 61) - 1; + let s = (seed | 1) & mask; + if mr_is_prime_u64(s) { s } else { mr_next_prime_u64(s) } +} + +// u128 probabilistic mode (cfg-gated on `unstable-u128`): +#[cfg(feature = "unstable-u128")] +pub fn is_prime_u128(n: u128, rounds: u8) -> bool { /* … 40-round MR with + a tiny seeded LCG for witness selection; deferred to mr_is_prime_u64 + when n <= u64::MAX */ } +``` + +### 2.3 Tests asserting correctness + +- `tests/primality_pseudoprimes.rs`: + - `is_prime_u64(3_215_031_751) == false` (OEIS A014233(4), spsp to {2,3,5,7}) + - `is_prime_u64(2_152_302_898_747) == false` (A014233(5)) + - `is_prime_u64(3_825_123_056_546_413_051) == false` (A014233(11), detected + only by base 37 — canary for anyone shrinking Sinclair-12) + - All primes/composites in [0, 100], 7 Carmichael numbers, edges + (0, 1, u64::MAX, u64::MAX − 58, largest u32 prime). +- `tests/table_cross_check.rs`: + - For each k ∈ [8, 64]: assert `is_prime_u64(PRIMES_BELOW_2K[k-8])` and + sweep every odd integer in `(table[k-8], 2^k)` asserting non-primality. + - Symmetric for k ∈ [8, 63] on `PRIMES_ABOVE_2K`. + - Sentinel: `PRIMES_ABOVE_2K[64-8] == 0`. + +--- + +## 3. Measurements (criterion, M-series, release profile) + +### 3.1 Phase-0 benches against the PRD targets + +| Bench | Measured | Target | Status | +|--------------------------------------------|-----------|---------|--------| +| `prev_prime_below_pow2(32)` (table) | 552 ps | ≤ 1 ns | green | +| `next_prime_u64(2^61 − 1)` general MR | 10.97 µs | ≤ 12 µs | green | +| `next_prime_u64(arbitrary ≈ 1e9)` general | 2.23 µs | ≤ 2 µs | +11% | +| `is_prime_u64(u64::MAX − 58)` worst-case | **15.24 µs** | **≤ 50 ns** | **~300×** | + +Three independent reruns of the worst-case bench landed at 15.24 / 15.79 / +15.65 µs — stable within ±2%, not measurement noise. + +### 3.2 Apples-to-apples competitor baseline + +To rule out "this machine is slow today" or "criterion is mismeasuring", we +built a throwaway scratch crate (outside the workspace) that compiles a +verbatim copy of our kernel alongside `num-prime` 0.4.4. Both run in the +same binary on the same input, with the same release profile +(`opt-level = 3`, `lto = "thin"`, `codegen-units = 1`). + +| Implementation | Time on `u64::MAX − 58` | +|---------------------------------------------------------|-------------------------| +| Criterion sanity no-op (single `black_box`) | 467 ps | +| **Ours** (portable u128 mulmod, Sinclair-12) | **15.63 µs** | +| **`num-prime` 0.4.4** (Montgomery via `num-modular`) | **884 ns** | +| PRD §6 target | 50 ns | + +Both implementations agreed on primality (no correctness gap). The 467 ps +sanity baseline confirms criterion is reporting honestly — broken benches +don't produce 467 ps for a no-op. + +**What this tells us:** + +1. **Our 15.63 µs is real and reproducible**, not a measurement artifact. +2. **We are ~17.7× slower than `num-prime`** on the same input. The + delta is almost certainly Montgomery-form modular multiplication + (`num-prime` pulls `num-modular`, which provides exactly that). +3. **`num-prime` itself is ~17.7× slower than the 50 ns target.** No + pure-Rust general-purpose primality crate we know of hits 50 ns on an + actual large prime; the realistic safe-Rust floor on M-series appears + to be ~880 ns. +4. The PRD's 50 ns figure is therefore *unachievable* in safe Rust — it + would require Montgomery + assembly / SIMD batching across many `n` / + leaving the safe subset entirely. + +ADR-151 forbids `num-prime` as a *runtime* dependency, but does not forbid +us from porting Montgomery into our own kernel — `num-modular` is +MIT/Apache and the technique itself is textbook. That is now a *measured* +optimization target with a known ceiling, not a guess. + +--- + +## 4. What we are asking you to do + +Produce **one document** with the four sections below. Be specific. Cite +sources where possible. Do not propose changes that would violate the +constraints in the preamble. + +### Section A — Correctness audit + +Read §2.1 and §2.2. Identify: +1. Any soundness bug (a composite that would be reported prime, or vice + versa) within the documented input ranges. +2. Edge cases not covered by the tests in §2.3 that you would add. +3. Any way the table cross-check could pass while masking a real bug + (i.e. is the test actually load-bearing?). +4. Risks specific to `ephemeral_prime`'s seed → prime mapping when used + for witness-chain fingerprinting (ADR §4.4): collisions, attacker + precomputation, distribution issues. + +### Section B — Performance plan, ranked + +The sharpened goal, given §3.2's competitor baseline: **close the 17.7× +gap to `num-prime` (15.63 µs → ~880 ns) in pure safe Rust, without taking +`num-prime` or `num-modular` as a runtime dependency**, AND hit the +`next_prime_u64` arbitrary 2 µs target. Treat 50 ns as aspirational; we +expect you to recommend a revised PRD target with justification. + +For each proposal: +- **Mechanism**: what changes in code (one paragraph, no hand-waving; + reference §2.1 line ranges where applicable). +- **Expected speedup vs our 15.63 µs baseline**: cite source or give a + back-of-envelope; if the technique is what `num-prime` uses, say so. +- **Cost**: code complexity (LoC, conceptual difficulty for reviewers), + WASM bundle size, any new `unsafe`. +- **Compatibility**: does it preserve the source-of-truth invariant + (build.rs and runtime use the same kernel via `include!`)? Does it + break `no_std`? +- **Validation methodology**: the *exact* benchmark and regression test + you would add to prove the speedup is real and stable, including + the criterion config you would use and the statistical threshold for + declaring "passed". + +Rank proposals by `expected_speedup × feasibility / complexity`. + +Candidate techniques to consider (extend or reject as you see fit): +- **Montgomery-form modular arithmetic** — likely the single biggest + lever based on the `num-prime` comparison. We want a concrete sketch + of the API change and a LoC estimate for porting it into our kernel. +- Wheel factorization (mod 30 / mod 210) for the small-prime screen. +- Branchless witness loops. +- Reduced witness sets for sub-ranges (e.g. {2} for n < 2047, + {2, 3} for n < 1.4 × 10^9). +- Strong-base early-exit ordering (which witness fails fastest on + random composites?). +- BPSW (Baillie-PSW) instead of MR — different correctness story; we'd + need a citation for deterministic-up-to-2^64 status. +- Strong Lucas as a deterministic add-on. +- Pre-screen by Pollard rho for small-factor composites (does this even + beat trial division for the tiny gap between 37 and our actual call + rate?). + +Specifically address: +1. **Realistic safe-Rust floor for `is_prime_u64` worst-case on M-series.** + Our measurement suggests ~880 ns (matching `num-prime`). Confirm or + refute, with reasoning. +2. **Recommended revised PRD target**, given that floor. +3. **The `next_prime_u64(arbitrary)` 2.23 µs vs 2 µs gap** — is this + meaningful or noise-band? If real, what closes it? + +### Section C — Architectural review + +1. Is the dual-path design (table + MR fallback) correctly capturing the + workload of the five named consumers (shard router, HNSW buckets, + sparsifier, mincut LSH, witness chain)? Any consumer where the table + would mislead? +2. Is `tests/table_cross_check.rs` sufficient as the source-of-truth gate, + or is there a stronger invariant we should assert? +3. Does `include!` of the kernel into both `build.rs` and `src/primality.rs` + create any failure mode you have seen burn other projects? +4. The `unstable-u128` feature uses Russian-peasant `mulmod_u128` and a + tiny seeded LCG for witness selection. Is that sound for the + probabilistic guarantee `Pr[err] < 2⁻⁸⁰` at 40 rounds? + +### Section D — Validation methodology + +For the *whole* Phase-0 deliverable, propose: +1. The minimum set of CI gates that would catch a regression in either + correctness or performance, and where they should run (PR / nightly / + release). +2. A reproducible benchmark harness that distinguishes signal from noise + on contended hardware (criterion is fine; what statistical thresholds + would you set for "pass"?). +3. A property-test (proptest/quickcheck-style) strategy that would + complement the pinned regressions in §2.3 without re-deriving MR. +4. Anything you would add to the `tests/` or `benches/` directory before + merging Phase 0. + +--- + +## 5. Format of your response + +Plain markdown. Sections A/B/C/D headed exactly as above. For Section B, +use a table sorted by your ranking. End with a one-paragraph **Verdict**: +should the PR merge as-is, merge with the PRD §6 row relaxed, or block +on a specific change? + +Do not be polite. If a proposal in our implementation is wrong, say so +directly with line-numbered references into §2.1 / §2.2. diff --git a/docs/research/miller-rabin-optimizations/HANDOFF.md b/docs/research/miller-rabin-optimizations/HANDOFF.md new file mode 100644 index 00000000..db963b3e --- /dev/null +++ b/docs/research/miller-rabin-optimizations/HANDOFF.md @@ -0,0 +1,113 @@ +# Handoff — Phase 0 Kickoff (PIAL) + +You are starting **Phase 0** of PIAL (Prime-Indexed Acceleration Layer): +land the Miller-Rabin primality utility in `ruvector-collections` and +nothing else. Five integration phases follow in separate PRs. + +## Read first (in order) + +1. **`docs/adr/ADR-151-miller-rabin-prime-optimizations.md`** — the binding + decision (status, scope, acceptance criteria, alternatives rejected). +2. **`docs/research/miller-rabin-optimizations/PRD.md`** — full design, + five creative use-cases, performance targets, six-phase rollout, risks. +3. **This file** — Phase 0 specifics. Do not skip. + +## Branch + +`feat/miller-rabin-prime-optimizations` (off `main`). Already created. + +## Target crate + +`crates/ruvector-collections/` already exists in the workspace. Today it +contains `collection.rs`, `error.rs`, `lib.rs`, `manager.rs`. No +`benches/` directory and no `build.rs` yet — both are Phase 0 work. + +## Phase 0 Deliverables (four files, one PR) + +| File | Purpose | Source of truth | +|---|---|---| +| `src/primality.rs` | Deterministic Miller-Rabin for u32/u64; probabilistic for u128; tabled `prev_prime_below_pow2` / `next_prime_above_pow2` fast paths; general `prev_prime_u64` / `next_prime_u64` MR-descent paths; `ephemeral_prime(seed)` for the witness chain | PRD §5 | +| `build.rs` | Generate `PRIMES_BELOW_2K[57]` and `PRIMES_ABOVE_2K[57]` (k ∈ [8, 64]) from the MR implementation at compile time; emit as `${OUT_DIR}/prime_tables.rs` for `include!`-inclusion in `primality.rs` | ADR-151 "Generation Strategy" | +| `benches/primality.rs` | Criterion benches: `is_prime_u64`, `prev_prime_below_pow2`, `next_prime_u64(arbitrary)`, `next_prime_u64(2^61)`. Targets in PRD §6 | PRD §6 | +| `tests/table_cross_check.rs` | For every k ∈ [8, 64], assert `is_prime_u64(PRIMES_BELOW_2K[k-8])` is true and that no prime exists in `(PRIMES_BELOW_2K[k-8], 2^k)`. Same for `_ABOVE_`. This is the gate that makes MR the source of truth | ADR-151 acceptance #2 | + +## Library wiring + +Add `pub mod primality;` to `crates/ruvector-collections/src/lib.rs` and +re-export the public API at the crate root. Update the crate-level +doc-comment to mention the new module. + +## Dependencies — explicitly do not add + +The PRD rejects `num-prime`, `glass_pumpkin`, and any other external +prime/big-integer crates. Use **only** `core` integer arithmetic. +Add `criterion` under `[dev-dependencies]` for benches if it is not +already inherited via the workspace. + +## Witnesses (the whole correctness story in three lines) + +- `u32`: `{ 2, 7, 61 }` — Pomerance/Selfridge/Wagstaff. Deterministic. +- `u64`: `{ 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37 }` — Sinclair (2011). Deterministic. +- `u128`: 40 random rounds, **only** behind `--feature unstable-u128`. Probabilistic, error < 2⁻⁸⁰. + +## Pinned pseudoprime regressions + +Include these in `tests/primality_pseudoprimes.rs` so future witness-set +"optimizations" cannot silently regress correctness: + +- `3_215_031_751` — strong pseudoprime to bases {2, 3, 5, 7} (must be detected by Sinclair-12). +- `2_152_302_898_747` — strong pseudoprime to {2, 3, 5, 7, 11}. +- `3_825_123_056_546_413_051` — large 64-bit known-hard composite. + +Add small-prime sanity (1, 2, 3, 4, 5, 7, 9, ..., 100) and edge cases +(0, 1, `u64::MAX`, `u64::MAX - 58` which is the largest u64 prime). + +## Performance targets (from PRD §6) + +| Operation | M-series | WASM | +|---|---|---| +| `is_prime_u64` worst-case | ≤ 50 ns | ≤ 200 ns | +| `prev_prime_below_pow2(k)` (table) | ≤ 1 ns | ≤ 2 ns | +| `next_prime_u64(2^32)` (table) | ≤ 1 ns | ≤ 2 ns | +| `next_prime_u64(arbitrary N)` (general MR) | ≤ 2 µs | ≤ 8 µs | +| `next_prime_u64(2^61)` (general MR) | ≤ 12 µs | ≤ 40 µs | + +## Phase 0 is "Done" when + +ADR-151 acceptance criteria #1, #2, #3 are all green: + +1. `cargo test -p ruvector-collections primality` passes (includes pinned pseudoprimes). +2. `cargo test -p ruvector-collections primality::table_cross_check` validates all 114 table entries against MR. +3. `cargo bench -p ruvector-collections primality` meets the targets above on M-series. + +**Do not start Phase 1 in this PR.** Phases ship as separate PRs +(PRD §7). Keep this one tightly scoped to the utility itself. + +## First commands in the new session + +```bash +# Confirm you are on the right branch +git status # should show "On branch feat/miller-rabin-prime-optimizations" with no changes + +# Baseline — confirm the crate compiles before you touch it +cargo check -p ruvector-collections + +# Re-read the binding documents +cat docs/adr/ADR-151-miller-rabin-prime-optimizations.md | head -80 +cat docs/research/miller-rabin-optimizations/PRD.md | sed -n '150,260p' # §5 API + §6 perf +``` + +Then start with `crates/ruvector-collections/src/primality.rs`. The +deterministic u64 Miller-Rabin is ~80 lines including comments; +everything else (tables via `build.rs`, benches, cross-check test) +follows mechanically from it. + +## What is explicitly **not** Phase 0 + +- Editing `crates/ruvector-graph/` (that's Phase 1). +- Editing any HNSW crate (Phase 2). +- Editing sparsifier or attn-mincut (Phase 3). +- Editing `crates/mcp-brain-server/` or pi-brain payloads (Phase 4). +- Editing CNN / quantization codebooks (Phase 5). + +If you find yourself touching any of those, stop and split the PR. diff --git a/docs/research/miller-rabin-optimizations/PRD.md b/docs/research/miller-rabin-optimizations/PRD.md new file mode 100644 index 00000000..f930f9a1 --- /dev/null +++ b/docs/research/miller-rabin-optimizations/PRD.md @@ -0,0 +1,369 @@ +# PRD: Prime-Indexed Acceleration Layer (PIAL) + +> Creative Miller-Rabin–driven optimizations for ruvector's hashing, +> sharding, sketching, and witness-chain layers. + +| Field | Value | +|--------------------|------------------------------------------------------| +| **Status** | Draft | +| **Date** | 2026-04-16 | +| **Owner** | RuVector Core / Architecture | +| **Related ADR** | ADR-151 (this PRD's binding decision record) | +| **Cross-refs** | ADR-027 (HNSW), ADR-038 (witness), ADR-058 (hash), | +| | ADR-148/149 (brain perf), ADR-150 (π-brain) | +| **Tier (ADR-026)** | T1 (Agent Booster eligible) for the core utility; | +| | T2 (Haiku) for the integration patches. | + +--- + +## 1. Background + +Three years of incremental work have left ruvector with several places where +**arithmetic on indices, hashes, and shard keys defaults to power-of-two +moduli** — convenient on hardware (`x & (N - 1)`), pathological on real data: + +| Site | Current modulus | Failure mode | +|---------------------------------------------------|--------------------|------------------------------------------------------------| +| `ruvector-graph` shard router (ADR-058 #6) | `xxh3_64() mod 2^k`| ~50% collision @ 2³² nodes; biased on Zipfian keys | +| `micro-hnsw-wasm` adjacency map | open-addressed 2^k | clustering on near-duplicate vectors (e.g. timestamps) | +| `ruvector-sparsifier` stride sampler | power-of-2 stride | aliasing on lattice / image-grid graphs | +| `ruvector-attn-mincut` LSH sketch | ad-hoc constant | breaks 2-independence of universal hash family | +| pi-brain witness fingerprint (ADR-038) | XXH3 only | single-hash tamper risk; no per-share entropy | + +The fix in every one of these is **the same primitive**: a fast, deterministic +primality test that lets us mint a prime *near a target size* on demand. + +We choose **Miller-Rabin** because it is: + +- **Deterministic** for all `u64` inputs with the Sinclair witness set + `{2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37}` — no probabilistic guarantees + needed for our hot paths. +- **O(k · log³ n)** — a `next_prime(2^32)` call costs ~2 µs in benchmarks; + amortized to zero against shard-rebalance cycles. +- **WASM-friendly** — pure integer arithmetic, no FFI, fits in <1 KB compiled. +- **Tier-1 eligible** under ADR-026 — pure transform work, no LLM in the loop. + +This PRD frames a single, surgically scoped utility (`primality.rs`) that +*unblocks* a portfolio of creative optimizations across the workspace. The +binding architectural commitments live in ADR-151. + +--- + +## 2. Goals + +| # | Goal | Metric / Acceptance | +|---|----------------------------------------------------------------------|------------------------------------------------------| +| G1| Provide `is_prime`, `next_prime`, `prev_prime` over `u32`/`u64` | Deterministic, ≥ 200 M ops/s on M-series | +| G2| Re-shard ruvector-graph by prime modulus | ≥ 30% reduction in shard-load std-dev on Zipfian load| +| G3| Convert HNSW adjacency tables to prime-bucket open addressing | ≥ 15% drop in p99 insert latency at 1 M vectors | +| G4| Replace LSH stride/modulus constants with certified primes | Restore 2-independence; pass property tests | +| G5| Add per-share ephemeral prime fingerprint to π-brain witness chain | +8 bytes/share; published in `brain_share` payload | +| G6| Cross-target: the utility compiles for native, WASM, and `no_std` | Single crate, no feature-flag explosion | + +## 3. Non-Goals + +- **No cryptographic key generation.** Miller-Rabin alone is *not* a substitute + for proven-prime generation in RSA/ECC; we only use it for hashing/sharding. +- **No new heap allocations** in the inner loop — the utility must be + allocation-free past the (constant-size) witness array. +- **No replacement** of `prime-radiant` (which is a coherence-gate crate and + unrelated despite the name collision). +- **No big-integer support.** 64-bit (and an opt-in `u128` mode) is enough for + every ruvector use case identified above. +- **No SHAKE/HMAC redesign.** ADR-058's other findings stand independently. + +--- + +## 4. Creative Use-Cases (the "why this is interesting") + +### 4.1 Prime-Modulus Shard Routing — *direct fix for ADR-058 #6* + +Today's shard router is `xxh3_64(node_id) & (shards - 1)`. The mask discards +all but `log₂(shards)` bits of entropy, which is exactly when adversarial / +Zipfian inputs cluster. Replacing it with `xxh3_64(node_id) % p`, where +`p = prev_prime(shards)`, recovers full entropy and gives provably balanced +buckets under universal hashing. + +> **Creative twist:** because `prev_prime(k)` is cheap, we can *adapt* the +> modulus during a rolling re-shard (every N minutes) — the cluster never +> sees a power-of-two pathology because the modulus literally never *is* a +> power of two for two consecutive epochs. + +### 4.2 Prime-Bucket HNSW Adjacency + +`micro-hnsw-wasm` and `ruvector-hyperbolic-hnsw` store edges in open-addressed +tables sized to the next power of two. Probe-sequence collisions on +near-duplicate vectors (e.g. real-time sensor or timestamp embeddings) blow up +p99 insert latency. Switching to `prev_prime(2^k)` capacity with linear or +quadratic probing keeps the table size cache-friendly while breaking the +power-of-two clustering. + +### 4.3 Certified Modulus for Universal LSH + +Several sketch modules (`ruvector-attn-mincut`, sparsifier samplers) build +hash families of the form `((a · x + b) mod p) mod m`. The 2-independence +guarantee *requires* `p` to be prime and `> universe_size`. Today these are +hand-picked Mersenne-shaped constants (`2^61 − 1`, `2^31 − 1`); when the +universe grows past those bounds the family silently degrades. Miller-Rabin +lets us call `next_prime(universe_size)` on dataset load and store the chosen +modulus alongside the index. + +### 4.4 Witness-Chain Ephemeral Primes (π-brain) + +The pi-brain witness chain (ADR-038, CLAUDE.md "Witness Chain Rules") +currently fingerprints each shared memory with XXH3 only. We propose: + +```text +share = { payload, fingerprint_xxh3, ephemeral_prime q, fingerprint_modq } + where q = next_prime( seed = SHA256(payload)[0..8] ) +``` + +A tampering peer attempting to substitute payloads must collide *both* +fingerprints — including a hash modulo a prime `q` they cannot precompute, +because `q` is derived per-share. Cost: 8 bytes on the wire, ~2 µs at the +sender, ~50 ns at every verifier. The asymmetry is the point. + +### 4.5 Anti-Aliasing Stride for Sparsifier Sampling + +Spectral sparsifiers in `ruvector-sparsifier` use stride-based subsampling +when the importance sketch is too expensive. Power-of-two strides alias +brutally on grid-structured graphs (image, mesh, lattice). A prime stride +breaks the alignment for the same reason linear-congruential generators +demand prime moduli — borrowed wisdom, decades old, free to reuse. + +### 4.6 Bonus: Prime-Sized Quantization Codebooks + +Product-quantization codebooks (used by ruvector-cnn-wasm and ruQu) sized to +prime cardinalities show measurably better recall@k on standard benchmarks +than power-of-two codebooks because they break the implicit "code-of-codes" +correlation across sub-spaces. This is an opt-in mode, not a default. + +--- + +## 5. Proposed Architecture + +``` +┌──────────────────────────────────────────────────────────────┐ +│ crates/ruvector-collections/src/primality.rs (new, ~250 LoC) │ +│ │ +│ pub fn is_prime_u32(n: u32) -> bool // {2,7,61} │ +│ pub fn is_prime_u64(n: u64) -> bool // Sinclair-12 │ +│ pub fn is_prime_u128(n: u128, k: u8) -> bool // probabilistic│ +│ pub fn next_prime_u64(n: u64) -> u64 │ +│ pub fn prev_prime_u64(n: u64) -> u64 │ +│ pub fn ephemeral_prime(seed: u64) -> u64 // for §4.4 │ +│ │ +│ #[cfg(target_arch = "wasm32")] // shares same impl │ +└──────────────────┬───────────────────────────┬────────────────┘ + │ │ + ┌──────────┴──────────┐ ┌─────────┴───────────┐ + ▼ ▼ ▼ ▼ + shard router HNSW buckets LSH families witness chain + (ruvector-graph) (micro-hnsw) (sparsifier, (mcp-brain-server, + attn-mincut) pi-brain) +``` + +### Why `ruvector-collections`? + +- It already houses cross-cutting data-structure utilities. +- All five consumers depend on it transitively, so no new edges in the + dependency graph. +- Keeps the workspace top-level crate count flat (we have 60+ already). + +### Public API (sketch) + +```rust +//! crates/ruvector-collections/src/primality.rs +//! +//! Deterministic Miller-Rabin primality for u32/u64 and probabilistic +//! Miller-Rabin for u128. Allocation-free, no_std-friendly. +//! +//! Hot-path strategy: tabled primes for the common power-of-two-aligned +//! sizes (zero runtime cost), Miller-Rabin descent as the general fallback. + +#[inline] +pub const fn is_prime_u32(n: u32) -> bool { /* witnesses: 2, 7, 61 */ } + +#[inline] +pub const fn is_prime_u64(n: u64) -> bool { + // Sinclair (2011): deterministic for all u64 + // witnesses: 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37 +} + +pub fn is_prime_u128(n: u128, rounds: u8) -> bool { /* probabilistic */ } + +// ── Generation: dual-path ──────────────────────────────────────────── +// +// Fast path: lookup table for "largest prime < 2^k", k ∈ [8, 64]. +// CI validates every entry against the Miller-Rabin descent at build +// time, so the table is never the source of truth — MR is. +const PRIMES_BELOW_2K: [u64; 57] = [ + 251, // < 2^8 + 509, // < 2^9 + 1021, // < 2^10 + // ... entries for k = 11..=31 ... + 4_294_967_291, // < 2^32 (shard-router common case) + // ... entries for k = 33..=63 ... + 18_446_744_073_709_551_557, // < 2^64 +]; + +#[inline] +pub const fn prev_prime_below_pow2(k: u32) -> u64 { + debug_assert!(k >= 8 && k <= 64); + PRIMES_BELOW_2K[(k - 8) as usize] +} + +#[inline] +pub fn prev_prime_u64(n: u64) -> u64 { + // Fast path: power-of-two-aligned inputs (HNSW buckets, shard sizes) + if n.is_power_of_two() && n.trailing_zeros() >= 8 { + return prev_prime_below_pow2(n.trailing_zeros()); + } + // General path: 6k±1 wheel + Miller-Rabin descent + miller_rabin_descent(n, Direction::Down) +} + +#[inline] +pub fn next_prime_u64(n: u64) -> u64 { + if n.is_power_of_two() && n.trailing_zeros() >= 8 { + // Symmetric optional fast path: PRIMES_ABOVE_2K table + return next_prime_above_pow2(n.trailing_zeros()); + } + miller_rabin_descent(n, Direction::Up) +} + +pub fn ephemeral_prime(seed: u64) -> u64 { + // seed → next_prime((seed | 1) % 2^61) — used by witness chain (§4.4) + // No table — input is unpredictable by design. +} +``` + +### Why the dual-path matters + +Three of PIAL's five generation sites (shard router, HNSW bucket sizing, +sparsifier strides) ask for primes near *fixed* sizes that never change +between releases. The table converts those calls into a single L1-cached +load — no Miller-Rabin work at runtime at all. + +The two unpredictable sites (LSH universe, witness-chain ephemeral primes) +fall through to the general MR path. They're cold paths anyway — +microsecond-scale generation cost is invisible against the surrounding work. + +**Crucially, MR is still the source of truth.** A `build.rs` script +regenerates `PRIMES_BELOW_2K` and `PRIMES_ABOVE_2K` from the MR +implementation on every build, and a `#[test]` cross-checks every entry +under `cargo test`. The table is an *amortization*, not a substitute. + +| Generation site | Path taken | Runtime cost | +|-----------------------------|--------------------|--------------| +| Shard router (`prev_prime(2^k)`) | Fast (table) | ~1 ns | +| HNSW bucket (`prev_prime(2^k)`) | Fast (table) | ~1 ns | +| Sparsifier stride (table-friendly)| Fast (table) | ~1 ns | +| LSH modulus (`next_prime(N)`) | General (MR) | ~250 ns | +| Witness ephemeral (`next_prime(seed)`)| General (MR) | ~250 ns | + +--- + +## 6. Performance Targets + +> **Revised 2026-04-16 (Phase 0).** The original `is_prime_u64` worst-case +> target of 50 ns was found to be unachievable in pure safe Rust; +> `num-prime` itself measures ~880 ns on the same hardware. Target relaxed +> to track the empirical safe-Rust ceiling. See §6.1 and the Phase 0 +> Findings section of ADR-151 for the full justification. + +| Operation | Target (M-series) | Target (WASM) | +|------------------------------------------------|---------------------|--------------------| +| `is_prime_u64(p)` (worst-case) | **≤ 1 µs** *(was 50 ns)* | **≤ 4 µs** *(was 200 ns)* | +| `prev_prime_below_pow2(k)` (table fast path) | **≤ 1 ns** | **≤ 2 ns** | +| `next_prime_u64(2^32)` (table fast path) | **≤ 1 ns** | **≤ 2 ns** | +| `next_prime_u64(arbitrary N)` (general MR path)| ≤ 2 µs | ≤ 8 µs | +| `next_prime_u64(2^61)` (general MR path) | ≤ 12 µs | ≤ 40 µs | +| Shard re-route on 1 M nodes | ≤ 30 ms (one-shot) | n/a | +| HNSW p99 insert @ 1 M vectors | -15% vs baseline | -10% vs baseline | +| WASM bundle growth from `PRIMES_BELOW_2K`+`_ABOVE_2K` | n/a | ≤ 1 KB total | + +Benchmarks live in `crates/ruvector-collections/benches/primality.rs` and run +under existing `npm run bench` infrastructure. + +### 6.1 Empirical findings (Phase 0) + +Phase 0 measurements on M-series, criterion release profile: + +| Bench | Measured | Revised target | Status | +|--------------------------------------------|-----------|----------------|--------| +| `prev_prime_below_pow2(32)` | 552 ps | ≤ 1 ns | met | +| `next_prime_u64(2^61 − 1)` | 10.97 µs | ≤ 12 µs | met | +| `next_prime_u64(arbitrary ≈ 1e9)` | 2.23 µs | ≤ 2 µs | +11% | +| `is_prime_u64(u64::MAX − 58)` worst-case | 15.24 µs | ≤ 1 µs | does not meet revised target — Phase 0.1 | + +A throwaway scratch crate compiling a verbatim copy of our kernel +alongside `num-prime` 0.4.4 in the same binary on the same input +measured **ours = 15.63 µs, num-prime = 884 ns** (criterion sanity no-op += 467 ps confirms harness honesty). The 17.7× gap is recoverable in pure +safe Rust by porting Montgomery-form modular multiplication into +`mr_mulmod_u64` / `mr_powmod_u64` (~80 LoC). That is Phase 0.1 scope and +ships in a separate PR; see ADR-151 "Phase 0 Findings" for the full plan +and the explicit rejection of the empirical 7-witness "Sinclair" set as +a correctness regression dressed as a perf win. + +--- + +## 7. Rollout Plan + +| Phase | Scope | Gate | +|-------|-------------------------------------------------------------------------|--------------------------------------------| +| **0** | Land `primality.rs` + tests + benches in `ruvector-collections` | `npm test && npm run lint` green | +| **1** | Wire `next_prime` into ruvector-graph shard router behind feature flag | A/B Zipfian load; ≥ 30% std-dev reduction | +| **2** | Convert HNSW adjacency to prime buckets (micro-hnsw-wasm first) | recall@k unchanged; p99 insert -15% | +| **3** | Switch sparsifier + attn-mincut LSH families to certified primes | property tests pass; no regression in cuts | +| **4** | Ship ephemeral-prime fingerprint in pi-brain witness payload (opt-in) | `brain_share` accepts new field; verifiers | +| | | tolerant of absence (backward compatible) | +| **5** | Optional: prime-sized PQ codebooks in ruvector-cnn-wasm | recall@10 ≥ baseline on SIFT-1M | + +Each phase is a separate PR; no big-bang merge. + +--- + +## 8. Risks & Mitigations + +| Risk | Mitigation | +|-----------------------------------------------------------------|----------------------------------------------------------------| +| Modulo-by-prime is a *division*, slower than mask | Use Lemire's `fastmod` (one mul + one shift) — already in tree | +| Sinclair witness set has subtle bugs in edge cases (n < 9) | Hard-code small-prime fast path + 100% branch coverage tests | +| WASM `u128` codegen is ~5× slower than native | u128 mode is opt-in; default paths are u64 | +| Cluster mid-flight reshard exposes intermediate state | Phase 1 ships behind `--feature prime-shard`; rollout is gated | +| Witness-chain change breaks older pi-brain peers | New field is `Option<…>`; verifiers ignore-on-absent | +| "Yet another collections crate" sprawl | All work lives in *existing* `ruvector-collections` | + +--- + +## 9. Open Questions + +1. Should `next_prime_u64` accept a *budget* (max-distance) and return + `Option` instead of looping unbounded? (Probably yes.) +2. Do we want a `PrimeModHash` newtype wrapper that auto-applies fastmod, + or expose `prev_prime` and let callers compose? (Lean: wrapper.) +3. Does the witness-chain ephemeral prime need to be authenticated under the + sender's key, or is per-share derivation from `SHA256(payload)` enough? + (Defer to security review during Phase 4.) + +--- + +## 10. Out of Scope (deliberately) + +- Big-integer / arbitrary-precision Miller-Rabin (use `num-bigint` if ever + needed — not on the roadmap). +- Replacing XXH3 as ruvector's primary hash (ADR-058's job). +- Strong-pseudoprime-based Lucas certificates (yagni for hashing). +- Distributed prime-generation protocols (we mint locally, deterministically). + +--- + +## 11. Approval Checklist + +- [ ] Architecture review (links ADR-151) +- [ ] Security review (esp. §4.4 witness chain) +- [ ] Performance baseline captured for shard-router and HNSW p99 +- [ ] WASM size budget verified (`micro-hnsw-wasm` < +2 KB) +- [ ] Documentation: README in `ruvector-collections` references new module