diff --git a/crates/ruvector-solver/src/backward_push.rs b/crates/ruvector-solver/src/backward_push.rs index 54669de28..7674292ad 100644 --- a/crates/ruvector-solver/src/backward_push.rs +++ b/crates/ruvector-solver/src/backward_push.rs @@ -41,6 +41,9 @@ use crate::types::{ SolverResult, SparsityProfile, }; +/// Maximum number of graph nodes to prevent OOM denial-of-service. +const MAX_GRAPH_NODES: usize = 100_000_000; + // --------------------------------------------------------------------------- // Solver struct // --------------------------------------------------------------------------- @@ -179,6 +182,16 @@ impl BackwardPushSolver { let start = Instant::now(); let n = graph.rows; + if n > MAX_GRAPH_NODES { + return Err(SolverError::InvalidInput( + ValidationError::MatrixTooLarge { + rows: n, + cols: n, + max_dim: MAX_GRAPH_NODES, + }, + )); + } + // Build the transposed adjacency so row_entries(v) in `graph_t` // yields the in-neighbours of v in the original graph. let graph_t = graph.transpose(); diff --git a/crates/ruvector-solver/src/bmssp.rs b/crates/ruvector-solver/src/bmssp.rs index cd3d5051d..97571c06a 100644 --- a/crates/ruvector-solver/src/bmssp.rs +++ b/crates/ruvector-solver/src/bmssp.rs @@ -404,7 +404,7 @@ fn transpose_csr(a: &CsrMatrix) -> CsrMatrix { /// than reallocated, making this efficient for the moderate-sized matrices /// produced during hierarchy construction. fn sparse_matmul(a: &CsrMatrix, b: &CsrMatrix) -> CsrMatrix { - debug_assert_eq!( + assert_eq!( a.cols, b.rows, "sparse_matmul: dimension mismatch {}x{} * {}x{}", a.rows, a.cols, b.rows, b.cols @@ -535,11 +535,14 @@ fn dense_direct_solve(matrix: &CsrMatrix, b: &[f64]) -> Vec { } if max_row != col { - for k in 0..stride { - let a_idx = col * stride + k; - let b_idx = max_row * stride + k; - aug.swap(a_idx, b_idx); - } + let (first, second) = if col < max_row { + let (left, right) = aug.split_at_mut(max_row * stride); + (&mut left[col * stride..col * stride + stride], &mut right[..stride]) + } else { + let (left, right) = aug.split_at_mut(col * stride); + (&mut right[..stride], &mut left[max_row * stride..max_row * stride + stride]) + }; + first.swap_with_slice(second); } let pivot = aug[col * stride + col]; @@ -769,6 +772,7 @@ impl SolverEngine for BmsspSolver { // Solve phase: iterate V-cycles. let mut x = vec![0.0f64; n]; + let mut ax_buf = vec![0.0f64; n]; let mut convergence_history = Vec::with_capacity(max_iter); let b_norm = { @@ -804,7 +808,8 @@ impl SolverEngine for BmsspSolver { v_cycle(&hierarchy, &mut x, rhs, 0); - let res = residual_l2(matrix, &x, rhs); + matrix.spmv(&x, &mut ax_buf); + let res = (0..n).map(|i| { let r = rhs[i] - ax_buf[i]; r * r }).sum::().sqrt(); convergence_history.push(ConvergenceInfo { iteration: iter, diff --git a/crates/ruvector-solver/src/cg.rs b/crates/ruvector-solver/src/cg.rs index 28e73e9d4..8a93d97a3 100644 --- a/crates/ruvector-solver/src/cg.rs +++ b/crates/ruvector-solver/src/cg.rs @@ -74,7 +74,7 @@ use crate::types::{ /// Debug-asserts that `a.len() == b.len()`. #[inline] pub fn dot_product_f64(a: &[f32], b: &[f32]) -> f64 { - debug_assert_eq!(a.len(), b.len(), "dot_product_f64: length mismatch"); + assert_eq!(a.len(), b.len(), "dot_product_f64: length mismatch"); let n = a.len(); let chunks = n / 4; @@ -106,7 +106,7 @@ pub fn dot_product_f64(a: &[f32], b: &[f32]) -> f64 { /// Used internally when the working vectors are already `f64`. #[inline] fn dot_f64(a: &[f64], b: &[f64]) -> f64 { - debug_assert_eq!(a.len(), b.len(), "dot_f64: length mismatch"); + assert_eq!(a.len(), b.len(), "dot_f64: length mismatch"); let n = a.len(); let chunks = n / 4; @@ -140,7 +140,7 @@ fn dot_f64(a: &[f64], b: &[f64]) -> f64 { /// Debug-asserts that `x.len() == y.len()`. #[inline] pub fn axpy(alpha: f32, x: &[f32], y: &mut [f32]) { - debug_assert_eq!(x.len(), y.len(), "axpy: length mismatch"); + assert_eq!(x.len(), y.len(), "axpy: length mismatch"); let n = x.len(); let chunks = n / 4; @@ -161,7 +161,7 @@ pub fn axpy(alpha: f32, x: &[f32], y: &mut [f32]) { /// Compute `y[i] += alpha * x[i]` for all `i` (AXPY operation, `f64`). #[inline] fn axpy_f64(alpha: f64, x: &[f64], y: &mut [f64]) { - debug_assert_eq!(x.len(), y.len(), "axpy_f64: length mismatch"); + assert_eq!(x.len(), y.len(), "axpy_f64: length mismatch"); let n = x.len(); let chunks = n / 4; @@ -349,8 +349,8 @@ impl ConjugateGradientSolver { /// Apply the diagonal preconditioner: `z[i] = inv_diag[i] * r[i]`. #[inline] fn apply_preconditioner(inv_diag: &[f64], r: &[f64], z: &mut [f64]) { - debug_assert_eq!(inv_diag.len(), r.len()); - debug_assert_eq!(r.len(), z.len()); + assert_eq!(inv_diag.len(), r.len()); + assert_eq!(r.len(), z.len()); let n = r.len(); let chunks = n / 4; diff --git a/crates/ruvector-solver/src/forward_push.rs b/crates/ruvector-solver/src/forward_push.rs index a64870207..4f0fbdc2e 100644 --- a/crates/ruvector-solver/src/forward_push.rs +++ b/crates/ruvector-solver/src/forward_push.rs @@ -105,18 +105,40 @@ pub struct ForwardPushSolver { impl ForwardPushSolver { /// Create a new forward-push solver. /// - /// # Panics - /// - /// Debug-asserts that `alpha` is in `(0, 1)` and `epsilon` is positive. + /// Parameters are validated lazily at the start of each computation + /// (see [`validate_params`](Self::validate_params)). pub fn new(alpha: f64, epsilon: f64) -> Self { - debug_assert!( - alpha > 0.0 && alpha < 1.0, - "alpha must be in (0, 1), got {alpha}", - ); - debug_assert!(epsilon > 0.0, "epsilon must be positive, got {epsilon}"); Self { alpha, epsilon } } + /// Validate that `alpha` and `epsilon` are within acceptable ranges. + /// + /// # Errors + /// + /// - [`SolverError::InvalidInput`] if `alpha` is not in `(0, 1)` exclusive. + /// - [`SolverError::InvalidInput`] if `epsilon` is not positive. + fn validate_params(&self) -> Result<(), SolverError> { + if self.alpha <= 0.0 || self.alpha >= 1.0 { + return Err(SolverError::InvalidInput( + crate::error::ValidationError::ParameterOutOfRange { + name: "alpha".into(), + value: self.alpha.to_string(), + expected: "(0.0, 1.0) exclusive".into(), + }, + )); + } + if self.epsilon <= 0.0 { + return Err(SolverError::InvalidInput( + crate::error::ValidationError::ParameterOutOfRange { + name: "epsilon".into(), + value: self.epsilon.to_string(), + expected: "> 0.0".into(), + }, + )); + } + Ok(()) + } + /// Create a solver with default parameters (`alpha = 0.85`, /// `epsilon = 1e-6`). pub fn default_params() -> Self { @@ -139,6 +161,7 @@ impl ForwardPushSolver { graph: &CsrMatrix, source: usize, ) -> Result, SolverError> { + self.validate_params()?; validate_vertex(graph, source, "source")?; self.forward_push_core(graph, &[(source, 1.0)]) } @@ -166,12 +189,27 @@ impl ForwardPushSolver { /// /// Uses a `VecDeque` work-queue with a membership bitvec to achieve /// O(1/epsilon) total work, independent of graph size. + /// Maximum number of graph nodes to prevent OOM DoS. + const MAX_GRAPH_NODES: usize = 100_000_000; + fn forward_push_core( &self, graph: &CsrMatrix, seeds: &[(usize, f64)], ) -> Result, SolverError> { + self.validate_params()?; + let n = graph.rows; + if n > Self::MAX_GRAPH_NODES { + return Err(SolverError::InvalidInput( + crate::error::ValidationError::MatrixTooLarge { + rows: n, + cols: graph.cols, + max_dim: Self::MAX_GRAPH_NODES, + }, + )); + } + let mut estimate = vec![KahanAccumulator::new(); n]; let mut residual = vec![KahanAccumulator::new(); n]; @@ -437,7 +475,7 @@ impl SolverEngine for ForwardPushSolver { _profile: &SparsityProfile, _n: usize, ) -> ComplexityEstimate { - let est_ops = (1.0 / self.epsilon) as usize; + let est_ops = (1.0 / self.epsilon).min(usize::MAX as f64) as usize; ComplexityEstimate { algorithm: Algorithm::ForwardPush, estimated_flops: est_ops as u64 * 10, diff --git a/crates/ruvector-solver/src/neumann.rs b/crates/ruvector-solver/src/neumann.rs index 410bd9f01..d189e9fa4 100644 --- a/crates/ruvector-solver/src/neumann.rs +++ b/crates/ruvector-solver/src/neumann.rs @@ -120,8 +120,23 @@ impl NeumannSolver { return 0.0; } - // Extract diagonal and compute D^{-1}. let d_inv = extract_diag_inv_f32(matrix); + Self::estimate_spectral_radius_with_diag(matrix, &d_inv) + } + + /// Inner helper: estimate spectral radius using a pre-computed `d_inv`. + /// + /// This avoids recomputing the diagonal inverse when the caller already + /// has it (e.g. `solve()` needs `d_inv` for both the spectral check and + /// the Jacobi iteration). + fn estimate_spectral_radius_with_diag( + matrix: &CsrMatrix, + d_inv: &[f32], + ) -> f64 { + let n = matrix.rows; + if n == 0 { + return 0.0; + } // Initialise with a deterministic pseudo-random unit vector. let mut v: Vec = (0..n) @@ -222,10 +237,14 @@ impl NeumannSolver { }); } + // Extract D^{-1} once — reused for both the spectral radius check + // and the Jacobi-preconditioned iteration that follows. + let d_inv = extract_diag_inv_f32(matrix); + // ------------------------------------------------------------------ // Spectral radius pre-check (10-step power iteration on I - D^{-1}A) // ------------------------------------------------------------------ - let rho = Self::estimate_spectral_radius(matrix); + let rho = Self::estimate_spectral_radius_with_diag(matrix, &d_inv); if rho >= 1.0 { warn!(rho, "spectral radius >= 1.0, Neumann series will diverge"); return Err(SolverError::SpectralRadiusExceeded { @@ -236,9 +255,6 @@ impl NeumannSolver { } info!(rho, "spectral radius check passed"); - // Extract D^{-1} for Jacobi preconditioning. - let d_inv = extract_diag_inv_f32(matrix); - // ------------------------------------------------------------------ // Jacobi-preconditioned iteration (fused kernel) // @@ -372,9 +388,14 @@ impl SolverEngine for NeumannSolver { }; let f32_rhs: Vec = rhs.iter().map(|&v| v as f32).collect(); - // Use the tighter of the solver's own budget and the caller's budget. + // Use the tighter of the solver's own tolerance and the caller's budget, + // but no tighter than f32 precision allows (the Neumann solver operates + // internally in f32, so residuals below ~f32::EPSILON are unreachable). let max_iters = self.max_iterations.min(budget.max_iterations); - let tol = self.tolerance.max(budget.tolerance); + let tol = self + .tolerance + .min(budget.tolerance) + .max(f32::EPSILON as f64 * 4.0); let inner_solver = NeumannSolver::new(tol, max_iters); @@ -439,6 +460,12 @@ fn extract_diag_inv_f32(matrix: &CsrMatrix) -> Vec { let diag = matrix.values[idx]; if diag.abs() > 1e-15 { d_inv[i] = 1.0 / diag; + } else { + warn!( + row = i, + diag_value = %diag, + "zero or near-zero diagonal entry; substituting 1.0 — matrix may be singular" + ); } break; } diff --git a/crates/ruvector-solver/src/random_walk.rs b/crates/ruvector-solver/src/random_walk.rs index 514790ccc..7fc63c513 100644 --- a/crates/ruvector-solver/src/random_walk.rs +++ b/crates/ruvector-solver/src/random_walk.rs @@ -429,7 +429,7 @@ impl SolverEngine for HybridRandomWalkSolver { cdf.push(cumulative); } - let walks = self.num_walks.min(budget.max_iterations * 10); + let walks = self.num_walks.min(budget.max_iterations.saturating_mul(10)); let mut counts = vec![0.0f64; n]; for _ in 0..walks { diff --git a/crates/ruvector-solver/src/router.rs b/crates/ruvector-solver/src/router.rs index ad9ba80be..1f051506a 100644 --- a/crates/ruvector-solver/src/router.rs +++ b/crates/ruvector-solver/src/router.rs @@ -540,8 +540,8 @@ impl SolverOrchestrator { let col_start = matrix.row_ptr[col]; let col_end = matrix.row_ptr[col + 1]; let found = matrix.col_indices[col_start..col_end] - .iter() - .any(|&c| c == row); + .binary_search(&row) + .is_ok(); if !found { symmetric_mismatches += 1; } @@ -1130,7 +1130,7 @@ impl Default for SolverOrchestrator { #[inline] #[allow(dead_code)] fn dot(a: &[f64], b: &[f64]) -> f64 { - debug_assert_eq!(a.len(), b.len()); + assert_eq!(a.len(), b.len(), "dot: length mismatch {} vs {}", a.len(), b.len()); a.iter() .zip(b.iter()) .map(|(&ai, &bi)| ai * bi) diff --git a/crates/ruvector-solver/src/simd.rs b/crates/ruvector-solver/src/simd.rs index fa0e5a0b5..d4c4a32dd 100644 --- a/crates/ruvector-solver/src/simd.rs +++ b/crates/ruvector-solver/src/simd.rs @@ -120,6 +120,91 @@ unsafe fn horizontal_sum_f32x8(v: std::arch::x86_64::__m256) -> f32 { _mm_cvtss_f32(result) } +/// Sparse matrix-vector multiply with optional SIMD acceleration for f64. +/// +/// Computes `y = A * x` where `A` is a CSR matrix of `f64` values. +pub fn spmv_simd_f64(matrix: &CsrMatrix, x: &[f64], y: &mut [f64]) { + assert_eq!(x.len(), matrix.cols, "x length must equal matrix.cols"); + assert_eq!(y.len(), matrix.rows, "y length must equal matrix.rows"); + + #[cfg(all(feature = "simd", target_arch = "x86_64"))] + { + if is_x86_feature_detected!("avx2") { + unsafe { + spmv_avx2_f64(matrix, x, y); + } + return; + } + } + + spmv_scalar_f64(matrix, x, y); +} + +/// Scalar fallback for f64 SpMV. +pub fn spmv_scalar_f64(matrix: &CsrMatrix, x: &[f64], y: &mut [f64]) { + for i in 0..matrix.rows { + let start = matrix.row_ptr[i]; + let end = matrix.row_ptr[i + 1]; + let mut sum = 0.0f64; + for idx in start..end { + let col = matrix.col_indices[idx]; + sum += matrix.values[idx] * x[col]; + } + y[i] = sum; + } +} + +#[cfg(all(feature = "simd", target_arch = "x86_64"))] +#[target_feature(enable = "avx2")] +unsafe fn spmv_avx2_f64(matrix: &CsrMatrix, x: &[f64], y: &mut [f64]) { + use std::arch::x86_64::*; + + for i in 0..matrix.rows { + let start = matrix.row_ptr[i]; + let end = matrix.row_ptr[i + 1]; + let len = end - start; + + let mut accum = _mm256_setzero_pd(); + let chunks = len / 4; + let remainder = len % 4; + + for chunk in 0..chunks { + let base = start + chunk * 4; + let vals = _mm256_loadu_pd(matrix.values.as_ptr().add(base)); + + let mut x_buf = [0.0f64; 4]; + for k in 0..4 { + let col = *matrix.col_indices.get_unchecked(base + k); + x_buf[k] = *x.get_unchecked(col); + } + let x_vec = _mm256_loadu_pd(x_buf.as_ptr()); + accum = _mm256_add_pd(accum, _mm256_mul_pd(vals, x_vec)); + } + + let mut sum = horizontal_sum_f64x4(accum); + + let tail_start = start + chunks * 4; + for idx in tail_start..(tail_start + remainder) { + let col = *matrix.col_indices.get_unchecked(idx); + sum += *matrix.values.get_unchecked(idx) * *x.get_unchecked(col); + } + + *y.get_unchecked_mut(i) = sum; + } +} + +#[cfg(all(feature = "simd", target_arch = "x86_64"))] +#[target_feature(enable = "avx2")] +unsafe fn horizontal_sum_f64x4(v: std::arch::x86_64::__m256d) -> f64 { + use std::arch::x86_64::*; + let hi = _mm256_extractf128_pd(v, 1); + let lo = _mm256_castpd256_pd128(v); + let sum128 = _mm_add_pd(lo, hi); + let hi64 = _mm_unpackhi_pd(sum128, sum128); + let result = _mm_add_sd(sum128, hi64); + _mm_cvtsd_f64(result) +} + #[cfg(test)] mod tests { use super::*; @@ -159,4 +244,38 @@ mod tests { assert!((y[1] - 6.0).abs() < 1e-6); assert!((y[2] - 13.0).abs() < 1e-6); } + + #[test] + fn spmv_simd_f64_correctness() { + let mat = CsrMatrix:: { + values: vec![2.0, 1.0, 3.0, 1.0, 4.0], + col_indices: vec![0, 2, 1, 0, 2], + row_ptr: vec![0, 2, 3, 5], + rows: 3, + cols: 3, + }; + let x = vec![1.0, 2.0, 3.0]; + let mut y = vec![0.0f64; 3]; + spmv_simd_f64(&mat, &x, &mut y); + assert!((y[0] - 5.0).abs() < 1e-10); + assert!((y[1] - 6.0).abs() < 1e-10); + assert!((y[2] - 13.0).abs() < 1e-10); + } + + #[test] + fn scalar_spmv_f64_correctness() { + let mat = CsrMatrix:: { + values: vec![2.0, 1.0, 3.0, 1.0, 4.0], + col_indices: vec![0, 2, 1, 0, 2], + row_ptr: vec![0, 2, 3, 5], + rows: 3, + cols: 3, + }; + let x = vec![1.0, 2.0, 3.0]; + let mut y = vec![0.0f64; 3]; + spmv_scalar_f64(&mat, &x, &mut y); + assert!((y[0] - 5.0).abs() < 1e-10); + assert!((y[1] - 6.0).abs() < 1e-10); + assert!((y[2] - 13.0).abs() < 1e-10); + } } diff --git a/crates/ruvector-solver/src/true_solver.rs b/crates/ruvector-solver/src/true_solver.rs index 34b092178..383af135b 100644 --- a/crates/ruvector-solver/src/true_solver.rs +++ b/crates/ruvector-solver/src/true_solver.rs @@ -120,7 +120,7 @@ impl TrueSolver { ) -> Vec<(usize, usize, f32)> { let scale = 1.0 / (k as f64).sqrt(); let scale_f32 = scale as f32; - let mut entries = Vec::new(); + let mut entries = Vec::with_capacity(((k * n) as f64 / 3.0).ceil() as usize); for row in 0..k { for col in 0..n { @@ -169,11 +169,12 @@ impl TrueSolver { // For each row i of Pi, compute B[i,:] = Pi[i,:] * A. let mut b_entries: Vec<(usize, usize, f32)> = Vec::new(); + // Hoist accumulator outside loop to avoid reallocating each iteration. + let mut b_row = vec![0.0f32; n]; for pi_row in 0..k { let pi_start = pi.row_ptr[pi_row]; let pi_end = pi.row_ptr[pi_row + 1]; - let mut b_row = vec![0.0f32; n]; for pi_idx in pi_start..pi_end { let pi_col = pi.col_indices[pi_idx]; let pi_val = pi.values[pi_idx]; @@ -190,6 +191,9 @@ impl TrueSolver { b_entries.push((pi_row, col, val)); } } + + // Zero the accumulator for the next row. + b_row.iter_mut().for_each(|v| *v = 0.0); } let b_matrix = CsrMatrix::::from_coo(k, n, b_entries); @@ -207,11 +211,12 @@ impl TrueSolver { let mut a_prime_entries: Vec<(usize, usize, f32)> = Vec::new(); - for b_row in 0..k { - let b_start = b_matrix.row_ptr[b_row]; - let b_end = b_matrix.row_ptr[b_row + 1]; + // Hoist accumulator outside loop to avoid reallocating each iteration. + let mut row_accum = vec![0.0f32; k]; + for b_row_idx in 0..k { + let b_start = b_matrix.row_ptr[b_row_idx]; + let b_end = b_matrix.row_ptr[b_row_idx + 1]; - let mut row_accum = vec![0.0f32; k]; for b_idx in b_start..b_end { let l = b_matrix.col_indices[b_idx]; let b_val = b_matrix.values[b_idx]; @@ -223,9 +228,12 @@ impl TrueSolver { for (j, &val) in row_accum.iter().enumerate() { if val.abs() > f32::EPSILON { - a_prime_entries.push((b_row, j, val)); + a_prime_entries.push((b_row_idx, j, val)); } } + + // Zero the accumulator for the next row. + row_accum.iter_mut().for_each(|v| *v = 0.0); } CsrMatrix::::from_coo(k, k, a_prime_entries) @@ -560,7 +568,11 @@ impl SolverEngine for TrueSolver { rhs: &[f64], _budget: &crate::types::ComputeBudget, ) -> Result { - // Convert f64 input to f32 for internal computation + // Convert f64 input to f32 for internal computation. + // NOTE: row_ptr and col_indices are cloned here because CsrMatrix owns + // Vec, so we cannot borrow from the f64 matrix. A future + // refactor could introduce a CsrMatrixView that borrows structural + // arrays to eliminate these allocations on the f64 -> f32 path. let f32_values: Vec = matrix.values.iter().map(|&v| v as f32).collect(); let f32_matrix = CsrMatrix { row_ptr: matrix.row_ptr.clone(), diff --git a/crates/ruvector-solver/src/types.rs b/crates/ruvector-solver/src/types.rs index 9bb89878b..8ec570e19 100644 --- a/crates/ruvector-solver/src/types.rs +++ b/crates/ruvector-solver/src/types.rs @@ -268,7 +268,7 @@ impl CsrMatrix { entries: impl IntoIterator, ) -> Self { let mut sorted: Vec<_> = entries.into_iter().collect(); - sorted.sort_by_key(|(r, c, _)| (*r, *c)); + sorted.sort_unstable_by_key(|(r, c, _)| (*r, *c)); let nnz = sorted.len(); let mut row_ptr = vec![0usize; rows + 1]; @@ -276,7 +276,7 @@ impl CsrMatrix { let mut values = Vec::with_capacity(nnz); for &(r, _, _) in &sorted { - debug_assert!(r < rows, "row index {} out of bounds (rows={})", r, rows); + assert!(r < rows, "row index {} out of bounds (rows={})", r, rows); row_ptr[r + 1] += 1; } for i in 1..=rows { @@ -284,7 +284,7 @@ impl CsrMatrix { } for (_, c, v) in sorted { - debug_assert!(c < cols, "col index {} out of bounds (cols={})", c, cols); + assert!(c < cols, "col index {} out of bounds (cols={})", c, cols); col_indices.push(c); values.push(v); }