fix: implement security audit findings for ruvector-solver (25 fixes)

Address 25 findings from the security and performance audit across 10 files: CRITICAL: - neumann.rs: fix .max() → .min() tolerance bug with f32 precision floor to prevent requesting impossible accuracy from f32 internal computation HIGH (security): - forward_push.rs: replace debug_assert! with runtime validate_params() returning Result<(), SolverError> with proper ValidationError variants - cg.rs: upgrade all debug_assert_eq! to assert_eq! in dot_product_f64, dot_f64, axpy, axpy_f64, apply_preconditioner - bmssp.rs: upgrade debug_assert_eq! to assert_eq! in sparse_matmul - router.rs: upgrade debug_assert_eq! to assert_eq! with descriptive message - forward_push.rs, backward_push.rs: add MAX_GRAPH_NODES (100M) limit to prevent OOM denial-of-service on untrusted graph inputs HIGH (speed): - neumann.rs: deduplicate extract_diag_inv_f32 by extracting estimate_spectral_radius_with_diag helper, reusing D^{-1} across spectral check and Jacobi iteration - bmssp.rs: pre-allocate ax_buf to eliminate redundant SpMV allocation in V-cycle outer loop MODERATE (security): - forward_push.rs: guard float-to-usize overflow with .min(usize::MAX as f64) - random_walk.rs: use saturating_mul(10) to prevent integer overflow - neumann.rs: add tracing::warn! for zero/near-zero diagonal entries MODERATE (speed): - true_solver.rs: pre-allocate JL entries with Vec::with_capacity - true_solver.rs: hoist dense row accumulators outside loops in project_matrix and compute_reduced_matrix - router.rs: use binary_search instead of linear .any() for symmetry check - bmssp.rs: optimize row swap with split_at_mut + swap_with_slice LOW: - types.rs: sort_by_key → sort_unstable_by_key in from_coo_generic - types.rs: upgrade debug_assert! to assert! for bounds checks - simd.rs: add complete f64 SIMD path (spmv_simd_f64, spmv_avx2_f64, spmv_scalar_f64) with AVX2 intrinsics and tests - true_solver.rs: document structural cloning limitation with future refactoring note https://claude.ai/code/session_01TiqLbr2DaNAntQHaVeLfiR
2026-06-01 06:10:31 +00:00 · 2026-02-20 12:40:13 +00:00 · 2026-02-20 12:40:13 +00:00 · 27e4eab783
commit 27e4eab783
parent 7ef759fa09
10 changed files with 258 additions and 44 deletions
--- a/crates/ruvector-solver/src/backward_push.rs
+++ b/crates/ruvector-solver/src/backward_push.rs
@ -41,6 +41,9 @@ use crate::types::{
    SolverResult, SparsityProfile,
 };

+/// Maximum number of graph nodes to prevent OOM denial-of-service.
+const MAX_GRAPH_NODES: usize = 100_000_000;
+
 // ---------------------------------------------------------------------------
 // Solver struct
 // ---------------------------------------------------------------------------
@ -179,6 +182,16 @@ impl BackwardPushSolver {
        let start = Instant::now();
        let n = graph.rows;

+        if n > MAX_GRAPH_NODES {
+            return Err(SolverError::InvalidInput(
+                ValidationError::MatrixTooLarge {
+                    rows: n,
+                    cols: n,
+                    max_dim: MAX_GRAPH_NODES,
+                },
+            ));
+        }
+
        // Build the transposed adjacency so row_entries(v) in `graph_t`
        // yields the in-neighbours of v in the original graph.
        let graph_t = graph.transpose();
--- a/crates/ruvector-solver/src/bmssp.rs
+++ b/crates/ruvector-solver/src/bmssp.rs
@ -404,7 +404,7 @@ fn transpose_csr(a: &CsrMatrix<f64>) -> CsrMatrix<f64> {
 /// than reallocated, making this efficient for the moderate-sized matrices
 /// produced during hierarchy construction.
 fn sparse_matmul(a: &CsrMatrix<f64>, b: &CsrMatrix<f64>) -> CsrMatrix<f64> {
-    debug_assert_eq!(
+    assert_eq!(
        a.cols, b.rows,
        "sparse_matmul: dimension mismatch {}x{} * {}x{}",
        a.rows, a.cols, b.rows, b.cols
@ -535,11 +535,14 @@ fn dense_direct_solve(matrix: &CsrMatrix<f64>, b: &[f64]) -> Vec<f64> {
        }

        if max_row != col {
-            for k in 0..stride {
-                let a_idx = col * stride + k;
-                let b_idx = max_row * stride + k;
-                aug.swap(a_idx, b_idx);
-            }
+            let (first, second) = if col < max_row {
+                let (left, right) = aug.split_at_mut(max_row * stride);
+                (&mut left[col * stride..col * stride + stride], &mut right[..stride])
+            } else {
+                let (left, right) = aug.split_at_mut(col * stride);
+                (&mut right[..stride], &mut left[max_row * stride..max_row * stride + stride])
+            };
+            first.swap_with_slice(second);
        }

        let pivot = aug[col * stride + col];
@ -769,6 +772,7 @@ impl SolverEngine for BmsspSolver {

        // Solve phase: iterate V-cycles.
        let mut x = vec![0.0f64; n];
+        let mut ax_buf = vec![0.0f64; n];
        let mut convergence_history = Vec::with_capacity(max_iter);

        let b_norm = {
@ -804,7 +808,8 @@ impl SolverEngine for BmsspSolver {

            v_cycle(&hierarchy, &mut x, rhs, 0);

-            let res = residual_l2(matrix, &x, rhs);
+            matrix.spmv(&x, &mut ax_buf);
+            let res = (0..n).map(|i| { let r = rhs[i] - ax_buf[i]; r * r }).sum::<f64>().sqrt();

            convergence_history.push(ConvergenceInfo {
                iteration: iter,
--- a/crates/ruvector-solver/src/cg.rs
+++ b/crates/ruvector-solver/src/cg.rs
@ -74,7 +74,7 @@ use crate::types::{
 /// Debug-asserts that `a.len() == b.len()`.
 #[inline]
 pub fn dot_product_f64(a: &[f32], b: &[f32]) -> f64 {
-    debug_assert_eq!(a.len(), b.len(), "dot_product_f64: length mismatch");
+    assert_eq!(a.len(), b.len(), "dot_product_f64: length mismatch");

    let n = a.len();
    let chunks = n / 4;
@ -106,7 +106,7 @@ pub fn dot_product_f64(a: &[f32], b: &[f32]) -> f64 {
 /// Used internally when the working vectors are already `f64`.
 #[inline]
 fn dot_f64(a: &[f64], b: &[f64]) -> f64 {
-    debug_assert_eq!(a.len(), b.len(), "dot_f64: length mismatch");
+    assert_eq!(a.len(), b.len(), "dot_f64: length mismatch");

    let n = a.len();
    let chunks = n / 4;
@ -140,7 +140,7 @@ fn dot_f64(a: &[f64], b: &[f64]) -> f64 {
 /// Debug-asserts that `x.len() == y.len()`.
 #[inline]
 pub fn axpy(alpha: f32, x: &[f32], y: &mut [f32]) {
-    debug_assert_eq!(x.len(), y.len(), "axpy: length mismatch");
+    assert_eq!(x.len(), y.len(), "axpy: length mismatch");

    let n = x.len();
    let chunks = n / 4;
@ -161,7 +161,7 @@ pub fn axpy(alpha: f32, x: &[f32], y: &mut [f32]) {
 /// Compute `y[i] += alpha * x[i]` for all `i` (AXPY operation, `f64`).
 #[inline]
 fn axpy_f64(alpha: f64, x: &[f64], y: &mut [f64]) {
-    debug_assert_eq!(x.len(), y.len(), "axpy_f64: length mismatch");
+    assert_eq!(x.len(), y.len(), "axpy_f64: length mismatch");

    let n = x.len();
    let chunks = n / 4;
@ -349,8 +349,8 @@ impl ConjugateGradientSolver {
    /// Apply the diagonal preconditioner: `z[i] = inv_diag[i] * r[i]`.
    #[inline]
    fn apply_preconditioner(inv_diag: &[f64], r: &[f64], z: &mut [f64]) {
-        debug_assert_eq!(inv_diag.len(), r.len());
-        debug_assert_eq!(r.len(), z.len());
+        assert_eq!(inv_diag.len(), r.len());
+        assert_eq!(r.len(), z.len());

        let n = r.len();
        let chunks = n / 4;
--- a/crates/ruvector-solver/src/forward_push.rs
+++ b/crates/ruvector-solver/src/forward_push.rs
@ -105,18 +105,40 @@ pub struct ForwardPushSolver {
 impl ForwardPushSolver {
    /// Create a new forward-push solver.
    ///
-    /// # Panics
-    ///
-    /// Debug-asserts that `alpha` is in `(0, 1)` and `epsilon` is positive.
+    /// Parameters are validated lazily at the start of each computation
+    /// (see [`validate_params`](Self::validate_params)).
    pub fn new(alpha: f64, epsilon: f64) -> Self {
-        debug_assert!(
-            alpha > 0.0 && alpha < 1.0,
-            "alpha must be in (0, 1), got {alpha}",
-        );
-        debug_assert!(epsilon > 0.0, "epsilon must be positive, got {epsilon}");
        Self { alpha, epsilon }
    }

+    /// Validate that `alpha` and `epsilon` are within acceptable ranges.
+    ///
+    /// # Errors
+    ///
+    /// - [`SolverError::InvalidInput`] if `alpha` is not in `(0, 1)` exclusive.
+    /// - [`SolverError::InvalidInput`] if `epsilon` is not positive.
+    fn validate_params(&self) -> Result<(), SolverError> {
+        if self.alpha <= 0.0 || self.alpha >= 1.0 {
+            return Err(SolverError::InvalidInput(
+                crate::error::ValidationError::ParameterOutOfRange {
+                    name: "alpha".into(),
+                    value: self.alpha.to_string(),
+                    expected: "(0.0, 1.0) exclusive".into(),
+                },
+            ));
+        }
+        if self.epsilon <= 0.0 {
+            return Err(SolverError::InvalidInput(
+                crate::error::ValidationError::ParameterOutOfRange {
+                    name: "epsilon".into(),
+                    value: self.epsilon.to_string(),
+                    expected: "> 0.0".into(),
+                },
+            ));
+        }
+        Ok(())
+    }
+
    /// Create a solver with default parameters (`alpha = 0.85`,
    /// `epsilon = 1e-6`).
    pub fn default_params() -> Self {
@ -139,6 +161,7 @@ impl ForwardPushSolver {
        graph: &CsrMatrix<f64>,
        source: usize,
    ) -> Result<Vec<(usize, f64)>, SolverError> {
+        self.validate_params()?;
        validate_vertex(graph, source, "source")?;
        self.forward_push_core(graph, &[(source, 1.0)])
    }
@ -166,12 +189,27 @@ impl ForwardPushSolver {
    ///
    /// Uses a `VecDeque` work-queue with a membership bitvec to achieve
    /// O(1/epsilon) total work, independent of graph size.
+    /// Maximum number of graph nodes to prevent OOM DoS.
+    const MAX_GRAPH_NODES: usize = 100_000_000;
+
    fn forward_push_core(
        &self,
        graph: &CsrMatrix<f64>,
        seeds: &[(usize, f64)],
    ) -> Result<Vec<(usize, f64)>, SolverError> {
+        self.validate_params()?;
+
        let n = graph.rows;
+        if n > Self::MAX_GRAPH_NODES {
+            return Err(SolverError::InvalidInput(
+                crate::error::ValidationError::MatrixTooLarge {
+                    rows: n,
+                    cols: graph.cols,
+                    max_dim: Self::MAX_GRAPH_NODES,
+                },
+            ));
+        }
+
        let mut estimate = vec![KahanAccumulator::new(); n];
        let mut residual = vec![KahanAccumulator::new(); n];

@ -437,7 +475,7 @@ impl SolverEngine for ForwardPushSolver {
        _profile: &SparsityProfile,
        _n: usize,
    ) -> ComplexityEstimate {
-        let est_ops = (1.0 / self.epsilon) as usize;
+        let est_ops = (1.0 / self.epsilon).min(usize::MAX as f64) as usize;
        ComplexityEstimate {
            algorithm: Algorithm::ForwardPush,
            estimated_flops: est_ops as u64 * 10,
--- a/crates/ruvector-solver/src/neumann.rs
+++ b/crates/ruvector-solver/src/neumann.rs
@ -120,8 +120,23 @@ impl NeumannSolver {
            return 0.0;
        }

-        // Extract diagonal and compute D^{-1}.
        let d_inv = extract_diag_inv_f32(matrix);
+        Self::estimate_spectral_radius_with_diag(matrix, &d_inv)
+    }
+
+    /// Inner helper: estimate spectral radius using a pre-computed `d_inv`.
+    ///
+    /// This avoids recomputing the diagonal inverse when the caller already
+    /// has it (e.g. `solve()` needs `d_inv` for both the spectral check and
+    /// the Jacobi iteration).
+    fn estimate_spectral_radius_with_diag(
+        matrix: &CsrMatrix<f32>,
+        d_inv: &[f32],
+    ) -> f64 {
+        let n = matrix.rows;
+        if n == 0 {
+            return 0.0;
+        }

        // Initialise with a deterministic pseudo-random unit vector.
        let mut v: Vec<f32> = (0..n)
@ -222,10 +237,14 @@ impl NeumannSolver {
            });
        }

+        // Extract D^{-1} once — reused for both the spectral radius check
+        // and the Jacobi-preconditioned iteration that follows.
+        let d_inv = extract_diag_inv_f32(matrix);
+
        // ------------------------------------------------------------------
        // Spectral radius pre-check (10-step power iteration on I - D^{-1}A)
        // ------------------------------------------------------------------
-        let rho = Self::estimate_spectral_radius(matrix);
+        let rho = Self::estimate_spectral_radius_with_diag(matrix, &d_inv);
        if rho >= 1.0 {
            warn!(rho, "spectral radius >= 1.0, Neumann series will diverge");
            return Err(SolverError::SpectralRadiusExceeded {
@ -236,9 +255,6 @@ impl NeumannSolver {
        }
        info!(rho, "spectral radius check passed");

-        // Extract D^{-1} for Jacobi preconditioning.
-        let d_inv = extract_diag_inv_f32(matrix);
-
        // ------------------------------------------------------------------
        // Jacobi-preconditioned iteration (fused kernel)
        //
@ -372,9 +388,14 @@ impl SolverEngine for NeumannSolver {
        };
        let f32_rhs: Vec<f32> = rhs.iter().map(|&v| v as f32).collect();

-        // Use the tighter of the solver's own budget and the caller's budget.
+        // Use the tighter of the solver's own tolerance and the caller's budget,
+        // but no tighter than f32 precision allows (the Neumann solver operates
+        // internally in f32, so residuals below ~f32::EPSILON are unreachable).
        let max_iters = self.max_iterations.min(budget.max_iterations);
-        let tol = self.tolerance.max(budget.tolerance);
+        let tol = self
+            .tolerance
+            .min(budget.tolerance)
+            .max(f32::EPSILON as f64 * 4.0);

        let inner_solver = NeumannSolver::new(tol, max_iters);

@ -439,6 +460,12 @@ fn extract_diag_inv_f32(matrix: &CsrMatrix<f32>) -> Vec<f32> {
                let diag = matrix.values[idx];
                if diag.abs() > 1e-15 {
                    d_inv[i] = 1.0 / diag;
+                } else {
+                    warn!(
+                        row = i,
+                        diag_value = %diag,
+                        "zero or near-zero diagonal entry; substituting 1.0 — matrix may be singular"
+                    );
                }
                break;
            }
--- a/crates/ruvector-solver/src/random_walk.rs
+++ b/crates/ruvector-solver/src/random_walk.rs
@ -429,7 +429,7 @@ impl SolverEngine for HybridRandomWalkSolver {
            cdf.push(cumulative);
        }

-        let walks = self.num_walks.min(budget.max_iterations * 10);
+        let walks = self.num_walks.min(budget.max_iterations.saturating_mul(10));
        let mut counts = vec![0.0f64; n];

        for _ in 0..walks {
--- a/crates/ruvector-solver/src/router.rs
+++ b/crates/ruvector-solver/src/router.rs
@ -540,8 +540,8 @@ impl SolverOrchestrator {
                    let col_start = matrix.row_ptr[col];
                    let col_end = matrix.row_ptr[col + 1];
                    let found = matrix.col_indices[col_start..col_end]
-                        .iter()
-                        .any(|&c| c == row);
+                        .binary_search(&row)
+                        .is_ok();
                    if !found {
                        symmetric_mismatches += 1;
                    }
@ -1130,7 +1130,7 @@ impl Default for SolverOrchestrator {
 #[inline]
 #[allow(dead_code)]
 fn dot(a: &[f64], b: &[f64]) -> f64 {
-    debug_assert_eq!(a.len(), b.len());
+    assert_eq!(a.len(), b.len(), "dot: length mismatch {} vs {}", a.len(), b.len());
    a.iter()
        .zip(b.iter())
        .map(|(&ai, &bi)| ai * bi)
--- a/crates/ruvector-solver/src/simd.rs
+++ b/crates/ruvector-solver/src/simd.rs
@ -120,6 +120,91 @@ unsafe fn horizontal_sum_f32x8(v: std::arch::x86_64::__m256) -> f32 {
    _mm_cvtss_f32(result)
 }

+/// Sparse matrix-vector multiply with optional SIMD acceleration for f64.
+///
+/// Computes `y = A * x` where `A` is a CSR matrix of `f64` values.
+pub fn spmv_simd_f64(matrix: &CsrMatrix<f64>, x: &[f64], y: &mut [f64]) {
+    assert_eq!(x.len(), matrix.cols, "x length must equal matrix.cols");
+    assert_eq!(y.len(), matrix.rows, "y length must equal matrix.rows");
+
+    #[cfg(all(feature = "simd", target_arch = "x86_64"))]
+    {
+        if is_x86_feature_detected!("avx2") {
+            unsafe {
+                spmv_avx2_f64(matrix, x, y);
+            }
+            return;
+        }
+    }
+
+    spmv_scalar_f64(matrix, x, y);
+}
+
+/// Scalar fallback for f64 SpMV.
+pub fn spmv_scalar_f64(matrix: &CsrMatrix<f64>, x: &[f64], y: &mut [f64]) {
+    for i in 0..matrix.rows {
+        let start = matrix.row_ptr[i];
+        let end = matrix.row_ptr[i + 1];
+        let mut sum = 0.0f64;
+        for idx in start..end {
+            let col = matrix.col_indices[idx];
+            sum += matrix.values[idx] * x[col];
+        }
+        y[i] = sum;
+    }
+}
+
+#[cfg(all(feature = "simd", target_arch = "x86_64"))]
+#[target_feature(enable = "avx2")]
+unsafe fn spmv_avx2_f64(matrix: &CsrMatrix<f64>, x: &[f64], y: &mut [f64]) {
+    use std::arch::x86_64::*;
+
+    for i in 0..matrix.rows {
+        let start = matrix.row_ptr[i];
+        let end = matrix.row_ptr[i + 1];
+        let len = end - start;
+
+        let mut accum = _mm256_setzero_pd();
+        let chunks = len / 4;
+        let remainder = len % 4;
+
+        for chunk in 0..chunks {
+            let base = start + chunk * 4;
+            let vals = _mm256_loadu_pd(matrix.values.as_ptr().add(base));
+
+            let mut x_buf = [0.0f64; 4];
+            for k in 0..4 {
+                let col = *matrix.col_indices.get_unchecked(base + k);
+                x_buf[k] = *x.get_unchecked(col);
+            }
+            let x_vec = _mm256_loadu_pd(x_buf.as_ptr());
+            accum = _mm256_add_pd(accum, _mm256_mul_pd(vals, x_vec));
+        }
+
+        let mut sum = horizontal_sum_f64x4(accum);
+
+        let tail_start = start + chunks * 4;
+        for idx in tail_start..(tail_start + remainder) {
+            let col = *matrix.col_indices.get_unchecked(idx);
+            sum += *matrix.values.get_unchecked(idx) * *x.get_unchecked(col);
+        }
+
+        *y.get_unchecked_mut(i) = sum;
+    }
+}
+
+#[cfg(all(feature = "simd", target_arch = "x86_64"))]
+#[target_feature(enable = "avx2")]
+unsafe fn horizontal_sum_f64x4(v: std::arch::x86_64::__m256d) -> f64 {
+    use std::arch::x86_64::*;
+    let hi = _mm256_extractf128_pd(v, 1);
+    let lo = _mm256_castpd256_pd128(v);
+    let sum128 = _mm_add_pd(lo, hi);
+    let hi64 = _mm_unpackhi_pd(sum128, sum128);
+    let result = _mm_add_sd(sum128, hi64);
+    _mm_cvtsd_f64(result)
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@ -159,4 +244,38 @@ mod tests {
        assert!((y[1] - 6.0).abs() < 1e-6);
        assert!((y[2] - 13.0).abs() < 1e-6);
    }
+
+    #[test]
+    fn spmv_simd_f64_correctness() {
+        let mat = CsrMatrix::<f64> {
+            values: vec![2.0, 1.0, 3.0, 1.0, 4.0],
+            col_indices: vec![0, 2, 1, 0, 2],
+            row_ptr: vec![0, 2, 3, 5],
+            rows: 3,
+            cols: 3,
+        };
+        let x = vec![1.0, 2.0, 3.0];
+        let mut y = vec![0.0f64; 3];
+        spmv_simd_f64(&mat, &x, &mut y);
+        assert!((y[0] - 5.0).abs() < 1e-10);
+        assert!((y[1] - 6.0).abs() < 1e-10);
+        assert!((y[2] - 13.0).abs() < 1e-10);
+    }
+
+    #[test]
+    fn scalar_spmv_f64_correctness() {
+        let mat = CsrMatrix::<f64> {
+            values: vec![2.0, 1.0, 3.0, 1.0, 4.0],
+            col_indices: vec![0, 2, 1, 0, 2],
+            row_ptr: vec![0, 2, 3, 5],
+            rows: 3,
+            cols: 3,
+        };
+        let x = vec![1.0, 2.0, 3.0];
+        let mut y = vec![0.0f64; 3];
+        spmv_scalar_f64(&mat, &x, &mut y);
+        assert!((y[0] - 5.0).abs() < 1e-10);
+        assert!((y[1] - 6.0).abs() < 1e-10);
+        assert!((y[2] - 13.0).abs() < 1e-10);
+    }
 }
--- a/crates/ruvector-solver/src/true_solver.rs
+++ b/crates/ruvector-solver/src/true_solver.rs
@ -120,7 +120,7 @@ impl TrueSolver {
    ) -> Vec<(usize, usize, f32)> {
        let scale = 1.0 / (k as f64).sqrt();
        let scale_f32 = scale as f32;
-        let mut entries = Vec::new();
+        let mut entries = Vec::with_capacity(((k * n) as f64 / 3.0).ceil() as usize);

        for row in 0..k {
            for col in 0..n {
@ -169,11 +169,12 @@ impl TrueSolver {
        // For each row i of Pi, compute B[i,:] = Pi[i,:] * A.
        let mut b_entries: Vec<(usize, usize, f32)> = Vec::new();

+        // Hoist accumulator outside loop to avoid reallocating each iteration.
+        let mut b_row = vec![0.0f32; n];
        for pi_row in 0..k {
            let pi_start = pi.row_ptr[pi_row];
            let pi_end = pi.row_ptr[pi_row + 1];

-            let mut b_row = vec![0.0f32; n];
            for pi_idx in pi_start..pi_end {
                let pi_col = pi.col_indices[pi_idx];
                let pi_val = pi.values[pi_idx];
@ -190,6 +191,9 @@ impl TrueSolver {
                    b_entries.push((pi_row, col, val));
                }
            }
+
+            // Zero the accumulator for the next row.
+            b_row.iter_mut().for_each(|v| *v = 0.0);
        }

        let b_matrix = CsrMatrix::<f32>::from_coo(k, n, b_entries);
@ -207,11 +211,12 @@ impl TrueSolver {

        let mut a_prime_entries: Vec<(usize, usize, f32)> = Vec::new();

-        for b_row in 0..k {
-            let b_start = b_matrix.row_ptr[b_row];
-            let b_end = b_matrix.row_ptr[b_row + 1];
+        // Hoist accumulator outside loop to avoid reallocating each iteration.
+        let mut row_accum = vec![0.0f32; k];
+        for b_row_idx in 0..k {
+            let b_start = b_matrix.row_ptr[b_row_idx];
+            let b_end = b_matrix.row_ptr[b_row_idx + 1];

-            let mut row_accum = vec![0.0f32; k];
            for b_idx in b_start..b_end {
                let l = b_matrix.col_indices[b_idx];
                let b_val = b_matrix.values[b_idx];
@ -223,9 +228,12 @@ impl TrueSolver {

            for (j, &val) in row_accum.iter().enumerate() {
                if val.abs() > f32::EPSILON {
-                    a_prime_entries.push((b_row, j, val));
+                    a_prime_entries.push((b_row_idx, j, val));
                }
            }
+
+            // Zero the accumulator for the next row.
+            row_accum.iter_mut().for_each(|v| *v = 0.0);
        }

        CsrMatrix::<f32>::from_coo(k, k, a_prime_entries)
@ -560,7 +568,11 @@ impl SolverEngine for TrueSolver {
        rhs: &[f64],
        _budget: &crate::types::ComputeBudget,
    ) -> Result<SolverResult, SolverError> {
-        // Convert f64 input to f32 for internal computation
+        // Convert f64 input to f32 for internal computation.
+        // NOTE: row_ptr and col_indices are cloned here because CsrMatrix owns
+        // Vec<usize>, so we cannot borrow from the f64 matrix. A future
+        // refactor could introduce a CsrMatrixView that borrows structural
+        // arrays to eliminate these allocations on the f64 -> f32 path.
        let f32_values: Vec<f32> = matrix.values.iter().map(|&v| v as f32).collect();
        let f32_matrix = CsrMatrix {
            row_ptr: matrix.row_ptr.clone(),
--- a/crates/ruvector-solver/src/types.rs
+++ b/crates/ruvector-solver/src/types.rs
@ -268,7 +268,7 @@ impl<T: Copy + Default + std::ops::AddAssign> CsrMatrix<T> {
        entries: impl IntoIterator<Item = (usize, usize, T)>,
    ) -> Self {
        let mut sorted: Vec<_> = entries.into_iter().collect();
-        sorted.sort_by_key(|(r, c, _)| (*r, *c));
+        sorted.sort_unstable_by_key(|(r, c, _)| (*r, *c));

        let nnz = sorted.len();
        let mut row_ptr = vec![0usize; rows + 1];
@ -276,7 +276,7 @@ impl<T: Copy + Default + std::ops::AddAssign> CsrMatrix<T> {
        let mut values = Vec::with_capacity(nnz);

        for &(r, _, _) in &sorted {
-            debug_assert!(r < rows, "row index {} out of bounds (rows={})", r, rows);
+            assert!(r < rows, "row index {} out of bounds (rows={})", r, rows);
            row_ptr[r + 1] += 1;
        }
        for i in 1..=rows {
@ -284,7 +284,7 @@ impl<T: Copy + Default + std::ops::AddAssign> CsrMatrix<T> {
        }

        for (_, c, v) in sorted {
-            debug_assert!(c < cols, "col index {} out of bounds (cols={})", c, cols);
+            assert!(c < cols, "col index {} out of bounds (cols={})", c, cols);
            col_indices.push(c);
            values.push(v);
        }