diff --git a/crates/ruvector-solver/src/backward_push.rs b/crates/ruvector-solver/src/backward_push.rs
index 54669de28..7674292ad 100644
--- a/crates/ruvector-solver/src/backward_push.rs
+++ b/crates/ruvector-solver/src/backward_push.rs
@@ -41,6 +41,9 @@ use crate::types::{
     SolverResult, SparsityProfile,
 };
 
+/// Maximum number of graph nodes to prevent OOM denial-of-service.
+const MAX_GRAPH_NODES: usize = 100_000_000;
+
 // ---------------------------------------------------------------------------
 // Solver struct
 // ---------------------------------------------------------------------------
@@ -179,6 +182,16 @@ impl BackwardPushSolver {
         let start = Instant::now();
         let n = graph.rows;
 
+        if n > MAX_GRAPH_NODES {
+            return Err(SolverError::InvalidInput(
+                ValidationError::MatrixTooLarge {
+                    rows: n,
+                    cols: n,
+                    max_dim: MAX_GRAPH_NODES,
+                },
+            ));
+        }
+
         // Build the transposed adjacency so row_entries(v) in `graph_t`
         // yields the in-neighbours of v in the original graph.
         let graph_t = graph.transpose();
diff --git a/crates/ruvector-solver/src/bmssp.rs b/crates/ruvector-solver/src/bmssp.rs
index cd3d5051d..97571c06a 100644
--- a/crates/ruvector-solver/src/bmssp.rs
+++ b/crates/ruvector-solver/src/bmssp.rs
@@ -404,7 +404,7 @@ fn transpose_csr(a: &CsrMatrix<f64>) -> CsrMatrix<f64> {
 /// than reallocated, making this efficient for the moderate-sized matrices
 /// produced during hierarchy construction.
 fn sparse_matmul(a: &CsrMatrix<f64>, b: &CsrMatrix<f64>) -> CsrMatrix<f64> {
-    debug_assert_eq!(
+    assert_eq!(
         a.cols, b.rows,
         "sparse_matmul: dimension mismatch {}x{} * {}x{}",
         a.rows, a.cols, b.rows, b.cols
@@ -535,11 +535,14 @@ fn dense_direct_solve(matrix: &CsrMatrix<f64>, b: &[f64]) -> Vec<f64> {
         }
 
         if max_row != col {
-            for k in 0..stride {
-                let a_idx = col * stride + k;
-                let b_idx = max_row * stride + k;
-                aug.swap(a_idx, b_idx);
-            }
+            let (first, second) = if col < max_row {
+                let (left, right) = aug.split_at_mut(max_row * stride);
+                (&mut left[col * stride..col * stride + stride], &mut right[..stride])
+            } else {
+                let (left, right) = aug.split_at_mut(col * stride);
+                (&mut right[..stride], &mut left[max_row * stride..max_row * stride + stride])
+            };
+            first.swap_with_slice(second);
         }
 
         let pivot = aug[col * stride + col];
@@ -769,6 +772,7 @@ impl SolverEngine for BmsspSolver {
 
         // Solve phase: iterate V-cycles.
         let mut x = vec![0.0f64; n];
+        let mut ax_buf = vec![0.0f64; n];
         let mut convergence_history = Vec::with_capacity(max_iter);
 
         let b_norm = {
@@ -804,7 +808,8 @@ impl SolverEngine for BmsspSolver {
 
             v_cycle(&hierarchy, &mut x, rhs, 0);
 
-            let res = residual_l2(matrix, &x, rhs);
+            matrix.spmv(&x, &mut ax_buf);
+            let res = (0..n).map(|i| { let r = rhs[i] - ax_buf[i]; r * r }).sum::<f64>().sqrt();
 
             convergence_history.push(ConvergenceInfo {
                 iteration: iter,
diff --git a/crates/ruvector-solver/src/cg.rs b/crates/ruvector-solver/src/cg.rs
index 28e73e9d4..8a93d97a3 100644
--- a/crates/ruvector-solver/src/cg.rs
+++ b/crates/ruvector-solver/src/cg.rs
@@ -74,7 +74,7 @@ use crate::types::{
 /// Debug-asserts that `a.len() == b.len()`.
 #[inline]
 pub fn dot_product_f64(a: &[f32], b: &[f32]) -> f64 {
-    debug_assert_eq!(a.len(), b.len(), "dot_product_f64: length mismatch");
+    assert_eq!(a.len(), b.len(), "dot_product_f64: length mismatch");
 
     let n = a.len();
     let chunks = n / 4;
@@ -106,7 +106,7 @@ pub fn dot_product_f64(a: &[f32], b: &[f32]) -> f64 {
 /// Used internally when the working vectors are already `f64`.
 #[inline]
 fn dot_f64(a: &[f64], b: &[f64]) -> f64 {
-    debug_assert_eq!(a.len(), b.len(), "dot_f64: length mismatch");
+    assert_eq!(a.len(), b.len(), "dot_f64: length mismatch");
 
     let n = a.len();
     let chunks = n / 4;
@@ -140,7 +140,7 @@ fn dot_f64(a: &[f64], b: &[f64]) -> f64 {
 /// Debug-asserts that `x.len() == y.len()`.
 #[inline]
 pub fn axpy(alpha: f32, x: &[f32], y: &mut [f32]) {
-    debug_assert_eq!(x.len(), y.len(), "axpy: length mismatch");
+    assert_eq!(x.len(), y.len(), "axpy: length mismatch");
 
     let n = x.len();
     let chunks = n / 4;
@@ -161,7 +161,7 @@ pub fn axpy(alpha: f32, x: &[f32], y: &mut [f32]) {
 /// Compute `y[i] += alpha * x[i]` for all `i` (AXPY operation, `f64`).
 #[inline]
 fn axpy_f64(alpha: f64, x: &[f64], y: &mut [f64]) {
-    debug_assert_eq!(x.len(), y.len(), "axpy_f64: length mismatch");
+    assert_eq!(x.len(), y.len(), "axpy_f64: length mismatch");
 
     let n = x.len();
     let chunks = n / 4;
@@ -349,8 +349,8 @@ impl ConjugateGradientSolver {
     /// Apply the diagonal preconditioner: `z[i] = inv_diag[i] * r[i]`.
     #[inline]
     fn apply_preconditioner(inv_diag: &[f64], r: &[f64], z: &mut [f64]) {
-        debug_assert_eq!(inv_diag.len(), r.len());
-        debug_assert_eq!(r.len(), z.len());
+        assert_eq!(inv_diag.len(), r.len());
+        assert_eq!(r.len(), z.len());
 
         let n = r.len();
         let chunks = n / 4;
diff --git a/crates/ruvector-solver/src/forward_push.rs b/crates/ruvector-solver/src/forward_push.rs
index a64870207..4f0fbdc2e 100644
--- a/crates/ruvector-solver/src/forward_push.rs
+++ b/crates/ruvector-solver/src/forward_push.rs
@@ -105,18 +105,40 @@ pub struct ForwardPushSolver {
 impl ForwardPushSolver {
     /// Create a new forward-push solver.
     ///
-    /// # Panics
-    ///
-    /// Debug-asserts that `alpha` is in `(0, 1)` and `epsilon` is positive.
+    /// Parameters are validated lazily at the start of each computation
+    /// (see [`validate_params`](Self::validate_params)).
     pub fn new(alpha: f64, epsilon: f64) -> Self {
-        debug_assert!(
-            alpha > 0.0 && alpha < 1.0,
-            "alpha must be in (0, 1), got {alpha}",
-        );
-        debug_assert!(epsilon > 0.0, "epsilon must be positive, got {epsilon}");
         Self { alpha, epsilon }
     }
 
+    /// Validate that `alpha` and `epsilon` are within acceptable ranges.
+    ///
+    /// # Errors
+    ///
+    /// - [`SolverError::InvalidInput`] if `alpha` is not in `(0, 1)` exclusive.
+    /// - [`SolverError::InvalidInput`] if `epsilon` is not positive.
+    fn validate_params(&self) -> Result<(), SolverError> {
+        if self.alpha <= 0.0 || self.alpha >= 1.0 {
+            return Err(SolverError::InvalidInput(
+                crate::error::ValidationError::ParameterOutOfRange {
+                    name: "alpha".into(),
+                    value: self.alpha.to_string(),
+                    expected: "(0.0, 1.0) exclusive".into(),
+                },
+            ));
+        }
+        if self.epsilon <= 0.0 {
+            return Err(SolverError::InvalidInput(
+                crate::error::ValidationError::ParameterOutOfRange {
+                    name: "epsilon".into(),
+                    value: self.epsilon.to_string(),
+                    expected: "> 0.0".into(),
+                },
+            ));
+        }
+        Ok(())
+    }
+
     /// Create a solver with default parameters (`alpha = 0.85`,
     /// `epsilon = 1e-6`).
     pub fn default_params() -> Self {
@@ -139,6 +161,7 @@ impl ForwardPushSolver {
         graph: &CsrMatrix<f64>,
         source: usize,
     ) -> Result<Vec<(usize, f64)>, SolverError> {
+        self.validate_params()?;
         validate_vertex(graph, source, "source")?;
         self.forward_push_core(graph, &[(source, 1.0)])
     }
@@ -166,12 +189,27 @@ impl ForwardPushSolver {
     ///
     /// Uses a `VecDeque` work-queue with a membership bitvec to achieve
     /// O(1/epsilon) total work, independent of graph size.
+    /// Maximum number of graph nodes to prevent OOM DoS.
+    const MAX_GRAPH_NODES: usize = 100_000_000;
+
     fn forward_push_core(
         &self,
         graph: &CsrMatrix<f64>,
         seeds: &[(usize, f64)],
     ) -> Result<Vec<(usize, f64)>, SolverError> {
+        self.validate_params()?;
+
         let n = graph.rows;
+        if n > Self::MAX_GRAPH_NODES {
+            return Err(SolverError::InvalidInput(
+                crate::error::ValidationError::MatrixTooLarge {
+                    rows: n,
+                    cols: graph.cols,
+                    max_dim: Self::MAX_GRAPH_NODES,
+                },
+            ));
+        }
+
         let mut estimate = vec![KahanAccumulator::new(); n];
         let mut residual = vec![KahanAccumulator::new(); n];
 
@@ -437,7 +475,7 @@ impl SolverEngine for ForwardPushSolver {
         _profile: &SparsityProfile,
         _n: usize,
     ) -> ComplexityEstimate {
-        let est_ops = (1.0 / self.epsilon) as usize;
+        let est_ops = (1.0 / self.epsilon).min(usize::MAX as f64) as usize;
         ComplexityEstimate {
             algorithm: Algorithm::ForwardPush,
             estimated_flops: est_ops as u64 * 10,
diff --git a/crates/ruvector-solver/src/neumann.rs b/crates/ruvector-solver/src/neumann.rs
index 410bd9f01..d189e9fa4 100644
--- a/crates/ruvector-solver/src/neumann.rs
+++ b/crates/ruvector-solver/src/neumann.rs
@@ -120,8 +120,23 @@ impl NeumannSolver {
             return 0.0;
         }
 
-        // Extract diagonal and compute D^{-1}.
         let d_inv = extract_diag_inv_f32(matrix);
+        Self::estimate_spectral_radius_with_diag(matrix, &d_inv)
+    }
+
+    /// Inner helper: estimate spectral radius using a pre-computed `d_inv`.
+    ///
+    /// This avoids recomputing the diagonal inverse when the caller already
+    /// has it (e.g. `solve()` needs `d_inv` for both the spectral check and
+    /// the Jacobi iteration).
+    fn estimate_spectral_radius_with_diag(
+        matrix: &CsrMatrix<f32>,
+        d_inv: &[f32],
+    ) -> f64 {
+        let n = matrix.rows;
+        if n == 0 {
+            return 0.0;
+        }
 
         // Initialise with a deterministic pseudo-random unit vector.
         let mut v: Vec<f32> = (0..n)
@@ -222,10 +237,14 @@ impl NeumannSolver {
             });
         }
 
+        // Extract D^{-1} once — reused for both the spectral radius check
+        // and the Jacobi-preconditioned iteration that follows.
+        let d_inv = extract_diag_inv_f32(matrix);
+
         // ------------------------------------------------------------------
         // Spectral radius pre-check (10-step power iteration on I - D^{-1}A)
         // ------------------------------------------------------------------
-        let rho = Self::estimate_spectral_radius(matrix);
+        let rho = Self::estimate_spectral_radius_with_diag(matrix, &d_inv);
         if rho >= 1.0 {
             warn!(rho, "spectral radius >= 1.0, Neumann series will diverge");
             return Err(SolverError::SpectralRadiusExceeded {
@@ -236,9 +255,6 @@ impl NeumannSolver {
         }
         info!(rho, "spectral radius check passed");
 
-        // Extract D^{-1} for Jacobi preconditioning.
-        let d_inv = extract_diag_inv_f32(matrix);
-
         // ------------------------------------------------------------------
         // Jacobi-preconditioned iteration (fused kernel)
         //
@@ -372,9 +388,14 @@ impl SolverEngine for NeumannSolver {
         };
         let f32_rhs: Vec<f32> = rhs.iter().map(|&v| v as f32).collect();
 
-        // Use the tighter of the solver's own budget and the caller's budget.
+        // Use the tighter of the solver's own tolerance and the caller's budget,
+        // but no tighter than f32 precision allows (the Neumann solver operates
+        // internally in f32, so residuals below ~f32::EPSILON are unreachable).
         let max_iters = self.max_iterations.min(budget.max_iterations);
-        let tol = self.tolerance.max(budget.tolerance);
+        let tol = self
+            .tolerance
+            .min(budget.tolerance)
+            .max(f32::EPSILON as f64 * 4.0);
 
         let inner_solver = NeumannSolver::new(tol, max_iters);
 
@@ -439,6 +460,12 @@ fn extract_diag_inv_f32(matrix: &CsrMatrix<f32>) -> Vec<f32> {
                 let diag = matrix.values[idx];
                 if diag.abs() > 1e-15 {
                     d_inv[i] = 1.0 / diag;
+                } else {
+                    warn!(
+                        row = i,
+                        diag_value = %diag,
+                        "zero or near-zero diagonal entry; substituting 1.0 — matrix may be singular"
+                    );
                 }
                 break;
             }
diff --git a/crates/ruvector-solver/src/random_walk.rs b/crates/ruvector-solver/src/random_walk.rs
index 514790ccc..7fc63c513 100644
--- a/crates/ruvector-solver/src/random_walk.rs
+++ b/crates/ruvector-solver/src/random_walk.rs
@@ -429,7 +429,7 @@ impl SolverEngine for HybridRandomWalkSolver {
             cdf.push(cumulative);
         }
 
-        let walks = self.num_walks.min(budget.max_iterations * 10);
+        let walks = self.num_walks.min(budget.max_iterations.saturating_mul(10));
         let mut counts = vec![0.0f64; n];
 
         for _ in 0..walks {
diff --git a/crates/ruvector-solver/src/router.rs b/crates/ruvector-solver/src/router.rs
index ad9ba80be..1f051506a 100644
--- a/crates/ruvector-solver/src/router.rs
+++ b/crates/ruvector-solver/src/router.rs
@@ -540,8 +540,8 @@ impl SolverOrchestrator {
                     let col_start = matrix.row_ptr[col];
                     let col_end = matrix.row_ptr[col + 1];
                     let found = matrix.col_indices[col_start..col_end]
-                        .iter()
-                        .any(|&c| c == row);
+                        .binary_search(&row)
+                        .is_ok();
                     if !found {
                         symmetric_mismatches += 1;
                     }
@@ -1130,7 +1130,7 @@ impl Default for SolverOrchestrator {
 #[inline]
 #[allow(dead_code)]
 fn dot(a: &[f64], b: &[f64]) -> f64 {
-    debug_assert_eq!(a.len(), b.len());
+    assert_eq!(a.len(), b.len(), "dot: length mismatch {} vs {}", a.len(), b.len());
     a.iter()
         .zip(b.iter())
         .map(|(&ai, &bi)| ai * bi)
diff --git a/crates/ruvector-solver/src/simd.rs b/crates/ruvector-solver/src/simd.rs
index fa0e5a0b5..d4c4a32dd 100644
--- a/crates/ruvector-solver/src/simd.rs
+++ b/crates/ruvector-solver/src/simd.rs
@@ -120,6 +120,91 @@ unsafe fn horizontal_sum_f32x8(v: std::arch::x86_64::__m256) -> f32 {
     _mm_cvtss_f32(result)
 }
 
+/// Sparse matrix-vector multiply with optional SIMD acceleration for f64.
+///
+/// Computes `y = A * x` where `A` is a CSR matrix of `f64` values.
+pub fn spmv_simd_f64(matrix: &CsrMatrix<f64>, x: &[f64], y: &mut [f64]) {
+    assert_eq!(x.len(), matrix.cols, "x length must equal matrix.cols");
+    assert_eq!(y.len(), matrix.rows, "y length must equal matrix.rows");
+
+    #[cfg(all(feature = "simd", target_arch = "x86_64"))]
+    {
+        if is_x86_feature_detected!("avx2") {
+            unsafe {
+                spmv_avx2_f64(matrix, x, y);
+            }
+            return;
+        }
+    }
+
+    spmv_scalar_f64(matrix, x, y);
+}
+
+/// Scalar fallback for f64 SpMV.
+pub fn spmv_scalar_f64(matrix: &CsrMatrix<f64>, x: &[f64], y: &mut [f64]) {
+    for i in 0..matrix.rows {
+        let start = matrix.row_ptr[i];
+        let end = matrix.row_ptr[i + 1];
+        let mut sum = 0.0f64;
+        for idx in start..end {
+            let col = matrix.col_indices[idx];
+            sum += matrix.values[idx] * x[col];
+        }
+        y[i] = sum;
+    }
+}
+
+#[cfg(all(feature = "simd", target_arch = "x86_64"))]
+#[target_feature(enable = "avx2")]
+unsafe fn spmv_avx2_f64(matrix: &CsrMatrix<f64>, x: &[f64], y: &mut [f64]) {
+    use std::arch::x86_64::*;
+
+    for i in 0..matrix.rows {
+        let start = matrix.row_ptr[i];
+        let end = matrix.row_ptr[i + 1];
+        let len = end - start;
+
+        let mut accum = _mm256_setzero_pd();
+        let chunks = len / 4;
+        let remainder = len % 4;
+
+        for chunk in 0..chunks {
+            let base = start + chunk * 4;
+            let vals = _mm256_loadu_pd(matrix.values.as_ptr().add(base));
+
+            let mut x_buf = [0.0f64; 4];
+            for k in 0..4 {
+                let col = *matrix.col_indices.get_unchecked(base + k);
+                x_buf[k] = *x.get_unchecked(col);
+            }
+            let x_vec = _mm256_loadu_pd(x_buf.as_ptr());
+            accum = _mm256_add_pd(accum, _mm256_mul_pd(vals, x_vec));
+        }
+
+        let mut sum = horizontal_sum_f64x4(accum);
+
+        let tail_start = start + chunks * 4;
+        for idx in tail_start..(tail_start + remainder) {
+            let col = *matrix.col_indices.get_unchecked(idx);
+            sum += *matrix.values.get_unchecked(idx) * *x.get_unchecked(col);
+        }
+
+        *y.get_unchecked_mut(i) = sum;
+    }
+}
+
+#[cfg(all(feature = "simd", target_arch = "x86_64"))]
+#[target_feature(enable = "avx2")]
+unsafe fn horizontal_sum_f64x4(v: std::arch::x86_64::__m256d) -> f64 {
+    use std::arch::x86_64::*;
+    let hi = _mm256_extractf128_pd(v, 1);
+    let lo = _mm256_castpd256_pd128(v);
+    let sum128 = _mm_add_pd(lo, hi);
+    let hi64 = _mm_unpackhi_pd(sum128, sum128);
+    let result = _mm_add_sd(sum128, hi64);
+    _mm_cvtsd_f64(result)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -159,4 +244,38 @@ mod tests {
         assert!((y[1] - 6.0).abs() < 1e-6);
         assert!((y[2] - 13.0).abs() < 1e-6);
     }
+
+    #[test]
+    fn spmv_simd_f64_correctness() {
+        let mat = CsrMatrix::<f64> {
+            values: vec![2.0, 1.0, 3.0, 1.0, 4.0],
+            col_indices: vec![0, 2, 1, 0, 2],
+            row_ptr: vec![0, 2, 3, 5],
+            rows: 3,
+            cols: 3,
+        };
+        let x = vec![1.0, 2.0, 3.0];
+        let mut y = vec![0.0f64; 3];
+        spmv_simd_f64(&mat, &x, &mut y);
+        assert!((y[0] - 5.0).abs() < 1e-10);
+        assert!((y[1] - 6.0).abs() < 1e-10);
+        assert!((y[2] - 13.0).abs() < 1e-10);
+    }
+
+    #[test]
+    fn scalar_spmv_f64_correctness() {
+        let mat = CsrMatrix::<f64> {
+            values: vec![2.0, 1.0, 3.0, 1.0, 4.0],
+            col_indices: vec![0, 2, 1, 0, 2],
+            row_ptr: vec![0, 2, 3, 5],
+            rows: 3,
+            cols: 3,
+        };
+        let x = vec![1.0, 2.0, 3.0];
+        let mut y = vec![0.0f64; 3];
+        spmv_scalar_f64(&mat, &x, &mut y);
+        assert!((y[0] - 5.0).abs() < 1e-10);
+        assert!((y[1] - 6.0).abs() < 1e-10);
+        assert!((y[2] - 13.0).abs() < 1e-10);
+    }
 }
diff --git a/crates/ruvector-solver/src/true_solver.rs b/crates/ruvector-solver/src/true_solver.rs
index 34b092178..383af135b 100644
--- a/crates/ruvector-solver/src/true_solver.rs
+++ b/crates/ruvector-solver/src/true_solver.rs
@@ -120,7 +120,7 @@ impl TrueSolver {
     ) -> Vec<(usize, usize, f32)> {
         let scale = 1.0 / (k as f64).sqrt();
         let scale_f32 = scale as f32;
-        let mut entries = Vec::new();
+        let mut entries = Vec::with_capacity(((k * n) as f64 / 3.0).ceil() as usize);
 
         for row in 0..k {
             for col in 0..n {
@@ -169,11 +169,12 @@ impl TrueSolver {
         // For each row i of Pi, compute B[i,:] = Pi[i,:] * A.
         let mut b_entries: Vec<(usize, usize, f32)> = Vec::new();
 
+        // Hoist accumulator outside loop to avoid reallocating each iteration.
+        let mut b_row = vec![0.0f32; n];
         for pi_row in 0..k {
             let pi_start = pi.row_ptr[pi_row];
             let pi_end = pi.row_ptr[pi_row + 1];
 
-            let mut b_row = vec![0.0f32; n];
             for pi_idx in pi_start..pi_end {
                 let pi_col = pi.col_indices[pi_idx];
                 let pi_val = pi.values[pi_idx];
@@ -190,6 +191,9 @@ impl TrueSolver {
                     b_entries.push((pi_row, col, val));
                 }
             }
+
+            // Zero the accumulator for the next row.
+            b_row.iter_mut().for_each(|v| *v = 0.0);
         }
 
         let b_matrix = CsrMatrix::<f32>::from_coo(k, n, b_entries);
@@ -207,11 +211,12 @@ impl TrueSolver {
 
         let mut a_prime_entries: Vec<(usize, usize, f32)> = Vec::new();
 
-        for b_row in 0..k {
-            let b_start = b_matrix.row_ptr[b_row];
-            let b_end = b_matrix.row_ptr[b_row + 1];
+        // Hoist accumulator outside loop to avoid reallocating each iteration.
+        let mut row_accum = vec![0.0f32; k];
+        for b_row_idx in 0..k {
+            let b_start = b_matrix.row_ptr[b_row_idx];
+            let b_end = b_matrix.row_ptr[b_row_idx + 1];
 
-            let mut row_accum = vec![0.0f32; k];
             for b_idx in b_start..b_end {
                 let l = b_matrix.col_indices[b_idx];
                 let b_val = b_matrix.values[b_idx];
@@ -223,9 +228,12 @@ impl TrueSolver {
 
             for (j, &val) in row_accum.iter().enumerate() {
                 if val.abs() > f32::EPSILON {
-                    a_prime_entries.push((b_row, j, val));
+                    a_prime_entries.push((b_row_idx, j, val));
                 }
             }
+
+            // Zero the accumulator for the next row.
+            row_accum.iter_mut().for_each(|v| *v = 0.0);
         }
 
         CsrMatrix::<f32>::from_coo(k, k, a_prime_entries)
@@ -560,7 +568,11 @@ impl SolverEngine for TrueSolver {
         rhs: &[f64],
         _budget: &crate::types::ComputeBudget,
     ) -> Result<SolverResult, SolverError> {
-        // Convert f64 input to f32 for internal computation
+        // Convert f64 input to f32 for internal computation.
+        // NOTE: row_ptr and col_indices are cloned here because CsrMatrix owns
+        // Vec<usize>, so we cannot borrow from the f64 matrix. A future
+        // refactor could introduce a CsrMatrixView that borrows structural
+        // arrays to eliminate these allocations on the f64 -> f32 path.
         let f32_values: Vec<f32> = matrix.values.iter().map(|&v| v as f32).collect();
         let f32_matrix = CsrMatrix {
             row_ptr: matrix.row_ptr.clone(),
diff --git a/crates/ruvector-solver/src/types.rs b/crates/ruvector-solver/src/types.rs
index 9bb89878b..8ec570e19 100644
--- a/crates/ruvector-solver/src/types.rs
+++ b/crates/ruvector-solver/src/types.rs
@@ -268,7 +268,7 @@ impl<T: Copy + Default + std::ops::AddAssign> CsrMatrix<T> {
         entries: impl IntoIterator<Item = (usize, usize, T)>,
     ) -> Self {
         let mut sorted: Vec<_> = entries.into_iter().collect();
-        sorted.sort_by_key(|(r, c, _)| (*r, *c));
+        sorted.sort_unstable_by_key(|(r, c, _)| (*r, *c));
 
         let nnz = sorted.len();
         let mut row_ptr = vec![0usize; rows + 1];
@@ -276,7 +276,7 @@ impl<T: Copy + Default + std::ops::AddAssign> CsrMatrix<T> {
         let mut values = Vec::with_capacity(nnz);
 
         for &(r, _, _) in &sorted {
-            debug_assert!(r < rows, "row index {} out of bounds (rows={})", r, rows);
+            assert!(r < rows, "row index {} out of bounds (rows={})", r, rows);
             row_ptr[r + 1] += 1;
         }
         for i in 1..=rows {
@@ -284,7 +284,7 @@ impl<T: Copy + Default + std::ops::AddAssign> CsrMatrix<T> {
         }
 
         for (_, c, v) in sorted {
-            debug_assert!(c < cols, "col index {} out of bounds (cols={})", c, cols);
+            assert!(c < cols, "col index {} out of bounds (cols={})", c, cols);
             col_indices.push(c);
             values.push(v);
         }