ruvector/crates/ruvector-cnn/tests/simd_test.rs
ruvnet f6c684aba0 docs(sdk): add deep planning review for ruvector Python SDK
Seven-file design review at docs/sdk/ covering the binding strategy,
API surface, M1-M4 milestones, risks, and a one-page decision record
for shipping a Python SDK.

Recommended path: **PyO3 + maturin, single in-tree
`crates/ruvector-py/` cdylib, abi3-py39 wheel via cibuildwheel,
`pyo3-asyncio` over a singleton tokio runtime.**

Why:
- The existing `*-node` NAPI templates (e.g.
  `crates/ruvector-diskann-node/src/lib.rs`) already prove out the
  opaque-handle + `Arc<RwLock<…>>` shape PyO3 mirrors line-for-line —
  ~70% port, ~30% lifetime gymnastics.
- abi3 collapses the wheel matrix from ~25 (cpython36 × 5 platforms)
  to 5 (one wheel per platform, all py3.9+).
- Singleton tokio runtime avoids the "one runtime per call" overhead
  while remaining compatible with asyncio + uvloop.

Milestone shape (each with explicit scope + acceptance tests):

  M1 — RaBitQ-only Python wheel. Just the published
       `ruvector-rabitq` crate exposed via PyO3. Smallest possible
       useful surface. ~600 LoC, 3 weeks.
  M2 — ruLake. Async via pyo3-asyncio. Witness verify exposed.
       ~900 LoC, 4 weeks.
  M3 — Embeddings + ML helpers. Wrap consumer-facing parts of
       `ruvector-cnn` / `ruvllm`. ~700 LoC, 3 weeks.
  M4 — A2A agent client. Wrap `rvagent-a2a` so Python apps can
       dispatch tasks to A2A peers, including signed AgentCard
       discovery. ~800 LoC, 4 weeks.

Three acceptance gates that gate the whole effort:
  1. A Python user can do RAG over 1 M vectors in <5 lines.
  2. An asyncio user can stream A2A task updates without thread
     fights.
  3. `pip install ruvector` takes <10 s on a stock machine.

Top 3 risks identified:
  R1 — tokio runtime + PyO3 + asyncio/uvloop interop. Mitigation:
       single lazy runtime, `pyo3-asyncio` shim.
  R3 — wheel size. M4 budget is 22 MB; A2A deps (axum + reqwest +
       rustls) could blow it. Mitigation: feature-gate axum/reqwest
       behind `agent` extra; default install is rabitq + rulake only.
  R7 — PyPI name squat on `ruvector`. Mitigation: register placeholder
       before M1 ships.

Nuance discovered: `ruvector-rabitq` has **no** sibling `*-node` or
`*-wasm` crate — unlike most consumer crates. M1 is therefore clean
greenfield: no parity-pressure to match a flaky NAPI signature, and
it confirms rabitq alone is the right starter target rather than the
umbrella `ruvector` crate the npm package wraps.

Planning doc only; no implementation.

Co-Authored-By: claude-flow <ruv@ruv.net>
2026-04-25 20:28:54 -04:00

747 lines
20 KiB
Rust

//! Tests for SIMD-accelerated operations
//!
//! Tests cover:
//! - SIMD vs scalar equivalence
//! - dot_product accuracy
//! - Edge cases (empty, misaligned, remainder handling)
use ruvector_cnn::simd;
use ruvector_cnn::simd::scalar;
// ============================================================================
// Dot Product Tests
// ============================================================================
#[test]
fn test_dot_product_simd_vs_scalar() {
let a = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let b = vec![2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0];
let simd_result = simd::dot_product_simd(&a, &b);
let scalar_result = scalar::dot_product_scalar(&a, &b);
assert!(
(simd_result - scalar_result).abs() < 1e-5,
"SIMD: {}, Scalar: {}",
simd_result,
scalar_result
);
}
#[test]
fn test_dot_product_large_vector() {
// Large vector to exercise SIMD loop (512 elements)
let size = 512;
let a: Vec<f32> = (0..size).map(|i| (i as f32) * 0.01).collect();
let b: Vec<f32> = (0..size).map(|i| ((size - i) as f32) * 0.01).collect();
let simd_result = simd::dot_product_simd(&a, &b);
let scalar_result = scalar::dot_product_scalar(&a, &b);
assert!(
(simd_result - scalar_result).abs() < 1.0, // Allow larger epsilon for accumulated error
"SIMD: {}, Scalar: {}",
simd_result,
scalar_result
);
}
#[test]
fn test_dot_product_various_sizes() {
// Test sizes that exercise different SIMD code paths
for size in [
1, 3, 7, 8, 9, 15, 16, 17, 31, 32, 63, 64, 100, 128, 255, 256,
] {
let a: Vec<f32> = (0..size).map(|i| (i as f32) * 0.1).collect();
let b: Vec<f32> = (0..size).map(|i| ((size - i) as f32) * 0.1).collect();
let simd_result = simd::dot_product_simd(&a, &b);
let scalar_result = scalar::dot_product_scalar(&a, &b);
let abs_diff = (simd_result - scalar_result).abs();
let rel_error = if scalar_result.abs() > 1e-10 {
abs_diff / scalar_result.abs()
} else {
abs_diff
};
assert!(
rel_error < 1e-4 || abs_diff < 1e-4,
"Size {}: SIMD={}, Scalar={}, diff={}",
size,
simd_result,
scalar_result,
abs_diff
);
}
}
#[test]
fn test_dot_product_empty() {
let a: Vec<f32> = vec![];
let b: Vec<f32> = vec![];
let result = simd::dot_product_simd(&a, &b);
assert_eq!(result, 0.0);
}
#[test]
fn test_dot_product_single_element() {
let a = vec![3.0];
let b = vec![4.0];
let result = simd::dot_product_simd(&a, &b);
assert!((result - 12.0).abs() < 1e-6);
}
#[test]
fn test_dot_product_negative_values() {
let a = vec![-1.0, -2.0, 3.0, 4.0];
let b = vec![2.0, -3.0, -4.0, 5.0];
// (-1*2) + (-2*-3) + (3*-4) + (4*5) = -2 + 6 - 12 + 20 = 12
let result = simd::dot_product_simd(&a, &b);
assert!((result - 12.0).abs() < 1e-5);
}
#[test]
fn test_dot_product_known_value() {
// Simple known calculation
let a = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let b = vec![1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0];
// 1+2+3+4+5+6+7+8 = 36
let result = simd::dot_product_simd(&a, &b);
assert!((result - 36.0).abs() < 1e-5);
}
#[test]
fn test_dot_product_large_small_values() {
// Test numerical precision with large and small values
let a = vec![1e6, 1e-6, 1e6, 1e-6];
let b = vec![1e-6, 1e6, 1e-6, 1e6];
let simd_result = simd::dot_product_simd(&a, &b);
let scalar_result = scalar::dot_product_scalar(&a, &b);
assert!(
(simd_result - scalar_result).abs() < 1.0,
"SIMD: {}, Scalar: {}",
simd_result,
scalar_result
);
}
// ============================================================================
// ReLU Tests
// ============================================================================
#[test]
fn test_relu_simd_vs_scalar() {
let input: Vec<f32> = (-16..16).map(|i| i as f32 * 0.5).collect();
let mut simd_output = vec![0.0; input.len()];
let mut scalar_output = vec![0.0; input.len()];
simd::relu_simd(&input, &mut simd_output);
scalar::relu_scalar(&input, &mut scalar_output);
for (i, (&s, &r)) in simd_output.iter().zip(scalar_output.iter()).enumerate() {
assert!(
(s - r).abs() < 1e-6,
"Index {}: SIMD={}, Scalar={}",
i,
s,
r
);
}
}
#[test]
fn test_relu_all_negative() {
let input = vec![-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0];
let mut output = vec![0.0; 8];
simd::relu_simd(&input, &mut output);
for &val in &output {
assert_eq!(val, 0.0);
}
}
#[test]
fn test_relu_all_positive() {
let input = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let mut output = vec![0.0; 8];
simd::relu_simd(&input, &mut output);
assert_eq!(output, input);
}
#[test]
fn test_relu_mixed() {
let input = vec![-1.0, 2.0, -3.0, 4.0, -5.0, 6.0, -7.0, 8.0];
let mut output = vec![0.0; 8];
simd::relu_simd(&input, &mut output);
assert_eq!(output, vec![0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0]);
}
#[test]
fn test_relu_large_batch() {
let size = 1024;
let input: Vec<f32> = (0..size).map(|i| (i as f32) - 512.0).collect();
let mut simd_output = vec![0.0; size];
let mut scalar_output = vec![0.0; size];
simd::relu_simd(&input, &mut simd_output);
scalar::relu_scalar(&input, &mut scalar_output);
for (i, (&s, &r)) in simd_output.iter().zip(scalar_output.iter()).enumerate() {
assert_eq!(s, r, "Mismatch at index {}", i);
}
}
// ============================================================================
// ReLU6 Tests
// ============================================================================
#[test]
fn test_relu6_simd_vs_scalar() {
let input: Vec<f32> = (-8..16).map(|i| i as f32 * 0.5).collect();
let mut simd_output = vec![0.0; input.len()];
let mut scalar_output = vec![0.0; input.len()];
simd::relu6_simd(&input, &mut simd_output);
scalar::relu6_scalar(&input, &mut scalar_output);
for (i, (&s, &r)) in simd_output.iter().zip(scalar_output.iter()).enumerate() {
assert!(
(s - r).abs() < 1e-6,
"Index {}: SIMD={}, Scalar={}",
i,
s,
r
);
}
}
#[test]
fn test_relu6_clamps() {
let input = vec![-1.0, 2.0, 7.0, 4.0, -5.0, 10.0, 3.0, 8.0];
let mut output = vec![0.0; 8];
simd::relu6_simd(&input, &mut output);
assert_eq!(output, vec![0.0, 2.0, 6.0, 4.0, 0.0, 6.0, 3.0, 6.0]);
}
#[test]
fn test_relu6_boundary() {
let input = vec![0.0, 6.0, -0.001, 6.001];
let mut output = vec![0.0; 4];
simd::relu6_simd(&input, &mut output);
assert!(output[0].abs() < 1e-6); // 0 -> 0
assert!((output[1] - 6.0).abs() < 1e-6); // 6 -> 6
assert!(output[2].abs() < 1e-6); // -0.001 -> 0
assert!((output[3] - 6.0).abs() < 1e-6); // 6.001 -> 6
}
// ============================================================================
// Batch Normalization Tests
// ============================================================================
#[test]
fn test_batch_norm_simd_vs_scalar() {
let channels = 4;
let spatial = 16;
let input: Vec<f32> = (0..channels * spatial).map(|i| (i as f32) * 0.1).collect();
let gamma = vec![1.0; channels];
let beta = vec![0.0; channels];
let mean = vec![0.0; channels];
let var = vec![1.0; channels];
let epsilon = 1e-5;
let mut simd_output = vec![0.0; input.len()];
let mut scalar_output = vec![0.0; input.len()];
simd::batch_norm_simd(
&input,
&mut simd_output,
&gamma,
&beta,
&mean,
&var,
epsilon,
channels,
);
scalar::batch_norm_scalar(
&input,
&mut scalar_output,
&gamma,
&beta,
&mean,
&var,
epsilon,
channels,
);
for (i, (&s, &r)) in simd_output.iter().zip(scalar_output.iter()).enumerate() {
assert!(
(s - r).abs() < 1e-4,
"Index {}: SIMD={}, Scalar={}",
i,
s,
r
);
}
}
#[test]
fn test_batch_norm_identity() {
// With gamma=1, beta=0, mean=0, var=1: output should equal input
let channels = 8;
let spatial = 4;
let input: Vec<f32> = (0..channels * spatial).map(|i| (i as f32) * 0.1).collect();
let gamma = vec![1.0; channels];
let beta = vec![0.0; channels];
let mean = vec![0.0; channels];
let var = vec![1.0; channels];
let mut output = vec![0.0; input.len()];
simd::batch_norm_simd(
&input,
&mut output,
&gamma,
&beta,
&mean,
&var,
1e-5,
channels,
);
for (i, (&inp, &out)) in input.iter().zip(output.iter()).enumerate() {
assert!(
(inp - out).abs() < 0.01,
"Index {}: input={}, output={}",
i,
inp,
out
);
}
}
#[test]
fn test_batch_norm_normalization() {
// Test that batch norm actually normalizes with given stats
let channels = 2;
let input = vec![
5.0, 10.0, // mean of ch0=5, mean of ch1=10
5.0, 10.0,
];
let gamma = vec![1.0, 1.0];
let beta = vec![0.0, 0.0];
let mean = vec![5.0, 10.0];
let var = vec![1.0, 1.0];
let mut output = vec![0.0; 4];
simd::batch_norm_simd(
&input,
&mut output,
&gamma,
&beta,
&mean,
&var,
1e-5,
channels,
);
// (5 - 5) / sqrt(1 + eps) = 0
// (10 - 10) / sqrt(1 + eps) = 0
for &val in &output {
assert!(val.abs() < 0.01, "Expected ~0, got {}", val);
}
}
// ============================================================================
// 3x3 Convolution Tests
// ============================================================================
#[test]
fn test_conv_3x3_simd_vs_scalar() {
let in_h = 8;
let in_w = 8;
let in_c = 3;
let out_c = 4;
let stride = 1;
let padding = 1;
let input: Vec<f32> = (0..in_h * in_w * in_c).map(|i| (i as f32) * 0.01).collect();
let kernel: Vec<f32> = (0..out_c * 3 * 3 * in_c)
.map(|i| (i as f32) * 0.001)
.collect();
let out_h = (in_h + 2 * padding - 3) / stride + 1;
let out_w = (in_w + 2 * padding - 3) / stride + 1;
let mut simd_output = vec![0.0; out_h * out_w * out_c];
let mut scalar_output = vec![0.0; out_h * out_w * out_c];
simd::conv_3x3_simd(
&input,
&kernel,
&mut simd_output,
in_h,
in_w,
in_c,
out_c,
stride,
padding,
);
scalar::conv_3x3_scalar(
&input,
&kernel,
&mut scalar_output,
in_h,
in_w,
in_c,
out_c,
stride,
padding,
);
for (i, (&s, &r)) in simd_output.iter().zip(scalar_output.iter()).enumerate() {
assert!((s - r).abs() < 0.1, "Index {}: SIMD={}, Scalar={}", i, s, r);
}
}
// ============================================================================
// Depthwise 3x3 Convolution Tests
// ============================================================================
#[test]
fn test_depthwise_conv_3x3_simd_vs_scalar() {
let h = 8;
let w = 8;
let c = 4;
let stride = 1;
let padding = 1;
let input: Vec<f32> = (0..h * w * c).map(|i| (i as f32) * 0.01).collect();
let kernel: Vec<f32> = (0..c * 9).map(|i| (i as f32) * 0.01).collect();
let out_h = (h + 2 * padding - 3) / stride + 1;
let out_w = (w + 2 * padding - 3) / stride + 1;
let mut simd_output = vec![0.0; out_h * out_w * c];
let mut scalar_output = vec![0.0; out_h * out_w * c];
simd::depthwise_conv_3x3_simd(&input, &kernel, &mut simd_output, h, w, c, stride, padding);
scalar::depthwise_conv_3x3_scalar(
&input,
&kernel,
&mut scalar_output,
h,
w,
c,
stride,
padding,
);
for (i, (&s, &r)) in simd_output.iter().zip(scalar_output.iter()).enumerate() {
assert!((s - r).abs() < 0.1, "Index {}: SIMD={}, Scalar={}", i, s, r);
}
}
// ============================================================================
// Global Average Pooling Tests
// ============================================================================
#[test]
fn test_global_avg_pool_simd_vs_scalar() {
let h = 4;
let w = 4;
let c = 8;
let input: Vec<f32> = (0..h * w * c).map(|i| (i as f32) * 0.1).collect();
let mut simd_output = vec![0.0; c];
let mut scalar_output = vec![0.0; c];
simd::global_avg_pool_simd(&input, &mut simd_output, h, w, c);
scalar::global_avg_pool_scalar(&input, &mut scalar_output, h, w, c);
for (i, (&s, &r)) in simd_output.iter().zip(scalar_output.iter()).enumerate() {
assert!(
(s - r).abs() < 1e-4,
"Channel {}: SIMD={}, Scalar={}",
i,
s,
r
);
}
}
#[test]
fn test_global_avg_pool_uniform_input() {
let h = 4;
let w = 4;
let c = 4;
// All values = 2.0, average should be 2.0
let input = vec![2.0; h * w * c];
let mut output = vec![0.0; c];
simd::global_avg_pool_simd(&input, &mut output, h, w, c);
for &val in &output {
assert!((val - 2.0).abs() < 1e-5);
}
}
// ============================================================================
// Max Pooling 2x2 Tests
// ============================================================================
#[test]
fn test_max_pool_2x2_simd_vs_scalar() {
let h = 8;
let w = 8;
let c = 4;
let stride = 2;
let input: Vec<f32> = (0..h * w * c).map(|i| (i as f32) * 0.1).collect();
let out_h = h / stride;
let out_w = w / stride;
let mut simd_output = vec![0.0; out_h * out_w * c];
let mut scalar_output = vec![0.0; out_h * out_w * c];
simd::max_pool_2x2_simd(&input, &mut simd_output, h, w, c, stride);
scalar::max_pool_2x2_scalar(&input, &mut scalar_output, h, w, c, stride);
for (i, (&s, &r)) in simd_output.iter().zip(scalar_output.iter()).enumerate() {
assert!(
(s - r).abs() < 1e-5,
"Index {}: SIMD={}, Scalar={}",
i,
s,
r
);
}
}
#[test]
fn test_max_pool_2x2_finds_max() {
// 4x4 input, 1 channel
let input = vec![
1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
];
let mut output = vec![0.0; 4];
simd::max_pool_2x2_simd(&input, &mut output, 4, 4, 1, 2);
// 2x2 windows:
// [1,2,5,6] -> 6
// [3,4,7,8] -> 8
// [9,10,13,14] -> 14
// [11,12,15,16] -> 16
assert_eq!(output[0], 6.0);
assert_eq!(output[1], 8.0);
assert_eq!(output[2], 14.0);
assert_eq!(output[3], 16.0);
}
// ============================================================================
// Edge Cases
// ============================================================================
#[test]
fn test_simd_empty_input() {
let empty: Vec<f32> = vec![];
let mut output: Vec<f32> = vec![];
// These should not panic
simd::relu_simd(&empty, &mut output);
simd::relu6_simd(&empty, &mut output);
}
#[test]
fn test_simd_single_element() {
let input = vec![5.0];
let mut output = vec![0.0];
simd::relu_simd(&input, &mut output);
assert_eq!(output[0], 5.0);
let input_neg = vec![-5.0];
simd::relu_simd(&input_neg, &mut output);
assert_eq!(output[0], 0.0);
}
#[test]
fn test_simd_remainder_handling() {
// Test sizes that don't align with SIMD width (not multiple of 8)
for size in [3, 7, 9, 15, 17, 25, 33] {
let input: Vec<f32> = (0..size)
.map(|i| (i as f32) - (size as f32 / 2.0))
.collect();
let mut simd_output = vec![0.0; size];
let mut scalar_output = vec![0.0; size];
simd::relu_simd(&input, &mut simd_output);
scalar::relu_scalar(&input, &mut scalar_output);
for (i, (&s, &r)) in simd_output.iter().zip(scalar_output.iter()).enumerate() {
assert_eq!(s, r, "Size {}, index {}: SIMD={}, Scalar={}", size, i, s, r);
}
}
}
// ============================================================================
// Scalar Function Tests (for reference)
// ============================================================================
#[test]
fn test_scalar_swish() {
let input = vec![0.0, 1.0, -1.0, 2.0];
let mut output = vec![0.0; 4];
scalar::swish_scalar(&input, &mut output);
// swish(0) = 0
assert!(output[0].abs() < 1e-6);
// swish(1) = 1 * sigmoid(1) ~ 0.731
assert!((output[1] - 0.731).abs() < 0.01);
// swish(-1) = -1 * sigmoid(-1) ~ -0.268
assert!((output[2] - (-0.268)).abs() < 0.01);
}
#[test]
fn test_scalar_hard_swish() {
let input = vec![-4.0, -3.0, 0.0, 3.0, 4.0];
let mut output = vec![0.0; 5];
scalar::hard_swish_scalar(&input, &mut output);
assert!(output[0].abs() < 1e-5); // -4 -> 0
assert!(output[1].abs() < 1e-5); // -3 -> 0
assert!(output[2].abs() < 1e-5); // 0 -> 0
assert!((output[3] - 3.0).abs() < 1e-5); // 3 -> 3
}
#[test]
fn test_scalar_sigmoid() {
let input = vec![0.0, 10.0, -10.0];
let mut output = vec![0.0; 3];
scalar::sigmoid_scalar(&input, &mut output);
assert!((output[0] - 0.5).abs() < 1e-5); // sigmoid(0) = 0.5
assert!((output[1] - 1.0).abs() < 0.001); // sigmoid(10) ~ 1.0
assert!(output[2] < 0.001); // sigmoid(-10) ~ 0.0
}
// ============================================================================
// Platform Detection Tests
// ============================================================================
#[test]
fn test_simd_feature_detection() {
// This test verifies the code compiles and runs on any platform
let a = vec![1.0f32; 16];
let b = vec![2.0f32; 16];
// Should use optimal SIMD path for current platform
let result = simd::dot_product_simd(&a, &b);
assert!((result - 32.0).abs() < 1e-5);
}
#[test]
#[cfg(target_arch = "x86_64")]
fn test_avx2_detection() {
// On x86_64, check if AVX2 is detected (informational)
let has_avx2 = is_x86_feature_detected!("avx2");
println!("AVX2 available: {}", has_avx2);
// Test should pass regardless of AVX2 availability
let a = vec![1.0f32; 32];
let b = vec![1.0f32; 32];
let result = simd::dot_product_simd(&a, &b);
assert!((result - 32.0).abs() < 1e-5);
}
#[test]
#[cfg(target_arch = "aarch64")]
fn test_neon_available() {
// NEON is always available on aarch64
let a = vec![1.0f32; 32];
let b = vec![1.0f32; 32];
let result = simd::dot_product_simd(&a, &b);
assert!((result - 32.0).abs() < 1e-5);
}
// ============================================================================
// Numerical Stability Edge Cases
// ============================================================================
#[test]
fn test_dot_product_inf_handling() {
let a = vec![f32::INFINITY, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0];
let b = vec![1.0f32; 8];
let result = simd::dot_product_simd(&a, &b);
assert!(result.is_infinite() && result > 0.0);
}
#[test]
fn test_dot_product_nan_propagation() {
let a = vec![f32::NAN, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0];
let b = vec![1.0f32; 8];
let result = simd::dot_product_simd(&a, &b);
assert!(result.is_nan());
}
#[test]
fn test_activation_with_special_values() {
let input = vec![
f32::INFINITY,
f32::NEG_INFINITY,
f32::NAN,
0.0,
1.0,
-1.0,
6.0,
100.0,
];
let mut output = vec![0.0; 8];
simd::relu_simd(&input, &mut output);
assert!(output[0].is_infinite() && output[0] > 0.0); // inf stays inf
assert_eq!(output[1], 0.0); // -inf becomes 0
// NaN handling depends on backend: AVX2 `_mm256_max_ps(NaN, 0)` returns
// the second operand (0.0) per Intel's unordered-comparison semantics,
// while a scalar `f32::max` propagates NaN. Both behaviors are
// legitimate ReLU implementations, so accept either.
assert!(
output[2].is_nan() || output[2] == 0.0,
"expected NaN or 0.0 for ReLU(NaN), got {}",
output[2]
);
assert_eq!(output[3], 0.0);
assert_eq!(output[4], 1.0);
assert_eq!(output[5], 0.0);
assert_eq!(output[6], 6.0);
assert_eq!(output[7], 100.0);
}