mirror of
https://github.com/ruvnet/RuVector.git
synced 2026-05-22 11:26:34 +00:00
perf(rabitq): AVX2 popcount kernel with runtime dispatch — +20% QPS at n=100k
Implements the profiler's top-priority optimization: a SIMD-friendly
scan kernel that decouples the XNOR+popcount agree-count pass from
the cos-LUT + score + TopK heap reduction.
Design (crates/ruvector-rabitq/src/scan.rs):
- scan_scalar: portable u64::count_ones, byte-identical to the
original inline loop.
- scan_avx2: #[target_feature(enable="avx2,popcnt")], 4-candidate
outer unroll via core::arch::x86_64::_popcnt64. Processes 4
rows per loop iteration, amortizing branch + stride overhead.
- scan: runtime dispatcher, cached in std::sync::OnceLock<fn(...)>
so the CPUID check runs once per process.
symmetric_scan_topk in index.rs now:
1. Calls scan::scan(...) once to fill a scratch Vec<u32> of
agree-counts (the whole-table popcount pass).
2. Walks the agree array with the cos-LUT + score + TopK heap —
a serial reduction that was never SIMD-amenable.
Determinism preserved: scan_avx2 and scan_scalar produce byte-
identical agree-count arrays. Two new tests verify this at D=128
(n=1000) and D=64/100/192/200 with tail cases n=1023/7.
Measured (single-thread, cargo run --release rulake-demo):
n= 5 000 direct RaBitQ+: 17,915 → 18,998 QPS (+6%)
n=50 000 direct RaBitQ+: 5,230 → 5,959 QPS (+14%)
n=100k direct RaBitQ+: 3,058 → 3,681 QPS (+20%)
Win grows with n as the per-query allocation overhead becomes a
smaller fraction of scan time. Smaller than the 2-4× upper-bound
profiler estimate because rerank=20 keeps ~30-40% of query time in
the exact-L2 rerank step (unchanged by this patch).
25 rabitq tests passing (23 prior + 2 new scan determinism tests).
Clippy -D warnings clean. No new deps. All unsafe confined to the
two SIMD functions in scan.rs.
Co-Authored-By: claude-flow <ruv@ruv.net>
This commit is contained in:
parent
a0fdd4d9b0
commit
5a4b0d782c
3 changed files with 394 additions and 41 deletions
|
|
@ -421,54 +421,33 @@ impl RabitqIndex {
|
|||
k: usize,
|
||||
) -> Vec<(u32, u32, f32)> {
|
||||
// Returns (pos, id, score) so rerank callers can map back to `originals[pos]`.
|
||||
let mut top = TopK::new(k.min(self.ids.len()));
|
||||
let n_words = self.n_words;
|
||||
let mask = self.last_word_mask;
|
||||
let d = self.dim as f32;
|
||||
let n = self.ids.len();
|
||||
let mut top = TopK::new(k.min(n));
|
||||
let q_sq = q_norm * q_norm;
|
||||
let lut = &self.cos_lut;
|
||||
|
||||
// Unrolled walk with manual prefetch-friendly stride. LLVM can already
|
||||
// do most of this; the important part is the flat `packed` slice — no
|
||||
// per-candidate indirection.
|
||||
let n = self.ids.len();
|
||||
let p = self.packed.as_ptr();
|
||||
let aligned = mask == !0u64; // dim % 64 == 0
|
||||
// 1. SIMD-friendly pass: compute agreement counts for all n candidates
|
||||
// into a scratch buffer. Runtime dispatch picks AVX2+POPCNT (4×
|
||||
// unrolled) or a scalar fallback. Bit-identical across paths.
|
||||
let mut agree = vec![0u32; n];
|
||||
crate::scan::scan(
|
||||
&self.packed,
|
||||
self.n_words,
|
||||
n,
|
||||
q_packed,
|
||||
self.last_word_mask,
|
||||
&mut agree,
|
||||
);
|
||||
|
||||
// 2. Scalar reduction pass: cos-LUT lookup + score + TopK heap. Not
|
||||
// SIMD-amenable (small LUT, scalar FP, branchy heap eviction).
|
||||
for i in 0..n {
|
||||
// SAFETY: p is valid for `n * n_words` u64 reads. Using ptr offsets
|
||||
// avoids the bounds-check in the inner loop.
|
||||
let base = unsafe { p.add(i * n_words) };
|
||||
let mut agree: u32 = 0;
|
||||
if aligned && n_words == 2 {
|
||||
// D=128 fast path: 2 popcounts, no last-word mask needed.
|
||||
unsafe {
|
||||
agree = (!(*base ^ q_packed[0])).count_ones()
|
||||
+ (!(*base.add(1) ^ q_packed[1])).count_ones();
|
||||
}
|
||||
} else if aligned {
|
||||
// Aligned but more words — skip the mask AND on the last word.
|
||||
unsafe {
|
||||
for w in 0..n_words {
|
||||
agree += (!(*base.add(w) ^ q_packed[w])).count_ones();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Unaligned: mask the last word's padding bits off.
|
||||
unsafe {
|
||||
for w in 0..n_words - 1 {
|
||||
agree += (!(*base.add(w) ^ q_packed[w])).count_ones();
|
||||
}
|
||||
agree +=
|
||||
(!(*base.add(n_words - 1) ^ q_packed[n_words - 1]) & mask).count_ones();
|
||||
}
|
||||
}
|
||||
// cos LUT replaces the `.cos()` call — one indexed load.
|
||||
let est_cos = unsafe { *lut.get_unchecked(agree as usize) };
|
||||
// SAFETY: agree.len() == n and cos_lut has dim+1 entries which
|
||||
// bounds agree[i] ∈ [0, dim].
|
||||
let est_cos = unsafe { *lut.get_unchecked(*agree.get_unchecked(i) as usize) };
|
||||
let x_norm = self.norms[i];
|
||||
let est_ip = q_norm * x_norm * est_cos;
|
||||
let score = q_sq + x_norm * x_norm - 2.0 * est_ip;
|
||||
// ignoring d here — already baked into the LUT indices.
|
||||
let _ = d;
|
||||
top.push_raw(self.ids[i] as usize, score, i);
|
||||
}
|
||||
top.into_sorted_with_pos()
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@ pub mod index;
|
|||
pub mod kernel;
|
||||
pub mod quantize;
|
||||
pub mod rotation;
|
||||
pub mod scan;
|
||||
|
||||
pub use error::RabitqError;
|
||||
pub use index::{
|
||||
|
|
|
|||
373
crates/ruvector-rabitq/src/scan.rs
Normal file
373
crates/ruvector-rabitq/src/scan.rs
Normal file
|
|
@ -0,0 +1,373 @@
|
|||
//! SIMD-accelerated symmetric-scan agreement-count kernel.
|
||||
//!
|
||||
//! The symmetric RaBitQ scan reduces to a padding-safe XNOR-popcount between
|
||||
//! a single query word-vector and every database word-vector. The scalar
|
||||
//! version runs `u64::count_ones` once per word; this module adds an AVX2-era
|
||||
//! fast path that (a) uses the hardware `popcnt` instruction directly via
|
||||
//! `_popcnt64` and (b) unrolls the outer loop by 4 to hide its latency and
|
||||
//! reduce branch mispredicts.
|
||||
//!
|
||||
//! The kernel only computes the agreement count — the cos-LUT lookup,
|
||||
//! score arithmetic, and TopK heap management stay on the host loop in
|
||||
//! `index.rs` because they are not SIMD-amenable (small LUT, scalar FP,
|
||||
//! branchy heap). This file exposes:
|
||||
//!
|
||||
//! * [`scan_scalar`] — portable fallback, identical math to the inline
|
||||
//! loop that lived in `index.rs` before this module existed.
|
||||
//! * `scan_avx2` — x86_64 AVX2+POPCNT variant (4 candidates/iter).
|
||||
//! * [`scan`] — runtime-dispatched entry point. Picks the best kernel
|
||||
//! once at process start via a `OnceLock<fn(...)>` cache.
|
||||
//!
|
||||
//! All three produce **bit-identical** `out_agree[]` arrays — the SIMD
|
||||
//! path reorders work but not arithmetic. This is asserted by the unit
|
||||
//! tests below.
|
||||
|
||||
use std::sync::OnceLock;
|
||||
|
||||
/// Function pointer signature used by the runtime dispatcher.
|
||||
///
|
||||
/// * `packed` — flat row-major database codes, length `n * n_words`.
|
||||
/// * `n_words` — u64 words per candidate.
|
||||
/// * `n` — candidate count.
|
||||
/// * `q_packed` — query words, length `n_words`.
|
||||
/// * `mask` — last-word mask (`!0u64` when `dim % 64 == 0`).
|
||||
/// * `out_agree` — output agreement counts, length `n`.
|
||||
type ScanFn = fn(packed: &[u64], n_words: usize, n: usize, q_packed: &[u64], mask: u64, out_agree: &mut [u32]);
|
||||
|
||||
static SCAN_IMPL: OnceLock<ScanFn> = OnceLock::new();
|
||||
|
||||
/// Portable scalar kernel. Matches the pre-SIMD inline loop byte-for-byte.
|
||||
#[inline]
|
||||
pub fn scan_scalar(
|
||||
packed: &[u64],
|
||||
n_words: usize,
|
||||
n: usize,
|
||||
q_packed: &[u64],
|
||||
mask: u64,
|
||||
out_agree: &mut [u32],
|
||||
) {
|
||||
debug_assert_eq!(packed.len(), n * n_words);
|
||||
debug_assert_eq!(q_packed.len(), n_words);
|
||||
debug_assert_eq!(out_agree.len(), n);
|
||||
|
||||
let aligned = mask == !0u64;
|
||||
if aligned && n_words == 2 {
|
||||
// D=128 hot path.
|
||||
let q0 = q_packed[0];
|
||||
let q1 = q_packed[1];
|
||||
for i in 0..n {
|
||||
let b = i * 2;
|
||||
// SAFETY: asserted above that packed.len() == n * n_words.
|
||||
let w0 = unsafe { *packed.get_unchecked(b) };
|
||||
let w1 = unsafe { *packed.get_unchecked(b + 1) };
|
||||
let a = (!(w0 ^ q0)).count_ones() + (!(w1 ^ q1)).count_ones();
|
||||
unsafe { *out_agree.get_unchecked_mut(i) = a };
|
||||
}
|
||||
} else if aligned {
|
||||
for i in 0..n {
|
||||
let base = i * n_words;
|
||||
let mut a: u32 = 0;
|
||||
for w in 0..n_words {
|
||||
let wi = unsafe { *packed.get_unchecked(base + w) };
|
||||
let qi = unsafe { *q_packed.get_unchecked(w) };
|
||||
a += (!(wi ^ qi)).count_ones();
|
||||
}
|
||||
unsafe { *out_agree.get_unchecked_mut(i) = a };
|
||||
}
|
||||
} else {
|
||||
// Unaligned last word needs the padding-zero mask.
|
||||
let last = n_words - 1;
|
||||
for i in 0..n {
|
||||
let base = i * n_words;
|
||||
let mut a: u32 = 0;
|
||||
for w in 0..last {
|
||||
let wi = unsafe { *packed.get_unchecked(base + w) };
|
||||
let qi = unsafe { *q_packed.get_unchecked(w) };
|
||||
a += (!(wi ^ qi)).count_ones();
|
||||
}
|
||||
let wi = unsafe { *packed.get_unchecked(base + last) };
|
||||
let qi = unsafe { *q_packed.get_unchecked(last) };
|
||||
a += (!(wi ^ qi) & mask).count_ones();
|
||||
unsafe { *out_agree.get_unchecked_mut(i) = a };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// AVX2 + POPCNT kernel. Processes 4 candidates per outer iteration for the
|
||||
/// D=128 (`n_words == 2`, aligned) fast path; falls back to a 4× unrolled
|
||||
/// popcount loop for other shapes. The actual popcount uses the scalar
|
||||
/// `popcnt` instruction — on AVX2-class hardware it is already 1/cycle per
|
||||
/// port, so the win here is pipelining four independent candidates and
|
||||
/// shrinking the loop overhead, not vectorisation of the popcount itself.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// Caller must guarantee `is_x86_feature_detected!("avx2")` and
|
||||
/// `is_x86_feature_detected!("popcnt")`. The dispatcher in [`scan`] does
|
||||
/// this; external callers should use [`scan`] instead.
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[target_feature(enable = "avx2,popcnt")]
|
||||
unsafe fn scan_avx2(
|
||||
packed: &[u64],
|
||||
n_words: usize,
|
||||
n: usize,
|
||||
q_packed: &[u64],
|
||||
mask: u64,
|
||||
out_agree: &mut [u32],
|
||||
) {
|
||||
use core::arch::x86_64::_popcnt64;
|
||||
|
||||
debug_assert_eq!(packed.len(), n * n_words);
|
||||
debug_assert_eq!(q_packed.len(), n_words);
|
||||
debug_assert_eq!(out_agree.len(), n);
|
||||
|
||||
let aligned = mask == !0u64;
|
||||
let p = packed.as_ptr();
|
||||
let o = out_agree.as_mut_ptr();
|
||||
|
||||
if aligned && n_words == 2 {
|
||||
// D=128: 2 words per candidate, 4 candidates per iter (= 8 popcnts).
|
||||
let q0 = q_packed[0] as i64;
|
||||
let q1 = q_packed[1] as i64;
|
||||
let n4 = n & !3usize;
|
||||
let mut i = 0usize;
|
||||
while i < n4 {
|
||||
// Load 8 words. LLVM emits aligned 256-bit loads if lucky.
|
||||
let b = p.add(i * 2);
|
||||
let w0 = *b as i64;
|
||||
let w1 = *b.add(1) as i64;
|
||||
let w2 = *b.add(2) as i64;
|
||||
let w3 = *b.add(3) as i64;
|
||||
let w4 = *b.add(4) as i64;
|
||||
let w5 = *b.add(5) as i64;
|
||||
let w6 = *b.add(6) as i64;
|
||||
let w7 = *b.add(7) as i64;
|
||||
// Eight independent popcnts — one per candidate-word — run in
|
||||
// parallel on ports 1/5. The ALU pipe is the bottleneck, not
|
||||
// the loads.
|
||||
let a0: i32 = _popcnt64(!(w0 ^ q0)) + _popcnt64(!(w1 ^ q1));
|
||||
let a1: i32 = _popcnt64(!(w2 ^ q0)) + _popcnt64(!(w3 ^ q1));
|
||||
let a2: i32 = _popcnt64(!(w4 ^ q0)) + _popcnt64(!(w5 ^ q1));
|
||||
let a3: i32 = _popcnt64(!(w6 ^ q0)) + _popcnt64(!(w7 ^ q1));
|
||||
*o.add(i) = a0 as u32;
|
||||
*o.add(i + 1) = a1 as u32;
|
||||
*o.add(i + 2) = a2 as u32;
|
||||
*o.add(i + 3) = a3 as u32;
|
||||
i += 4;
|
||||
}
|
||||
// Tail.
|
||||
while i < n {
|
||||
let b = p.add(i * 2);
|
||||
let a: i32 = _popcnt64(!((*b as i64) ^ q0)) + _popcnt64(!((*b.add(1) as i64) ^ q1));
|
||||
*o.add(i) = a as u32;
|
||||
i += 1;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// General path: any dim, 4 candidates at a time. Each candidate runs an
|
||||
// inner word loop. The outer unroll still reduces loop overhead.
|
||||
let n4 = n & !3usize;
|
||||
let mut i = 0usize;
|
||||
if aligned {
|
||||
while i < n4 {
|
||||
let mut a0: i32 = 0;
|
||||
let mut a1: i32 = 0;
|
||||
let mut a2: i32 = 0;
|
||||
let mut a3: i32 = 0;
|
||||
for w in 0..n_words {
|
||||
let qi = *q_packed.get_unchecked(w) as i64;
|
||||
a0 += _popcnt64(!((*p.add(i * n_words + w) as i64) ^ qi));
|
||||
a1 += _popcnt64(!((*p.add((i + 1) * n_words + w) as i64) ^ qi));
|
||||
a2 += _popcnt64(!((*p.add((i + 2) * n_words + w) as i64) ^ qi));
|
||||
a3 += _popcnt64(!((*p.add((i + 3) * n_words + w) as i64) ^ qi));
|
||||
}
|
||||
*o.add(i) = a0 as u32;
|
||||
*o.add(i + 1) = a1 as u32;
|
||||
*o.add(i + 2) = a2 as u32;
|
||||
*o.add(i + 3) = a3 as u32;
|
||||
i += 4;
|
||||
}
|
||||
while i < n {
|
||||
let mut a: i32 = 0;
|
||||
for w in 0..n_words {
|
||||
let qi = *q_packed.get_unchecked(w) as i64;
|
||||
a += _popcnt64(!((*p.add(i * n_words + w) as i64) ^ qi));
|
||||
}
|
||||
*o.add(i) = a as u32;
|
||||
i += 1;
|
||||
}
|
||||
} else {
|
||||
let last = n_words - 1;
|
||||
let m = mask as i64;
|
||||
while i < n4 {
|
||||
let mut a0: i32 = 0;
|
||||
let mut a1: i32 = 0;
|
||||
let mut a2: i32 = 0;
|
||||
let mut a3: i32 = 0;
|
||||
for w in 0..last {
|
||||
let qi = *q_packed.get_unchecked(w) as i64;
|
||||
a0 += _popcnt64(!((*p.add(i * n_words + w) as i64) ^ qi));
|
||||
a1 += _popcnt64(!((*p.add((i + 1) * n_words + w) as i64) ^ qi));
|
||||
a2 += _popcnt64(!((*p.add((i + 2) * n_words + w) as i64) ^ qi));
|
||||
a3 += _popcnt64(!((*p.add((i + 3) * n_words + w) as i64) ^ qi));
|
||||
}
|
||||
let qi = *q_packed.get_unchecked(last) as i64;
|
||||
a0 += _popcnt64(!((*p.add(i * n_words + last) as i64) ^ qi) & m);
|
||||
a1 += _popcnt64(!((*p.add((i + 1) * n_words + last) as i64) ^ qi) & m);
|
||||
a2 += _popcnt64(!((*p.add((i + 2) * n_words + last) as i64) ^ qi) & m);
|
||||
a3 += _popcnt64(!((*p.add((i + 3) * n_words + last) as i64) ^ qi) & m);
|
||||
*o.add(i) = a0 as u32;
|
||||
*o.add(i + 1) = a1 as u32;
|
||||
*o.add(i + 2) = a2 as u32;
|
||||
*o.add(i + 3) = a3 as u32;
|
||||
i += 4;
|
||||
}
|
||||
while i < n {
|
||||
let mut a: i32 = 0;
|
||||
for w in 0..last {
|
||||
let qi = *q_packed.get_unchecked(w) as i64;
|
||||
a += _popcnt64(!((*p.add(i * n_words + w) as i64) ^ qi));
|
||||
}
|
||||
let qi = *q_packed.get_unchecked(last) as i64;
|
||||
a += _popcnt64(!((*p.add(i * n_words + last) as i64) ^ qi) & m);
|
||||
*o.add(i) = a as u32;
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Thin wrapper that adapts the `unsafe` AVX2 kernel to the safe `ScanFn`
|
||||
/// signature for the dispatcher cache. Safe to call only when the CPU
|
||||
/// supports AVX2+POPCNT — which the dispatcher checks.
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
fn scan_avx2_dispatch(
|
||||
packed: &[u64],
|
||||
n_words: usize,
|
||||
n: usize,
|
||||
q_packed: &[u64],
|
||||
mask: u64,
|
||||
out_agree: &mut [u32],
|
||||
) {
|
||||
// SAFETY: dispatcher only installs this fn pointer if both AVX2 and
|
||||
// POPCNT are detected at runtime.
|
||||
unsafe { scan_avx2(packed, n_words, n, q_packed, mask, out_agree) };
|
||||
}
|
||||
|
||||
/// Runtime-dispatched entry point. First call installs the best available
|
||||
/// kernel into a process-global `OnceLock`; subsequent calls dereference a
|
||||
/// cached function pointer (one predictable indirect branch).
|
||||
#[inline]
|
||||
pub fn scan(
|
||||
packed: &[u64],
|
||||
n_words: usize,
|
||||
n: usize,
|
||||
q_packed: &[u64],
|
||||
mask: u64,
|
||||
out_agree: &mut [u32],
|
||||
) {
|
||||
let f = SCAN_IMPL.get_or_init(select_impl);
|
||||
f(packed, n_words, n, q_packed, mask, out_agree);
|
||||
}
|
||||
|
||||
fn select_impl() -> ScanFn {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
if std::is_x86_feature_detected!("avx2") && std::is_x86_feature_detected!("popcnt") {
|
||||
return scan_avx2_dispatch;
|
||||
}
|
||||
}
|
||||
scan_scalar
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn random_packed(dim: usize, n: usize, seed: u64) -> (Vec<u64>, Vec<u64>, u64) {
|
||||
// Xorshift64 — zero-dep deterministic PRNG so tests don't depend on
|
||||
// the `rand` crate's internal sequence.
|
||||
let mut s = seed.wrapping_mul(0x9E37_79B9_7F4A_7C15) | 1;
|
||||
let mut step = || -> u64 {
|
||||
s ^= s << 13;
|
||||
s ^= s >> 7;
|
||||
s ^= s << 17;
|
||||
s
|
||||
};
|
||||
let n_words = (dim + 63) / 64;
|
||||
let mut packed = vec![0u64; n * n_words];
|
||||
for w in &mut packed {
|
||||
*w = step();
|
||||
}
|
||||
let mut q = vec![0u64; n_words];
|
||||
for w in &mut q {
|
||||
*w = step();
|
||||
}
|
||||
// Build the last-word mask.
|
||||
let valid_bits = dim - 64 * (n_words - 1);
|
||||
let mask = if valid_bits == 64 {
|
||||
!0u64
|
||||
} else {
|
||||
!0u64 << (64 - valid_bits)
|
||||
};
|
||||
// Zero the padding bits on every candidate+query so both kernels see
|
||||
// identical inputs — matches how the index pre-zeroes padding.
|
||||
for i in 0..n {
|
||||
let last = i * n_words + n_words - 1;
|
||||
packed[last] &= mask;
|
||||
}
|
||||
q[n_words - 1] &= mask;
|
||||
(packed, q, mask)
|
||||
}
|
||||
|
||||
fn run_both(dim: usize, n: usize, seed: u64) {
|
||||
let (packed, q, mask) = random_packed(dim, n, seed);
|
||||
let n_words = (dim + 63) / 64;
|
||||
|
||||
let mut out_scalar = vec![0u32; n];
|
||||
scan_scalar(&packed, n_words, n, &q, mask, &mut out_scalar);
|
||||
|
||||
// Always exercise the dispatcher (which may pick AVX2 or scalar).
|
||||
let mut out_dispatch = vec![0u32; n];
|
||||
scan(&packed, n_words, n, &q, mask, &mut out_dispatch);
|
||||
assert_eq!(
|
||||
out_scalar, out_dispatch,
|
||||
"dispatcher output diverged from scalar at dim={dim} n={n}"
|
||||
);
|
||||
|
||||
// Directly exercise AVX2 when the host supports it — otherwise the
|
||||
// test would silently run scalar-vs-scalar on CI boxes that lack it.
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
if std::is_x86_feature_detected!("avx2") && std::is_x86_feature_detected!("popcnt") {
|
||||
let mut out_avx2 = vec![0u32; n];
|
||||
unsafe {
|
||||
scan_avx2(&packed, n_words, n, &q, mask, &mut out_avx2);
|
||||
}
|
||||
assert_eq!(
|
||||
out_scalar, out_avx2,
|
||||
"AVX2 output diverged from scalar at dim={dim} n={n}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scan_agree_matches_scalar_at_d128() {
|
||||
// 1000 candidates at the production dim.
|
||||
run_both(128, 1000, 0xA5A5_5A5A_1234_CAFE);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scan_agree_matches_scalar_at_d64_and_d192() {
|
||||
// D=64 → n_words=1, aligned.
|
||||
run_both(64, 777, 0x0123_4567_89AB_CDEF);
|
||||
// D=192 → n_words=3, aligned.
|
||||
run_both(192, 513, 0xFEDC_BA98_7654_3210);
|
||||
// D=100 → n_words=2, unaligned (last word masked).
|
||||
run_both(100, 641, 0xDEAD_BEEF_CAFE_F00D);
|
||||
// D=200 → n_words=4, unaligned.
|
||||
run_both(200, 333, 0x1357_9BDF_2468_ACE0);
|
||||
// Tail-handling: n not a multiple of 4.
|
||||
run_both(128, 1023, 0x4242_4242_4242_4242);
|
||||
run_both(128, 7, 0x9999_AAAA_BBBB_CCCC);
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue