feat(analysis): rate-histogram motif encoder + A/B vs SDPA — ADR-154 §17 item 10 follow-up

Adds src/analysis/rate_encoder.rs + tests/ac_2_encoder_comparison.rs.
Controlled A/B diagnostic on the 8-protocol labeled corpus that
disproved SDPA in ADR §17 item 10.

Measured precision@5:
  SDPA (shipped)            : 0.072
  rate histogram (this path): 0.079
  delta                     : +0.007

Verdict: encoder is NOT the bottleneck. Both encoders sit below the
1/8 = 0.125 random baseline on the 8-protocol corpus (SDPA 0.072 and
rate histogram 0.079), with the two scores within +0.007 of each
other. Swapping the encoder from SDPA + deterministic-low-rank
projection to a trivial row-major flatten of the normalised raster
did not materially move the number. By ADR §17 item 10's three-axis
framing (encoder / substrate / labels), this rules out the encoder
axis: remaining levers are substrate (real FlyWire ingest) or labels
(raster-regime rather than stimulus-protocol).

Max file 349 LOC (tests/ac_2_encoder_comparison.rs). New LOC 500
(rate_encoder 151 + test 349).

Co-Authored-By: claude-flow <ruv@ruv.net>
This commit is contained in:
ruvnet 2026-04-22 16:27:44 -04:00
parent 70003115df
commit d06e80fe20
4 changed files with 518 additions and 4 deletions

View file

@ -17,6 +17,7 @@
pub mod gpu;
pub mod motif;
pub mod partition;
pub mod rate_encoder;
pub mod structural;
pub mod types;
@ -95,6 +96,19 @@ impl Analysis {
&self.cfg, &self.sdpa, &self.w_q, &self.w_k, &self.w_v, conn, spikes, k,
)
}
/// Same as [`Self::retrieve_motifs`] but uses the rate-histogram
/// encoder (see `analysis::rate_encoder`) instead of SDPA. Exposed
/// for the ADR-154 §17 item 10 encoder-vs-substrate A/B
/// diagnostic; prefer the SDPA path for production use.
pub fn retrieve_motifs_rate(
&self,
conn: &Connectome,
spikes: &[Spike],
k: usize,
) -> (MotifIndex, Vec<MotifHit>) {
rate_encoder::rate_histogram_retrieve_motifs(&self.cfg, conn, spikes, k)
}
}
#[cfg(test)]

View file

@ -56,12 +56,12 @@ pub(crate) fn retrieve_motifs(
(index, hits)
}
struct WindowMeta {
spike_count: u32,
dominant_class_idx: u8,
pub(super) struct WindowMeta {
pub(super) spike_count: u32,
pub(super) dominant_class_idx: u8,
}
fn build_raster(
pub(super) fn build_raster(
conn: &Connectome,
spikes: &[Spike],
t_start: f32,

View file

@ -0,0 +1,151 @@
//! Rate-histogram motif encoder — alternative to the SDPA path in
//! `motif.rs`. Designed as a *controlled A/B baseline* for the AC-2
//! encoder-vs-substrate diagnosis in ADR-154 §17 item 10.
//!
//! Design intent:
//!
//! - The shipped SDPA + deterministic-low-rank-projection encoder is
//! protocol-blind on the expanded 8-protocol labeled corpus
//! (precision@5 ≈ random). Three remediations plateau at ≤ 0.60.
//! The ADR calls for pinning the bottleneck: encoder, substrate, or
//! labels.
//! - This module implements the *encoder* axis: a trivial
//! row-major flatten of the normalised raster produced by
//! `motif::build_raster`. No projection, no attention, no additional
//! normalisation. Every bin of every class is preserved verbatim.
//! - If this cheap baseline scores *higher* than SDPA on the same
//! 8-protocol labeled corpus, SDPA is actively hurting. If it scores
//! the same or lower, the substrate — not the encoder — is the
//! bottleneck.
//!
//! The encoder is deterministic (no RNG, no state) and uses exactly
//! one allocation (the output vector).
use crate::connectome::Connectome;
use crate::lif::Spike;
use super::motif::build_raster;
use super::types::{AnalysisConfig, MotifHit, MotifIndex, MotifWindow};
/// Flatten a raster `[n_rows][n_cols]` into a row-major `Vec<f32>`.
///
/// The output length is `n_rows * n_cols` and `out[r * n_cols + c] ==
/// raster[r][c]`. No normalisation beyond what the caller already
/// applied — we preserve the row-normalised form emitted by
/// `motif::build_raster` verbatim so the A/B comparison isolates "what
/// does SDPA add beyond the raster itself".
///
/// Empty rasters return an empty vector.
pub fn rate_histogram_encode(raster: &[Vec<f32>]) -> Vec<f32> {
if raster.is_empty() {
return Vec::new();
}
let n_cols = raster[0].len();
// Guard against ragged rasters (shouldn't occur from build_raster but
// we validate anyway — the public API surface treats this as input).
for row in raster.iter() {
debug_assert_eq!(
row.len(),
n_cols,
"rate_histogram_encode: ragged raster (row len differs from first)"
);
}
let n_rows = raster.len();
let mut out = Vec::with_capacity(n_rows * n_cols);
for row in raster {
// Explicit raw-bin-count copy; `extend_from_slice` compiles to a
// `memcpy` on contiguous data. No projection, no attention.
out.extend_from_slice(row);
}
out
}
/// Build motif embeddings over sliding windows using the rate-histogram
/// encoder and index them. Mirrors `motif::retrieve_motifs` so the two
/// paths can be swapped at call sites without other changes. Returns
/// the index plus the top-k repeated motifs.
///
/// The sliding-window schedule, the in-memory kNN index, and the
/// dominant-class accounting are *identical* to the SDPA path — the
/// only difference is the per-window embedding function. This is the
/// A/B invariant the diagnostic test relies on.
pub fn rate_histogram_retrieve_motifs(
cfg: &AnalysisConfig,
conn: &Connectome,
spikes: &[Spike],
k: usize,
) -> (MotifIndex, Vec<MotifHit>) {
let mut index = MotifIndex::new(cfg.index_capacity);
if spikes.is_empty() {
return (index, Vec::new());
}
let t_end = spikes.last().map(|s| s.t_ms).unwrap_or(0.0);
let win = cfg.motif_window_ms;
let bins = cfg.motif_bins;
let step = win / 2.0;
let mut t = 0.0;
while t + win <= t_end + step {
let (raster, meta) = build_raster(conn, spikes, t, win, bins);
if meta.spike_count == 0 {
t += step;
continue;
}
let vec = rate_histogram_encode(&raster);
index.insert(
vec,
MotifWindow {
t_center_ms: t + win * 0.5,
spike_count: meta.spike_count,
dominant_class_idx: meta.dominant_class_idx,
},
);
t += step;
}
let hits = index.top_k(k);
(index, hits)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn encode_empty_raster_returns_empty_vec() {
let raster: Vec<Vec<f32>> = Vec::new();
let v = rate_histogram_encode(&raster);
assert!(v.is_empty());
}
#[test]
fn encode_is_row_major_and_preserves_values() {
// 3 rows × 4 cols — pick distinct values so we can spot
// row-vs-column-major mistakes.
let raster: Vec<Vec<f32>> = vec![
vec![1.0, 2.0, 3.0, 4.0],
vec![5.0, 6.0, 7.0, 8.0],
vec![9.0, 10.0, 11.0, 12.0],
];
let flat = rate_histogram_encode(&raster);
assert_eq!(flat.len(), 12);
for r in 0..3 {
for c in 0..4 {
assert_eq!(flat[r * 4 + c], raster[r][c]);
}
}
}
#[test]
fn encode_is_deterministic_across_runs() {
let raster: Vec<Vec<f32>> = vec![
vec![0.1, 0.2, 0.3],
vec![0.4, 0.5, 0.6],
vec![0.7, 0.8, 0.9],
];
let a = rate_histogram_encode(&raster);
let b = rate_histogram_encode(&raster);
assert_eq!(a.len(), b.len());
for (x, y) in a.iter().zip(b.iter()) {
assert_eq!(x.to_bits(), y.to_bits(), "bit-level determinism required");
}
}
}

View file

@ -0,0 +1,349 @@
#![allow(clippy::needless_range_loop)]
//! ADR-154 §17 item 10 follow-up — encoder-vs-substrate diagnostic.
//!
//! The shipped SDPA + deterministic-low-rank-projection motif encoder
//! was measured protocol-blind on this substrate: expanded-corpus AC-2
//! at 8 protocols landed at `precision@5 = 0.117` (random = 0.125). The
//! ADR names three axes to fix this — different encoder, different
//! substrate, different labels — and asks that the cheapest axis
//! (encoder) be investigated first with a controlled A/B.
//!
//! This test is that A/B. It runs the same 8-protocol labeled corpus
//! through BOTH encoders and reports precision@5 side-by-side. The
//! test is **publish-only**: it does not gate on absolute precision
//! numbers. It fails only on non-deterministic output, malformed
//! vectors, or an empty corpus — the AC-2 precision numbers go into
//! the ADR §17 table, not into a regression gate.
//!
//! Interpretation rubric (fill in the commit message from the
//! printed verdict):
//!
//! - rate > SDPA by a meaningful margin (≥ 0.05) → SDPA is actively
//! hurting on this substrate.
//! - rate ≈ SDPA (within 0.05) → encoder is NOT the bottleneck; try
//! substrate or labels next.
//! - rate < SDPA → rate histogram is actively worse; SDPA at least
//! preserves some protocol-specific signal.
use connectome_fly::{
Analysis, AnalysisConfig, Connectome, ConnectomeConfig, Engine, EngineConfig, MotifIndex,
NeuronId, Observer, Spike, Stimulus,
};
// -----------------------------------------------------------------
// 8-protocol corpus
// -----------------------------------------------------------------
/// One stimulus protocol in the 8-protocol labeled corpus.
///
/// Axes (ADR-154 §17 item 10 mirrors this):
/// - `sensory_subset` — 0 = first half of sensory neurons, 1 = second half.
/// - `freq_hz` — pulse-train rate.
/// - `amplitude_pa` — per-pulse charge.
/// - `duration_ms` — pulse-train window width.
#[derive(Clone, Copy, Debug)]
struct Protocol {
id: u8,
sensory_subset: u8,
freq_hz: f32,
amplitude_pa: f32,
duration_ms: f32,
}
/// Build an 8-protocol corpus spanning the four axes called out in
/// ADR-154 §17 item 10. The eight points are an asymmetric partial
/// factorial — not all 2⁴ combinations (the budget is 8 protocols) —
/// chosen so every axis varies at least once against the P0 baseline.
fn eight_protocols() -> [Protocol; 8] {
[
Protocol {
id: 0,
sensory_subset: 0,
freq_hz: 60.0,
amplitude_pa: 80.0,
duration_ms: 200.0,
},
Protocol {
id: 1,
sensory_subset: 0,
freq_hz: 60.0,
amplitude_pa: 80.0,
duration_ms: 300.0,
},
Protocol {
id: 2,
sensory_subset: 0,
freq_hz: 60.0,
amplitude_pa: 130.0,
duration_ms: 200.0,
},
Protocol {
id: 3,
sensory_subset: 0,
freq_hz: 120.0,
amplitude_pa: 80.0,
duration_ms: 200.0,
},
Protocol {
id: 4,
sensory_subset: 1,
freq_hz: 60.0,
amplitude_pa: 80.0,
duration_ms: 200.0,
},
Protocol {
id: 5,
sensory_subset: 1,
freq_hz: 120.0,
amplitude_pa: 130.0,
duration_ms: 200.0,
},
Protocol {
id: 6,
sensory_subset: 0,
freq_hz: 120.0,
amplitude_pa: 130.0,
duration_ms: 300.0,
},
Protocol {
id: 7,
sensory_subset: 1,
freq_hz: 60.0,
amplitude_pa: 130.0,
duration_ms: 300.0,
},
]
}
/// Build the current-injection schedule for one protocol.
fn stimulus_for(conn: &Connectome, p: &Protocol) -> Stimulus {
let sensory = conn.sensory_neurons();
let half = sensory.len() / 2;
let subset: Vec<NeuronId> = if p.sensory_subset == 0 {
sensory[..half].to_vec()
} else {
sensory[half..].to_vec()
};
Stimulus::pulse_train(&subset, 20.0, p.duration_ms, p.amplitude_pa, p.freq_hz)
}
/// Run one protocol through a fresh LIF engine and return all spikes
/// plus the simulation end-time.
fn run_protocol(conn: &Connectome, p: &Protocol) -> (f32, Vec<Spike>) {
let stim = stimulus_for(conn, p);
let mut eng = Engine::new(conn, EngineConfig::default());
let mut obs = Observer::new(conn.num_neurons());
let t_end = 20.0 + p.duration_ms + 80.0;
eng.run_with(&stim, &mut obs, t_end);
(t_end, obs.spikes().to_vec())
}
/// One labeled motif vector: the encoder output plus the protocol id
/// it was produced under.
#[derive(Clone, Debug)]
struct LabeledVec {
vector: Vec<f32>,
protocol_id: u8,
}
/// Run all 8 protocols and collect labeled motif vectors from the
/// given encoder (SDPA via `retrieve_motifs`, rate histogram via
/// `retrieve_motifs_rate`). `encoder_fn` takes `(connectome, spikes)`
/// and returns the populated motif index; the caller decides which
/// `Analysis` method to call.
fn collect_labeled_vectors<F>(
conn: &Connectome,
protocols: &[Protocol],
mut encoder_fn: F,
) -> Vec<LabeledVec>
where
F: FnMut(&Connectome, &[Spike]) -> MotifIndex,
{
let mut labeled: Vec<LabeledVec> = Vec::new();
for p in protocols {
let (_t_end, spikes) = run_protocol(conn, p);
if spikes.is_empty() {
continue;
}
let index = encoder_fn(conn, &spikes);
for v in index.vectors() {
labeled.push(LabeledVec {
vector: v.clone(),
protocol_id: p.id,
});
}
}
labeled
}
// -----------------------------------------------------------------
// Precision@k
// -----------------------------------------------------------------
#[inline]
fn l2(a: &[f32], b: &[f32]) -> f32 {
let mut s = 0.0_f32;
let n = a.len().min(b.len());
for i in 0..n {
let d = a[i] - b[i];
s += d * d;
}
s.sqrt()
}
/// Labeled precision@k: for each labeled vector, brute-force find its
/// top-k nearest neighbours in the labeled corpus (excluding itself),
/// count how many share its label. Returns the mean across the corpus.
fn precision_at_k(corpus: &[LabeledVec], k: usize) -> f32 {
if corpus.len() < 2 || k == 0 {
return 0.0;
}
let mut total = 0.0_f32;
for (qi, q) in corpus.iter().enumerate() {
// Score every other vector; keep top-k smallest distances.
let mut pairs: Vec<(f32, u8)> = Vec::with_capacity(corpus.len() - 1);
for (ci, c) in corpus.iter().enumerate() {
if ci == qi {
continue;
}
pairs.push((l2(&q.vector, &c.vector), c.protocol_id));
}
pairs.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));
let take = k.min(pairs.len());
let hits = pairs[..take]
.iter()
.filter(|(_, lbl)| *lbl == q.protocol_id)
.count();
total += hits as f32 / take as f32;
}
total / corpus.len() as f32
}
// -----------------------------------------------------------------
// Main A/B diagnostic
// -----------------------------------------------------------------
#[test]
fn ac_2_encoder_comparison_sdpa_vs_rate_histogram() {
// Same connectome for both encoders — isolates the encoder as the
// only variable.
let conn = Connectome::generate(&ConnectomeConfig::default());
let protocols = eight_protocols();
// Motif-window config matches the expanded-corpus AC-2 test
// described in ADR-154 §17 item 10: 20 ms windows, 10 bins. The
// index is large enough to hold every window from all 8 protocols
// (≈ 8 × 20 = 160 at 200 ms, more at 300 ms).
let cfg = AnalysisConfig {
motif_window_ms: 20.0,
motif_bins: 10,
index_capacity: 1024,
..AnalysisConfig::default()
};
let an = Analysis::new(cfg.clone());
// ---- SDPA path (shipped) ----
let sdpa_corpus = collect_labeled_vectors(&conn, &protocols, |c, sp| {
let (index, _hits) = an.retrieve_motifs(c, sp, 5);
index
});
// ---- Rate-histogram path (this commit) ----
let rate_corpus = collect_labeled_vectors(&conn, &protocols, |c, sp| {
let (index, _hits) = an.retrieve_motifs_rate(c, sp, 5);
index
});
// ---- Hard asserts: diagnostic sanity, NOT precision floor ----
assert!(
!sdpa_corpus.is_empty(),
"SDPA corpus is empty — LIF engine or SDPA path failed"
);
assert!(
!rate_corpus.is_empty(),
"rate-histogram corpus is empty — LIF engine or rate path failed"
);
// Both encoders see the same windows — they must produce the same
// count of labeled vectors. If this differs the A/B is invalid
// (one path is dropping or inserting windows the other isn't).
assert_eq!(
sdpa_corpus.len(),
rate_corpus.len(),
"corpus size mismatch ({} SDPA vs {} rate) — one encoder is \
filtering differently, invalidating the A/B",
sdpa_corpus.len(),
rate_corpus.len()
);
// Each protocol must be represented — otherwise the 1/8 random
// baseline is not the right floor.
let mut counts = [0_u32; 8];
for v in &sdpa_corpus {
counts[v.protocol_id as usize] += 1;
}
let distinct = counts.iter().filter(|c| **c > 0).count();
assert!(
distinct >= 2,
"corpus collapsed to {distinct} distinct protocols out of 8 \
random baseline not comparable"
);
// Determinism check: re-run the rate path and confirm bit-identical
// vectors. (SDPA relies on an external crate whose internal ordering
// we don't gate here; the rate path has no RNG so MUST be exact.)
let rate_corpus_b = collect_labeled_vectors(&conn, &protocols, |c, sp| {
let (index, _hits) = an.retrieve_motifs_rate(c, sp, 5);
index
});
assert_eq!(rate_corpus.len(), rate_corpus_b.len());
for (a, b) in rate_corpus.iter().zip(rate_corpus_b.iter()) {
assert_eq!(a.vector.len(), b.vector.len(), "rate: vector length drift");
for (x, y) in a.vector.iter().zip(b.vector.iter()) {
assert_eq!(
x.to_bits(),
y.to_bits(),
"rate encoder is non-deterministic"
);
}
}
// Malformed-vector guard: every rate vector should have length
// 15 * motif_bins (15 classes in the connectome).
let expected_dim = 15 * cfg.motif_bins;
for v in &rate_corpus {
assert_eq!(
v.vector.len(),
expected_dim,
"rate vector has dim {} expected {expected_dim}",
v.vector.len()
);
}
// ---- Soft measurement: precision@5 ----
let k = 5;
let sdpa_p = precision_at_k(&sdpa_corpus, k);
let rate_p = precision_at_k(&rate_corpus, k);
let delta = rate_p - sdpa_p;
let random_baseline = 1.0 / 8.0;
// Verdict marker for the ADR §17 follow-up row.
let marker = if delta > 0.05 {
"PASS (rate > SDPA — SDPA is actively hurting)"
} else if delta < -0.05 {
"MISS (rate < SDPA — rate histogram is actively worse)"
} else {
"TIE (rate ≈ SDPA — encoder is NOT the bottleneck; try substrate or labels)"
};
eprintln!(
"ac-2-encoder-comparison:\n\
corpus_size = {} windows\n\
distinct_protocols = {}/8\n\
SDPA precision@{k} = {sdpa_p:.3}\n\
rate precision@{k} = {rate_p:.3}\n\
delta (rate - SDPA) = {delta:+.3}\n\
random baseline (1/8) = {random_baseline:.3}\n\
verdict = {marker}",
sdpa_corpus.len(),
distinct,
);
}