mirror of
https://github.com/ruvnet/RuVector.git
synced 2026-05-30 20:43:38 +00:00
feat(ruvector-hailo-cluster): require fingerprint when --cache > 0 (ADR-172 §2a, iter 101)
Both `ruvector-hailo-embed` and `ruvector-hailo-cluster-bench` now refuse to start when `--cache > 0` is requested with an empty fingerprint, unless the operator explicitly opts in via `--allow-empty-fingerprint`. Empty-fingerprint + cache was the silent stale-serve risk: any worker returning the cached vector under a different (or unset) HEF version would poison the cache, and clients would never notice. The gate fires before any RPC, with an error that names ADR-172 §2a so future operators searching the codebase land at the rationale. Three new CLI tests in tests/embed_cli.rs: - empty-fp + cache, no opt-in -> non-zero exit, gate message on stderr - --allow-empty-fingerprint -> success (escape hatch for legacy fleets) - --fingerprint <hex> + cache -> success (intended path) ADR-172 §2a marked MITIGATED, roadmap row updated. 125 tests green under --features tls (79 lib + 6 + 12 + 9 + 3 + 6 + 2 + 8); clippy --all-targets -D warnings clean for default + tls feature configs. Co-Authored-By: claude-flow <ruv@ruv.net>
This commit is contained in:
parent
165d317793
commit
d8b66d49dc
4 changed files with 152 additions and 6 deletions
|
|
@ -44,6 +44,9 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
let mut quiet = false;
|
||||
let mut fingerprint: String = String::new();
|
||||
let mut auto_fingerprint = false;
|
||||
// ADR-172 §2a iter-101 gate — see embed.rs for the rationale; same
|
||||
// refusal applies here because bench drives the same cluster code.
|
||||
let mut allow_empty_fingerprint = false;
|
||||
let mut validate_fleet = false;
|
||||
// 0 = no background health-checker. >0 = probe every N seconds in
|
||||
// a background tokio task; mismatched fingerprints get hard-ejected
|
||||
|
|
@ -68,6 +71,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
"--quiet" => { quiet = true; i += 1; }
|
||||
"--fingerprint" => { fingerprint = args.get(i + 1).cloned().unwrap_or_default(); i += 2; }
|
||||
"--auto-fingerprint" => { auto_fingerprint = true; i += 1; }
|
||||
"--allow-empty-fingerprint" => { allow_empty_fingerprint = true; i += 1; }
|
||||
"--validate-fleet" => { validate_fleet = true; i += 1; }
|
||||
"--health-check" => {
|
||||
health_check_secs = args.get(i + 1).and_then(|s| s.parse().ok()).unwrap_or(0);
|
||||
|
|
@ -159,6 +163,17 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
}
|
||||
}
|
||||
|
||||
// ADR-172 §2a mitigation (iter 101): same gate as embed.rs — refuse
|
||||
// to enable cache without a fingerprint binding it.
|
||||
if cache_cap > 0 && fingerprint.is_empty() && !allow_empty_fingerprint {
|
||||
return Err(
|
||||
"refusing --cache > 0 with empty fingerprint (ADR-172 §2a); pass \
|
||||
--fingerprint <hex> or --auto-fingerprint, or opt out with \
|
||||
--allow-empty-fingerprint"
|
||||
.into(),
|
||||
);
|
||||
}
|
||||
|
||||
let cluster = Arc::new({
|
||||
let c = HailoClusterEmbedder::new(workers, transport, dim, fingerprint)?;
|
||||
match (cache_cap, cache_ttl_secs) {
|
||||
|
|
@ -487,6 +502,9 @@ OPTIONS:
|
|||
fingerprints. Empty = no enforcement.
|
||||
--auto-fingerprint Probe one worker for its fingerprint
|
||||
and use that as the expected value.
|
||||
--allow-empty-fingerprint Opt out of the ADR-172 §2a safety gate
|
||||
that refuses --cache > 0 with empty fp.
|
||||
Risks silent stale-serve from drift.
|
||||
--validate-fleet Probe every worker on startup;
|
||||
refuse to bench (exit 2) if fleet
|
||||
has 0 healthy workers. Pairs with
|
||||
|
|
|
|||
|
|
@ -41,6 +41,10 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
let mut validate_fleet = false;
|
||||
let mut validate_only = false;
|
||||
let mut auto_fingerprint = false;
|
||||
// ADR-172 §2a iter-101 gate: if --cache > 0 is requested but the
|
||||
// fingerprint is empty (and didn't get filled in by --auto-fingerprint),
|
||||
// refuse to start unless the operator explicitly opted in.
|
||||
let mut allow_empty_fingerprint = false;
|
||||
let mut request_id: String = String::new();
|
||||
// "head" (default), "full", "none". head = first 8 components;
|
||||
// full = entire vector; none = drop the vector, keep dim + latency.
|
||||
|
|
@ -88,6 +92,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
"--validate-fleet" => { validate_fleet = true; i += 1; }
|
||||
"--validate-only" => { validate_only = true; validate_fleet = true; i += 1; }
|
||||
"--auto-fingerprint" => { auto_fingerprint = true; i += 1; }
|
||||
"--allow-empty-fingerprint" => { allow_empty_fingerprint = true; i += 1; }
|
||||
"--request-id" => { request_id = args.get(i + 1).cloned().unwrap_or_default(); i += 2; }
|
||||
"--output" => {
|
||||
let v = args.get(i + 1).cloned().unwrap_or_default();
|
||||
|
|
@ -205,6 +210,20 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
}
|
||||
}
|
||||
|
||||
// ADR-172 §2a HIGH-MEDIUM mitigation (iter 101): refuse to enable
|
||||
// the in-process cache without a fingerprint to bind it to. An empty
|
||||
// fingerprint means *any* worker can poison the cache (silent stale
|
||||
// serve from a mismatched HEF/vocab). Operators who explicitly want
|
||||
// the legacy behavior pass --allow-empty-fingerprint.
|
||||
if cache_cap > 0 && fingerprint.is_empty() && !allow_empty_fingerprint {
|
||||
return Err(
|
||||
"refusing --cache > 0 with empty fingerprint (ADR-172 §2a); pass \
|
||||
--fingerprint <hex> or --auto-fingerprint, or opt out with \
|
||||
--allow-empty-fingerprint"
|
||||
.into(),
|
||||
);
|
||||
}
|
||||
|
||||
let cluster = {
|
||||
let c = HailoClusterEmbedder::new(workers, transport, dim, fingerprint)?;
|
||||
match (cache_cap, cache_ttl_secs) {
|
||||
|
|
@ -496,6 +515,12 @@ OPTIONS:
|
|||
and use that as the expected value.
|
||||
Pairs with --validate-fleet to
|
||||
auto-discover then enforce homogeneity.
|
||||
--allow-empty-fingerprint Opt out of the ADR-172 §2a safety gate
|
||||
that refuses --cache > 0 when the
|
||||
fingerprint is empty. Useful only for
|
||||
legacy fleets that haven't published a
|
||||
fingerprint yet; risks silent stale-
|
||||
serve from a mismatched HEF.
|
||||
--request-id <id> Caller-supplied tracing token sent
|
||||
with every embed RPC via gRPC
|
||||
metadata. Workers' tracing spans
|
||||
|
|
|
|||
|
|
@ -172,6 +172,104 @@ fn embed_cli_version_flag_prints_pkg_name_and_version() {
|
|||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn embed_cli_cache_with_empty_fingerprint_refuses_without_opt_in() {
|
||||
// ADR-172 §2a iter-101 gate: --cache > 0 with no fingerprint and
|
||||
// no --allow-empty-fingerprint must fail loud, before any RPC fires.
|
||||
let port = free_port();
|
||||
let mut worker = spawn_fakeworker(port, 4, "");
|
||||
|
||||
let out = Command::new(EMBED)
|
||||
.args([
|
||||
"--workers", &format!("127.0.0.1:{}", port),
|
||||
"--dim", "4",
|
||||
"--cache", "16",
|
||||
"--text", "hello",
|
||||
"--quiet",
|
||||
])
|
||||
.output()
|
||||
.expect("run embed");
|
||||
|
||||
let _ = worker.kill();
|
||||
let _ = worker.wait();
|
||||
|
||||
assert!(!out.status.success(), "expected non-zero exit");
|
||||
let stderr = String::from_utf8_lossy(&out.stderr);
|
||||
assert!(
|
||||
stderr.contains("ADR-172 §2a") || stderr.contains("empty fingerprint"),
|
||||
"stderr should reference the ADR-172 §2a gate, got: {}",
|
||||
stderr
|
||||
);
|
||||
// No vector should have been emitted on stdout.
|
||||
let stdout = String::from_utf8_lossy(&out.stdout);
|
||||
assert!(stdout.trim().is_empty(), "stdout should be empty, got: {}", stdout);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn embed_cli_cache_with_explicit_opt_in_runs() {
|
||||
// The escape hatch: --allow-empty-fingerprint lets legacy fleets
|
||||
// keep their old behavior (cache enabled, fingerprint empty).
|
||||
let port = free_port();
|
||||
let mut worker = spawn_fakeworker(port, 4, "");
|
||||
|
||||
let out = Command::new(EMBED)
|
||||
.args([
|
||||
"--workers", &format!("127.0.0.1:{}", port),
|
||||
"--dim", "4",
|
||||
"--cache", "16",
|
||||
"--allow-empty-fingerprint",
|
||||
"--text", "hello",
|
||||
"--quiet",
|
||||
])
|
||||
.output()
|
||||
.expect("run embed");
|
||||
|
||||
let _ = worker.kill();
|
||||
let _ = worker.wait();
|
||||
|
||||
assert!(
|
||||
out.status.success(),
|
||||
"expected success with --allow-empty-fingerprint, got {:?}, stderr: {}",
|
||||
out.status,
|
||||
String::from_utf8_lossy(&out.stderr),
|
||||
);
|
||||
let stdout = String::from_utf8_lossy(&out.stdout);
|
||||
assert!(
|
||||
stdout.contains("\"text\":\"hello\""),
|
||||
"expected JSON line, got: {}",
|
||||
stdout
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn embed_cli_cache_with_fingerprint_passes_gate() {
|
||||
// The intended path: pass --fingerprint <hex> and the gate is happy.
|
||||
let port = free_port();
|
||||
let mut worker = spawn_fakeworker(port, 4, "fp:test");
|
||||
|
||||
let out = Command::new(EMBED)
|
||||
.args([
|
||||
"--workers", &format!("127.0.0.1:{}", port),
|
||||
"--dim", "4",
|
||||
"--cache", "16",
|
||||
"--fingerprint", "fp:test",
|
||||
"--text", "hello",
|
||||
"--quiet",
|
||||
])
|
||||
.output()
|
||||
.expect("run embed");
|
||||
|
||||
let _ = worker.kill();
|
||||
let _ = worker.wait();
|
||||
|
||||
assert!(
|
||||
out.status.success(),
|
||||
"expected success with --fingerprint set, got {:?}, stderr: {}",
|
||||
out.status,
|
||||
String::from_utf8_lossy(&out.stderr),
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn embed_cli_validate_fleet_with_wrong_fingerprint_exits_nonzero() {
|
||||
let port = free_port();
|
||||
|
|
|
|||
|
|
@ -82,14 +82,19 @@ the expected model fingerprint to be dispatched to.
|
|||
|
||||
### 2. Cache integrity / poisoning — MEDIUM
|
||||
|
||||
**2a. Empty `expected_model_fingerprint` skips integrity check.**
|
||||
**2a. Empty `expected_model_fingerprint` skips integrity check.** [✅ MITIGATED — iter 101]
|
||||
Default-empty in CLI flags, tests, demos, and examples. Operator opting
|
||||
into `--auto-fingerprint` is the only thing protecting them — and
|
||||
into `--auto-fingerprint` was the only thing protecting them — and
|
||||
auto-fingerprint trusts the first-reachable worker.
|
||||
|
||||
*Mitigation:* Make `--fingerprint <hex>` required when `--cache > 0`
|
||||
(opt-out via explicit `--allow-empty-fingerprint`). Document the
|
||||
"silently serve stale" failure mode in BENCHMARK.md.
|
||||
*Mitigation (shipped iter 101):* Both `ruvector-hailo-embed` and
|
||||
`ruvector-hailo-cluster-bench` now refuse to start when `--cache > 0`
|
||||
is requested with an empty fingerprint, unless the operator explicitly
|
||||
opts in via `--allow-empty-fingerprint`. Refusal happens before any RPC
|
||||
fires; the error message names ADR-172 §2a so operators searching for
|
||||
it land here. Tested end-to-end via 3 new cases in `tests/embed_cli.rs`:
|
||||
(1) refusal without opt-in, (2) success with `--allow-empty-fingerprint`,
|
||||
(3) success with `--fingerprint <hex>` set.
|
||||
|
||||
**2b. Worker-reported fingerprint is trusted blindly.**
|
||||
A malicious worker can claim any fingerprint. Cache key includes the
|
||||
|
|
@ -219,7 +224,7 @@ session key. Out-of-band key exchange via QR code at provisioning.
|
|||
| 92 | HIGH | 1b — mTLS client auth | --require-client-cert worker flag (✅ shipped iter 100 via RUVECTOR_TLS_CLIENT_CA) |
|
||||
| 92 | MEDIUM | 5c — cargo-audit CI | new workflow + initial vuln triage |
|
||||
| 93 | MEDIUM | 3a — drop root | new user + udev rule + install.sh update |
|
||||
| 93 | MEDIUM | 2a — fp required with cache | CLI flag enforcement + docs |
|
||||
| 93 | MEDIUM | 2a — fp required with cache | CLI flag enforcement + docs (✅ shipped iter 101) |
|
||||
| 94 | MEDIUM | 3b — per-peer rate limit | governor interceptor |
|
||||
| 94 | MEDIUM | 2b — auto-fp quorum requirement | discover_fingerprint quorum mode |
|
||||
| 95 | MEDIUM | 3c — log text hash mode | --log-text-content flag |
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue