From d8b66d49dcb0ef0ecd5fc8676a652e55a237123e Mon Sep 17 00:00:00 2001 From: ruvnet Date: Sat, 2 May 2026 10:16:12 -0400 Subject: [PATCH] =?UTF-8?q?feat(ruvector-hailo-cluster):=20require=20finge?= =?UTF-8?q?rprint=20when=20--cache=20>=200=20(ADR-172=20=C2=A72a,=20iter?= =?UTF-8?q?=20101)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both `ruvector-hailo-embed` and `ruvector-hailo-cluster-bench` now refuse to start when `--cache > 0` is requested with an empty fingerprint, unless the operator explicitly opts in via `--allow-empty-fingerprint`. Empty-fingerprint + cache was the silent stale-serve risk: any worker returning the cached vector under a different (or unset) HEF version would poison the cache, and clients would never notice. The gate fires before any RPC, with an error that names ADR-172 §2a so future operators searching the codebase land at the rationale. Three new CLI tests in tests/embed_cli.rs: - empty-fp + cache, no opt-in -> non-zero exit, gate message on stderr - --allow-empty-fingerprint -> success (escape hatch for legacy fleets) - --fingerprint + cache -> success (intended path) ADR-172 §2a marked MITIGATED, roadmap row updated. 125 tests green under --features tls (79 lib + 6 + 12 + 9 + 3 + 6 + 2 + 8); clippy --all-targets -D warnings clean for default + tls feature configs. Co-Authored-By: claude-flow --- .../ruvector-hailo-cluster/src/bin/bench.rs | 18 ++++ .../ruvector-hailo-cluster/src/bin/embed.rs | 25 +++++ .../ruvector-hailo-cluster/tests/embed_cli.rs | 98 +++++++++++++++++++ .../ADR-172-ruvector-hailo-security-review.md | 17 ++-- 4 files changed, 152 insertions(+), 6 deletions(-) diff --git a/crates/ruvector-hailo-cluster/src/bin/bench.rs b/crates/ruvector-hailo-cluster/src/bin/bench.rs index 67647086e..12e47afd5 100644 --- a/crates/ruvector-hailo-cluster/src/bin/bench.rs +++ b/crates/ruvector-hailo-cluster/src/bin/bench.rs @@ -44,6 +44,9 @@ fn main() -> Result<(), Box> { let mut quiet = false; let mut fingerprint: String = String::new(); let mut auto_fingerprint = false; + // ADR-172 §2a iter-101 gate — see embed.rs for the rationale; same + // refusal applies here because bench drives the same cluster code. + let mut allow_empty_fingerprint = false; let mut validate_fleet = false; // 0 = no background health-checker. >0 = probe every N seconds in // a background tokio task; mismatched fingerprints get hard-ejected @@ -68,6 +71,7 @@ fn main() -> Result<(), Box> { "--quiet" => { quiet = true; i += 1; } "--fingerprint" => { fingerprint = args.get(i + 1).cloned().unwrap_or_default(); i += 2; } "--auto-fingerprint" => { auto_fingerprint = true; i += 1; } + "--allow-empty-fingerprint" => { allow_empty_fingerprint = true; i += 1; } "--validate-fleet" => { validate_fleet = true; i += 1; } "--health-check" => { health_check_secs = args.get(i + 1).and_then(|s| s.parse().ok()).unwrap_or(0); @@ -159,6 +163,17 @@ fn main() -> Result<(), Box> { } } + // ADR-172 §2a mitigation (iter 101): same gate as embed.rs — refuse + // to enable cache without a fingerprint binding it. + if cache_cap > 0 && fingerprint.is_empty() && !allow_empty_fingerprint { + return Err( + "refusing --cache > 0 with empty fingerprint (ADR-172 §2a); pass \ + --fingerprint or --auto-fingerprint, or opt out with \ + --allow-empty-fingerprint" + .into(), + ); + } + let cluster = Arc::new({ let c = HailoClusterEmbedder::new(workers, transport, dim, fingerprint)?; match (cache_cap, cache_ttl_secs) { @@ -487,6 +502,9 @@ OPTIONS: fingerprints. Empty = no enforcement. --auto-fingerprint Probe one worker for its fingerprint and use that as the expected value. + --allow-empty-fingerprint Opt out of the ADR-172 §2a safety gate + that refuses --cache > 0 with empty fp. + Risks silent stale-serve from drift. --validate-fleet Probe every worker on startup; refuse to bench (exit 2) if fleet has 0 healthy workers. Pairs with diff --git a/crates/ruvector-hailo-cluster/src/bin/embed.rs b/crates/ruvector-hailo-cluster/src/bin/embed.rs index ac121cb28..9751adf5e 100644 --- a/crates/ruvector-hailo-cluster/src/bin/embed.rs +++ b/crates/ruvector-hailo-cluster/src/bin/embed.rs @@ -41,6 +41,10 @@ fn main() -> Result<(), Box> { let mut validate_fleet = false; let mut validate_only = false; let mut auto_fingerprint = false; + // ADR-172 §2a iter-101 gate: if --cache > 0 is requested but the + // fingerprint is empty (and didn't get filled in by --auto-fingerprint), + // refuse to start unless the operator explicitly opted in. + let mut allow_empty_fingerprint = false; let mut request_id: String = String::new(); // "head" (default), "full", "none". head = first 8 components; // full = entire vector; none = drop the vector, keep dim + latency. @@ -88,6 +92,7 @@ fn main() -> Result<(), Box> { "--validate-fleet" => { validate_fleet = true; i += 1; } "--validate-only" => { validate_only = true; validate_fleet = true; i += 1; } "--auto-fingerprint" => { auto_fingerprint = true; i += 1; } + "--allow-empty-fingerprint" => { allow_empty_fingerprint = true; i += 1; } "--request-id" => { request_id = args.get(i + 1).cloned().unwrap_or_default(); i += 2; } "--output" => { let v = args.get(i + 1).cloned().unwrap_or_default(); @@ -205,6 +210,20 @@ fn main() -> Result<(), Box> { } } + // ADR-172 §2a HIGH-MEDIUM mitigation (iter 101): refuse to enable + // the in-process cache without a fingerprint to bind it to. An empty + // fingerprint means *any* worker can poison the cache (silent stale + // serve from a mismatched HEF/vocab). Operators who explicitly want + // the legacy behavior pass --allow-empty-fingerprint. + if cache_cap > 0 && fingerprint.is_empty() && !allow_empty_fingerprint { + return Err( + "refusing --cache > 0 with empty fingerprint (ADR-172 §2a); pass \ + --fingerprint or --auto-fingerprint, or opt out with \ + --allow-empty-fingerprint" + .into(), + ); + } + let cluster = { let c = HailoClusterEmbedder::new(workers, transport, dim, fingerprint)?; match (cache_cap, cache_ttl_secs) { @@ -496,6 +515,12 @@ OPTIONS: and use that as the expected value. Pairs with --validate-fleet to auto-discover then enforce homogeneity. + --allow-empty-fingerprint Opt out of the ADR-172 §2a safety gate + that refuses --cache > 0 when the + fingerprint is empty. Useful only for + legacy fleets that haven't published a + fingerprint yet; risks silent stale- + serve from a mismatched HEF. --request-id Caller-supplied tracing token sent with every embed RPC via gRPC metadata. Workers' tracing spans diff --git a/crates/ruvector-hailo-cluster/tests/embed_cli.rs b/crates/ruvector-hailo-cluster/tests/embed_cli.rs index 88221af7f..e13c9c0a3 100644 --- a/crates/ruvector-hailo-cluster/tests/embed_cli.rs +++ b/crates/ruvector-hailo-cluster/tests/embed_cli.rs @@ -172,6 +172,104 @@ fn embed_cli_version_flag_prints_pkg_name_and_version() { } } +#[test] +fn embed_cli_cache_with_empty_fingerprint_refuses_without_opt_in() { + // ADR-172 §2a iter-101 gate: --cache > 0 with no fingerprint and + // no --allow-empty-fingerprint must fail loud, before any RPC fires. + let port = free_port(); + let mut worker = spawn_fakeworker(port, 4, ""); + + let out = Command::new(EMBED) + .args([ + "--workers", &format!("127.0.0.1:{}", port), + "--dim", "4", + "--cache", "16", + "--text", "hello", + "--quiet", + ]) + .output() + .expect("run embed"); + + let _ = worker.kill(); + let _ = worker.wait(); + + assert!(!out.status.success(), "expected non-zero exit"); + let stderr = String::from_utf8_lossy(&out.stderr); + assert!( + stderr.contains("ADR-172 §2a") || stderr.contains("empty fingerprint"), + "stderr should reference the ADR-172 §2a gate, got: {}", + stderr + ); + // No vector should have been emitted on stdout. + let stdout = String::from_utf8_lossy(&out.stdout); + assert!(stdout.trim().is_empty(), "stdout should be empty, got: {}", stdout); +} + +#[test] +fn embed_cli_cache_with_explicit_opt_in_runs() { + // The escape hatch: --allow-empty-fingerprint lets legacy fleets + // keep their old behavior (cache enabled, fingerprint empty). + let port = free_port(); + let mut worker = spawn_fakeworker(port, 4, ""); + + let out = Command::new(EMBED) + .args([ + "--workers", &format!("127.0.0.1:{}", port), + "--dim", "4", + "--cache", "16", + "--allow-empty-fingerprint", + "--text", "hello", + "--quiet", + ]) + .output() + .expect("run embed"); + + let _ = worker.kill(); + let _ = worker.wait(); + + assert!( + out.status.success(), + "expected success with --allow-empty-fingerprint, got {:?}, stderr: {}", + out.status, + String::from_utf8_lossy(&out.stderr), + ); + let stdout = String::from_utf8_lossy(&out.stdout); + assert!( + stdout.contains("\"text\":\"hello\""), + "expected JSON line, got: {}", + stdout + ); +} + +#[test] +fn embed_cli_cache_with_fingerprint_passes_gate() { + // The intended path: pass --fingerprint and the gate is happy. + let port = free_port(); + let mut worker = spawn_fakeworker(port, 4, "fp:test"); + + let out = Command::new(EMBED) + .args([ + "--workers", &format!("127.0.0.1:{}", port), + "--dim", "4", + "--cache", "16", + "--fingerprint", "fp:test", + "--text", "hello", + "--quiet", + ]) + .output() + .expect("run embed"); + + let _ = worker.kill(); + let _ = worker.wait(); + + assert!( + out.status.success(), + "expected success with --fingerprint set, got {:?}, stderr: {}", + out.status, + String::from_utf8_lossy(&out.stderr), + ); +} + #[test] fn embed_cli_validate_fleet_with_wrong_fingerprint_exits_nonzero() { let port = free_port(); diff --git a/docs/adr/ADR-172-ruvector-hailo-security-review.md b/docs/adr/ADR-172-ruvector-hailo-security-review.md index 06672c89e..62ecfe557 100644 --- a/docs/adr/ADR-172-ruvector-hailo-security-review.md +++ b/docs/adr/ADR-172-ruvector-hailo-security-review.md @@ -82,14 +82,19 @@ the expected model fingerprint to be dispatched to. ### 2. Cache integrity / poisoning — MEDIUM -**2a. Empty `expected_model_fingerprint` skips integrity check.** +**2a. Empty `expected_model_fingerprint` skips integrity check.** [✅ MITIGATED — iter 101] Default-empty in CLI flags, tests, demos, and examples. Operator opting -into `--auto-fingerprint` is the only thing protecting them — and +into `--auto-fingerprint` was the only thing protecting them — and auto-fingerprint trusts the first-reachable worker. -*Mitigation:* Make `--fingerprint ` required when `--cache > 0` -(opt-out via explicit `--allow-empty-fingerprint`). Document the -"silently serve stale" failure mode in BENCHMARK.md. +*Mitigation (shipped iter 101):* Both `ruvector-hailo-embed` and +`ruvector-hailo-cluster-bench` now refuse to start when `--cache > 0` +is requested with an empty fingerprint, unless the operator explicitly +opts in via `--allow-empty-fingerprint`. Refusal happens before any RPC +fires; the error message names ADR-172 §2a so operators searching for +it land here. Tested end-to-end via 3 new cases in `tests/embed_cli.rs`: +(1) refusal without opt-in, (2) success with `--allow-empty-fingerprint`, +(3) success with `--fingerprint ` set. **2b. Worker-reported fingerprint is trusted blindly.** A malicious worker can claim any fingerprint. Cache key includes the @@ -219,7 +224,7 @@ session key. Out-of-band key exchange via QR code at provisioning. | 92 | HIGH | 1b — mTLS client auth | --require-client-cert worker flag (✅ shipped iter 100 via RUVECTOR_TLS_CLIENT_CA) | | 92 | MEDIUM | 5c — cargo-audit CI | new workflow + initial vuln triage | | 93 | MEDIUM | 3a — drop root | new user + udev rule + install.sh update | -| 93 | MEDIUM | 2a — fp required with cache | CLI flag enforcement + docs | +| 93 | MEDIUM | 2a — fp required with cache | CLI flag enforcement + docs (✅ shipped iter 101) | | 94 | MEDIUM | 3b — per-peer rate limit | governor interceptor | | 94 | MEDIUM | 2b — auto-fp quorum requirement | discover_fingerprint quorum mode | | 95 | MEDIUM | 3c — log text hash mode | --log-text-content flag |