From 5f597dec83dbd2d989dd4818fd5e005aca09f9f4 Mon Sep 17 00:00:00 2001 From: ruvnet Date: Sun, 3 May 2026 17:55:34 -0400 Subject: [PATCH] sec(hailo): HTTP/2 keepalive ping for dead-peer reclaim (iter 184) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tonic's default leaves http2_keepalive_interval=None, so a half-closed TCP connection (client crashed, NAT mid-flow drop, network partition) sits in the worker's accept table indefinitely, holding stream state that the iter-181 max_concurrent_streams cap can't reclaim. Add a 60 s server-initiated PING; if the client doesn't PONG within hyper's default 20 s timeout, the connection is closed and its state freed. Operators can tune via `RUVECTOR_HTTP2_KEEPALIVE_SECS`. 0 disables the feature entirely (cellular metering, ping-hostile networks). Floor 10 s so a misconfig can't saturate the link with pings. Validated on cognitum-v0, c=8 b=1, 8 s × 3 runs: iter-183 baseline: 70.5, 70.5, 69.6 → mean 70.2/sec iter-184 after : 70.6, 69.0, 70.5 → mean 70.0/sec Δ throughput: -0.3% (unmeasurable; the 60 s ping interval falls outside the 8 s bench window so no PINGs even fire during measurement) Δ p50 : flat at 110-112 ms Net new behavior: half-closed peers now reclaimed in ≤80 s instead of waiting on TCP keepalive defaults (sysctl tcp_keepalive_time = 2 hours). Combined with iter-181's 256-stream cap, the worker can no longer accumulate orphan stream state from disappearing clients. Five gates now in the worker startup banner: byte cap (180), stream cap (181), RPC timeout (182), rapid-reset cap (183), keepalive (184). Co-Authored-By: claude-flow --- .../ruvector-hailo-cluster/src/bin/worker.rs | 35 ++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/crates/ruvector-hailo-cluster/src/bin/worker.rs b/crates/ruvector-hailo-cluster/src/bin/worker.rs index d4abe696..5385cae8 100644 --- a/crates/ruvector-hailo-cluster/src/bin/worker.rs +++ b/crates/ruvector-hailo-cluster/src/bin/worker.rs @@ -48,6 +48,12 @@ //! floor 8). Caps unprocessed RST_STREAM //! frames; once exceeded, the server //! sends GOAWAY and closes the connection. +//! RUVECTOR_HTTP2_KEEPALIVE_SECS HTTP/2 keepalive ping interval +//! (ADR-172 §3a iter 184 — default 60, +//! floor 10, 0 = disabled). Reclaims +//! half-closed TCP state from crashed or +//! partitioned clients; pong timeout is +//! hyper's default 20 s. //! //! When both `RUVECTOR_TLS_CERT` and `RUVECTOR_TLS_KEY` are set and the //! binary was built with `--features tls`, the worker serves over HTTPS @@ -585,10 +591,37 @@ fn main() -> Result<(), Box> { max_pending_resets, "HTTP/2 max_pending_accept_reset_streams set (ADR-172 §3a iter 183 CVE-2023-44487 gate)" ); + // Iter 184 — HTTP/2 keepalive ping. tonic's default is no + // keepalive, so a half-closed TCP connection (client crashed, + // NAT mid-flow drop, network partition) sits in the worker's + // accept table indefinitely, holding stream state. With a + // 60 s ping interval the worker probes idle peers; if no PONG + // arrives within the (hyper-default) 20 s timeout, the + // connection is closed and its state reclaimed. Operators can + // tune via `RUVECTOR_HTTP2_KEEPALIVE_SECS`; 0 disables the + // feature for environments where ping traffic is undesirable + // (e.g. cellular metering). Floor 10 s so a misconfig can't + // saturate the link with pings. + let keepalive_secs: u64 = std::env::var("RUVECTOR_HTTP2_KEEPALIVE_SECS") + .ok() + .and_then(|s| s.parse::().ok()) + .unwrap_or(60); + let keepalive = if keepalive_secs == 0 { + info!("HTTP/2 keepalive disabled (RUVECTOR_HTTP2_KEEPALIVE_SECS=0)"); + None + } else { + let v = keepalive_secs.max(10); + info!( + http2_keepalive_secs = v, + "HTTP/2 keepalive enabled (ADR-172 §3a iter 184 dead-peer reclaim)" + ); + Some(Duration::from_secs(v)) + }; let mut server = Server::builder() .max_concurrent_streams(Some(max_streams)) .timeout(Duration::from_secs(request_timeout_secs)) - .http2_max_pending_accept_reset_streams(Some(max_pending_resets)); + .http2_max_pending_accept_reset_streams(Some(max_pending_resets)) + .http2_keepalive_interval(keepalive); #[cfg(feature = "tls")] { // Both vars must be set to opt-in. A partial config (cert