mirror of
https://github.com/ruvnet/RuVector.git
synced 2026-05-30 03:53:34 +00:00
sec(hailo): HTTP/2 keepalive ping for dead-peer reclaim (iter 184)
tonic's default leaves http2_keepalive_interval=None, so a half-closed
TCP connection (client crashed, NAT mid-flow drop, network partition)
sits in the worker's accept table indefinitely, holding stream state
that the iter-181 max_concurrent_streams cap can't reclaim. Add a
60 s server-initiated PING; if the client doesn't PONG within hyper's
default 20 s timeout, the connection is closed and its state freed.
Operators can tune via `RUVECTOR_HTTP2_KEEPALIVE_SECS`. 0 disables
the feature entirely (cellular metering, ping-hostile networks).
Floor 10 s so a misconfig can't saturate the link with pings.
Validated on cognitum-v0, c=8 b=1, 8 s × 3 runs:
iter-183 baseline: 70.5, 70.5, 69.6 → mean 70.2/sec
iter-184 after : 70.6, 69.0, 70.5 → mean 70.0/sec
Δ throughput: -0.3% (unmeasurable; the 60 s ping interval falls
outside the 8 s bench window so no PINGs even fire
during measurement)
Δ p50 : flat at 110-112 ms
Net new behavior: half-closed peers now reclaimed in ≤80 s instead
of waiting on TCP keepalive defaults (sysctl tcp_keepalive_time =
2 hours). Combined with iter-181's 256-stream cap, the worker can
no longer accumulate orphan stream state from disappearing clients.
Five gates now in the worker startup banner: byte cap (180), stream
cap (181), RPC timeout (182), rapid-reset cap (183), keepalive (184).
Co-Authored-By: claude-flow <ruv@ruv.net>
This commit is contained in:
parent
520e892493
commit
5f597dec83
1 changed files with 34 additions and 1 deletions
|
|
@ -48,6 +48,12 @@
|
|||
//! floor 8). Caps unprocessed RST_STREAM
|
||||
//! frames; once exceeded, the server
|
||||
//! sends GOAWAY and closes the connection.
|
||||
//! RUVECTOR_HTTP2_KEEPALIVE_SECS HTTP/2 keepalive ping interval
|
||||
//! (ADR-172 §3a iter 184 — default 60,
|
||||
//! floor 10, 0 = disabled). Reclaims
|
||||
//! half-closed TCP state from crashed or
|
||||
//! partitioned clients; pong timeout is
|
||||
//! hyper's default 20 s.
|
||||
//!
|
||||
//! When both `RUVECTOR_TLS_CERT` and `RUVECTOR_TLS_KEY` are set and the
|
||||
//! binary was built with `--features tls`, the worker serves over HTTPS
|
||||
|
|
@ -585,10 +591,37 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
max_pending_resets,
|
||||
"HTTP/2 max_pending_accept_reset_streams set (ADR-172 §3a iter 183 CVE-2023-44487 gate)"
|
||||
);
|
||||
// Iter 184 — HTTP/2 keepalive ping. tonic's default is no
|
||||
// keepalive, so a half-closed TCP connection (client crashed,
|
||||
// NAT mid-flow drop, network partition) sits in the worker's
|
||||
// accept table indefinitely, holding stream state. With a
|
||||
// 60 s ping interval the worker probes idle peers; if no PONG
|
||||
// arrives within the (hyper-default) 20 s timeout, the
|
||||
// connection is closed and its state reclaimed. Operators can
|
||||
// tune via `RUVECTOR_HTTP2_KEEPALIVE_SECS`; 0 disables the
|
||||
// feature for environments where ping traffic is undesirable
|
||||
// (e.g. cellular metering). Floor 10 s so a misconfig can't
|
||||
// saturate the link with pings.
|
||||
let keepalive_secs: u64 = std::env::var("RUVECTOR_HTTP2_KEEPALIVE_SECS")
|
||||
.ok()
|
||||
.and_then(|s| s.parse::<u64>().ok())
|
||||
.unwrap_or(60);
|
||||
let keepalive = if keepalive_secs == 0 {
|
||||
info!("HTTP/2 keepalive disabled (RUVECTOR_HTTP2_KEEPALIVE_SECS=0)");
|
||||
None
|
||||
} else {
|
||||
let v = keepalive_secs.max(10);
|
||||
info!(
|
||||
http2_keepalive_secs = v,
|
||||
"HTTP/2 keepalive enabled (ADR-172 §3a iter 184 dead-peer reclaim)"
|
||||
);
|
||||
Some(Duration::from_secs(v))
|
||||
};
|
||||
let mut server = Server::builder()
|
||||
.max_concurrent_streams(Some(max_streams))
|
||||
.timeout(Duration::from_secs(request_timeout_secs))
|
||||
.http2_max_pending_accept_reset_streams(Some(max_pending_resets));
|
||||
.http2_max_pending_accept_reset_streams(Some(max_pending_resets))
|
||||
.http2_keepalive_interval(keepalive);
|
||||
#[cfg(feature = "tls")]
|
||||
{
|
||||
// Both vars must be set to opt-in. A partial config (cert
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue