mirror of
https://github.com/OpenRouterTeam/spawn.git
synced 2026-05-20 01:11:18 +00:00
test(e2e): add openclaw gateway kill/restart resilience test
Verifies that the openclaw gateway auto-restarts after being killed with SIGKILL, validating the systemd Restart=always supervision. The test runs as part of verify_openclaw: 1. Confirms gateway is listening on :18789 2. Kills it with SIGKILL (simulates a hard crash) 3. Waits up to 30s for systemd to auto-restart it 4. Verifies port 18789 comes back online If the gateway isn't running (e.g. non-systemd env), the test is skipped gracefully. On failure, dumps systemd status and gateway logs for diagnostics. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
ed98a59318
commit
39b79d5c12
1 changed files with 68 additions and 0 deletions
|
|
@ -348,9 +348,77 @@ verify_openclaw() {
|
|||
failures=$((failures + 1))
|
||||
fi
|
||||
|
||||
# Gateway resilience: kill the gateway and verify it auto-restarts
|
||||
_openclaw_verify_gateway_resilience "${app}" || failures=$((failures + 1))
|
||||
|
||||
return "${failures}"
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _openclaw_verify_gateway_resilience APP_NAME
|
||||
#
|
||||
# Tests that the openclaw gateway auto-restarts after being killed:
|
||||
# 1. Verify gateway is running on :18789
|
||||
# 2. Kill it with SIGKILL (simulates a crash)
|
||||
# 3. Wait for systemd Restart=always to bring it back (~5-10s)
|
||||
# 4. Verify port 18789 is listening again
|
||||
# Returns 0 on success (gateway recovered), 1 on failure.
|
||||
# ---------------------------------------------------------------------------
|
||||
_openclaw_verify_gateway_resilience() {
|
||||
local app="$1"
|
||||
local port_check='ss -tln 2>/dev/null | grep -q ":18789 " || (echo >/dev/tcp/127.0.0.1/18789) 2>/dev/null || nc -z 127.0.0.1 18789 2>/dev/null'
|
||||
|
||||
# Step 1: Confirm gateway is currently running
|
||||
log_step "Gateway resilience: checking gateway is running..."
|
||||
if ! cloud_exec "${app}" "source ~/.spawnrc 2>/dev/null; \
|
||||
export PATH=\$HOME/.npm-global/bin:\$HOME/.bun/bin:\$HOME/.local/bin:\$PATH; \
|
||||
${port_check}" >/dev/null 2>&1; then
|
||||
log_warn "Gateway not running — skipping resilience test"
|
||||
return 0
|
||||
fi
|
||||
log_ok "Gateway resilience: gateway confirmed running on :18789"
|
||||
|
||||
# Step 2: Kill the gateway with SIGKILL (simulate hard crash)
|
||||
log_step "Gateway resilience: killing gateway (SIGKILL)..."
|
||||
cloud_exec "${app}" "source ~/.spawnrc 2>/dev/null; \
|
||||
export PATH=\$HOME/.npm-global/bin:\$HOME/.bun/bin:\$HOME/.local/bin:\$PATH; \
|
||||
_gw_pid=\$(lsof -ti tcp:18789 2>/dev/null || fuser 18789/tcp 2>/dev/null | tr -d ' '); \
|
||||
if [ -n \"\$_gw_pid\" ]; then kill -9 \$_gw_pid 2>/dev/null; fi" >/dev/null 2>&1 || true
|
||||
|
||||
# Brief pause to let the process die
|
||||
sleep 2
|
||||
|
||||
# Confirm it's actually down
|
||||
if cloud_exec "${app}" "${port_check}" >/dev/null 2>&1; then
|
||||
log_warn "Gateway resilience: port still open after kill — process may not have died"
|
||||
else
|
||||
log_ok "Gateway resilience: gateway confirmed dead"
|
||||
fi
|
||||
|
||||
# Step 3: Wait for auto-restart (systemd Restart=always, RestartSec=5)
|
||||
# Allow up to 30s for systemd to detect the crash and restart the process.
|
||||
log_step "Gateway resilience: waiting for auto-restart (up to 30s)..."
|
||||
local recovered
|
||||
recovered=$(cloud_exec "${app}" "source ~/.spawnrc 2>/dev/null; \
|
||||
export PATH=\$HOME/.npm-global/bin:\$HOME/.bun/bin:\$HOME/.local/bin:\$PATH; \
|
||||
elapsed=0; while [ \$elapsed -lt 30 ]; do \
|
||||
if ${port_check}; then echo 'recovered'; exit 0; fi; \
|
||||
sleep 1; elapsed=\$((elapsed + 1)); \
|
||||
done; echo 'timeout'" 2>&1) || true
|
||||
|
||||
# Step 4: Check result
|
||||
if printf '%s' "${recovered}" | grep -q "recovered"; then
|
||||
log_ok "Gateway resilience: gateway auto-restarted successfully"
|
||||
return 0
|
||||
else
|
||||
log_err "Gateway resilience: gateway did NOT restart within 30s"
|
||||
# Dump systemd status for diagnostics
|
||||
cloud_exec "${app}" "systemctl status openclaw-gateway 2>/dev/null || true; \
|
||||
tail -10 /tmp/openclaw-gateway.log 2>/dev/null || true" 2>&1 | tail -15 >&2
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
verify_zeroclaw() {
|
||||
local app="$1"
|
||||
local failures=0
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue