fix(e2e): add per-agent timeout to prevent silent hangs in E2E runs (#2720)

The E2E framework's run_single_agent function had no overall timeout.
When provision/verify/input_test steps hung (e.g. cloud_exec blocking
on sprite-zeroclaw or digitalocean-opencode), the process would stall
indefinitely without writing a .result file, causing silent test failures.

Add a per-agent wall-clock timeout (default 1800s, 2400s for junie) that
wraps the core provision/verify/input_test logic in a killable subshell.
If the timeout expires, the subshell is killed and a "fail" result is
written, ensuring E2E batches always complete.

Fixes #2714

Agent: code-health

Co-authored-by: B <6723574+louisgv@users.noreply.github.com>
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
A 2026-03-17 13:16:09 -07:00 committed by GitHub
parent ce91953649
commit 3630c07c70
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 96 additions and 6 deletions

View file

@ -192,16 +192,68 @@ run_single_agent() {
local status="fail"
# Provision -> Verify -> Input Test
if provision_agent "${agent}" "${app_name}" "${LOG_DIR}"; then
if verify_agent "${agent}" "${app_name}"; then
if run_input_test "${agent}" "${app_name}"; then
status="pass"
# ---------------------------------------------------------------------------
# Per-agent timeout: run provision/verify/input_test in a subshell with a
# wall-clock timeout. This prevents any single step from hanging indefinitely
# and ensures a result file is always written (pass, fail, or timeout).
# Fixes #2714: sprite-zeroclaw and digitalocean-opencode stalling with no result.
# ---------------------------------------------------------------------------
local effective_agent_timeout
effective_agent_timeout=$(get_agent_timeout "${agent}")
log_info "Agent timeout: ${effective_agent_timeout}s"
local status_file="${LOG_DIR}/${app_name}.agent-status"
rm -f "${status_file}"
# Run core logic in a subshell so we can kill it on timeout
(
local _inner_status="fail"
if provision_agent "${agent}" "${app_name}" "${LOG_DIR}"; then
if verify_agent "${agent}" "${app_name}"; then
if run_input_test "${agent}" "${app_name}"; then
_inner_status="pass"
fi
fi
fi
printf '%s' "${_inner_status}" > "${status_file}"
) &
local agent_pid=$!
# Poll for completion or timeout (bash 3.2 compatible — no wait -n)
local agent_waited=0
while [ "${agent_waited}" -lt "${effective_agent_timeout}" ]; do
if [ -f "${status_file}" ]; then
break
fi
# Also break if the subshell exited without writing (crash/error)
if ! kill -0 "${agent_pid}" 2>/dev/null; then
break
fi
sleep 5
agent_waited=$((agent_waited + 5))
done
# Collect result or handle timeout
if [ -f "${status_file}" ]; then
status=$(cat "${status_file}")
wait "${agent_pid}" 2>/dev/null || true
elif kill -0 "${agent_pid}" 2>/dev/null; then
# Timed out — kill the subshell and its children
log_err "${agent} timed out after ${effective_agent_timeout}s — killing"
pkill -P "${agent_pid}" 2>/dev/null || true
kill "${agent_pid}" 2>/dev/null || true
wait "${agent_pid}" 2>/dev/null || true
status="fail"
else
# Subshell exited without writing status file (unexpected error)
log_err "${agent} subshell exited without writing status"
wait "${agent_pid}" 2>/dev/null || true
status="fail"
fi
# Teardown (always attempt)
rm -f "${status_file}"
# Teardown (always attempt, even after timeout)
teardown_agent "${app_name}" || log_warn "Teardown failed for ${app_name}"
local agent_end

View file

@ -9,11 +9,15 @@ ALL_AGENTS="claude openclaw zeroclaw codex opencode kilocode hermes junie"
PROVISION_TIMEOUT="${PROVISION_TIMEOUT:-720}"
INSTALL_WAIT="${INSTALL_WAIT:-600}"
INPUT_TEST_TIMEOUT="${INPUT_TEST_TIMEOUT:-120}"
# Per-agent overall timeout: max wall-clock time for provision + verify + input test.
# Ensures a result file is always written even if a step hangs indefinitely.
AGENT_TIMEOUT="${AGENT_TIMEOUT:-1800}"
# Validate numeric env vars that get interpolated into remote command strings.
# A non-numeric value here could lead to shell injection via SSH commands.
case "${PROVISION_TIMEOUT}" in ''|*[!0-9]*) PROVISION_TIMEOUT=720 ;; esac
case "${INSTALL_WAIT}" in ''|*[!0-9]*) INSTALL_WAIT=600 ;; esac
case "${INPUT_TEST_TIMEOUT}" in ''|*[!0-9]*) INPUT_TEST_TIMEOUT=120 ;; esac
case "${AGENT_TIMEOUT}" in ''|*[!0-9]*) AGENT_TIMEOUT=1800 ;; esac
# ---------------------------------------------------------------------------
# OpenRouter API key fallback
@ -142,6 +146,7 @@ cloud_install_wait() {
# 3. Global PROVISION_TIMEOUT
# ---------------------------------------------------------------------------
_PROVISION_TIMEOUT_junie=1200
_AGENT_TIMEOUT_junie=2400
get_provision_timeout() {
local agent="$1"
@ -169,6 +174,39 @@ get_provision_timeout() {
printf '%s' "${PROVISION_TIMEOUT}"
}
# ---------------------------------------------------------------------------
# get_agent_timeout AGENT
#
# Returns the overall wall-clock timeout (seconds) for a single agent run
# (provision + verify + input test). Same override precedence as above:
# 1. AGENT_TIMEOUT_<agent> env var
# 2. Built-in per-agent default (_AGENT_TIMEOUT_<agent>)
# 3. Global AGENT_TIMEOUT
# ---------------------------------------------------------------------------
get_agent_timeout() {
local agent="$1"
local safe_agent
safe_agent=$(printf '%s' "${agent}" | sed 's/[^A-Za-z0-9_]/_/g')
# Check for env var override: AGENT_TIMEOUT_<agent>
local env_var="AGENT_TIMEOUT_${safe_agent}"
eval "local env_val=\${${env_var}:-}"
if [ -n "${env_val}" ]; then
case "${env_val}" in ''|*[!0-9]*) ;; *) printf '%s' "${env_val}"; return ;; esac
fi
# Check for built-in per-agent default
local builtin_var="_AGENT_TIMEOUT_${safe_agent}"
eval "local builtin_val=\${${builtin_var}:-}"
if [ -n "${builtin_val}" ]; then
printf '%s' "${builtin_val}"
return
fi
# Fall back to global
printf '%s' "${AGENT_TIMEOUT}"
}
# ---------------------------------------------------------------------------
# require_common_env
#