mirror of
https://github.com/OpenRouterTeam/spawn.git
synced 2026-04-28 03:49:31 +00:00
fix(e2e): add per-agent timeout to prevent silent hangs in E2E runs (#2720)
The E2E framework's run_single_agent function had no overall timeout. When provision/verify/input_test steps hung (e.g. cloud_exec blocking on sprite-zeroclaw or digitalocean-opencode), the process would stall indefinitely without writing a .result file, causing silent test failures. Add a per-agent wall-clock timeout (default 1800s, 2400s for junie) that wraps the core provision/verify/input_test logic in a killable subshell. If the timeout expires, the subshell is killed and a "fail" result is written, ensuring E2E batches always complete. Fixes #2714 Agent: code-health Co-authored-by: B <6723574+louisgv@users.noreply.github.com> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
ce91953649
commit
3630c07c70
2 changed files with 96 additions and 6 deletions
|
|
@ -192,16 +192,68 @@ run_single_agent() {
|
|||
|
||||
local status="fail"
|
||||
|
||||
# Provision -> Verify -> Input Test
|
||||
if provision_agent "${agent}" "${app_name}" "${LOG_DIR}"; then
|
||||
if verify_agent "${agent}" "${app_name}"; then
|
||||
if run_input_test "${agent}" "${app_name}"; then
|
||||
status="pass"
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-agent timeout: run provision/verify/input_test in a subshell with a
|
||||
# wall-clock timeout. This prevents any single step from hanging indefinitely
|
||||
# and ensures a result file is always written (pass, fail, or timeout).
|
||||
# Fixes #2714: sprite-zeroclaw and digitalocean-opencode stalling with no result.
|
||||
# ---------------------------------------------------------------------------
|
||||
local effective_agent_timeout
|
||||
effective_agent_timeout=$(get_agent_timeout "${agent}")
|
||||
log_info "Agent timeout: ${effective_agent_timeout}s"
|
||||
|
||||
local status_file="${LOG_DIR}/${app_name}.agent-status"
|
||||
rm -f "${status_file}"
|
||||
|
||||
# Run core logic in a subshell so we can kill it on timeout
|
||||
(
|
||||
local _inner_status="fail"
|
||||
if provision_agent "${agent}" "${app_name}" "${LOG_DIR}"; then
|
||||
if verify_agent "${agent}" "${app_name}"; then
|
||||
if run_input_test "${agent}" "${app_name}"; then
|
||||
_inner_status="pass"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
printf '%s' "${_inner_status}" > "${status_file}"
|
||||
) &
|
||||
local agent_pid=$!
|
||||
|
||||
# Poll for completion or timeout (bash 3.2 compatible — no wait -n)
|
||||
local agent_waited=0
|
||||
while [ "${agent_waited}" -lt "${effective_agent_timeout}" ]; do
|
||||
if [ -f "${status_file}" ]; then
|
||||
break
|
||||
fi
|
||||
# Also break if the subshell exited without writing (crash/error)
|
||||
if ! kill -0 "${agent_pid}" 2>/dev/null; then
|
||||
break
|
||||
fi
|
||||
sleep 5
|
||||
agent_waited=$((agent_waited + 5))
|
||||
done
|
||||
|
||||
# Collect result or handle timeout
|
||||
if [ -f "${status_file}" ]; then
|
||||
status=$(cat "${status_file}")
|
||||
wait "${agent_pid}" 2>/dev/null || true
|
||||
elif kill -0 "${agent_pid}" 2>/dev/null; then
|
||||
# Timed out — kill the subshell and its children
|
||||
log_err "${agent} timed out after ${effective_agent_timeout}s — killing"
|
||||
pkill -P "${agent_pid}" 2>/dev/null || true
|
||||
kill "${agent_pid}" 2>/dev/null || true
|
||||
wait "${agent_pid}" 2>/dev/null || true
|
||||
status="fail"
|
||||
else
|
||||
# Subshell exited without writing status file (unexpected error)
|
||||
log_err "${agent} subshell exited without writing status"
|
||||
wait "${agent_pid}" 2>/dev/null || true
|
||||
status="fail"
|
||||
fi
|
||||
|
||||
# Teardown (always attempt)
|
||||
rm -f "${status_file}"
|
||||
|
||||
# Teardown (always attempt, even after timeout)
|
||||
teardown_agent "${app_name}" || log_warn "Teardown failed for ${app_name}"
|
||||
|
||||
local agent_end
|
||||
|
|
|
|||
|
|
@ -9,11 +9,15 @@ ALL_AGENTS="claude openclaw zeroclaw codex opencode kilocode hermes junie"
|
|||
PROVISION_TIMEOUT="${PROVISION_TIMEOUT:-720}"
|
||||
INSTALL_WAIT="${INSTALL_WAIT:-600}"
|
||||
INPUT_TEST_TIMEOUT="${INPUT_TEST_TIMEOUT:-120}"
|
||||
# Per-agent overall timeout: max wall-clock time for provision + verify + input test.
|
||||
# Ensures a result file is always written even if a step hangs indefinitely.
|
||||
AGENT_TIMEOUT="${AGENT_TIMEOUT:-1800}"
|
||||
# Validate numeric env vars that get interpolated into remote command strings.
|
||||
# A non-numeric value here could lead to shell injection via SSH commands.
|
||||
case "${PROVISION_TIMEOUT}" in ''|*[!0-9]*) PROVISION_TIMEOUT=720 ;; esac
|
||||
case "${INSTALL_WAIT}" in ''|*[!0-9]*) INSTALL_WAIT=600 ;; esac
|
||||
case "${INPUT_TEST_TIMEOUT}" in ''|*[!0-9]*) INPUT_TEST_TIMEOUT=120 ;; esac
|
||||
case "${AGENT_TIMEOUT}" in ''|*[!0-9]*) AGENT_TIMEOUT=1800 ;; esac
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# OpenRouter API key fallback
|
||||
|
|
@ -142,6 +146,7 @@ cloud_install_wait() {
|
|||
# 3. Global PROVISION_TIMEOUT
|
||||
# ---------------------------------------------------------------------------
|
||||
_PROVISION_TIMEOUT_junie=1200
|
||||
_AGENT_TIMEOUT_junie=2400
|
||||
|
||||
get_provision_timeout() {
|
||||
local agent="$1"
|
||||
|
|
@ -169,6 +174,39 @@ get_provision_timeout() {
|
|||
printf '%s' "${PROVISION_TIMEOUT}"
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# get_agent_timeout AGENT
|
||||
#
|
||||
# Returns the overall wall-clock timeout (seconds) for a single agent run
|
||||
# (provision + verify + input test). Same override precedence as above:
|
||||
# 1. AGENT_TIMEOUT_<agent> env var
|
||||
# 2. Built-in per-agent default (_AGENT_TIMEOUT_<agent>)
|
||||
# 3. Global AGENT_TIMEOUT
|
||||
# ---------------------------------------------------------------------------
|
||||
get_agent_timeout() {
|
||||
local agent="$1"
|
||||
local safe_agent
|
||||
safe_agent=$(printf '%s' "${agent}" | sed 's/[^A-Za-z0-9_]/_/g')
|
||||
|
||||
# Check for env var override: AGENT_TIMEOUT_<agent>
|
||||
local env_var="AGENT_TIMEOUT_${safe_agent}"
|
||||
eval "local env_val=\${${env_var}:-}"
|
||||
if [ -n "${env_val}" ]; then
|
||||
case "${env_val}" in ''|*[!0-9]*) ;; *) printf '%s' "${env_val}"; return ;; esac
|
||||
fi
|
||||
|
||||
# Check for built-in per-agent default
|
||||
local builtin_var="_AGENT_TIMEOUT_${safe_agent}"
|
||||
eval "local builtin_val=\${${builtin_var}:-}"
|
||||
if [ -n "${builtin_val}" ]; then
|
||||
printf '%s' "${builtin_val}"
|
||||
return
|
||||
fi
|
||||
|
||||
# Fall back to global
|
||||
printf '%s' "${AGENT_TIMEOUT}"
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# require_common_env
|
||||
#
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue