fix(e2e): add per-agent timeout to prevent silent hangs in E2E runs (#2720)

The E2E framework's run_single_agent function had no overall timeout. When provision/verify/input_test steps hung (e.g. cloud_exec blocking on sprite-zeroclaw or digitalocean-opencode), the process would stall indefinitely without writing a .result file, causing silent test failures. Add a per-agent wall-clock timeout (default 1800s, 2400s for junie) that wraps the core provision/verify/input_test logic in a killable subshell. If the timeout expires, the subshell is killed and a "fail" result is written, ensuring E2E batches always complete. Fixes #2714 Agent: code-health Co-authored-by: B <6723574+louisgv@users.noreply.github.com> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-28 03:49:31 +00:00 · 2026-03-17 13:16:09 -07:00 · 2026-03-17 13:16:09 -07:00 · 3630c07c70
commit 3630c07c70
parent ce91953649
2 changed files with 96 additions and 6 deletions
--- a/sh/e2e/e2e.sh
+++ b/sh/e2e/e2e.sh
@ -192,16 +192,68 @@ run_single_agent() {

  local status="fail"

-  # Provision -> Verify -> Input Test
-  if provision_agent "${agent}" "${app_name}" "${LOG_DIR}"; then
-    if verify_agent "${agent}" "${app_name}"; then
-      if run_input_test "${agent}" "${app_name}"; then
-        status="pass"
+  # ---------------------------------------------------------------------------
+  # Per-agent timeout: run provision/verify/input_test in a subshell with a
+  # wall-clock timeout. This prevents any single step from hanging indefinitely
+  # and ensures a result file is always written (pass, fail, or timeout).
+  # Fixes #2714: sprite-zeroclaw and digitalocean-opencode stalling with no result.
+  # ---------------------------------------------------------------------------
+  local effective_agent_timeout
+  effective_agent_timeout=$(get_agent_timeout "${agent}")
+  log_info "Agent timeout: ${effective_agent_timeout}s"
+
+  local status_file="${LOG_DIR}/${app_name}.agent-status"
+  rm -f "${status_file}"
+
+  # Run core logic in a subshell so we can kill it on timeout
+  (
+    local _inner_status="fail"
+    if provision_agent "${agent}" "${app_name}" "${LOG_DIR}"; then
+      if verify_agent "${agent}" "${app_name}"; then
+        if run_input_test "${agent}" "${app_name}"; then
+          _inner_status="pass"
+        fi
      fi
    fi
+    printf '%s' "${_inner_status}" > "${status_file}"
+  ) &
+  local agent_pid=$!
+
+  # Poll for completion or timeout (bash 3.2 compatible — no wait -n)
+  local agent_waited=0
+  while [ "${agent_waited}" -lt "${effective_agent_timeout}" ]; do
+    if [ -f "${status_file}" ]; then
+      break
+    fi
+    # Also break if the subshell exited without writing (crash/error)
+    if ! kill -0 "${agent_pid}" 2>/dev/null; then
+      break
+    fi
+    sleep 5
+    agent_waited=$((agent_waited + 5))
+  done
+
+  # Collect result or handle timeout
+  if [ -f "${status_file}" ]; then
+    status=$(cat "${status_file}")
+    wait "${agent_pid}" 2>/dev/null || true
+  elif kill -0 "${agent_pid}" 2>/dev/null; then
+    # Timed out — kill the subshell and its children
+    log_err "${agent} timed out after ${effective_agent_timeout}s — killing"
+    pkill -P "${agent_pid}" 2>/dev/null || true
+    kill "${agent_pid}" 2>/dev/null || true
+    wait "${agent_pid}" 2>/dev/null || true
+    status="fail"
+  else
+    # Subshell exited without writing status file (unexpected error)
+    log_err "${agent} subshell exited without writing status"
+    wait "${agent_pid}" 2>/dev/null || true
+    status="fail"
  fi

-  # Teardown (always attempt)
+  rm -f "${status_file}"
+
+  # Teardown (always attempt, even after timeout)
  teardown_agent "${app_name}" || log_warn "Teardown failed for ${app_name}"

  local agent_end
--- a/sh/e2e/lib/common.sh
+++ b/sh/e2e/lib/common.sh
@ -9,11 +9,15 @@ ALL_AGENTS="claude openclaw zeroclaw codex opencode kilocode hermes junie"
 PROVISION_TIMEOUT="${PROVISION_TIMEOUT:-720}"
 INSTALL_WAIT="${INSTALL_WAIT:-600}"
 INPUT_TEST_TIMEOUT="${INPUT_TEST_TIMEOUT:-120}"
+# Per-agent overall timeout: max wall-clock time for provision + verify + input test.
+# Ensures a result file is always written even if a step hangs indefinitely.
+AGENT_TIMEOUT="${AGENT_TIMEOUT:-1800}"
 # Validate numeric env vars that get interpolated into remote command strings.
 # A non-numeric value here could lead to shell injection via SSH commands.
 case "${PROVISION_TIMEOUT}" in ''|*[!0-9]*) PROVISION_TIMEOUT=720 ;; esac
 case "${INSTALL_WAIT}" in ''|*[!0-9]*) INSTALL_WAIT=600 ;; esac
 case "${INPUT_TEST_TIMEOUT}" in ''|*[!0-9]*) INPUT_TEST_TIMEOUT=120 ;; esac
+case "${AGENT_TIMEOUT}" in ''|*[!0-9]*) AGENT_TIMEOUT=1800 ;; esac

 # ---------------------------------------------------------------------------
 # OpenRouter API key fallback
@ -142,6 +146,7 @@ cloud_install_wait() {
 #   3. Global PROVISION_TIMEOUT
 # ---------------------------------------------------------------------------
 _PROVISION_TIMEOUT_junie=1200
+_AGENT_TIMEOUT_junie=2400

 get_provision_timeout() {
  local agent="$1"
@ -169,6 +174,39 @@ get_provision_timeout() {
  printf '%s' "${PROVISION_TIMEOUT}"
 }

+# ---------------------------------------------------------------------------
+# get_agent_timeout AGENT
+#
+# Returns the overall wall-clock timeout (seconds) for a single agent run
+# (provision + verify + input test). Same override precedence as above:
+#   1. AGENT_TIMEOUT_<agent> env var
+#   2. Built-in per-agent default (_AGENT_TIMEOUT_<agent>)
+#   3. Global AGENT_TIMEOUT
+# ---------------------------------------------------------------------------
+get_agent_timeout() {
+  local agent="$1"
+  local safe_agent
+  safe_agent=$(printf '%s' "${agent}" | sed 's/[^A-Za-z0-9_]/_/g')
+
+  # Check for env var override: AGENT_TIMEOUT_<agent>
+  local env_var="AGENT_TIMEOUT_${safe_agent}"
+  eval "local env_val=\${${env_var}:-}"
+  if [ -n "${env_val}" ]; then
+    case "${env_val}" in ''|*[!0-9]*) ;; *) printf '%s' "${env_val}"; return ;; esac
+  fi
+
+  # Check for built-in per-agent default
+  local builtin_var="_AGENT_TIMEOUT_${safe_agent}"
+  eval "local builtin_val=\${${builtin_var}:-}"
+  if [ -n "${builtin_val}" ]; then
+    printf '%s' "${builtin_val}"
+    return
+  fi
+
+  # Fall back to global
+  printf '%s' "${AGENT_TIMEOUT}"
+}
+
 # ---------------------------------------------------------------------------
 # require_common_env
 #