From 070d58f13145ee379ae9598faf37fac27f2080ae Mon Sep 17 00:00:00 2001 From: A <258483684+la14-1@users.noreply.github.com> Date: Tue, 10 Feb 2026 17:29:45 -0800 Subject: [PATCH] fix: Add activity watchdog to detect and kill hung API calls (#310) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The team lead's claude process can hang indefinitely when an API call doesn't return (observed: pre-flight check hung for 30+ min while 6 agents were orphaned). The hard timeout waits the full 40 min. Now monitors log file growth every 10s. If no output for 3 minutes (IDLE_TIMEOUT=180s), the process is killed immediately. The next 5-minute cron trigger starts a fresh cycle — no wasted time. Co-authored-by: A <6723574+louisgv@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 (1M context) --- .../skills/setup-trigger-service/refactor.sh | 56 +++++++++++++++++-- 1 file changed, 51 insertions(+), 5 deletions(-) diff --git a/.claude/skills/setup-trigger-service/refactor.sh b/.claude/skills/setup-trigger-service/refactor.sh index a71495c6..2974751d 100755 --- a/.claude/skills/setup-trigger-service/refactor.sh +++ b/.claude/skills/setup-trigger-service/refactor.sh @@ -488,10 +488,51 @@ log "Hard timeout: ${HARD_TIMEOUT}s" # NOTE: VM keep-alive is handled by the trigger server streaming output back # to the GitHub Actions runner. The long-lived HTTP response keeps Sprite alive. -# Run Claude Code with the prompt file, enforcing a hard timeout -CLAUDE_EXIT=0 -timeout --signal=TERM --kill-after=60 "${HARD_TIMEOUT}" \ - claude -p "$(cat "${PROMPT_FILE}")" --output-format stream-json --verbose 2>&1 | tee -a "${LOG_FILE}" || CLAUDE_EXIT=$? +# Activity watchdog: kill claude if no output for IDLE_TIMEOUT seconds. +# This catches hung API calls (pre-flight check hangs, network issues) much +# faster than the hard timeout. The next cron trigger starts a fresh cycle. +IDLE_TIMEOUT=180 # 3 minutes of silence = hung + +# Run claude in background so we can monitor output activity +claude -p "$(cat "${PROMPT_FILE}")" --output-format stream-json --verbose 2>&1 | tee -a "${LOG_FILE}" & +PIPE_PID=$! + +# Watchdog loop: check log file growth every 10 seconds +LAST_SIZE=$(wc -c < "${LOG_FILE}" 2>/dev/null || echo 0) +IDLE_SECONDS=0 +WALL_START=$(date +%s) + +while kill -0 "${PIPE_PID}" 2>/dev/null; do + sleep 10 + CURR_SIZE=$(wc -c < "${LOG_FILE}" 2>/dev/null || echo 0) + WALL_ELAPSED=$(( $(date +%s) - WALL_START )) + + if [[ "${CURR_SIZE}" -eq "${LAST_SIZE}" ]]; then + IDLE_SECONDS=$((IDLE_SECONDS + 10)) + if [[ "${IDLE_SECONDS}" -ge "${IDLE_TIMEOUT}" ]]; then + log "Watchdog: no output for ${IDLE_SECONDS}s — killing hung process" + # Kill the entire process group spawned by the pipe + kill -- -"${PIPE_PID}" 2>/dev/null || kill "${PIPE_PID}" 2>/dev/null || true + # Also kill any claude processes we spawned + pkill -P "${PIPE_PID}" 2>/dev/null || true + break + fi + else + IDLE_SECONDS=0 + LAST_SIZE="${CURR_SIZE}" + fi + + # Hard wall-clock timeout as final safety net + if [[ "${WALL_ELAPSED}" -ge "${HARD_TIMEOUT}" ]]; then + log "Hard timeout: ${WALL_ELAPSED}s elapsed — killing process" + kill -- -"${PIPE_PID}" 2>/dev/null || kill "${PIPE_PID}" 2>/dev/null || true + pkill -P "${PIPE_PID}" 2>/dev/null || true + break + fi +done + +wait "${PIPE_PID}" 2>/dev/null +CLAUDE_EXIT=$? if [[ "${CLAUDE_EXIT}" -eq 0 ]]; then log "Cycle completed successfully" @@ -513,10 +554,15 @@ Co-Authored-By: Claude Sonnet 4.5 " 2>&1 | tee -a "${LOG_ # Create checkpoint log "Creating checkpoint..." sprite-env checkpoint create --comment "${RUN_MODE} cycle complete" 2>&1 | tee -a "${LOG_FILE}" || true +elif [[ "${IDLE_SECONDS}" -ge "${IDLE_TIMEOUT}" ]]; then + log "Cycle killed by activity watchdog (no output for ${IDLE_TIMEOUT}s)" + + # Still checkpoint partial work + log "Creating checkpoint for partial work..." + sprite-env checkpoint create --comment "${RUN_MODE} cycle hung (watchdog kill)" 2>&1 | tee -a "${LOG_FILE}" || true elif [[ "${CLAUDE_EXIT}" -eq 124 ]]; then log "Cycle timed out after ${HARD_TIMEOUT}s — killed by hard timeout" - # Still create checkpoint for any partial work that was merged log "Creating checkpoint for partial work..." sprite-env checkpoint create --comment "${RUN_MODE} cycle timed out (partial)" 2>&1 | tee -a "${LOG_FILE}" || true else