fix: Use Sprite public URL for keep-alive instead of localhost

Localhost pings (curl http://localhost:8080/health) bypass the Sprite
proxy entirely and don't register as "actively servicing HTTP requests."
Per Sprite lifecycle rules, VMs pause when there's no inbound HTTP
through the proxy and no detachable session output — so the old
keep-alive was doing nothing.

Now both discovery.sh and refactor.sh resolve the Sprite's public URL
via `sprite-env info` and ping that instead. The request routes through
the Sprite proxy, which counts as real activity and prevents pause.

Also adds keep-alive to discovery.sh (previously had none at all).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
B 2026-02-10 17:58:38 +00:00
parent b20a172ba4
commit 4d15470f8f
2 changed files with 66 additions and 6 deletions

View file

@ -39,11 +39,46 @@ log_info() { printf "${GREEN}[discovery]${NC} %s\n" "$1"; echo "[$(date +'%Y-%m
log_warn() { printf "${YELLOW}[discovery]${NC} %s\n" "$1"; echo "[$(date +'%Y-%m-%d %H:%M:%S')] [discovery] WARN: $1" >> "${LOG_FILE}"; }
log_error() { printf "${RED}[discovery]${NC} %s\n" "$1"; echo "[$(date +'%Y-%m-%d %H:%M:%S')] [discovery] ERROR: $1" >> "${LOG_FILE}"; }
# --- Keep-alive: ping the Sprite's PUBLIC URL to prevent VM pause ---
# Sprite only counts inbound HTTP requests through its proxy as "active."
# Localhost requests bypass the proxy and do NOT prevent the VM from pausing.
# We must hit the public URL so the request routes through the Sprite proxy.
KEEPALIVE_PID=""
SPRITE_PUBLIC_URL=""
start_keepalive() {
SPRITE_PUBLIC_URL=$(sprite-env info 2>/dev/null | python3 -c "import sys,json; print(json.load(sys.stdin)['sprite_url'])" 2>/dev/null) || SPRITE_PUBLIC_URL=""
if [[ -z "${SPRITE_PUBLIC_URL}" ]]; then
log_warn "Could not resolve Sprite public URL — keep-alive will use localhost (may not prevent pause)"
SPRITE_PUBLIC_URL="http://localhost:8080"
else
log_info "Keep-alive will ping: ${SPRITE_PUBLIC_URL}/health"
fi
(
while true; do
curl -sf "${SPRITE_PUBLIC_URL}/health" >/dev/null 2>&1 || true
sleep 30
done
) &
KEEPALIVE_PID=$!
}
stop_keepalive() {
if [[ -n "${KEEPALIVE_PID}" ]]; then
kill "${KEEPALIVE_PID}" 2>/dev/null || true
wait "${KEEPALIVE_PID}" 2>/dev/null || true
KEEPALIVE_PID=""
fi
}
# --- Cleanup trap (from refactor.sh) ---
cleanup() {
local exit_code=$?
log_info "Running cleanup (exit_code=${exit_code})..."
# Stop keep-alive loop
stop_keepalive
cd "${REPO_ROOT}" 2>/dev/null || true
# Prune worktrees and clean up only OUR worktree base
@ -543,6 +578,10 @@ run_team_cycle() {
# Substitute WORKTREE_BASE_PLACEHOLDER with actual worktree path
sed -i "s|WORKTREE_BASE_PLACEHOLDER|${WORKTREE_BASE}|g" "${PROMPT_FILE}"
# Start keep-alive before launching claude (prevents Sprite from pausing the VM)
start_keepalive
log_info "Keep-alive started (pid=${KEEPALIVE_PID})"
log_info "Launching agent team..."
log_info "Worktree base: ${WORKTREE_BASE}"
log_info "Cycle timeout: ${CYCLE_TIMEOUT}s"
@ -558,6 +597,9 @@ run_team_cycle() {
claude -p "$(cat "${PROMPT_FILE}")" --dangerously-skip-permissions --model sonnet \
2>&1 | tee -a "${LOG_FILE}" || CLAUDE_EXIT=$?
# Stop keep-alive now that the cycle is done
stop_keepalive
if [[ "${CLAUDE_EXIT}" -eq 0 ]]; then
log_info "Cycle completed successfully"
@ -594,6 +636,10 @@ run_single_cycle() {
PROMPT_FILE=$(mktemp /tmp/discovery-prompt-XXXXXX.md)
build_single_prompt > "${PROMPT_FILE}"
# Start keep-alive before launching claude
start_keepalive
log_info "Keep-alive started (pid=${KEEPALIVE_PID})"
log_info "Launching single agent..."
log_info "Cycle timeout: ${SINGLE_TIMEOUT}s"
echo ""
@ -606,6 +652,8 @@ run_single_cycle() {
claude --print -p "$(cat "${PROMPT_FILE}")" --model sonnet \
2>&1 | tee -a "${LOG_FILE}" || CLAUDE_EXIT=$?
stop_keepalive
if [[ "${CLAUDE_EXIT}" -eq 0 ]]; then
log_info "Single cycle completed successfully"
sprite-env checkpoint create --comment "discovery single cycle complete" 2>&1 | tee -a "${LOG_FILE}" || true

View file

@ -60,16 +60,28 @@ cleanup() {
trap cleanup EXIT SIGTERM SIGINT
# --- Keep-alive: ping the trigger server's /health endpoint periodically ---
# Sprite pauses/stops VMs that have no HTTP activity. When claude is running
# (waiting on API calls), there may be no inbound requests for long stretches,
# causing Sprite to freeze the VM mid-cycle. This background loop ensures
# continuous HTTP activity so Sprite keeps the VM alive.
# --- Keep-alive: ping the Sprite's PUBLIC URL to prevent VM pause ---
# Sprite only counts inbound HTTP requests through its proxy as "active."
# Localhost requests (curl http://localhost:8080/health) bypass the proxy
# entirely and do NOT prevent the VM from pausing. We must hit the public
# URL so the request routes through the Sprite proxy infrastructure.
KEEPALIVE_PID=""
SPRITE_PUBLIC_URL=""
start_keepalive() {
# Resolve the Sprite's public URL from sprite-env info
SPRITE_PUBLIC_URL=$(sprite-env info 2>/dev/null | python3 -c "import sys,json; print(json.load(sys.stdin)['sprite_url'])" 2>/dev/null) || SPRITE_PUBLIC_URL=""
if [[ -z "${SPRITE_PUBLIC_URL}" ]]; then
log "WARNING: Could not resolve Sprite public URL — keep-alive will use localhost (may not prevent pause)"
SPRITE_PUBLIC_URL="http://localhost:8080"
else
log "Keep-alive will ping: ${SPRITE_PUBLIC_URL}/health"
fi
(
while true; do
curl -sf http://localhost:8080/health >/dev/null 2>&1 || true
# Ping via public URL (routes through Sprite proxy, counts as active HTTP)
curl -sf "${SPRITE_PUBLIC_URL}/health" >/dev/null 2>&1 || true
sleep 30
done
) &