mirror of
https://github.com/OpenRouterTeam/spawn.git
synced 2026-05-19 08:01:17 +00:00
fix: Prevent duplicate work, add graceful shutdown, and enforce team lifecycle (#86)
- Change trigger-server MAX_CONCURRENT default from 3 to 1 to prevent overlapping cycles that duplicate GitHub issue comments - Add SIGTERM/SIGINT handling to trigger-server so running scripts finish gracefully on service restart instead of being killed mid-flight - Add cleanup trap to refactor.sh for worktree/tempfile cleanup on exit - Add pre-cycle cleanup of stale worktrees, merged branches, and abandoned PRs from previously interrupted cycles - Add mandatory Lifecycle Management section to team lead prompt requiring shutdown_request to all teammates before exiting - Add dedup checks to community-coordinator: check existing comments before posting to prevent duplicate acknowledgments/resolutions - Pass issue number in workflow trigger reason for better logging Co-authored-by: A <6723574+louisgv@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
89ed02fe55
commit
ab343d26a2
3 changed files with 170 additions and 25 deletions
|
|
@ -11,6 +11,7 @@ cd "${REPO_ROOT}"
|
|||
|
||||
LOG_FILE="/home/sprite/spawn/.docs/refactor.log"
|
||||
TEAM_NAME="spawn-refactor"
|
||||
PROMPT_FILE=""
|
||||
|
||||
# Ensure .docs directory exists
|
||||
mkdir -p "$(dirname "${LOG_FILE}")"
|
||||
|
|
@ -19,6 +20,26 @@ log() {
|
|||
echo "[$(date +'%Y-%m-%d %H:%M:%S')] $*" | tee -a "${LOG_FILE}"
|
||||
}
|
||||
|
||||
# Cleanup function — runs on normal exit, SIGTERM, and SIGINT
|
||||
cleanup() {
|
||||
local exit_code=$?
|
||||
log "Running cleanup (exit_code=${exit_code})..."
|
||||
|
||||
cd "${REPO_ROOT}" 2>/dev/null || true
|
||||
|
||||
# Prune worktrees
|
||||
git worktree prune 2>/dev/null || true
|
||||
rm -rf /tmp/spawn-worktrees 2>/dev/null || true
|
||||
|
||||
# Clean up prompt file
|
||||
rm -f "${PROMPT_FILE:-}" 2>/dev/null || true
|
||||
|
||||
log "=== Refactoring Cycle Done (exit_code=${exit_code}) ==="
|
||||
exit $exit_code
|
||||
}
|
||||
|
||||
trap cleanup EXIT SIGTERM SIGINT
|
||||
|
||||
log "=== Starting Refactoring Cycle ==="
|
||||
log "Working directory: ${REPO_ROOT}"
|
||||
log "Team name: ${TEAM_NAME}"
|
||||
|
|
@ -29,6 +50,38 @@ log "Syncing with origin/main..."
|
|||
git fetch --prune origin 2>&1 | tee -a "${LOG_FILE}" || true
|
||||
git reset --hard origin/main 2>&1 | tee -a "${LOG_FILE}" || true
|
||||
|
||||
# Pre-cycle cleanup: stale worktrees, merged branches, abandoned PRs
|
||||
log "Pre-cycle cleanup: stale worktrees..."
|
||||
git worktree prune 2>&1 | tee -a "${LOG_FILE}" || true
|
||||
if [[ -d /tmp/spawn-worktrees ]]; then
|
||||
rm -rf /tmp/spawn-worktrees 2>&1 | tee -a "${LOG_FILE}" || true
|
||||
log "Removed stale /tmp/spawn-worktrees directory"
|
||||
fi
|
||||
|
||||
log "Pre-cycle cleanup: merged remote branches..."
|
||||
MERGED_BRANCHES=$(git branch -r --merged origin/main | grep -v 'main' | grep 'origin/' | sed 's|origin/||' | tr -d ' ') || true
|
||||
for branch in $MERGED_BRANCHES; do
|
||||
if [[ -n "$branch" && "$branch" != "main" ]]; then
|
||||
git push origin --delete "$branch" 2>&1 | tee -a "${LOG_FILE}" && log "Deleted merged branch: $branch" || true
|
||||
fi
|
||||
done
|
||||
|
||||
log "Pre-cycle cleanup: stale open PRs..."
|
||||
STALE_PRS=$(gh pr list --repo OpenRouterTeam/spawn --state open --json number,updatedAt --jq '.[] | select(.updatedAt < (now - 7200 | todate)) | .number' 2>/dev/null) || true
|
||||
for pr_num in $STALE_PRS; do
|
||||
if [[ -n "$pr_num" ]]; then
|
||||
log "Found stale PR #${pr_num}, checking if mergeable..."
|
||||
PR_MERGEABLE=$(gh pr view "$pr_num" --repo OpenRouterTeam/spawn --json mergeable --jq '.mergeable' 2>/dev/null) || PR_MERGEABLE="UNKNOWN"
|
||||
if [[ "$PR_MERGEABLE" == "MERGEABLE" ]]; then
|
||||
log "Merging stale PR #${pr_num}..."
|
||||
gh pr merge "$pr_num" --repo OpenRouterTeam/spawn --squash --delete-branch 2>&1 | tee -a "${LOG_FILE}" || true
|
||||
else
|
||||
log "Closing unmergeable stale PR #${pr_num}..."
|
||||
gh pr close "$pr_num" --repo OpenRouterTeam/spawn --comment "Auto-closing: stale PR from a previous interrupted cycle. Please reopen if still needed." 2>&1 | tee -a "${LOG_FILE}" || true
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
# Launch Claude Code with team instructions
|
||||
log "Launching refactoring team..."
|
||||
|
||||
|
|
@ -82,41 +135,50 @@ Create these teammates:
|
|||
|
||||
6. **community-coordinator** (Sonnet)
|
||||
- FIRST TASK: Run `gh issue list --repo OpenRouterTeam/spawn --state open --json number,title,body,labels,createdAt`
|
||||
- For each issue, post a brief, casual comment thanking them for flagging it (e.g. "Thanks for flagging this!" or "Appreciate the report!") — keep it short and natural, not corporate
|
||||
- DEDUP CHECK (MANDATORY before ANY comment): For each issue, FIRST check existing comments:
|
||||
`gh issue view NUMBER --repo OpenRouterTeam/spawn --json comments --jq '.comments[].author.login'`
|
||||
If the issue already has a comment from your bot account (check for "la14-1" or any automated commenter), SKIP posting an acknowledgment — the issue has already been engaged.
|
||||
Only post an acknowledgment if the issue has ZERO comments from automated accounts.
|
||||
- For issues that need acknowledgment, post a brief, casual comment thanking them for flagging it (e.g. "Thanks for flagging this!" or "Appreciate the report!") — keep it short and natural, not corporate
|
||||
- Before posting ANY comment (acknowledgment, interim update, or resolution), ALWAYS check existing comments first:
|
||||
`gh issue view NUMBER --repo OpenRouterTeam/spawn --json comments --jq '.comments[-1].body'`
|
||||
If the last comment already contains similar content (e.g., already has a "Thanks for flagging" or already has a resolution with a PR link), do NOT post again. Never duplicate information.
|
||||
- Categorize each issue (bug, feature request, question, already-fixed)
|
||||
- For bugs: message the relevant teammate to investigate
|
||||
* Security-related → message security-auditor
|
||||
* UX/error messages → message ux-engineer
|
||||
* Test failures → message test-engineer
|
||||
* Code quality → message complexity-hunter
|
||||
- Post interim updates on issues as teammates report findings:
|
||||
- Post interim updates on issues as teammates report findings (only if no similar update exists):
|
||||
gh issue comment NUMBER --body "Update: We've identified the root cause — [summary]. Working on a fix now."
|
||||
- When a fix PR is merged, post the final resolution:
|
||||
- When a fix PR is merged, post the final resolution (only if no resolution comment exists yet):
|
||||
gh issue comment NUMBER --body "This has been fixed in PR_URL. [Brief explanation of what was changed and why]. The fix is live on main — please try updating and let us know if you still see the issue."
|
||||
- Then close: gh issue close NUMBER
|
||||
- For feature requests: comment acknowledging the request, label as enhancement, and close with a note pointing to discussions
|
||||
- For questions: answer directly in a comment, then close
|
||||
- GOAL: Every issue reporter should feel heard and informed. No cold trails.
|
||||
- EVERY open issue must be engaged by end of cycle. No dangling issues.
|
||||
- NEVER post duplicate comments. One acknowledgment per issue. One resolution per issue.
|
||||
|
||||
## Issue Fix Workflow (CRITICAL follow exactly)
|
||||
|
||||
When fixing a bug reported in a GitHub issue:
|
||||
|
||||
1. Community-coordinator posts acknowledgment comment on the issue
|
||||
2. Community-coordinator messages the relevant teammate to investigate
|
||||
3. Create a worktree for the fix:
|
||||
1. Community-coordinator checks for existing comments (dedup) before posting acknowledgment
|
||||
2. Community-coordinator posts acknowledgment comment on the issue (only if not already acknowledged)
|
||||
3. Community-coordinator messages the relevant teammate to investigate
|
||||
4. Create a worktree for the fix:
|
||||
git worktree add /tmp/spawn-worktrees/fix/issue-NUMBER -b fix/issue-NUMBER origin/main
|
||||
4. Work inside the worktree: cd /tmp/spawn-worktrees/fix/issue-NUMBER
|
||||
5. Implement the fix and commit (include Agent: marker)
|
||||
6. Community-coordinator posts interim update on the issue with root cause summary
|
||||
7. Push the branch: git push -u origin fix/issue-NUMBER
|
||||
8. Create a PR that references the issue:
|
||||
5. Work inside the worktree: cd /tmp/spawn-worktrees/fix/issue-NUMBER
|
||||
6. Implement the fix and commit (include Agent: marker)
|
||||
7. Community-coordinator posts interim update on the issue with root cause summary (only if no similar update exists)
|
||||
8. Push the branch: git push -u origin fix/issue-NUMBER
|
||||
9. Create a PR that references the issue:
|
||||
gh pr create --title "Fix: description" --body "Fixes #NUMBER"
|
||||
9. Merge the PR immediately: gh pr merge --squash --delete-branch
|
||||
10. Clean up: git worktree remove /tmp/spawn-worktrees/fix/issue-NUMBER
|
||||
11. Community-coordinator posts final resolution comment with PR link and explanation
|
||||
12. Close the issue: gh issue close NUMBER
|
||||
10. Merge the PR immediately: gh pr merge --squash --delete-branch
|
||||
11. Clean up: git worktree remove /tmp/spawn-worktrees/fix/issue-NUMBER
|
||||
12. Community-coordinator posts final resolution comment with PR link and explanation (only if no resolution exists)
|
||||
13. Close the issue: gh issue close NUMBER
|
||||
|
||||
NEVER leave an issue open after the fix is merged. NEVER leave a PR unmerged.
|
||||
If a PR cannot be merged (conflicts, superseded, etc.), close it WITH a comment explaining why.
|
||||
|
|
@ -224,7 +286,31 @@ git worktree remove /tmp/spawn-worktrees/BRANCH-NAME
|
|||
13. Community-coordinator posts final resolutions on all issues, closes them
|
||||
14. Branch-cleaner runs final pass to catch any branches created during the cycle
|
||||
15. Team lead runs: git worktree prune to clean stale worktree entries
|
||||
16. When cycle completes, verify: every issue engaged, all PRs merged, zero stale branches, summarize what was fixed/improved
|
||||
16. When all work is done, execute the Lifecycle Management shutdown sequence (below) — send shutdown_request to every teammate, wait for confirmations, clean up worktrees, then exit
|
||||
|
||||
## Lifecycle Management (MANDATORY — DO NOT EXIT EARLY)
|
||||
|
||||
You MUST remain active until ALL of the following are true:
|
||||
|
||||
1. **All tasks are completed**: Run TaskList and confirm every task has status "completed"
|
||||
2. **All PRs are resolved**: Run `gh pr list --repo OpenRouterTeam/spawn --state open --author @me` and confirm zero open PRs from this cycle. Every PR must be either merged or closed with a comment.
|
||||
3. **All issues are engaged**: Run `gh issue list --repo OpenRouterTeam/spawn --state open` and confirm every open issue has been commented on (or was already engaged).
|
||||
4. **All worktrees are cleaned**: Run `git worktree list` and confirm only the main worktree exists. Run `rm -rf /tmp/spawn-worktrees` and `git worktree prune`.
|
||||
5. **All teammates are shut down**: Send `shutdown_request` to EVERY teammate. Wait for each to confirm. Do NOT exit while any teammate is still active.
|
||||
|
||||
### Shutdown Sequence (execute in this exact order):
|
||||
|
||||
1. Check TaskList — if any tasks are still in_progress or pending, wait and check again (poll every 30 seconds, up to 10 minutes)
|
||||
2. Verify all PRs merged or closed: `gh pr list --repo OpenRouterTeam/spawn --state open`
|
||||
3. Verify all issues engaged: `gh issue list --repo OpenRouterTeam/spawn --state open`
|
||||
4. For each teammate, send a `shutdown_request` via SendMessage
|
||||
5. Wait for all `shutdown_response` confirmations
|
||||
6. Run final cleanup: `git worktree prune && rm -rf /tmp/spawn-worktrees`
|
||||
7. Create checkpoint: `sprite-env checkpoint create --comment 'Refactor cycle complete'`
|
||||
8. Print final summary of what was accomplished
|
||||
9. ONLY THEN may the session end
|
||||
|
||||
### CRITICAL: If you exit before completing this sequence, running agents will be orphaned and the cycle will be incomplete. This has caused real problems in the past (PR #83 was left unmerged, issues got duplicate comments from overlapping cycles). You MUST wait for all teammates to shut down before exiting.
|
||||
|
||||
## Safety Rules
|
||||
|
||||
|
|
@ -244,7 +330,7 @@ Score tasks: (Impact x Confidence) / Risk
|
|||
|
||||
Target autonomous score: >30
|
||||
|
||||
Begin now. Spawn the team and start working.
|
||||
Begin now. Spawn the team and start working. DO NOT EXIT until all teammates are shut down and all cleanup is complete per the Lifecycle Management section above.
|
||||
PROMPT_EOF
|
||||
|
||||
# Run Claude Code with the prompt file
|
||||
|
|
@ -270,7 +356,4 @@ else
|
|||
log "Cycle failed"
|
||||
fi
|
||||
|
||||
# Clean up prompt file
|
||||
rm -f "${PROMPT_FILE}"
|
||||
|
||||
log "=== Refactoring Cycle Done ==="
|
||||
# Note: cleanup (worktree prune, prompt file removal, final log) handled by trap
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@
|
|||
const PORT = 8080;
|
||||
const TRIGGER_SECRET = process.env.TRIGGER_SECRET ?? "";
|
||||
const TARGET_SCRIPT = process.env.TARGET_SCRIPT ?? "";
|
||||
const MAX_CONCURRENT = parseInt(process.env.MAX_CONCURRENT ?? "3", 10);
|
||||
const MAX_CONCURRENT = parseInt(process.env.MAX_CONCURRENT ?? "1", 10);
|
||||
|
||||
if (!TRIGGER_SECRET) {
|
||||
console.error("ERROR: TRIGGER_SECRET env var is required");
|
||||
|
|
@ -26,6 +26,56 @@ if (!TARGET_SCRIPT) {
|
|||
}
|
||||
|
||||
let runningCount = 0;
|
||||
let shuttingDown = false;
|
||||
const runningProcs = new Set<ReturnType<typeof Bun.spawn>>();
|
||||
|
||||
function gracefulShutdown(signal: string) {
|
||||
if (shuttingDown) return;
|
||||
shuttingDown = true;
|
||||
console.log(`[trigger] Received ${signal}, shutting down gracefully...`);
|
||||
console.log(
|
||||
`[trigger] Waiting for ${runningProcs.size} running script(s) to finish...`
|
||||
);
|
||||
|
||||
// Stop accepting new connections
|
||||
server.stop();
|
||||
|
||||
if (runningProcs.size === 0) {
|
||||
console.log(`[trigger] No running scripts, exiting immediately`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Hard deadline: 15 minutes max wait, then force kill
|
||||
const HARD_TIMEOUT_MS = 15 * 60 * 1000;
|
||||
const forceKillTimer = setTimeout(() => {
|
||||
console.error(
|
||||
`[trigger] Hard timeout reached (${HARD_TIMEOUT_MS / 1000}s), force killing remaining processes`
|
||||
);
|
||||
for (const proc of runningProcs) {
|
||||
try {
|
||||
proc.kill(9);
|
||||
} catch {}
|
||||
}
|
||||
process.exit(1);
|
||||
}, HARD_TIMEOUT_MS);
|
||||
forceKillTimer.unref?.();
|
||||
|
||||
// Wait for all running scripts to finish
|
||||
Promise.all(Array.from(runningProcs).map((proc) => proc.exited))
|
||||
.then(() => {
|
||||
console.log(`[trigger] All scripts finished, exiting`);
|
||||
clearTimeout(forceKillTimer);
|
||||
process.exit(0);
|
||||
})
|
||||
.catch((e) => {
|
||||
console.error(`[trigger] Error waiting for scripts:`, e);
|
||||
clearTimeout(forceKillTimer);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
|
||||
process.on("SIGTERM", () => gracefulShutdown("SIGTERM"));
|
||||
process.on("SIGINT", () => gracefulShutdown("SIGINT"));
|
||||
|
||||
async function runScript(reason: string) {
|
||||
runningCount++;
|
||||
|
|
@ -34,11 +84,16 @@ async function runScript(reason: string) {
|
|||
);
|
||||
try {
|
||||
const proc = Bun.spawn(["bash", TARGET_SCRIPT], {
|
||||
cwd: process.env.REPO_ROOT || TARGET_SCRIPT.substring(0, TARGET_SCRIPT.lastIndexOf("/")) || ".",
|
||||
cwd:
|
||||
process.env.REPO_ROOT ||
|
||||
TARGET_SCRIPT.substring(0, TARGET_SCRIPT.lastIndexOf("/")) ||
|
||||
".",
|
||||
stdout: "inherit",
|
||||
stderr: "inherit",
|
||||
});
|
||||
runningProcs.add(proc);
|
||||
await proc.exited;
|
||||
runningProcs.delete(proc);
|
||||
console.log(
|
||||
`[trigger] ${TARGET_SCRIPT} finished (exit=${proc.exitCode}, concurrent=${runningCount - 1}/${MAX_CONCURRENT})`
|
||||
);
|
||||
|
|
@ -55,10 +110,17 @@ const server = Bun.serve({
|
|||
const url = new URL(req.url);
|
||||
|
||||
if (req.method === "GET" && url.pathname === "/health") {
|
||||
return Response.json({ status: "ok" });
|
||||
return Response.json({ status: "ok", running: runningCount, shuttingDown });
|
||||
}
|
||||
|
||||
if (req.method === "POST" && url.pathname === "/trigger") {
|
||||
if (shuttingDown) {
|
||||
return Response.json(
|
||||
{ error: "server is shutting down" },
|
||||
{ status: 503 }
|
||||
);
|
||||
}
|
||||
|
||||
const auth = req.headers.get("Authorization") ?? "";
|
||||
if (auth !== `Bearer ${TRIGGER_SECRET}`) {
|
||||
return Response.json({ error: "unauthorized" }, { status: 401 });
|
||||
|
|
|
|||
2
.github/workflows/refactor.yml
vendored
2
.github/workflows/refactor.yml
vendored
|
|
@ -20,5 +20,5 @@ jobs:
|
|||
SPRITE_URL: ${{ secrets.REFACTOR_SPRITE_URL }}
|
||||
TRIGGER_SECRET: ${{ secrets.REFACTOR_TRIGGER_SECRET }}
|
||||
run: |
|
||||
curl -sf -X POST "${SPRITE_URL}/trigger?reason=${{ github.event_name }}" \
|
||||
curl -sf -X POST "${SPRITE_URL}/trigger?reason=${{ github.event_name }}&issue=${{ github.event.issue.number || '' }}" \
|
||||
-H "Authorization: Bearer ${TRIGGER_SECRET}"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue