From c6d0cb218e3bbcef3bcb9bdfe3697e38c91e1050 Mon Sep 17 00:00:00 2001 From: Ahmed Abushagur Date: Fri, 13 Feb 2026 17:07:54 -0800 Subject: [PATCH] improve: make QA bot more effective with structured failures and verification (#1034) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 5 improvements to the QA cycle: 1. Fix agents now get structured failure context — categorized failures (exit_code, missing_api_call, missing_env, no_fixture) instead of raw 500-line test output, plus a passing agent for comparison 2. Fix agent changes are verified before committing — re-runs mock tests after the agent finishes and only commits if results actually improved, discarding bad fixes that would create noise PRs 3. Test results now include failure categories — mock.sh records cloud/agent:fail:reason instead of just cloud/agent:fail, enabling smarter failure routing 4. Mock curl logs NO_FIXTURE warnings when no fixture matches a GET request, surfacing false-confidence gaps where tests pass with synthetic fallback data 5. Phase 3 (code fix) failures now escalate to GitHub issues after 3 consecutive cycles, matching the Phase 1 escalation pattern Co-authored-by: Claude Opus 4.6 --- .claude/skills/setup-agent-team/qa-cycle.sh | 183 +++++++++++++++----- test/mock.sh | 54 +++++- 2 files changed, 188 insertions(+), 49 deletions(-) diff --git a/.claude/skills/setup-agent-team/qa-cycle.sh b/.claude/skills/setup-agent-team/qa-cycle.sh index 054b163a..19cb03b4 100644 --- a/.claude/skills/setup-agent-team/qa-cycle.sh +++ b/.claude/skills/setup-agent-team/qa-cycle.sh @@ -587,11 +587,15 @@ if [[ "${FAIL_COUNT:-0}" -eq 0 ]]; then log "Phase 3: No failures to fix" else # Collect failures grouped by cloud (one teammate per cloud, not per script) + # Results format: cloud/agent:fail[:reason] where reason is exit_code|missing_api_call|missing_env|no_fixture FAILURES="" FAILED_CLOUDS="" + FAILURE_DETAILS="" if [[ -f "${RESULTS_PHASE2}" ]]; then - FAILURES=$(grep ':fail$' "${RESULTS_PHASE2}" | sed 's/:fail$//' || true) - FAILED_CLOUDS=$(grep ':fail$' "${RESULTS_PHASE2}" | sed 's/:fail$//' | cut -d/ -f1 | sort -u || true) + FAILURES=$(grep ':fail' "${RESULTS_PHASE2}" | sed 's/:fail.*$//' || true) + FAILED_CLOUDS=$(grep ':fail' "${RESULTS_PHASE2}" | sed 's/:fail.*$//' | cut -d/ -f1 | sort -u || true) + # Keep full failure lines for structured context + FAILURE_DETAILS=$(grep ':fail' "${RESULTS_PHASE2}" || true) fi # Capture full mock test output per-cloud for richer agent context @@ -623,10 +627,31 @@ else failing_scripts=$(printf '%s' "$failing_scripts" | sed 's/^ //') failing_agents=$(printf '%s' "$failing_agents" | sed 's/^ //') - # Use full mock test output as error context (not just 10 lines from log) + # Build structured failure summary for this cloud + structured_failures="" + for combo in $cloud_failures; do + agent=$(printf '%s' "$combo" | cut -d/ -f2) + reason=$(printf '%s\n' "$FAILURE_DETAILS" | grep "^${combo}:fail" | sed 's/.*:fail://' | sed 's/:fail$//' || true) + if [[ -z "$reason" ]]; then reason="unknown"; fi + structured_failures="${structured_failures} - ${cloud}/${agent}.sh: ${reason}\n" + done + + # Find a passing agent on the same cloud for comparison + passing_agent="" + if [[ -f "${RESULTS_PHASE2}" ]]; then + passing_agent=$(grep "^${cloud}/.*:pass$" "${RESULTS_PHASE2}" | head -1 | sed 's/:pass$//' | cut -d/ -f2 || true) + fi + + # Extract only the assertion failure lines from mock output (not full log) + error_summary="" + if [[ -f "${MOCK_OUTPUT_DIR}/${cloud}.log" ]]; then + error_summary=$(grep -E '(✗|NO_FIXTURE:|BODY_ERROR:|--- output|exit code)' "${MOCK_OUTPUT_DIR}/${cloud}.log" | head -60 || true) + fi + + # Keep full output available but prioritize the structured summary error_context="" if [[ -f "${MOCK_OUTPUT_DIR}/${cloud}.log" ]]; then - error_context=$(cat "${MOCK_OUTPUT_DIR}/${cloud}.log") + error_context=$(tail -200 "${MOCK_OUTPUT_DIR}/${cloud}.log") fi fail_count=$(printf '%s\n' $cloud_failures | wc -l | tr -d ' ') @@ -641,56 +666,43 @@ else } # Spawn ONE Claude teammate per cloud to fix all its failing scripts (15 min timeout) + passing_ref="" + if [[ -n "${passing_agent}" ]]; then + passing_ref=" +## Reference: A PASSING agent on this cloud +${cloud}/${passing_agent}.sh passes all tests. Compare it with the failing scripts to find what's different." + fi + ( cd "${worktree}" run_with_timeout 900 \ claude -p "Fix the failing mock tests for cloud '${cloud}' in the spawn codebase. -Failing scripts: ${failing_scripts} +## Failure Summary (structured) +$(printf '%b' "${structured_failures}") +## Assertion Failures & Warnings +${error_summary} +${passing_ref} -Error context from test run: +## Full test output (last 200 lines) ${error_context} -## Investigation & Fix Process (be thorough): +## Fix Process: -1. **Understand the test infrastructure:** - - Read test/mock.sh to see how mocking works (curl interception, fixture matching) - - Read ${cloud}/lib/common.sh to understand the cloud's API primitives - - Check test/fixtures/${cloud}/ to see what API responses are mocked +1. **Read the failure summary above first.** Each failure has a category: + - **exit_code** — script crashed or exited non-zero. Read the script and check what command fails. + - **missing_api_call** — script didn't call expected cloud API. Check if API endpoint URL changed. + - **missing_env** — OPENROUTER_API_KEY not injected. Check env var setup in the script. + - **no_fixture** — script calls an API endpoint with no test fixture. Add the fixture file. + - **missing_ssh** — script didn't use SSH. Check if connectivity section is missing. -2. **For EACH failing script, investigate the root cause:** - - Read the failing script (${cloud}/.sh) - - Identify which API calls are being made - - Check if the script is making API calls that aren't mocked in test/fixtures/${cloud}/ - - Look for missing fixtures, incorrect API endpoint URLs, or changed function signatures +2. **For no_fixture failures:** Check test/fixtures/${cloud}/ for what fixtures exist. Add missing ones by copying the format from an existing fixture in the same directory. -3. **Check the cloud provider's current API (if needed):** - - If the script seems correct but fixtures seem outdated, check the provider's API docs - - Compare fixture responses with current API documentation - - Look for API changes: new required parameters, different response formats, endpoint deprecations +3. **For exit_code failures:** Read the failing script and the last 10 lines of its output. Compare with ${passing_agent:+the passing ${cloud}/${passing_agent}.sh}${passing_agent:-another agent script on this cloud}. -4. **Common failure patterns to check:** - - Missing test fixtures (script calls an API that has no mock response) - - Wrong API endpoint format (e.g., /v2/servers vs /servers) - - Missing authentication setup (API token not set in mock environment) - - Incorrect assumptions about SSH connectivity in mock mode - - Scripts calling commands that don't work in mock mode (ssh, scp without proper mocking) +4. **Test each fix:** Run: RESULTS_FILE=/tmp/fix-test.txt bash test/mock.sh ${cloud} -5. **Fix the issues:** - - Update scripts to work properly with the mock infrastructure - - Add missing fixture files if needed (test/fixtures/${cloud}/.json) - - Fix API calls to match current provider API - - Ensure proper error handling for mock environment - -6. **Test each fix incrementally:** - - After fixing each script, run: RESULTS_FILE=/tmp/fix-test.txt bash test/mock.sh ${cloud} - - Verify the specific script now passes - - Check for regressions in other scripts - -7. **Syntax check and commit:** - - Run: bash -n on each modified script - - Test final state: RESULTS_FILE=/tmp/fix-test.txt bash test/mock.sh ${cloud} - - Commit all fixes with a message listing what was fixed and why +5. **Syntax check and commit:** Run bash -n on each modified script before committing. You can modify: scripts in ${cloud}/, test/fixtures/${cloud}/, and test/mock.sh if infrastructure updates are needed." \ 2>&1 | tee -a "${LOG_FILE}" || true @@ -707,13 +719,33 @@ You can modify: scripts in ${cloud}/, test/fixtures/${cloud}/, and test/mock.sh # Stage any uncommitted changes the teammate left behind if [[ "$syntax_ok" == "true" ]] && [[ -n "$(git status --porcelain)" ]]; then git add ${failing_scripts} "${cloud}/lib/common.sh" "test/fixtures/${cloud}/" "test/mock.sh" 2>/dev/null || true - git commit -m "$(cat <&1 || true) + local verify_pass=0 + local verify_fail=0 + if [[ -f "/tmp/qa-verify-${cloud}.txt" ]]; then + verify_pass=$(grep -c ':pass' "/tmp/qa-verify-${cloud}.txt" || true) + verify_fail=$(grep -c ':fail' "/tmp/qa-verify-${cloud}.txt" || true) + fi + rm -f "/tmp/qa-verify-${cloud}.txt" + + if [[ "$verify_fail" -lt "$fail_count" ]] || [[ "$verify_pass" -gt 0 ]]; then + log "Phase 3: Fix verified for ${cloud} (${verify_pass} pass, ${verify_fail} fail, was ${fail_count} fail)" + git commit -m "$(cat < FIXEOF - )" || true + )" || true + else + log "Phase 3: Fix did NOT improve results for ${cloud} (still ${verify_fail} fail) — discarding" + git checkout -- . 2>/dev/null || true + fi fi # Push, PR, and merge with retry on stale main @@ -739,6 +771,71 @@ FIXEOF # Clean up per-cloud mock output rm -rf "${MOCK_OUTPUT_DIR}" 2>/dev/null || true + # Track consecutive Phase 3 failures for escalation + MOCK_FAILURES_FILE="${REPO_ROOT}/.docs/qa-mock-failures.json" + if [[ -f "${RESULTS_PHASE2}" ]]; then + MOCK_FAILED_CLOUDS=$(grep ':fail' "${RESULTS_PHASE2}" | sed 's/:fail.*$//' | cut -d/ -f1 | sort -u || true) + MOCK_PASSED_CLOUDS=$(grep ':pass$' "${RESULTS_PHASE2}" | cut -d/ -f1 | sort -u || true) + + # Initialize tracker if missing + if [[ ! -f "${MOCK_FAILURES_FILE}" ]]; then + printf '{}' > "${MOCK_FAILURES_FILE}" + fi + + python3 -c " +import json, sys + +tracker_path = sys.argv[1] +failed = sys.argv[2].split() if sys.argv[2] else [] +succeeded = sys.argv[3].split() if sys.argv[3] else [] + +try: + with open(tracker_path) as f: + tracker = json.load(f) +except (json.JSONDecodeError, FileNotFoundError): + tracker = {} + +for cloud in failed: + tracker[cloud] = tracker.get(cloud, 0) + 1 +for cloud in succeeded: + tracker[cloud] = 0 + +with open(tracker_path, 'w') as f: + json.dump(tracker, f, indent=2, sort_keys=True) + +escalate = [c for c, count in tracker.items() if count >= 3] +if escalate: + print(' '.join(escalate)) +" "${MOCK_FAILURES_FILE}" "${MOCK_FAILED_CLOUDS}" "${MOCK_PASSED_CLOUDS}" > /tmp/spawn-qa-mock-escalate.txt 2>/dev/null || true + + MOCK_ESCALATE=$(cat /tmp/spawn-qa-mock-escalate.txt 2>/dev/null || true) + rm -f /tmp/spawn-qa-mock-escalate.txt + + if [[ -n "${MOCK_ESCALATE}" ]]; then + for cloud in ${MOCK_ESCALATE}; do + consecutive=$(python3 -c "import json, sys; print(json.load(open(sys.argv[1])).get(sys.argv[2], 0))" "${MOCK_FAILURES_FILE}" "${cloud}" 2>/dev/null || printf "3+") + log "Phase 3: ESCALATION — ${cloud} mock tests failing for ${consecutive} consecutive cycles" + + existing_issue=$(gh issue list --repo OpenRouterTeam/spawn --state open \ + --search "mock tests failing ${cloud}" \ + --json number --jq '.[0].number' 2>/dev/null) || existing_issue="" + + if [[ -z "${existing_issue}" ]]; then + # Get failure categories for this cloud + cloud_reasons=$(grep "^${cloud}/.*:fail" "${RESULTS_PHASE2}" | sed 's/.*:fail://' | sort | uniq -c | sort -rn || true) + gh issue create --repo OpenRouterTeam/spawn \ + --title "QA: ${cloud} mock tests failing for ${consecutive} consecutive cycles" \ + --body "$(printf 'The automated QA cycle has detected that mock tests for **%s** have failed for **%s consecutive cycles**, despite automated fix attempts.\n\n## Failure breakdown\n```\n%s\n```\n\n## What to check\n- Run `bash test/mock.sh %s` locally to reproduce\n- Check `test/fixtures/%s/` for missing or outdated fixtures\n- Check `%s/lib/common.sh` for API changes\n- If failures are `no_fixture`, run `bash test/record.sh %s` to record fresh fixtures\n\n## Auto-generated\nThis issue was created automatically by the QA cycle (`qa-cycle.sh`).\n\n-- qa/cycle' "${cloud}" "${consecutive}" "${cloud_reasons}" "${cloud}" "${cloud}" "${cloud}" "${cloud}")" \ + --label "bug" \ + 2>&1 | tee -a "${LOG_FILE}" || true + log "Phase 3: Created GitHub issue for ${cloud} persistent mock test failure" + else + log "Phase 3: Issue #${existing_issue} already open for ${cloud}, skipping" + fi + done + fi + fi + log "Phase 3: Fix teammates complete" fi diff --git a/test/mock.sh b/test/mock.sh index 7487b73a..8190d413 100644 --- a/test/mock.sh +++ b/test/mock.sh @@ -323,8 +323,12 @@ _respond_get() { elif [ "$HAS_ID_SUFFIX" = "false" ]; then local FIXTURE_NAME_BASE FIXTURE_NAME_BASE=$(echo "$FIXTURE_NAME" | sed 's|_[0-9a-f-]*$||') - _try_fixture "$FIXTURE_NAME_BASE" || printf '{}' + if ! _try_fixture "$FIXTURE_NAME_BASE"; then + echo "NO_FIXTURE:GET:${EP_CLEAN}:${FIXTURE_NAME}" >> "${MOCK_LOG}" + printf '{}' + fi else + # ID-suffixed GET (e.g., /servers/12345) — use synthetic for status polling _synthetic_active_response fi } @@ -338,6 +342,7 @@ _respond_post() { if _try_fixture "create_server"; then : else + echo "NO_FIXTURE:POST:${EP_CLEAN}:create_server" >> "${MOCK_LOG}" case "$MOCK_CLOUD" in hetzner) printf '{"server":{"id":99999,"name":"test-srv","public_net":{"ipv4":{"ip":"10.0.0.1"}}},"action":{"id":1,"status":"running"}}' ;; digitalocean) printf '{"droplet":{"id":12345678,"name":"test-srv","status":"new","networks":{"v4":[{"ip_address":"10.0.0.1","type":"public"}]}}}' ;; @@ -632,13 +637,20 @@ assert_cloud_api_calls() { } # Write pass/fail result to RESULTS_FILE if set. -# Args: cloud agent result ("pass" or "fail", or "auto" to compute from _pre_failed) +# Args: cloud agent result [reason] +# Result format: cloud/agent:pass or cloud/agent:fail[:reason] +# Reasons: exit_code, missing_api_call, missing_env, no_fixture record_test_result() { local cloud="$1" local agent="$2" local result="$3" + local reason="${4:-}" [[ -n "${RESULTS_FILE:-}" ]] || return 0 - printf '%s/%s:%s\n' "${cloud}" "${agent}" "${result}" >> "${RESULTS_FILE}" + if [[ -n "$reason" ]]; then + printf '%s/%s:%s:%s\n' "${cloud}" "${agent}" "${result}" "${reason}" >> "${RESULTS_FILE}" + else + printf '%s/%s:%s\n' "${cloud}" "${agent}" "${result}" >> "${RESULTS_FILE}" + fi } # ============================================================ @@ -679,11 +691,22 @@ run_test() { return 0 fi - # Normal mode: run standard assertions + # Normal mode: run standard assertions and track which ones fail + local _fail_before_exit=$FAILED assert_exit_code "${exit_code}" 0 "exits successfully" + local _exit_failed=$(( FAILED - _fail_before_exit )) + + local _fail_before_api=$FAILED assert_cloud_api_calls "$cloud" + local _api_failed=$(( FAILED - _fail_before_api )) + + local _fail_before_ssh=$FAILED assert_log_contains "ssh " "uses SSH" + local _ssh_failed=$(( FAILED - _fail_before_ssh )) + + local _fail_before_env=$FAILED assert_env_injected "OPENROUTER_API_KEY" + local _env_failed=$(( FAILED - _fail_before_env )) if [[ "${MOCK_VALIDATE_BODY:-}" == "1" ]]; then assert_no_body_errors @@ -692,10 +715,29 @@ run_test() { assert_server_cleaned_up "${state_file}" fi - # Record result + # Check for missing fixtures + local _has_no_fixture=0 + if grep -q "NO_FIXTURE:" "${MOCK_LOG}" 2>/dev/null; then + _has_no_fixture=1 + fi + + # Record result with failure category local pre_fail=$((FAILED - _pre_failed)) if [[ "$pre_fail" -gt 0 ]]; then - record_test_result "${cloud}" "${agent}" "fail" + # Determine primary failure reason (priority order) + local _reason="unknown" + if [[ "$_has_no_fixture" -gt 0 ]]; then + _reason="no_fixture" + elif [[ "$_exit_failed" -gt 0 ]]; then + _reason="exit_code" + elif [[ "$_api_failed" -gt 0 ]]; then + _reason="missing_api_call" + elif [[ "$_env_failed" -gt 0 ]]; then + _reason="missing_env" + elif [[ "$_ssh_failed" -gt 0 ]]; then + _reason="missing_ssh" + fi + record_test_result "${cloud}" "${agent}" "fail" "${_reason}" else record_test_result "${cloud}" "${agent}" "pass" fi