fix(ci): deflake smoke tests for Google models (#7344)
Some checks are pending
Canary / Prepare Version (push) Waiting to run
Canary / build-cli (push) Blocked by required conditions
Canary / Upload Install Script (push) Blocked by required conditions
Canary / bundle-desktop (push) Blocked by required conditions
Canary / bundle-desktop-linux (push) Blocked by required conditions
Canary / bundle-desktop-windows (push) Blocked by required conditions
Canary / Release (push) Blocked by required conditions
CI / changes (push) Waiting to run
CI / Check Rust Code Format (push) Blocked by required conditions
CI / Build and Test Rust Project (push) Blocked by required conditions
CI / Lint Rust Code (push) Blocked by required conditions
CI / Check OpenAPI Schema is Up-to-Date (push) Blocked by required conditions
CI / Test and Lint Electron Desktop App (push) Blocked by required conditions
Live Provider Tests / check-fork (push) Waiting to run
Live Provider Tests / changes (push) Blocked by required conditions
Live Provider Tests / Build Binary (push) Blocked by required conditions
Live Provider Tests / Smoke Tests (push) Blocked by required conditions
Live Provider Tests / Smoke Tests (Code Execution) (push) Blocked by required conditions
Live Provider Tests / Compaction Tests (push) Blocked by required conditions
Live Provider Tests / goose server HTTP integration tests (push) Blocked by required conditions
Publish Docker Image / docker (push) Waiting to run
Scorecard supply-chain security / Scorecard analysis (push) Waiting to run

This commit is contained in:
Michael Neale 2026-02-19 20:06:48 +11:00 committed by GitHub
parent 19964ca867
commit c324cd3a3b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -23,8 +23,17 @@ run_test() {
cp "$TEST_FILE" "$testdir/test-content.txt"
prompt="read ./test-content.txt and output its contents exactly"
else
echo "$TEST_CONTENT" > "$testdir/input.txt"
prompt="Use the text_editor view command to read ./input.txt, then output this file's contents in UPPERCASE. Do NOT use any other tool in Developer"
# Write two files with unique random tokens. Validation checks that text_editor
# was used and that both tokens appear in the output, proving the model actually
# read the files (random tokens can't be guessed or hallucinated).
local token_a="smoke-alpha-$RANDOM"
local token_b="smoke-bravo-$RANDOM"
echo "$token_a" > "$testdir/part-a.txt"
echo "$token_b" > "$testdir/part-b.txt"
# Store tokens so validation can check them
echo "$token_a" > "$testdir/.token_a"
echo "$token_b" > "$testdir/.token_b"
prompt="Use the text_editor view command to read ./part-a.txt and ./part-b.txt, then reply with ONLY the contents of both files, one per line, nothing else. Do NOT use any other tool in Developer."
fi
(
@ -40,12 +49,17 @@ run_test() {
echo "failure|test content not found by model" > "$result_file"
fi
else
local token_a token_b
token_a=$(cat "$testdir/.token_a")
token_b=$(cat "$testdir/.token_b")
if ! grep -qE "(text_editor \| developer)|(▸.*text_editor.*developer)" "$output_file"; then
echo "failure|model did not use text_editor tool" > "$result_file"
elif ! grep -q "TEST-CONTENT-ABC123" "$output_file"; then
echo "failure|model did not return uppercased file content" > "$result_file"
elif ! grep -q "$token_a" "$output_file"; then
echo "failure|model did not return contents of part-a.txt ($token_a)" > "$result_file"
elif ! grep -q "$token_b" "$output_file"; then
echo "failure|model did not return contents of part-b.txt ($token_b)" > "$result_file"
else
echo "success|model read and uppercased file content" > "$result_file"
echo "success|model read and returned both file contents" > "$result_file"
fi
fi