mirror of
https://github.com/block/goose.git
synced 2026-04-28 03:29:36 +00:00
fix(ci): deflake smoke tests for Google models (#7344)
Some checks are pending
Canary / Prepare Version (push) Waiting to run
Canary / build-cli (push) Blocked by required conditions
Canary / Upload Install Script (push) Blocked by required conditions
Canary / bundle-desktop (push) Blocked by required conditions
Canary / bundle-desktop-linux (push) Blocked by required conditions
Canary / bundle-desktop-windows (push) Blocked by required conditions
Canary / Release (push) Blocked by required conditions
CI / changes (push) Waiting to run
CI / Check Rust Code Format (push) Blocked by required conditions
CI / Build and Test Rust Project (push) Blocked by required conditions
CI / Lint Rust Code (push) Blocked by required conditions
CI / Check OpenAPI Schema is Up-to-Date (push) Blocked by required conditions
CI / Test and Lint Electron Desktop App (push) Blocked by required conditions
Live Provider Tests / check-fork (push) Waiting to run
Live Provider Tests / changes (push) Blocked by required conditions
Live Provider Tests / Build Binary (push) Blocked by required conditions
Live Provider Tests / Smoke Tests (push) Blocked by required conditions
Live Provider Tests / Smoke Tests (Code Execution) (push) Blocked by required conditions
Live Provider Tests / Compaction Tests (push) Blocked by required conditions
Live Provider Tests / goose server HTTP integration tests (push) Blocked by required conditions
Publish Docker Image / docker (push) Waiting to run
Scorecard supply-chain security / Scorecard analysis (push) Waiting to run
Some checks are pending
Canary / Prepare Version (push) Waiting to run
Canary / build-cli (push) Blocked by required conditions
Canary / Upload Install Script (push) Blocked by required conditions
Canary / bundle-desktop (push) Blocked by required conditions
Canary / bundle-desktop-linux (push) Blocked by required conditions
Canary / bundle-desktop-windows (push) Blocked by required conditions
Canary / Release (push) Blocked by required conditions
CI / changes (push) Waiting to run
CI / Check Rust Code Format (push) Blocked by required conditions
CI / Build and Test Rust Project (push) Blocked by required conditions
CI / Lint Rust Code (push) Blocked by required conditions
CI / Check OpenAPI Schema is Up-to-Date (push) Blocked by required conditions
CI / Test and Lint Electron Desktop App (push) Blocked by required conditions
Live Provider Tests / check-fork (push) Waiting to run
Live Provider Tests / changes (push) Blocked by required conditions
Live Provider Tests / Build Binary (push) Blocked by required conditions
Live Provider Tests / Smoke Tests (push) Blocked by required conditions
Live Provider Tests / Smoke Tests (Code Execution) (push) Blocked by required conditions
Live Provider Tests / Compaction Tests (push) Blocked by required conditions
Live Provider Tests / goose server HTTP integration tests (push) Blocked by required conditions
Publish Docker Image / docker (push) Waiting to run
Scorecard supply-chain security / Scorecard analysis (push) Waiting to run
This commit is contained in:
parent
19964ca867
commit
c324cd3a3b
1 changed files with 19 additions and 5 deletions
|
|
@ -23,8 +23,17 @@ run_test() {
|
|||
cp "$TEST_FILE" "$testdir/test-content.txt"
|
||||
prompt="read ./test-content.txt and output its contents exactly"
|
||||
else
|
||||
echo "$TEST_CONTENT" > "$testdir/input.txt"
|
||||
prompt="Use the text_editor view command to read ./input.txt, then output this file's contents in UPPERCASE. Do NOT use any other tool in Developer"
|
||||
# Write two files with unique random tokens. Validation checks that text_editor
|
||||
# was used and that both tokens appear in the output, proving the model actually
|
||||
# read the files (random tokens can't be guessed or hallucinated).
|
||||
local token_a="smoke-alpha-$RANDOM"
|
||||
local token_b="smoke-bravo-$RANDOM"
|
||||
echo "$token_a" > "$testdir/part-a.txt"
|
||||
echo "$token_b" > "$testdir/part-b.txt"
|
||||
# Store tokens so validation can check them
|
||||
echo "$token_a" > "$testdir/.token_a"
|
||||
echo "$token_b" > "$testdir/.token_b"
|
||||
prompt="Use the text_editor view command to read ./part-a.txt and ./part-b.txt, then reply with ONLY the contents of both files, one per line, nothing else. Do NOT use any other tool in Developer."
|
||||
fi
|
||||
|
||||
(
|
||||
|
|
@ -40,12 +49,17 @@ run_test() {
|
|||
echo "failure|test content not found by model" > "$result_file"
|
||||
fi
|
||||
else
|
||||
local token_a token_b
|
||||
token_a=$(cat "$testdir/.token_a")
|
||||
token_b=$(cat "$testdir/.token_b")
|
||||
if ! grep -qE "(text_editor \| developer)|(▸.*text_editor.*developer)" "$output_file"; then
|
||||
echo "failure|model did not use text_editor tool" > "$result_file"
|
||||
elif ! grep -q "TEST-CONTENT-ABC123" "$output_file"; then
|
||||
echo "failure|model did not return uppercased file content" > "$result_file"
|
||||
elif ! grep -q "$token_a" "$output_file"; then
|
||||
echo "failure|model did not return contents of part-a.txt ($token_a)" > "$result_file"
|
||||
elif ! grep -q "$token_b" "$output_file"; then
|
||||
echo "failure|model did not return contents of part-b.txt ($token_b)" > "$result_file"
|
||||
else
|
||||
echo "success|model read and uppercased file content" > "$result_file"
|
||||
echo "success|model read and returned both file contents" > "$result_file"
|
||||
fi
|
||||
fi
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue