mirror of
https://github.com/block/goose.git
synced 2026-04-26 10:40:45 +00:00
Port provider tests to typescript (#8237)
Some checks failed
Canary / Prepare Version (push) Waiting to run
Canary / build-cli (push) Blocked by required conditions
Canary / Upload Install Script (push) Blocked by required conditions
Canary / bundle-desktop (push) Blocked by required conditions
Canary / bundle-desktop-intel (push) Blocked by required conditions
Canary / bundle-desktop-linux (push) Blocked by required conditions
Canary / bundle-desktop-windows (push) Blocked by required conditions
Canary / Release (push) Blocked by required conditions
CI / Build Rust Project on Windows (push) Waiting to run
CI / changes (push) Waiting to run
CI / Check Rust Code Format (push) Blocked by required conditions
CI / Build and Test Rust Project (push) Blocked by required conditions
CI / Check MSRV (push) Blocked by required conditions
CI / Lint Rust Code (push) Blocked by required conditions
CI / Check Generated Schemas are Up-to-Date (push) Blocked by required conditions
CI / Test and Lint Electron Desktop App (push) Blocked by required conditions
Goose 2 CI / Lint & Format (push) Waiting to run
Goose 2 CI / Unit Tests (push) Waiting to run
Goose 2 CI / Desktop Build & E2E (push) Waiting to run
Goose 2 CI / Rust Lint (push) Waiting to run
Live Provider Tests / Smoke Tests (Code Execution) (push) Blocked by required conditions
Live Provider Tests / check-fork (push) Waiting to run
Live Provider Tests / changes (push) Blocked by required conditions
Live Provider Tests / Build Binary (push) Blocked by required conditions
Live Provider Tests / Smoke Tests (push) Blocked by required conditions
Live Provider Tests / Compaction Tests (push) Blocked by required conditions
Live Provider Tests / goose server HTTP integration tests (push) Blocked by required conditions
Publish Docker Image / docker (push) Waiting to run
Scorecard supply-chain security / Scorecard analysis (push) Waiting to run
Unused Dependencies / machete (push) Has been cancelled
Some checks failed
Canary / Prepare Version (push) Waiting to run
Canary / build-cli (push) Blocked by required conditions
Canary / Upload Install Script (push) Blocked by required conditions
Canary / bundle-desktop (push) Blocked by required conditions
Canary / bundle-desktop-intel (push) Blocked by required conditions
Canary / bundle-desktop-linux (push) Blocked by required conditions
Canary / bundle-desktop-windows (push) Blocked by required conditions
Canary / Release (push) Blocked by required conditions
CI / Build Rust Project on Windows (push) Waiting to run
CI / changes (push) Waiting to run
CI / Check Rust Code Format (push) Blocked by required conditions
CI / Build and Test Rust Project (push) Blocked by required conditions
CI / Check MSRV (push) Blocked by required conditions
CI / Lint Rust Code (push) Blocked by required conditions
CI / Check Generated Schemas are Up-to-Date (push) Blocked by required conditions
CI / Test and Lint Electron Desktop App (push) Blocked by required conditions
Goose 2 CI / Lint & Format (push) Waiting to run
Goose 2 CI / Unit Tests (push) Waiting to run
Goose 2 CI / Desktop Build & E2E (push) Waiting to run
Goose 2 CI / Rust Lint (push) Waiting to run
Live Provider Tests / Smoke Tests (Code Execution) (push) Blocked by required conditions
Live Provider Tests / check-fork (push) Waiting to run
Live Provider Tests / changes (push) Blocked by required conditions
Live Provider Tests / Build Binary (push) Blocked by required conditions
Live Provider Tests / Smoke Tests (push) Blocked by required conditions
Live Provider Tests / Compaction Tests (push) Blocked by required conditions
Live Provider Tests / goose server HTTP integration tests (push) Blocked by required conditions
Publish Docker Image / docker (push) Waiting to run
Scorecard supply-chain security / Scorecard analysis (push) Waiting to run
Unused Dependencies / machete (push) Has been cancelled
Signed-off-by: Douwe Osinga <douwe@squareup.com> Co-authored-by: Douwe Osinga <douwe@squareup.com>
This commit is contained in:
parent
eb60770c81
commit
c6755d3259
10 changed files with 549 additions and 369 deletions
22
.github/workflows/pr-smoke-test.yml
vendored
22
.github/workflows/pr-smoke-test.yml
vendored
|
|
@ -110,7 +110,11 @@ jobs:
|
|||
- name: Install agentic providers
|
||||
run: npm install -g @anthropic-ai/claude-code @zed-industries/claude-agent-acp @zed-industries/codex-acp
|
||||
|
||||
- name: Run Smoke Tests with Provider Script
|
||||
- name: Install Node.js Dependencies
|
||||
run: source ../../bin/activate-hermit && pnpm install --frozen-lockfile
|
||||
working-directory: ui/desktop
|
||||
|
||||
- name: Run Smoke Tests (Normal Mode)
|
||||
env:
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
|
|
@ -127,12 +131,10 @@ jobs:
|
|||
SKIP_BUILD: 1
|
||||
SKIP_PROVIDERS: ${{ vars.SKIP_PROVIDERS || '' }}
|
||||
run: |
|
||||
# Ensure the HOME directory structure exists
|
||||
mkdir -p $HOME/.local/share/goose/sessions
|
||||
mkdir -p $HOME/.config/goose
|
||||
|
||||
# Run the provider test script (binary already built and downloaded)
|
||||
bash scripts/test_providers.sh
|
||||
source ../../bin/activate-hermit && pnpm run test:integration:providers
|
||||
working-directory: ui/desktop
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
|
|
@ -188,6 +190,10 @@ jobs:
|
|||
- name: Make Binary Executable
|
||||
run: chmod +x target/debug/goose
|
||||
|
||||
- name: Install Node.js Dependencies
|
||||
run: source ../../bin/activate-hermit && pnpm install --frozen-lockfile
|
||||
working-directory: ui/desktop
|
||||
|
||||
- name: Run Provider Tests (Code Execution Mode)
|
||||
env:
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
|
|
@ -205,7 +211,8 @@ jobs:
|
|||
run: |
|
||||
mkdir -p $HOME/.local/share/goose/sessions
|
||||
mkdir -p $HOME/.config/goose
|
||||
bash scripts/test_providers_code_exec.sh
|
||||
source ../../bin/activate-hermit && pnpm run test:integration:providers-code-exec
|
||||
working-directory: ui/desktop
|
||||
|
||||
compaction-tests:
|
||||
name: Compaction Tests
|
||||
|
|
@ -277,7 +284,8 @@ jobs:
|
|||
GOOSE_PROVIDER: anthropic
|
||||
GOOSE_MODEL: claude-sonnet-4-5-20250929
|
||||
SHELL: /bin/bash
|
||||
SKIP_BUILD: 1
|
||||
run: |
|
||||
echo 'export PATH=/some/fake/path:$PATH' >> $HOME/.bash_profile
|
||||
source ../../bin/activate-hermit && pnpm run test:integration:debug
|
||||
source ../../bin/activate-hermit && pnpm run test:integration:goosed
|
||||
working-directory: ui/desktop
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ Make a copy of this document for each version and check off as steps are verifie
|
|||
|
||||
### Provider Testing
|
||||
|
||||
- [ ] Run `./scripts/test_providers.sh` locally from the release branch and verify all providers/models work
|
||||
- [ ] Run `cd ui/desktop && pnpm run test:integration:providers` locally from the release branch and verify all providers/models work
|
||||
- [ ] Launch goose, click reset providers, choose databricks and a model
|
||||
|
||||
### Starting Conversations
|
||||
|
|
|
|||
|
|
@ -1,71 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
LIB_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
source "$LIB_DIR/test_providers_lib.sh"
|
||||
|
||||
echo "Mode: normal (direct tool calls)"
|
||||
echo ""
|
||||
|
||||
GOOSE_BIN=$(build_goose)
|
||||
BUILTINS="developer"
|
||||
|
||||
mkdir -p target
|
||||
TEST_CONTENT="test-content-abc123"
|
||||
TEST_FILE="./target/test-content.txt"
|
||||
echo "$TEST_CONTENT" > "$TEST_FILE"
|
||||
|
||||
run_test() {
|
||||
local provider="$1" model="$2" result_file="$3" output_file="$4"
|
||||
local testdir=$(mktemp -d)
|
||||
|
||||
local prompt
|
||||
if is_agentic_provider "$provider"; then
|
||||
cp "$TEST_FILE" "$testdir/test-content.txt"
|
||||
prompt="read ./test-content.txt and output its contents exactly"
|
||||
else
|
||||
# Write two files with unique random tokens. Validation checks that the shell
|
||||
# tool was used and that both tokens appear in the output, proving the model
|
||||
# actually read the files (random tokens can't be guessed or hallucinated).
|
||||
local token_a="smoke-alpha-$RANDOM"
|
||||
local token_b="smoke-bravo-$RANDOM"
|
||||
echo "$token_a" > "$testdir/part-a.txt"
|
||||
echo "$token_b" > "$testdir/part-b.txt"
|
||||
# Store tokens so validation can check them
|
||||
echo "$token_a" > "$testdir/.token_a"
|
||||
echo "$token_b" > "$testdir/.token_b"
|
||||
prompt="Use the shell tool to cat ./part-a.txt and ./part-b.txt, then reply with ONLY the contents of both files, one per line, nothing else."
|
||||
fi
|
||||
|
||||
(
|
||||
export GOOSE_PROVIDER="$provider"
|
||||
export GOOSE_MODEL="$model"
|
||||
cd "$testdir" && "$GOOSE_BIN" run --text "$prompt" --with-builtin "$BUILTINS" 2>&1
|
||||
) > "$output_file" 2>&1
|
||||
|
||||
if is_agentic_provider "$provider"; then
|
||||
if grep -qi "$TEST_CONTENT" "$output_file"; then
|
||||
echo "success|test content found by model" > "$result_file"
|
||||
else
|
||||
echo "failure|test content not found by model" > "$result_file"
|
||||
fi
|
||||
else
|
||||
local token_a token_b
|
||||
token_a=$(cat "$testdir/.token_a")
|
||||
token_b=$(cat "$testdir/.token_b")
|
||||
if ! grep -qE "(shell \| developer)|(▸.*shell)" "$output_file"; then
|
||||
echo "failure|model did not use shell tool" > "$result_file"
|
||||
elif ! grep -q "$token_a" "$output_file"; then
|
||||
echo "failure|model did not return contents of part-a.txt ($token_a)" > "$result_file"
|
||||
elif ! grep -q "$token_b" "$output_file"; then
|
||||
echo "failure|model did not return contents of part-b.txt ($token_b)" > "$result_file"
|
||||
else
|
||||
echo "success|model read and returned both file contents" > "$result_file"
|
||||
fi
|
||||
fi
|
||||
|
||||
rm -rf "$testdir"
|
||||
}
|
||||
|
||||
build_test_cases
|
||||
run_test_cases run_test
|
||||
report_results
|
||||
|
|
@ -1,45 +0,0 @@
|
|||
#!/bin/bash
|
||||
# Provider smoke tests - code execution mode (JS batching)
|
||||
|
||||
LIB_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
source "$LIB_DIR/test_providers_lib.sh"
|
||||
|
||||
echo "Mode: code_execution (JS batching)"
|
||||
echo ""
|
||||
|
||||
# --- Setup ---
|
||||
|
||||
GOOSE_BIN=$(build_goose)
|
||||
BUILTINS="memory,code_execution"
|
||||
|
||||
# --- Test case ---
|
||||
|
||||
run_test() {
|
||||
local provider="$1" model="$2" result_file="$3" output_file="$4"
|
||||
local testdir=$(mktemp -d)
|
||||
|
||||
local prompt="Store a memory with category 'test' and data 'hello world', then retrieve all memories from category 'test'."
|
||||
|
||||
# Run goose
|
||||
(
|
||||
export GOOSE_PROVIDER="$provider"
|
||||
export GOOSE_MODEL="$model"
|
||||
cd "$testdir" && "$GOOSE_BIN" run --text "$prompt" --with-builtin "$BUILTINS" 2>&1
|
||||
) > "$output_file" 2>&1
|
||||
|
||||
# Matches: "execute_typescript | code_execution", "get_function_details | code_execution",
|
||||
# "tool call | execute", "tool calls | execute" (old format)
|
||||
# "▸ execute N tool call" (new format with tool_graph)
|
||||
# "▸ execute_typescript" (plain tool name in output)
|
||||
if grep -qE "(execute_typescript \| code_execution)|(get_function_details \| code_execution)|(tool calls? \| execute)|(▸.*execute.*tool call)|(▸ execute_typescript)" "$output_file"; then
|
||||
echo "success|code_execution tool called" > "$result_file"
|
||||
else
|
||||
echo "failure|no code_execution tool calls found" > "$result_file"
|
||||
fi
|
||||
|
||||
rm -rf "$testdir"
|
||||
}
|
||||
|
||||
build_test_cases --skip-agentic
|
||||
run_test_cases run_test
|
||||
report_results
|
||||
|
|
@ -1,240 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
PROVIDER_CONFIG="
|
||||
openrouter -> google/gemini-2.5-pro|anthropic/claude-sonnet-4.5|qwen/qwen3-coder:exacto|z-ai/glm-4.6:exacto|nvidia/nemotron-3-nano-30b-a3b
|
||||
xai -> grok-3
|
||||
openai -> gpt-4o|gpt-4o-mini|gpt-3.5-turbo|gpt-5
|
||||
anthropic -> claude-sonnet-4-5-20250929|claude-opus-4-5-20251101
|
||||
google -> gemini-2.5-pro|gemini-2.5-flash|gemini-3-pro-preview|gemini-3-flash-preview
|
||||
tetrate -> claude-sonnet-4-20250514
|
||||
databricks -> databricks-claude-sonnet-4|gemini-2-5-flash|gpt-4o
|
||||
azure_openai -> ${AZURE_OPENAI_DEPLOYMENT_NAME}
|
||||
aws_bedrock -> us.anthropic.claude-sonnet-4-5-20250929-v1:0
|
||||
gcp_vertex_ai -> gemini-2.5-pro
|
||||
snowflake -> claude-sonnet-4-5
|
||||
venice -> llama-3.3-70b
|
||||
litellm -> gpt-4o-mini
|
||||
sagemaker_tgi -> sagemaker-tgi-endpoint
|
||||
github_copilot -> gpt-4.1
|
||||
chatgpt_codex -> gpt-5.4
|
||||
claude-code -> default
|
||||
cursor-agent -> auto
|
||||
ollama -> qwen3
|
||||
"
|
||||
|
||||
# Flaky models allowed to fail without blocking PRs.
|
||||
ALLOWED_FAILURES=(
|
||||
"google:gemini-2.5-flash"
|
||||
"google:gemini-3-pro-preview"
|
||||
"openrouter:nvidia/nemotron-3-nano-30b-a3b"
|
||||
"openrouter:qwen/qwen3-coder:exacto"
|
||||
"openai:gpt-3.5-turbo"
|
||||
)
|
||||
|
||||
AGENTIC_PROVIDERS=("claude-code" "cursor-agent")
|
||||
|
||||
if [ -f .env ]; then
|
||||
export $(grep -v '^#' .env | xargs)
|
||||
fi
|
||||
|
||||
build_goose() {
|
||||
if [ -z "$SKIP_BUILD" ]; then
|
||||
echo "Building goose..." >&2
|
||||
cargo build --bin goose >&2
|
||||
echo "" >&2
|
||||
else
|
||||
echo "Skipping build (SKIP_BUILD is set)..." >&2
|
||||
echo "" >&2
|
||||
fi
|
||||
|
||||
echo "$(pwd)/target/debug/goose"
|
||||
}
|
||||
|
||||
has_env() { [ -n "${!1}" ]; }
|
||||
has_cmd() { command -v "$1" &>/dev/null; }
|
||||
has_file() { [ -f "$1" ]; }
|
||||
|
||||
is_provider_available() {
|
||||
case "$1" in
|
||||
openrouter) has_env OPENROUTER_API_KEY ;;
|
||||
xai) has_env XAI_API_KEY ;;
|
||||
openai) has_env OPENAI_API_KEY ;;
|
||||
anthropic) has_env ANTHROPIC_API_KEY ;;
|
||||
google) has_env GOOGLE_API_KEY ;;
|
||||
tetrate) has_env TETRATE_API_KEY ;;
|
||||
databricks) has_env DATABRICKS_HOST && has_env DATABRICKS_TOKEN ;;
|
||||
azure_openai) has_env AZURE_OPENAI_ENDPOINT && has_env AZURE_OPENAI_DEPLOYMENT_NAME ;;
|
||||
aws_bedrock) has_env AWS_REGION && { has_env AWS_PROFILE || has_env AWS_ACCESS_KEY_ID; } ;;
|
||||
gcp_vertex_ai) has_env GCP_PROJECT_ID ;;
|
||||
snowflake) has_env SNOWFLAKE_HOST && has_env SNOWFLAKE_TOKEN ;;
|
||||
venice) has_env VENICE_API_KEY ;;
|
||||
litellm) has_env LITELLM_API_KEY ;;
|
||||
sagemaker_tgi) has_env SAGEMAKER_ENDPOINT_NAME && has_env AWS_REGION ;;
|
||||
github_copilot) has_env GITHUB_COPILOT_TOKEN || has_file "$HOME/.config/goose/github_copilot_token.json" ;;
|
||||
chatgpt_codex) has_env CHATGPT_CODEX_TOKEN || has_file "$HOME/.config/goose/chatgpt_codex/tokens.json" ;;
|
||||
ollama) has_env OLLAMA_HOST || has_cmd ollama ;;
|
||||
claude-code) has_cmd claude ;;
|
||||
cursor-agent) has_cmd cursor-agent ;;
|
||||
*) return 0 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
is_allowed_failure() {
|
||||
local key="${1}:${2}"
|
||||
for allowed in "${ALLOWED_FAILURES[@]}"; do
|
||||
[ "$allowed" = "$key" ] && return 0
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
should_skip_provider() {
|
||||
[ -z "$SKIP_PROVIDERS" ] && return 1
|
||||
IFS=',' read -ra SKIP_LIST <<< "$SKIP_PROVIDERS"
|
||||
for skip in "${SKIP_LIST[@]}"; do
|
||||
skip=$(echo "$skip" | xargs)
|
||||
[ "$skip" = "$1" ] && return 0
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
is_agentic_provider() {
|
||||
for agentic in "${AGENTIC_PROVIDERS[@]}"; do
|
||||
[ "$agentic" = "$1" ] && return 0
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
# build_test_cases [--skip-agentic]
|
||||
build_test_cases() {
|
||||
local skip_agentic=false
|
||||
[ "$1" = "--skip-agentic" ] && skip_agentic=true
|
||||
|
||||
local providers=()
|
||||
while IFS= read -r line; do
|
||||
[[ "$line" =~ ^#.*$ || -z "$line" ]] && continue
|
||||
local provider="${line%% -> *}"
|
||||
if is_provider_available "$provider"; then
|
||||
providers+=("$line")
|
||||
echo "✓ Including $provider"
|
||||
else
|
||||
echo "⚠️ Skipping $provider (prerequisites not met)"
|
||||
fi
|
||||
done <<< "$PROVIDER_CONFIG"
|
||||
echo ""
|
||||
|
||||
TEST_CASES=()
|
||||
local job_index=0
|
||||
for provider_config in "${providers[@]}"; do
|
||||
local provider="${provider_config%% -> *}"
|
||||
local models_str="${provider_config#* -> }"
|
||||
|
||||
if should_skip_provider "$provider"; then
|
||||
echo "⊘ Skipping provider: ${provider} (SKIP_PROVIDERS)"
|
||||
continue
|
||||
fi
|
||||
|
||||
if [ "$skip_agentic" = true ] && is_agentic_provider "$provider"; then
|
||||
echo "⊘ Skipping agentic provider: ${provider}"
|
||||
continue
|
||||
fi
|
||||
|
||||
IFS='|' read -ra models <<< "$models_str"
|
||||
for model in "${models[@]}"; do
|
||||
TEST_CASES+=("$provider|$model|$job_index")
|
||||
((job_index++))
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
# run_test_cases <test_fn>
|
||||
run_test_cases() {
|
||||
local test_fn="$1"
|
||||
|
||||
RESULTS_DIR=$(mktemp -d)
|
||||
trap 'if [ -n "${RESULTS_DIR:-}" ]; then rm -rf -- "$RESULTS_DIR"; fi; if [ -n "${CLEANUP_DIR:-}" ]; then rm -rf -- "$CLEANUP_DIR"; fi' EXIT
|
||||
MAX_PARALLEL=${MAX_PARALLEL:-$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 8)}
|
||||
echo "Running ${#TEST_CASES[@]} tests (max $MAX_PARALLEL parallel)"
|
||||
echo ""
|
||||
|
||||
local running=0
|
||||
for ((i=0; i<${#TEST_CASES[@]}; i++)); do
|
||||
IFS='|' read -r provider model idx <<< "${TEST_CASES[$i]}"
|
||||
|
||||
if [ $i -eq 0 ]; then
|
||||
# First test runs sequentially to catch early failures
|
||||
"$test_fn" "$provider" "$model" "$RESULTS_DIR/result_$idx" "$RESULTS_DIR/output_$idx"
|
||||
else
|
||||
"$test_fn" "$provider" "$model" "$RESULTS_DIR/result_$idx" "$RESULTS_DIR/output_$idx" &
|
||||
((running++))
|
||||
if [ $running -ge $MAX_PARALLEL ]; then
|
||||
wait -n 2>/dev/null || wait
|
||||
((running--))
|
||||
fi
|
||||
fi
|
||||
done
|
||||
wait
|
||||
}
|
||||
|
||||
report_results() {
|
||||
echo ""
|
||||
echo "=== Test Results ==="
|
||||
echo ""
|
||||
|
||||
RESULTS=()
|
||||
HARD_FAILURES=()
|
||||
|
||||
for job in "${TEST_CASES[@]}"; do
|
||||
IFS='|' read -r provider model idx <<< "$job"
|
||||
|
||||
echo "Provider: $provider"
|
||||
echo "Model: $model"
|
||||
echo ""
|
||||
cat "$RESULTS_DIR/output_$idx"
|
||||
echo ""
|
||||
|
||||
local result_line=""
|
||||
[ -f "$RESULTS_DIR/result_$idx" ] && result_line=$(cat "$RESULTS_DIR/result_$idx")
|
||||
local status="${result_line%%|*}"
|
||||
local msg="${result_line#*|}"
|
||||
|
||||
if [ "$status" = "success" ]; then
|
||||
echo "✓ SUCCESS: $msg"
|
||||
RESULTS+=("✓ ${provider}: ${model}")
|
||||
else
|
||||
if is_allowed_failure "$provider" "$model"; then
|
||||
echo "⚠ FLAKY: $msg"
|
||||
RESULTS+=("⚠ ${provider}: ${model} (flaky)")
|
||||
else
|
||||
echo "✗ FAILED: $msg"
|
||||
RESULTS+=("✗ ${provider}: ${model}")
|
||||
HARD_FAILURES+=("${provider}: ${model}")
|
||||
fi
|
||||
fi
|
||||
echo "---"
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "=== Test Summary ==="
|
||||
for result in "${RESULTS[@]}"; do
|
||||
echo "$result"
|
||||
done
|
||||
|
||||
if [ ${#HARD_FAILURES[@]} -gt 0 ]; then
|
||||
echo ""
|
||||
echo "Hard failures (${#HARD_FAILURES[@]}):"
|
||||
for failure in "${HARD_FAILURES[@]}"; do
|
||||
echo " - $failure"
|
||||
done
|
||||
echo ""
|
||||
echo "Some tests failed!"
|
||||
exit 1
|
||||
else
|
||||
if echo "${RESULTS[@]}" | grep -q "⚠"; then
|
||||
echo ""
|
||||
echo "All required tests passed! (some flaky tests failed but are allowed)"
|
||||
else
|
||||
echo ""
|
||||
echo "All tests passed!"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
|
@ -35,6 +35,9 @@
|
|||
"test:ui": "vitest --ui",
|
||||
"test:coverage": "vitest run --coverage",
|
||||
"test:integration": "vitest run --config vitest.integration.config.ts",
|
||||
"test:integration:goosed": "vitest run --config vitest.integration.config.ts tests/integration/goosed.test.ts",
|
||||
"test:integration:providers": "vitest run --config vitest.integration.config.ts tests/integration/test_providers.test.ts",
|
||||
"test:integration:providers-code-exec": "vitest run --config vitest.integration.config.ts tests/integration/test_providers_code_exec.test.ts",
|
||||
"test:integration:watch": "vitest --config vitest.integration.config.ts",
|
||||
"test:integration:debug": "DEBUG=1 vitest run --config vitest.integration.config.ts",
|
||||
"i18n:extract": "formatjs extract 'src/**/*.{ts,tsx}' --out-file src/i18n/messages/en.json --flatten && pnpm run i18n:compile",
|
||||
|
|
|
|||
86
ui/desktop/tests/integration/test_providers.test.ts
Normal file
86
ui/desktop/tests/integration/test_providers.test.ts
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
/**
|
||||
* Provider smoke tests — normal mode (direct tool calls).
|
||||
*
|
||||
* Each available provider/model pair gets its own test that spawns `goose run`
|
||||
* with the developer builtin, asks the model to read files via the shell tool,
|
||||
* and validates the output.
|
||||
*/
|
||||
|
||||
import { expect, beforeAll } from 'vitest';
|
||||
import fs from 'node:fs';
|
||||
import os from 'node:os';
|
||||
import path from 'node:path';
|
||||
import { buildGoose, discoverTestCases, runGoose, providerTest } from './test_providers_lib';
|
||||
|
||||
const BUILTINS = 'developer';
|
||||
const TEST_CONTENT = 'test-content-abc123';
|
||||
|
||||
let gooseBin: string;
|
||||
let testFile: string;
|
||||
|
||||
beforeAll(() => {
|
||||
gooseBin = buildGoose();
|
||||
|
||||
const targetDir = path.resolve(process.cwd(), '..', '..', 'target');
|
||||
fs.mkdirSync(targetDir, { recursive: true });
|
||||
testFile = path.join(targetDir, 'test-content.txt');
|
||||
fs.writeFileSync(testFile, TEST_CONTENT + '\n');
|
||||
});
|
||||
|
||||
const { testAgentic, testNonAgentic } = providerTest(discoverTestCases());
|
||||
|
||||
testNonAgentic('reads files via shell tool', async (tc) => {
|
||||
const testdir = fs.mkdtempSync(path.join(os.tmpdir(), 'goose-test-'));
|
||||
try {
|
||||
const tokenA = `smoke-alpha-${Math.floor(Math.random() * 32768)}`;
|
||||
const tokenB = `smoke-bravo-${Math.floor(Math.random() * 32768)}`;
|
||||
fs.writeFileSync(path.join(testdir, 'part-a.txt'), tokenA + '\n');
|
||||
fs.writeFileSync(path.join(testdir, 'part-b.txt'), tokenB + '\n');
|
||||
|
||||
const output = await runGoose(
|
||||
gooseBin,
|
||||
testdir,
|
||||
'Use the shell tool to cat ./part-a.txt and ./part-b.txt, then reply with ONLY the contents of both files, one per line, nothing else.',
|
||||
BUILTINS,
|
||||
{ GOOSE_PROVIDER: tc.provider, GOOSE_MODEL: tc.model }
|
||||
);
|
||||
|
||||
const shellToolPattern = /(shell \| developer)|(▸.*shell)/;
|
||||
expect(
|
||||
shellToolPattern.test(output),
|
||||
`Expected model to use shell tool\n\nFull output:\n${output}`
|
||||
).toBe(true);
|
||||
expect(
|
||||
output,
|
||||
`Expected output to contain token from part-a.txt (${tokenA})\n\nFull output:\n${output}`
|
||||
).toContain(tokenA);
|
||||
expect(
|
||||
output,
|
||||
`Expected output to contain token from part-b.txt (${tokenB})\n\nFull output:\n${output}`
|
||||
).toContain(tokenB);
|
||||
} finally {
|
||||
fs.rmSync(testdir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
testAgentic('reads file contents', async (tc) => {
|
||||
const testdir = fs.mkdtempSync(path.join(os.tmpdir(), 'goose-test-'));
|
||||
try {
|
||||
fs.copyFileSync(testFile, path.join(testdir, 'test-content.txt'));
|
||||
|
||||
const output = await runGoose(
|
||||
gooseBin,
|
||||
testdir,
|
||||
'read ./test-content.txt and output its contents exactly',
|
||||
BUILTINS,
|
||||
{ GOOSE_PROVIDER: tc.provider, GOOSE_MODEL: tc.model }
|
||||
);
|
||||
|
||||
expect(
|
||||
output.toLowerCase(),
|
||||
`Expected model output to contain "${TEST_CONTENT}"\n\nFull output:\n${output}`
|
||||
).toContain(TEST_CONTENT.toLowerCase());
|
||||
} finally {
|
||||
fs.rmSync(testdir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
|
@ -0,0 +1,50 @@
|
|||
/**
|
||||
* Provider smoke tests — code execution mode (JS batching).
|
||||
*
|
||||
* Each available (non-agentic) provider/model pair gets its own test that
|
||||
* spawns `goose run` with the memory + code_execution builtins and validates
|
||||
* that the code_execution tool was invoked.
|
||||
*/
|
||||
|
||||
import { expect, beforeAll } from 'vitest';
|
||||
import fs from 'node:fs';
|
||||
import os from 'node:os';
|
||||
import path from 'node:path';
|
||||
import { buildGoose, discoverTestCases, runGoose, providerTest } from './test_providers_lib';
|
||||
|
||||
const BUILTINS = 'memory,code_execution';
|
||||
|
||||
let gooseBin: string;
|
||||
|
||||
beforeAll(() => {
|
||||
gooseBin = buildGoose();
|
||||
});
|
||||
|
||||
const { testAll } = providerTest(discoverTestCases({ skipAgentic: true }));
|
||||
|
||||
testAll('invokes code_execution tool', async (tc) => {
|
||||
const testdir = fs.mkdtempSync(path.join(os.tmpdir(), 'goose-codeexec-'));
|
||||
try {
|
||||
const output = await runGoose(
|
||||
gooseBin,
|
||||
testdir,
|
||||
"Store a memory with category 'test' and data 'hello world', then retrieve all memories from category 'test'.",
|
||||
BUILTINS,
|
||||
{ GOOSE_PROVIDER: tc.provider, GOOSE_MODEL: tc.model }
|
||||
);
|
||||
|
||||
// Matches: "execute_typescript | code_execution", "get_function_details | code_execution",
|
||||
// "tool call | execute", "tool calls | execute" (old format)
|
||||
// "▸ execute N tool call" (new format with tool_graph)
|
||||
// "▸ execute_typescript" (plain tool name in output)
|
||||
const codeExecPattern =
|
||||
/(execute_typescript \| code_execution)|(get_function_details \| code_execution)|(tool calls? \| execute)|(▸.*execute.*tool call)|(▸ execute_typescript)/;
|
||||
|
||||
expect(
|
||||
codeExecPattern.test(output),
|
||||
`Expected code_execution tool to be called\n\nFull output:\n${output}`
|
||||
).toBe(true);
|
||||
} finally {
|
||||
fs.rmSync(testdir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
389
ui/desktop/tests/integration/test_providers_lib.ts
Normal file
389
ui/desktop/tests/integration/test_providers_lib.ts
Normal file
|
|
@ -0,0 +1,389 @@
|
|||
/**
|
||||
* Shared library for provider smoke tests.
|
||||
*
|
||||
* Ported from scripts/test_providers_lib.sh — keeps the same provider config,
|
||||
* allowed-failure list, agentic-provider list, and environment detection.
|
||||
*/
|
||||
|
||||
import { test } from 'vitest';
|
||||
import { execSync, spawn, type ChildProcess } from 'node:child_process';
|
||||
import fs from 'node:fs';
|
||||
import os from 'node:os';
|
||||
import path from 'node:path';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Provider configuration
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
type ModelEntry = string | { name: string; flaky: true };
|
||||
|
||||
interface ProviderConfig {
|
||||
provider: string;
|
||||
models: ModelEntry[];
|
||||
agentic?: boolean;
|
||||
available: () => boolean;
|
||||
}
|
||||
|
||||
function modelName(entry: ModelEntry): string {
|
||||
return typeof entry === 'string' ? entry : entry.name;
|
||||
}
|
||||
|
||||
function modelFlaky(entry: ModelEntry): boolean {
|
||||
return typeof entry !== 'string' && entry.flaky;
|
||||
}
|
||||
|
||||
function hasEnv(name: string): boolean {
|
||||
return !!process.env[name];
|
||||
}
|
||||
|
||||
function hasCmd(name: string): boolean {
|
||||
try {
|
||||
execSync(`command -v ${name}`, { stdio: 'ignore' });
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
function hasFile(p: string): boolean {
|
||||
return fs.existsSync(p);
|
||||
}
|
||||
|
||||
function getProviders(): ProviderConfig[] {
|
||||
return [
|
||||
{
|
||||
provider: 'openrouter',
|
||||
models: [
|
||||
'google/gemini-2.5-pro',
|
||||
'anthropic/claude-sonnet-4.5',
|
||||
{ name: 'qwen/qwen3-coder:exacto', flaky: true },
|
||||
'z-ai/glm-4.6:exacto',
|
||||
{ name: 'nvidia/nemotron-3-nano-30b-a3b', flaky: true },
|
||||
],
|
||||
available: () => hasEnv('OPENROUTER_API_KEY'),
|
||||
},
|
||||
{
|
||||
provider: 'xai',
|
||||
models: ['grok-3'],
|
||||
available: () => hasEnv('XAI_API_KEY'),
|
||||
},
|
||||
{
|
||||
provider: 'openai',
|
||||
models: ['gpt-4o', 'gpt-4o-mini', { name: 'gpt-3.5-turbo', flaky: true }, 'gpt-5'],
|
||||
available: () => hasEnv('OPENAI_API_KEY'),
|
||||
},
|
||||
{
|
||||
provider: 'anthropic',
|
||||
models: ['claude-sonnet-4-5-20250929', 'claude-opus-4-5-20251101'],
|
||||
available: () => hasEnv('ANTHROPIC_API_KEY'),
|
||||
},
|
||||
{
|
||||
provider: 'google',
|
||||
models: [
|
||||
'gemini-2.5-pro',
|
||||
{ name: 'gemini-2.5-flash', flaky: true },
|
||||
{ name: 'gemini-3-pro-preview', flaky: true },
|
||||
'gemini-3-flash-preview',
|
||||
],
|
||||
available: () => hasEnv('GOOGLE_API_KEY'),
|
||||
},
|
||||
{
|
||||
provider: 'tetrate',
|
||||
models: ['claude-sonnet-4-20250514'],
|
||||
available: () => hasEnv('TETRATE_API_KEY'),
|
||||
},
|
||||
{
|
||||
provider: 'databricks',
|
||||
models: ['databricks-claude-sonnet-4', 'gemini-2-5-flash', 'gpt-4o'],
|
||||
available: () => hasEnv('DATABRICKS_HOST') && hasEnv('DATABRICKS_TOKEN'),
|
||||
},
|
||||
{
|
||||
provider: 'azure_openai',
|
||||
models: [process.env.AZURE_OPENAI_DEPLOYMENT_NAME ?? ''],
|
||||
available: () => hasEnv('AZURE_OPENAI_ENDPOINT') && hasEnv('AZURE_OPENAI_DEPLOYMENT_NAME'),
|
||||
},
|
||||
{
|
||||
provider: 'aws_bedrock',
|
||||
models: ['us.anthropic.claude-sonnet-4-5-20250929-v1:0'],
|
||||
available: () =>
|
||||
hasEnv('AWS_REGION') && (hasEnv('AWS_PROFILE') || hasEnv('AWS_ACCESS_KEY_ID')),
|
||||
},
|
||||
{
|
||||
provider: 'gcp_vertex_ai',
|
||||
models: ['gemini-2.5-pro'],
|
||||
available: () => hasEnv('GCP_PROJECT_ID'),
|
||||
},
|
||||
{
|
||||
provider: 'snowflake',
|
||||
models: ['claude-sonnet-4-5'],
|
||||
available: () => hasEnv('SNOWFLAKE_HOST') && hasEnv('SNOWFLAKE_TOKEN'),
|
||||
},
|
||||
{
|
||||
provider: 'venice',
|
||||
models: ['llama-3.3-70b'],
|
||||
available: () => hasEnv('VENICE_API_KEY'),
|
||||
},
|
||||
{
|
||||
provider: 'litellm',
|
||||
models: ['gpt-4o-mini'],
|
||||
available: () => hasEnv('LITELLM_API_KEY'),
|
||||
},
|
||||
{
|
||||
provider: 'sagemaker_tgi',
|
||||
models: ['sagemaker-tgi-endpoint'],
|
||||
available: () => hasEnv('SAGEMAKER_ENDPOINT_NAME') && hasEnv('AWS_REGION'),
|
||||
},
|
||||
{
|
||||
provider: 'github_copilot',
|
||||
models: ['gpt-4.1'],
|
||||
available: () =>
|
||||
hasEnv('GITHUB_COPILOT_TOKEN') ||
|
||||
hasFile(path.join(os.homedir(), '.config/goose/github_copilot_token.json')),
|
||||
},
|
||||
{
|
||||
provider: 'chatgpt_codex',
|
||||
models: ['gpt-5.4'],
|
||||
available: () =>
|
||||
hasEnv('CHATGPT_CODEX_TOKEN') ||
|
||||
hasFile(path.join(os.homedir(), '.config/goose/chatgpt_codex/tokens.json')),
|
||||
},
|
||||
{
|
||||
provider: 'claude-code',
|
||||
models: ['default'],
|
||||
agentic: true,
|
||||
available: () => hasCmd('claude'),
|
||||
},
|
||||
{
|
||||
provider: 'cursor-agent',
|
||||
models: ['auto'],
|
||||
agentic: true,
|
||||
available: () => hasCmd('cursor-agent'),
|
||||
},
|
||||
{
|
||||
provider: 'ollama',
|
||||
models: ['qwen3'],
|
||||
available: () => hasEnv('OLLAMA_HOST') || hasCmd('ollama'),
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function stripQuotes(s: string): string {
|
||||
if (
|
||||
s.length >= 2 &&
|
||||
((s.startsWith('"') && s.endsWith('"')) || (s.startsWith("'") && s.endsWith("'")))
|
||||
) {
|
||||
return s.slice(1, -1);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
function loadDotenv(): void {
|
||||
// Resolve .env from the repository root (two levels up from ui/desktop).
|
||||
const repoRoot = path.resolve(__dirname, '..', '..', '..', '..');
|
||||
const envPath = path.join(repoRoot, '.env');
|
||||
if (!fs.existsSync(envPath)) return;
|
||||
const lines = fs.readFileSync(envPath, 'utf-8').split('\n');
|
||||
for (const line of lines) {
|
||||
const trimmed = line.trim();
|
||||
if (!trimmed || trimmed.startsWith('#')) continue;
|
||||
const eqIdx = trimmed.indexOf('=');
|
||||
if (eqIdx === -1) continue;
|
||||
const key = trimmed.slice(0, eqIdx);
|
||||
const value = stripQuotes(trimmed.slice(eqIdx + 1));
|
||||
if (!(key in process.env)) {
|
||||
process.env[key] = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function shouldSkipProvider(provider: string): boolean {
|
||||
const skip = process.env.SKIP_PROVIDERS;
|
||||
if (!skip) return false;
|
||||
return skip
|
||||
.split(',')
|
||||
.map((s) => s.trim())
|
||||
.includes(provider);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Build goose binary
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export function buildGoose(): string {
|
||||
if (!process.env.SKIP_BUILD) {
|
||||
console.error('Building goose...');
|
||||
execSync('cargo build --bin goose', { stdio: 'inherit' });
|
||||
console.error('');
|
||||
} else {
|
||||
console.error('Skipping build (SKIP_BUILD is set)...');
|
||||
console.error('');
|
||||
}
|
||||
return path.resolve(process.cwd(), '..', '..', 'target/debug/goose');
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test case discovery
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export interface TestCase {
|
||||
provider: string;
|
||||
model: string;
|
||||
available: boolean;
|
||||
flaky: boolean;
|
||||
agentic: boolean;
|
||||
skippedReason?: string;
|
||||
}
|
||||
|
||||
export function discoverTestCases(options?: { skipAgentic?: boolean }): TestCase[] {
|
||||
loadDotenv();
|
||||
const skipAgentic = options?.skipAgentic ?? false;
|
||||
const providers = getProviders();
|
||||
|
||||
const testCases: TestCase[] = [];
|
||||
|
||||
for (const pc of providers) {
|
||||
const providerAvailable = pc.available();
|
||||
const agentic = pc.agentic ?? false;
|
||||
|
||||
for (const entry of pc.models) {
|
||||
const model = modelName(entry);
|
||||
const flaky = modelFlaky(entry);
|
||||
|
||||
if (!providerAvailable) {
|
||||
testCases.push({
|
||||
provider: pc.provider,
|
||||
model,
|
||||
available: false,
|
||||
flaky,
|
||||
agentic,
|
||||
skippedReason: 'prerequisites not met',
|
||||
});
|
||||
} else if (shouldSkipProvider(pc.provider)) {
|
||||
testCases.push({
|
||||
provider: pc.provider,
|
||||
model,
|
||||
available: false,
|
||||
flaky,
|
||||
agentic,
|
||||
skippedReason: 'SKIP_PROVIDERS',
|
||||
});
|
||||
} else if (skipAgentic && agentic) {
|
||||
testCases.push({
|
||||
provider: pc.provider,
|
||||
model,
|
||||
available: false,
|
||||
flaky,
|
||||
agentic,
|
||||
skippedReason: 'agentic provider skipped in this mode',
|
||||
});
|
||||
} else {
|
||||
testCases.push({
|
||||
provider: pc.provider,
|
||||
model,
|
||||
available: true,
|
||||
flaky,
|
||||
agentic,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return testCases;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test registration helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
type ProviderTestFn = (tc: TestCase) => Promise<void>;
|
||||
|
||||
function registerTests(label: string, cases: TestCase[], fn: ProviderTestFn): void {
|
||||
const available = cases.filter((tc) => tc.available && !tc.flaky);
|
||||
const flaky = cases.filter((tc) => tc.available && tc.flaky);
|
||||
const skipped = cases.filter((tc) => !tc.available);
|
||||
|
||||
if (available.length > 0) {
|
||||
test.each(available)(`${label} — $provider / $model`, async (tc) => {
|
||||
await fn(tc);
|
||||
});
|
||||
}
|
||||
|
||||
if (flaky.length > 0) {
|
||||
test.each(flaky)(`${label} — $provider / $model (flaky)`, async (tc) => {
|
||||
try {
|
||||
await fn(tc);
|
||||
} catch (err) {
|
||||
console.warn(`Flaky test ${tc.provider}/${tc.model} failed (allowed): ${err}`);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (skipped.length > 0) {
|
||||
test.skip.each(skipped)(`${label} — $provider / $model — $skippedReason`, () => {});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Build decorator-style test registrars from a set of discovered test cases.
|
||||
*
|
||||
* Usage:
|
||||
* const { testAll, testAgentic, testNonAgentic } = providerTest(cases);
|
||||
*
|
||||
* testAll('reads a file', async (tc) => { ... });
|
||||
* testAgentic('delegates work', async (tc) => { ... });
|
||||
* testNonAgentic('uses shell tool', async (tc) => { ... });
|
||||
*/
|
||||
export function providerTest(cases: TestCase[]) {
|
||||
const agentic = cases.filter((tc) => tc.agentic);
|
||||
const nonAgentic = cases.filter((tc) => !tc.agentic);
|
||||
|
||||
return {
|
||||
testAll: (label: string, fn: ProviderTestFn) => registerTests(label, cases, fn),
|
||||
testAgentic: (label: string, fn: ProviderTestFn) => registerTests(label, agentic, fn),
|
||||
testNonAgentic: (label: string, fn: ProviderTestFn) => registerTests(label, nonAgentic, fn),
|
||||
};
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Utility: run goose binary and capture output
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export function runGoose(
|
||||
gooseBin: string,
|
||||
cwd: string,
|
||||
prompt: string,
|
||||
builtins: string,
|
||||
env: Record<string, string>
|
||||
): Promise<string> {
|
||||
return new Promise((resolve) => {
|
||||
const child: ChildProcess = spawn(
|
||||
gooseBin,
|
||||
['run', '--text', prompt, '--with-builtin', builtins],
|
||||
{
|
||||
cwd,
|
||||
env: { ...process.env, ...env },
|
||||
stdio: ['ignore', 'pipe', 'pipe'],
|
||||
}
|
||||
);
|
||||
|
||||
let output = '';
|
||||
child.stdout?.on('data', (d) => {
|
||||
output += String(d);
|
||||
});
|
||||
child.stderr?.on('data', (d) => {
|
||||
output += String(d);
|
||||
});
|
||||
|
||||
child.on('close', () => {
|
||||
resolve(output);
|
||||
});
|
||||
|
||||
child.on('error', (err) => {
|
||||
resolve(`spawn error: ${err.message}`);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
/// <reference types="vitest" />
|
||||
import { defineConfig } from 'vitest/config'
|
||||
import react from '@vitejs/plugin-react'
|
||||
import { resolve } from 'node:path'
|
||||
import { defineConfig } from 'vitest/config';
|
||||
import react from '@vitejs/plugin-react';
|
||||
import { resolve } from 'node:path';
|
||||
|
||||
const cfg = {
|
||||
plugins: [react()],
|
||||
|
|
@ -17,6 +17,6 @@ const cfg = {
|
|||
css: true,
|
||||
include: ['src/**/*.{test,spec}.{js,jsx,ts,tsx}'],
|
||||
},
|
||||
} satisfies Record<string, any>
|
||||
} satisfies Record<string, any>;
|
||||
|
||||
export default defineConfig(cfg as any)
|
||||
export default defineConfig(cfg as any);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue