mirror of
https://github.com/block/goose.git
synced 2026-05-01 21:10:54 +00:00
Some checks failed
Canary / build-cli (push) Blocked by required conditions
Canary / Upload Install Script (push) Blocked by required conditions
Canary / bundle-desktop (push) Blocked by required conditions
Canary / bundle-desktop-linux (push) Blocked by required conditions
Canary / bundle-desktop-windows (push) Blocked by required conditions
Canary / Release (push) Blocked by required conditions
Canary / Prepare Version (push) Waiting to run
CI / Test and Lint Electron Desktop App (push) Blocked by required conditions
CI / changes (push) Waiting to run
CI / Check Rust Code Format (push) Blocked by required conditions
CI / Build and Test Rust Project (push) Blocked by required conditions
CI / Lint Rust Code (push) Blocked by required conditions
CI / Check OpenAPI Schema is Up-to-Date (push) Blocked by required conditions
Live Provider Tests / check-fork (push) Waiting to run
Live Provider Tests / changes (push) Blocked by required conditions
Live Provider Tests / Build Binary (push) Blocked by required conditions
Live Provider Tests / Smoke Tests (push) Blocked by required conditions
Live Provider Tests / Smoke Tests (Code Execution) (push) Blocked by required conditions
Live Provider Tests / Compaction Tests (push) Blocked by required conditions
Publish Docker Image / docker (push) Waiting to run
Scorecard supply-chain security / Scorecard analysis (push) Waiting to run
Deploy Documentation / deploy (push) Has been cancelled
Publish Ask AI Bot Docker Image / docker (push) Has been cancelled
Signed-off-by: Adrian Cole <adrian@tetrate.io>
435 lines
13 KiB
Bash
Executable file
435 lines
13 KiB
Bash
Executable file
#!/bin/bash
|
|
# Test providers with optional code_execution mode
|
|
# Usage:
|
|
# ./test_providers.sh # Normal mode (direct tool calls)
|
|
# ./test_providers.sh --code-exec # Code execution mode (JS batching)
|
|
#
|
|
# Environment variables:
|
|
# SKIP_PROVIDERS Comma-separated list of providers to skip (e.g., "tetrate,xai")
|
|
# SKIP_BUILD Skip the cargo build step if set
|
|
|
|
CODE_EXEC_MODE=false
|
|
for arg in "$@"; do
|
|
case $arg in
|
|
--code-exec)
|
|
CODE_EXEC_MODE=true
|
|
;;
|
|
esac
|
|
done
|
|
|
|
# Flaky models that are allowed to fail without failing the entire test run.
|
|
# These are typically preview/experimental models with inconsistent tool-calling behavior.
|
|
# Failures are still reported but don't block PRs.
|
|
ALLOWED_FAILURES=(
|
|
"google:gemini-3-pro-preview"
|
|
"openrouter:nvidia/nemotron-3-nano-30b-a3b"
|
|
)
|
|
|
|
# Agentic providers handle tools internally and return text results.
|
|
# They can't produce the normal tool-call log patterns (e.g. "shell | developer").
|
|
AGENTIC_PROVIDERS=("claude-code" "codex" "gemini-cli" "cursor-agent")
|
|
|
|
if [ -f .env ]; then
|
|
export $(grep -v '^#' .env | xargs)
|
|
fi
|
|
|
|
if [ -z "$SKIP_BUILD" ]; then
|
|
echo "Building goose..."
|
|
cargo build --bin goose
|
|
echo ""
|
|
else
|
|
echo "Skipping build (SKIP_BUILD is set)..."
|
|
echo ""
|
|
fi
|
|
|
|
SCRIPT_DIR=$(pwd)
|
|
|
|
# Create a test file with known content in the current directory
|
|
# This cannot be /tmp as some agents cannot work outside the PWD
|
|
mkdir -p target
|
|
TEST_CONTENT="test-content-abc123"
|
|
TEST_FILE="./target/test-content.txt"
|
|
echo "$TEST_CONTENT" > "$TEST_FILE"
|
|
|
|
# Format: "provider -> model1|model2|model3"
|
|
# Base providers that are always tested (with appropriate env vars)
|
|
PROVIDERS=(
|
|
"openrouter -> google/gemini-2.5-pro|anthropic/claude-sonnet-4.5|qwen/qwen3-coder:exacto|z-ai/glm-4.6:exacto|nvidia/nemotron-3-nano-30b-a3b"
|
|
"xai -> grok-3"
|
|
"openai -> gpt-4o|gpt-4o-mini|gpt-3.5-turbo|gpt-5"
|
|
"anthropic -> claude-sonnet-4-5-20250929|claude-opus-4-1-20250805"
|
|
"google -> gemini-2.5-pro|gemini-2.5-flash|gemini-3-pro-preview|gemini-3-flash-preview"
|
|
"tetrate -> claude-sonnet-4-20250514"
|
|
)
|
|
|
|
# Conditionally add providers based on environment variables
|
|
|
|
# Databricks: requires DATABRICKS_HOST and DATABRICKS_TOKEN
|
|
if [ -n "$DATABRICKS_HOST" ] && [ -n "$DATABRICKS_TOKEN" ]; then
|
|
echo "✓ Including Databricks tests"
|
|
PROVIDERS+=("databricks -> databricks-claude-sonnet-4|gemini-2-5-flash|gpt-4o")
|
|
else
|
|
echo "⚠️ Skipping Databricks tests (DATABRICKS_HOST and DATABRICKS_TOKEN required)"
|
|
fi
|
|
|
|
# Azure OpenAI: requires AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_DEPLOYMENT_NAME
|
|
if [ -n "$AZURE_OPENAI_ENDPOINT" ] && [ -n "$AZURE_OPENAI_DEPLOYMENT_NAME" ]; then
|
|
echo "✓ Including Azure OpenAI tests"
|
|
PROVIDERS+=("azure_openai -> ${AZURE_OPENAI_DEPLOYMENT_NAME}")
|
|
else
|
|
echo "⚠️ Skipping Azure OpenAI tests (AZURE_OPENAI_ENDPOINT and AZURE_OPENAI_DEPLOYMENT_NAME required)"
|
|
fi
|
|
|
|
# AWS Bedrock: requires AWS credentials (profile or keys) and AWS_REGION
|
|
if [ -n "$AWS_REGION" ] && { [ -n "$AWS_PROFILE" ] || [ -n "$AWS_ACCESS_KEY_ID" ]; }; then
|
|
echo "✓ Including AWS Bedrock tests"
|
|
PROVIDERS+=("aws_bedrock -> us.anthropic.claude-sonnet-4-5-20250929-v1:0")
|
|
else
|
|
echo "⚠️ Skipping AWS Bedrock tests (AWS_REGION and AWS_PROFILE or AWS credentials required)"
|
|
fi
|
|
|
|
# GCP Vertex AI: requires GCP_PROJECT_ID
|
|
if [ -n "$GCP_PROJECT_ID" ]; then
|
|
echo "✓ Including GCP Vertex AI tests"
|
|
PROVIDERS+=("gcp_vertex_ai -> gemini-2.5-pro")
|
|
else
|
|
echo "⚠️ Skipping GCP Vertex AI tests (GCP_PROJECT_ID required)"
|
|
fi
|
|
|
|
# Snowflake: requires SNOWFLAKE_HOST and SNOWFLAKE_TOKEN
|
|
if [ -n "$SNOWFLAKE_HOST" ] && [ -n "$SNOWFLAKE_TOKEN" ]; then
|
|
echo "✓ Including Snowflake tests"
|
|
PROVIDERS+=("snowflake -> claude-sonnet-4-5")
|
|
else
|
|
echo "⚠️ Skipping Snowflake tests (SNOWFLAKE_HOST and SNOWFLAKE_TOKEN required)"
|
|
fi
|
|
|
|
# Venice: requires VENICE_API_KEY
|
|
if [ -n "$VENICE_API_KEY" ]; then
|
|
echo "✓ Including Venice tests"
|
|
PROVIDERS+=("venice -> llama-3.3-70b")
|
|
else
|
|
echo "⚠️ Skipping Venice tests (VENICE_API_KEY required)"
|
|
fi
|
|
|
|
# LiteLLM: requires LITELLM_API_KEY (and optionally LITELLM_HOST)
|
|
if [ -n "$LITELLM_API_KEY" ]; then
|
|
echo "✓ Including LiteLLM tests"
|
|
PROVIDERS+=("litellm -> gpt-4o-mini")
|
|
else
|
|
echo "⚠️ Skipping LiteLLM tests (LITELLM_API_KEY required)"
|
|
fi
|
|
|
|
# Ollama: requires OLLAMA_HOST (or uses default localhost:11434)
|
|
if [ -n "$OLLAMA_HOST" ] || command -v ollama &> /dev/null; then
|
|
echo "✓ Including Ollama tests"
|
|
PROVIDERS+=("ollama -> qwen3")
|
|
else
|
|
echo "⚠️ Skipping Ollama tests (OLLAMA_HOST required or ollama must be installed)"
|
|
fi
|
|
|
|
# SageMaker TGI: requires AWS credentials and SAGEMAKER_ENDPOINT_NAME
|
|
if [ -n "$SAGEMAKER_ENDPOINT_NAME" ] && [ -n "$AWS_REGION" ]; then
|
|
echo "✓ Including SageMaker TGI tests"
|
|
PROVIDERS+=("sagemaker_tgi -> sagemaker-tgi-endpoint")
|
|
else
|
|
echo "⚠️ Skipping SageMaker TGI tests (SAGEMAKER_ENDPOINT_NAME and AWS_REGION required)"
|
|
fi
|
|
|
|
# GitHub Copilot: requires OAuth setup (check for cached token)
|
|
if [ -n "$GITHUB_COPILOT_TOKEN" ] || [ -f "$HOME/.config/goose/github_copilot_token.json" ]; then
|
|
echo "✓ Including GitHub Copilot tests"
|
|
PROVIDERS+=("github_copilot -> gpt-4.1")
|
|
else
|
|
echo "⚠️ Skipping GitHub Copilot tests (OAuth setup required - run 'goose configure' first)"
|
|
fi
|
|
|
|
# ChatGPT Codex: requires OAuth setup
|
|
if [ -n "$CHATGPT_CODEX_TOKEN" ] || [ -f "$HOME/.config/goose/chatgpt_codex_token.json" ]; then
|
|
echo "✓ Including ChatGPT Codex tests"
|
|
PROVIDERS+=("chatgpt_codex -> gpt-5.1-codex")
|
|
else
|
|
echo "⚠️ Skipping ChatGPT Codex tests (OAuth setup required - run 'goose configure' first)"
|
|
fi
|
|
|
|
# CLI-based providers (require the CLI tool to be installed)
|
|
|
|
# Claude Code CLI: requires 'claude' CLI tool
|
|
if command -v claude &> /dev/null; then
|
|
echo "✓ Including Claude Code CLI tests"
|
|
PROVIDERS+=("claude-code -> claude-sonnet-4-20250514")
|
|
else
|
|
echo "⚠️ Skipping Claude Code CLI tests ('claude' CLI tool required)"
|
|
fi
|
|
|
|
# Codex CLI: requires 'codex' CLI tool
|
|
if command -v codex &> /dev/null; then
|
|
echo "✓ Including Codex CLI tests"
|
|
PROVIDERS+=("codex -> gpt-5.2-codex")
|
|
else
|
|
echo "⚠️ Skipping Codex CLI tests ('codex' CLI tool required)"
|
|
fi
|
|
|
|
# Gemini CLI: requires 'gemini' CLI tool
|
|
if command -v gemini &> /dev/null; then
|
|
echo "✓ Including Gemini CLI tests"
|
|
PROVIDERS+=("gemini-cli -> gemini-2.5-pro")
|
|
else
|
|
echo "⚠️ Skipping Gemini CLI tests ('gemini' CLI tool required)"
|
|
fi
|
|
|
|
# Cursor Agent: requires 'cursor-agent' CLI tool
|
|
if command -v cursor-agent &> /dev/null; then
|
|
echo "✓ Including Cursor Agent tests"
|
|
PROVIDERS+=("cursor-agent -> auto")
|
|
else
|
|
echo "⚠️ Skipping Cursor Agent tests ('cursor-agent' CLI tool required)"
|
|
fi
|
|
|
|
echo ""
|
|
|
|
# Configure mode-specific settings
|
|
if [ "$CODE_EXEC_MODE" = true ]; then
|
|
echo "Mode: code_execution (JS batching)"
|
|
BUILTINS="developer,code_execution"
|
|
# Match code_execution tool usage:
|
|
# - "execute | code_execution" or "get_function_details | code_execution" (fallback format)
|
|
# - "tool call | execute" or "tool calls | execute" (new format with tool_graph)
|
|
SUCCESS_PATTERN="(execute \| code_execution)|(get_function_details \| code_execution)|(tool calls? \| execute)"
|
|
SUCCESS_MSG="code_execution tool called"
|
|
FAILURE_MSG="no code_execution tools called"
|
|
else
|
|
echo "Mode: normal (direct tool calls)"
|
|
BUILTINS="developer,autovisualiser,computercontroller,tutorial,todo,extensionmanager"
|
|
SUCCESS_PATTERN="shell \| developer"
|
|
SUCCESS_MSG="developer tool called"
|
|
FAILURE_MSG="no developer tools called"
|
|
fi
|
|
echo ""
|
|
|
|
is_allowed_failure() {
|
|
local provider="$1"
|
|
local model="$2"
|
|
local key="${provider}:${model}"
|
|
for allowed in "${ALLOWED_FAILURES[@]}"; do
|
|
if [ "$allowed" = "$key" ]; then
|
|
return 0
|
|
fi
|
|
done
|
|
return 1
|
|
}
|
|
|
|
should_skip_provider() {
|
|
local provider="$1"
|
|
if [ -z "$SKIP_PROVIDERS" ]; then
|
|
return 1
|
|
fi
|
|
IFS=',' read -ra SKIP_LIST <<< "$SKIP_PROVIDERS"
|
|
for skip in "${SKIP_LIST[@]}"; do
|
|
# Trim whitespace
|
|
skip=$(echo "$skip" | xargs)
|
|
if [ "$skip" = "$provider" ]; then
|
|
return 0
|
|
fi
|
|
done
|
|
return 1
|
|
}
|
|
|
|
is_agentic_provider() {
|
|
local provider="$1"
|
|
for agentic in "${AGENTIC_PROVIDERS[@]}"; do
|
|
if [ "$agentic" = "$provider" ]; then
|
|
return 0
|
|
fi
|
|
done
|
|
return 1
|
|
}
|
|
|
|
# Create temp directory for results
|
|
RESULTS_DIR=$(mktemp -d)
|
|
trap "rm -rf $RESULTS_DIR" EXIT
|
|
|
|
# Maximum parallel jobs (default: number of CPU cores, or override with MAX_PARALLEL)
|
|
MAX_PARALLEL=${MAX_PARALLEL:-$(sysctl -n hw.ncpu 2>/dev/null || nproc 2>/dev/null || echo 8)}
|
|
echo "Running tests with up to $MAX_PARALLEL parallel jobs"
|
|
echo ""
|
|
|
|
# Function to run a single test
|
|
run_test() {
|
|
local provider="$1"
|
|
local model="$2"
|
|
local result_file="$3"
|
|
local output_file="$4"
|
|
|
|
local testdir=$(mktemp -d)
|
|
|
|
# Agentic providers use a file-read prompt with known content marker;
|
|
# regular providers use the shell prompt that produces tool-call logs.
|
|
local prompt
|
|
if is_agentic_provider "$provider"; then
|
|
cp "$TEST_FILE" "$testdir/test-content.txt"
|
|
prompt="read ./test-content.txt and output its contents exactly"
|
|
else
|
|
echo "hello" > "$testdir/hello.txt"
|
|
prompt="Immediately use the shell tool to run 'ls'. Do not ask for confirmation."
|
|
fi
|
|
|
|
# Run the test and capture output
|
|
(
|
|
export GOOSE_PROVIDER="$provider"
|
|
export GOOSE_MODEL="$model"
|
|
cd "$testdir" && "$SCRIPT_DIR/target/debug/goose" run --text "$prompt" --with-builtin "$BUILTINS" 2>&1
|
|
) > "$output_file" 2>&1
|
|
|
|
# Check result: agentic providers return text containing the test content
|
|
# instead of producing tool-call log patterns
|
|
if is_agentic_provider "$provider"; then
|
|
if grep -qi "$TEST_CONTENT" "$output_file"; then
|
|
echo "success" > "$result_file"
|
|
else
|
|
echo "failure" > "$result_file"
|
|
fi
|
|
elif grep -qE "$SUCCESS_PATTERN" "$output_file"; then
|
|
echo "success" > "$result_file"
|
|
else
|
|
echo "failure" > "$result_file"
|
|
fi
|
|
|
|
rm -rf "$testdir"
|
|
}
|
|
|
|
# Build list of all provider/model combinations
|
|
JOBS=()
|
|
job_index=0
|
|
for provider_config in "${PROVIDERS[@]}"; do
|
|
PROVIDER="${provider_config%% -> *}"
|
|
MODELS_STR="${provider_config#* -> }"
|
|
|
|
# Skip provider if it's in SKIP_PROVIDERS
|
|
if should_skip_provider "$PROVIDER"; then
|
|
echo "⊘ Skipping provider: ${PROVIDER} (SKIP_PROVIDERS)"
|
|
continue
|
|
fi
|
|
|
|
# Agentic providers don't use goose's code_execution system
|
|
if [ "$CODE_EXEC_MODE" = true ] && is_agentic_provider "$PROVIDER"; then
|
|
echo "⊘ Skipping agentic provider in code_exec mode: ${PROVIDER}"
|
|
continue
|
|
fi
|
|
|
|
IFS='|' read -ra MODELS <<< "$MODELS_STR"
|
|
for MODEL in "${MODELS[@]}"; do
|
|
JOBS+=("$PROVIDER|$MODEL|$job_index")
|
|
((job_index++))
|
|
done
|
|
done
|
|
|
|
total_jobs=${#JOBS[@]}
|
|
echo "Starting $total_jobs tests..."
|
|
echo ""
|
|
|
|
# Run first test sequentially if any jobs exist
|
|
if [ $total_jobs -gt 0 ]; then
|
|
echo "Running first test sequentially..."
|
|
first_job="${JOBS[0]}"
|
|
IFS='|' read -r provider model idx <<< "$first_job"
|
|
|
|
result_file="$RESULTS_DIR/result_$idx"
|
|
output_file="$RESULTS_DIR/output_$idx"
|
|
meta_file="$RESULTS_DIR/meta_$idx"
|
|
echo "$provider|$model" > "$meta_file"
|
|
|
|
# Run first test and wait for it to complete
|
|
run_test "$provider" "$model" "$result_file" "$output_file"
|
|
echo "First test completed."
|
|
echo ""
|
|
fi
|
|
|
|
# Run remaining tests in parallel
|
|
if [ $total_jobs -gt 1 ]; then
|
|
echo "Running remaining tests in parallel..."
|
|
running_jobs=0
|
|
for ((i=1; i<$total_jobs; i++)); do
|
|
job="${JOBS[$i]}"
|
|
IFS='|' read -r provider model idx <<< "$job"
|
|
|
|
result_file="$RESULTS_DIR/result_$idx"
|
|
output_file="$RESULTS_DIR/output_$idx"
|
|
meta_file="$RESULTS_DIR/meta_$idx"
|
|
echo "$provider|$model" > "$meta_file"
|
|
|
|
# Run test in background
|
|
run_test "$provider" "$model" "$result_file" "$output_file" &
|
|
((running_jobs++))
|
|
|
|
# Wait if we've hit the parallel limit
|
|
if [ $running_jobs -ge $MAX_PARALLEL ]; then
|
|
wait -n 2>/dev/null || wait
|
|
((running_jobs--))
|
|
fi
|
|
done
|
|
|
|
# Wait for all remaining jobs
|
|
wait
|
|
fi
|
|
|
|
echo ""
|
|
echo "=== Test Results ==="
|
|
echo ""
|
|
|
|
# Collect results
|
|
RESULTS=()
|
|
HARD_FAILURES=()
|
|
|
|
for job in "${JOBS[@]}"; do
|
|
IFS='|' read -r provider model idx <<< "$job"
|
|
|
|
result_file="$RESULTS_DIR/result_$idx"
|
|
output_file="$RESULTS_DIR/output_$idx"
|
|
|
|
echo "Provider: $provider"
|
|
echo "Model: $model"
|
|
echo ""
|
|
cat "$output_file"
|
|
echo ""
|
|
|
|
if [ -f "$result_file" ] && [ "$(cat "$result_file")" = "success" ]; then
|
|
echo "✓ SUCCESS: Test passed - $SUCCESS_MSG"
|
|
RESULTS+=("✓ ${provider}: ${model}")
|
|
else
|
|
if is_allowed_failure "$provider" "$model"; then
|
|
echo "⚠ FLAKY: Test failed but model is in allowed failures list - $FAILURE_MSG"
|
|
RESULTS+=("⚠ ${provider}: ${model} (flaky)")
|
|
else
|
|
echo "✗ FAILED: Test failed - $FAILURE_MSG"
|
|
RESULTS+=("✗ ${provider}: ${model}")
|
|
HARD_FAILURES+=("${provider}: ${model}")
|
|
fi
|
|
fi
|
|
echo "---"
|
|
done
|
|
|
|
echo ""
|
|
echo "=== Test Summary ==="
|
|
for result in "${RESULTS[@]}"; do
|
|
echo "$result"
|
|
done
|
|
|
|
if [ ${#HARD_FAILURES[@]} -gt 0 ]; then
|
|
echo ""
|
|
echo "Hard failures (${#HARD_FAILURES[@]}):"
|
|
for failure in "${HARD_FAILURES[@]}"; do
|
|
echo " - $failure"
|
|
done
|
|
echo ""
|
|
echo "Some tests failed!"
|
|
exit 1
|
|
else
|
|
if echo "${RESULTS[@]}" | grep -q "⚠"; then
|
|
echo ""
|
|
echo "All required tests passed! (some flaky tests failed but are allowed)"
|
|
else
|
|
echo ""
|
|
echo "All tests passed!"
|
|
fi
|
|
fi
|