Pulse/internal/ai/eval/runner_additional_test.go

520 lines
15 KiB
Go

package eval
import (
"encoding/json"
"fmt"
"net/http"
"net/http/httptest"
"os"
"strings"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// TestRunner_ParseSSEStream_Comprehensive tests the parsing of various SSE events and scenarios
func TestRunner_ParseSSEStream_Comprehensive(t *testing.T) {
runner := &Runner{
config: DefaultConfig(),
}
tests := []struct {
name string
inputBody string
expectedContent string
expectedTools int
expectedApprovals int
expectError bool
errorContains string
}{
{
name: "Standard flow with content and done",
inputBody: `data: {"type":"content","data":{"text":"Hello"}}
data: {"type":"content","data":{"text":" World"}}
data: {"type":"done","data":{"session_id":"sess-1","input_tokens":10,"output_tokens":5}}
`,
expectedContent: "Hello World",
},
{
name: "Tool call flow",
inputBody: `data: {"type":"tool_start","data":{"id":"call-1","name":"test_tool","input":"{\"arg\":\"val\"}"}}
data: {"type":"tool_end","data":{"id":"call-1","name":"test_tool","output":"result","success":true}}
data: {"type":"done","data":{}}
`,
expectedTools: 1,
},
{
name: "Interleaved content and tools",
inputBody: `data: {"type":"content","data":{"text":"Using tool..."}}
data: {"type":"tool_start","data":{"id":"call-1","name":"t1","input":""}}
data: {"type":"tool_end","data":{"id":"call-1","name":"t1","output":"ok","success":true}}
data: {"type":"content","data":{"text":" Done."}}
data: {"type":"done","data":{}}
`,
expectedContent: "Using tool... Done.",
expectedTools: 1,
},
{
name: "Approval needed event",
inputBody: `data: {"type":"approval_needed","data":{"approval_id":"app-1","tool_id":"call-2","tool_name":"dangerous_tool","risk":"high"}}
data: {"type":"done","data":{}}
`,
expectedApprovals: 1,
},
{
name: "Stream error event",
inputBody: `data: {"type":"error","data":{"message":"something went wrong"}}
`,
expectError: true,
errorContains: "something went wrong",
},
{
name: "Raw string error event",
inputBody: `data: {"type":"error","data":"raw error message"}
`,
expectError: true,
errorContains: "raw error message",
},
{
name: "Malformed JSON (should be ignored/handled gracefully)",
inputBody: `data: {invalid-json}
data: {"type":"content","data":{"text":"Still working"}}
data: {"type":"done","data":{}}
`,
expectedContent: "Still working",
},
{
name: "Multiple tool calls",
inputBody: `data: {"type":"tool_start","data":{"id":"1","name":"t1","input":"i1"}}
data: {"type":"tool_start","data":{"id":"2","name":"t2","input":"i2"}}
data: {"type":"tool_end","data":{"id":"1","output":"o1","success":true}}
data: {"type":"tool_end","data":{"id":"2","output":"o2","success":false}}
data: {"type":"done","data":{}}
`,
expectedTools: 2,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
_, tools, approvals, content, _, _, _, _, err := runner.parseSSEStream(
strings.NewReader(tc.inputBody),
ApprovalNone,
"",
)
if tc.expectError {
require.Error(t, err)
if tc.errorContains != "" {
assert.Contains(t, err.Error(), tc.errorContains)
}
} else {
require.NoError(t, err)
assert.Equal(t, tc.expectedContent, content)
assert.Len(t, tools, tc.expectedTools)
assert.Len(t, approvals, tc.expectedApprovals)
}
})
}
}
func TestRunner_HandleApprovalDecision(t *testing.T) {
// Setup a mock server to handle approval requests
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// Basic auth check
u, p, ok := r.BasicAuth()
if !ok || u != "admin" || p != "admin" {
w.WriteHeader(http.StatusUnauthorized)
return
}
if strings.HasSuffix(r.URL.Path, "/approve") {
assert.Equal(t, http.MethodPost, r.Method)
w.WriteHeader(http.StatusOK)
return
}
if strings.HasSuffix(r.URL.Path, "/deny") {
assert.Equal(t, http.MethodPost, r.Method)
// Check if reason payload is present
var payload map[string]string
if err := json.NewDecoder(r.Body).Decode(&payload); err == nil {
if reason, ok := payload["reason"]; ok && reason == "unsafe" {
w.WriteHeader(http.StatusOK)
return
}
}
// If we expected a reason but didn't get one or got wrong one
w.WriteHeader(http.StatusOK)
return
}
w.WriteHeader(http.StatusNotFound)
}))
defer server.Close()
cfg := DefaultConfig()
cfg.BaseURL = server.URL
runner := NewRunner(cfg)
// Test Approve
err := runner.handleApprovalDecision(ApprovalApprove, "app-123", "")
require.NoError(t, err)
// Test Deny with Reason
err = runner.handleApprovalDecision(ApprovalDeny, "app-456", "unsafe")
require.NoError(t, err)
// Test Ignore (None)
err = runner.handleApprovalDecision(ApprovalNone, "app-789", "")
require.NoError(t, err)
// Test Server Error Handling (setup failure server)
failServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
w.Write([]byte("oops"))
}))
defer failServer.Close()
failRunner := NewRunner(DefaultConfig())
failRunner.config.BaseURL = failServer.URL
err = failRunner.handleApprovalDecision(ApprovalApprove, "app-fail", "")
require.Error(t, err)
assert.Contains(t, err.Error(), "status 500")
}
func TestRunner_RetryLogic_Advanced(t *testing.T) {
runner := &Runner{config: DefaultConfig()}
// Enable all retries for testing
runner.config.RetryOnRateLimit = true
runner.config.RetryOnPhantom = true
runner.config.RetryOnToolErrors = true
runner.config.RetryOnExplicitTool = true
tests := []struct {
name string
result *StepResult
step Step
shouldRetry bool
retryReason string
}{
{
name: "Rate Limit Error",
result: &StepResult{
Error: fmt.Errorf("HTTP 429 Too Many Requests"),
},
shouldRetry: true,
retryReason: "rate_limit",
},
{
name: "Stream Parse Error",
result: &StepResult{
Error: fmt.Errorf("failed to parse SSE stream: unexpected EOF"),
},
shouldRetry: true,
retryReason: "stream_error",
},
{
name: "Phantom Detection (Phantom content, no successful tools)",
result: &StepResult{
Content: "I apologize, but I wasn't able to access the infrastructure tools needed to complete that request",
ToolCalls: []ToolCallEvent{
{Success: false},
},
},
shouldRetry: true,
retryReason: "phantom_detection",
},
{
name: "Phantom Detection (Phantom content, BUT has successful tools - No Retry)",
result: &StepResult{
Content: "I apologize, but I wasn't able to access the infrastructure tools needed to complete that request",
ToolCalls: []ToolCallEvent{
{Success: true},
},
},
shouldRetry: false,
},
{
name: "Explicit Tool Requested but Missing",
step: Step{Prompt: "Please use pulse_read to check files"},
result: &StepResult{
Content: "some content",
ToolCalls: []ToolCallEvent{},
},
shouldRetry: true,
retryReason: "no_tool_calls_for_explicit_tool",
},
{
name: "Tool Error (Retryable: timeout)",
result: &StepResult{
Content: "some content",
ToolCalls: []ToolCallEvent{
{Success: false, Output: "context deadline exceeded"},
},
},
shouldRetry: true,
retryReason: "tool_error",
},
{
name: "Tool Error (Non-Retryable: routing mismatch)",
result: &StepResult{
Content: "some content",
ToolCalls: []ToolCallEvent{
{Success: false, Output: "routing_mismatch detected"},
},
},
shouldRetry: false,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
retry, reason := runner.shouldRetryStep(tc.result, tc.step)
assert.Equal(t, tc.shouldRetry, retry)
if tc.shouldRetry {
assert.Equal(t, tc.retryReason, reason)
}
})
}
}
func TestIsRateLimitError(t *testing.T) {
assert.True(t, isRateLimitError("429 Too Many Requests"))
assert.True(t, isRateLimitError("Quota exceeded"))
assert.True(t, isRateLimitError("Retry-After: 30"))
assert.False(t, isRateLimitError("Generic error"))
}
func TestHasRetryableToolError(t *testing.T) {
// Retryable
assert.True(t, hasRetryableToolError([]ToolCallEvent{
{Success: false, Output: "connection refused"},
}))
assert.True(t, hasRetryableToolError([]ToolCallEvent{
{Success: false, Output: "502 Bad Gateway"},
}))
// Non-retryable has precedence
assert.False(t, hasRetryableToolError([]ToolCallEvent{
{Success: false, Output: "read_only_violation"},
}))
// Success means no retry needed for that tool
assert.False(t, hasRetryableToolError([]ToolCallEvent{
{Success: true, Output: "success"},
}))
}
func TestRunner_ExecuteStep_FullFlow(t *testing.T) {
// Mock a successful flow
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/event-stream")
fmt.Fprintf(w, "data: {\"type\":\"content\",\"data\":{\"text\":\"Step done\"}}\n\n")
fmt.Fprintf(w, "data: {\"type\":\"done\",\"data\":{\"session_id\":\"s-1\"}}\n\n")
}))
defer server.Close()
runner := NewRunner(DefaultConfig())
runner.config.BaseURL = server.URL
runner.config.Verbose = true // Cover verbose logging paths
step := Step{Name: "Test Step", Prompt: "Run"}
result := runner.executeStep(step, "")
assert.True(t, result.Success)
assert.Equal(t, "Step done", result.Content)
assert.Equal(t, "s-1", result.SessionID)
}
func TestRunner_ExecuteStep_RetryFlow(t *testing.T) {
attempts := 0
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
attempts++
if attempts == 1 {
// First attempt fails with rate limit
w.WriteHeader(http.StatusTooManyRequests)
w.Write([]byte("Rate limit exceeded"))
return
}
// Second attempt succeeds
w.Header().Set("Content-Type", "text/event-stream")
fmt.Fprintf(w, "data: {\"type\":\"content\",\"data\":{\"text\":\"Success\"}}\n\n")
fmt.Fprintf(w, "data: {\"type\":\"done\",\"data\":{}}\n\n")
}))
defer server.Close()
runner := NewRunner(DefaultConfig())
runner.config.BaseURL = server.URL
runner.config.StepRetries = 2
runner.config.RetryOnRateLimit = true
runner.config.RateLimitCooldown = 1 * time.Millisecond // Fast retry
result := runner.executeStep(Step{Name: "Retry Step", Prompt: "Go"}, "")
// Should pass eventually
assert.True(t, result.Success)
// Should have retries recorded
assert.True(t, result.Retries > 0)
assert.Contains(t, result.RetryNotes, "rate_limit")
}
func TestRunner_WriteReport(t *testing.T) {
tempDir := t.TempDir()
cfg := DefaultConfig()
cfg.ReportDir = tempDir
cfg.Model = "test-model"
runner := NewRunner(cfg)
result := ScenarioResult{
ScenarioName: "Test Report Scenario",
Passed: true,
Steps: []StepResult{
{StepName: "Step 1", Success: true},
},
}
path, err := runner.writeReport(result)
require.NoError(t, err)
assert.Contains(t, path, tempDir)
assert.Contains(t, path, "eval-test-report-scenario-test-model")
assert.FileExists(t, path)
// Verify content
content, err := os.ReadFile(path)
require.NoError(t, err)
assert.Contains(t, string(content), "Test Report Scenario")
}
func TestEnvHelpers(t *testing.T) {
// Bool
os.Setenv("TEST_BOOL_TRUE", "true")
os.Setenv("TEST_BOOL_1", "1")
os.Setenv("TEST_BOOL_FALSE", "false")
defer os.Unsetenv("TEST_BOOL_TRUE")
defer os.Unsetenv("TEST_BOOL_1")
defer os.Unsetenv("TEST_BOOL_FALSE")
val, ok := envBool("TEST_BOOL_TRUE")
assert.True(t, ok)
assert.True(t, val)
val, ok = envBool("TEST_BOOL_1")
assert.True(t, ok)
assert.True(t, val)
val, ok = envBool("TEST_BOOL_FALSE")
assert.True(t, ok)
assert.False(t, val)
_, ok = envBool("NON_EXISTENT")
assert.False(t, ok)
// Int
os.Setenv("TEST_INT", "123")
os.Setenv("TEST_INT_BAD", "abc")
defer os.Unsetenv("TEST_INT")
defer os.Unsetenv("TEST_INT_BAD")
iVal, ok := envInt("TEST_INT")
assert.True(t, ok)
assert.Equal(t, 123, iVal)
_, ok = envInt("TEST_INT_BAD")
assert.False(t, ok)
// Float
os.Setenv("TEST_FLOAT", "123.456")
defer os.Unsetenv("TEST_FLOAT")
fVal, ok := envFloat("TEST_FLOAT")
assert.True(t, ok)
assert.InDelta(t, 123.456, fVal, 0.001)
// String
os.Setenv("TEST_STRING", "val")
defer os.Unsetenv("TEST_STRING")
sVal, ok := envString("TEST_STRING")
assert.True(t, ok)
assert.Equal(t, "val", sVal)
}
func TestApplyEvalEnvOverrides_Comprehensive(t *testing.T) {
// Set all env vars
envVars := map[string]string{
"EVAL_HTTP_TIMEOUT": "60",
"EVAL_STEP_RETRIES": "3",
"EVAL_RETRY_ON_PHANTOM": "true",
"EVAL_RETRY_ON_EXPLICIT_TOOL": "true",
"EVAL_RETRY_ON_STREAM_FAILURE": "false",
"EVAL_RETRY_ON_EMPTY_RESPONSE": "false",
"EVAL_RETRY_ON_TOOL_ERRORS": "true",
"EVAL_RETRY_ON_RATE_LIMIT": "true",
"EVAL_RATE_LIMIT_COOLDOWN": "5",
"EVAL_PREFLIGHT": "true",
"EVAL_PREFLIGHT_TIMEOUT": "10",
"EVAL_MODEL": "gpt-4",
"EVAL_REPORT_DIR": "/tmp/reports",
}
for k, v := range envVars {
os.Setenv(k, v)
defer os.Unsetenv(k)
}
cfg := Config{} // Empty config
applyEvalEnvOverrides(&cfg)
assert.Equal(t, 60*time.Second, cfg.RequestTimeout)
assert.Equal(t, 3, cfg.StepRetries)
assert.True(t, cfg.RetryOnPhantom)
assert.True(t, cfg.RetryOnExplicitTool)
assert.False(t, cfg.RetryOnStreamFailure)
assert.False(t, cfg.RetryOnEmptyResponse)
assert.True(t, cfg.RetryOnToolErrors)
assert.True(t, cfg.RetryOnRateLimit)
assert.Equal(t, 5*time.Second, cfg.RateLimitCooldown)
assert.True(t, cfg.Preflight)
assert.Equal(t, 10*time.Second, cfg.PreflightTimeout)
assert.Equal(t, "gpt-4", cfg.Model)
assert.Equal(t, "/tmp/reports", cfg.ReportDir)
}
func TestRunner_RunPreflight(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/event-stream")
fmt.Fprintf(w, "data: {\"type\":\"content\",\"data\":{\"text\":\"Hello\"}}\n\n")
fmt.Fprintf(w, "data: {\"type\":\"done\",\"data\":{}}\n\n")
}))
defer server.Close()
cfg := DefaultConfig()
cfg.BaseURL = server.URL
cfg.PreflightTimeout = 1 * time.Second
runner := NewRunner(cfg)
result := runner.runPreflight()
assert.True(t, result.Success)
assert.Equal(t, "Preflight", result.StepName)
assert.NotEmpty(t, result.Content)
}
func TestRunner_RunPreflight_Fail(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// Return empty successful response (which preflight considers failure)
w.Header().Set("Content-Type", "text/event-stream")
fmt.Fprintf(w, "data: {\"type\":\"done\",\"data\":{}}\n\n")
}))
defer server.Close()
cfg := DefaultConfig()
cfg.BaseURL = server.URL
runner := NewRunner(cfg)
result := runner.runPreflight()
assert.False(t, result.Success)
assert.Error(t, result.Error)
assert.Contains(t, result.Error.Error(), "preflight returned empty response")
}