Pulse/internal/ai/eval/eval.go

1169 lines
30 KiB
Go

// Package eval provides an evaluation framework for testing Pulse Assistant
// behavior end-to-end. It sends prompts to the live API and captures the
// full trace of tool calls, FSM transitions, and responses for verification.
package eval
import (
"bufio"
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"path/filepath"
"strings"
"time"
"github.com/rcourtman/pulse-go-rewrite/internal/config"
)
// Config holds eval runner configuration
type Config struct {
BaseURL string // e.g., "http://127.0.0.1:7655"
Username string
Password string
Verbose bool
Model string
// HTTP timeout for each chat request.
RequestTimeout time.Duration
// Retry behavior for transient eval failures.
StepRetries int
RetryOnPhantom bool
RetryOnExplicitTool bool
RetryOnStreamFailure bool
RetryOnEmptyResponse bool
RetryOnToolErrors bool
RetryOnRateLimit bool
RateLimitCooldown time.Duration
// Optional preflight to fail fast when SSE hangs.
Preflight bool
PreflightTimeout time.Duration
// Optional report output directory (JSON per scenario).
ReportDir string
}
// DefaultConfig returns a config for local development
func DefaultConfig() Config {
return Config{
BaseURL: "http://127.0.0.1:7655",
Username: "admin",
Password: "admin",
Verbose: true,
RequestTimeout: 5 * time.Minute,
StepRetries: 2,
RetryOnPhantom: true,
RetryOnExplicitTool: true,
RetryOnStreamFailure: true,
RetryOnEmptyResponse: true,
RetryOnToolErrors: true,
RetryOnRateLimit: false,
RateLimitCooldown: 0,
Preflight: false,
PreflightTimeout: 15 * time.Second,
}
}
// Runner executes eval scenarios against the Pulse API
type Runner struct {
config Config
client *http.Client
}
// NewRunner creates a new eval runner
func NewRunner(config Config) *Runner {
applyEvalEnvOverrides(&config)
timeout := config.RequestTimeout
if timeout <= 0 {
timeout = 5 * time.Minute
}
return &Runner{
config: config,
client: &http.Client{
Timeout: timeout, // Long timeout for AI responses
},
}
}
// StepResult captures the result of a single eval step
type StepResult struct {
StepName string
Prompt string
SessionID string
Model string
InputTokens int
OutputTokens int
Success bool
Error error
Duration time.Duration
Retries int
RetryNotes []string
ToolCalls []ToolCallEvent
Approvals []ApprovalEvent
Content string
RawEvents []SSEEvent
Assertions []AssertionResult
}
// ToolCallEvent represents a tool call captured during execution
type ToolCallEvent struct {
ID string
Name string
Input string
Output string
Success bool
}
// ApprovalEvent represents an approval request captured during execution
type ApprovalEvent struct {
ApprovalID string
ToolID string
ToolName string
Command string
Risk string
Description string
}
// SSEEvent represents a raw SSE event from the stream
type SSEEvent struct {
Type string
Data json.RawMessage
}
// AssertionResult captures the result of a single assertion
type AssertionResult struct {
Name string
Passed bool
Message string
}
// ScenarioResult captures the result of a full scenario
type ScenarioResult struct {
ScenarioName string
Steps []StepResult
Passed bool
Duration time.Duration
ReportPath string
}
// StepMention represents a structured mention attached to a step.
// When present, these are sent alongside the prompt so the backend
// can resolve resources without discovery tool calls.
type StepMention struct {
ID string `json:"id"`
Name string `json:"name"`
Type string `json:"type"`
Node string `json:"node,omitempty"`
}
// Step defines a single step in an eval scenario
type Step struct {
Name string
Prompt string
Mentions []StepMention // optional structured mentions
Assertions []Assertion
ApprovalDecision ApprovalDecision
ApprovalReason string
}
// ApprovalDecision controls how eval handles approval requests during a step.
type ApprovalDecision string
const (
ApprovalNone ApprovalDecision = ""
ApprovalApprove ApprovalDecision = "approve"
ApprovalDeny ApprovalDecision = "deny"
)
// Assertion defines a check to run after a step
type Assertion func(result *StepResult) AssertionResult
// Scenario defines a multi-step eval scenario
type Scenario struct {
Name string
Description string
Steps []Step
}
// RunScenario executes a scenario and returns the results
func (r *Runner) RunScenario(scenario Scenario) ScenarioResult {
startTime := time.Now()
result := ScenarioResult{
ScenarioName: scenario.Name,
Passed: true,
}
var sessionID string
if r.config.Preflight {
preflight := r.runPreflight()
result.Steps = append(result.Steps, preflight)
if !preflight.Success {
result.Passed = false
result.Duration = time.Since(startTime)
if reportPath, err := r.writeReport(result); err == nil {
result.ReportPath = reportPath
}
return result
}
}
for i, step := range scenario.Steps {
if r.config.Verbose {
fmt.Printf("\n=== Step %d: %s ===\n", i+1, step.Name)
fmt.Printf("Prompt: %s\n", step.Prompt)
}
stepResult := r.executeStep(step, sessionID)
// Use session from first step for subsequent steps
if sessionID == "" && stepResult.SessionID != "" {
sessionID = stepResult.SessionID
}
stepResult.SessionID = sessionID
// Run assertions
for _, assertion := range step.Assertions {
assertResult := assertion(&stepResult)
stepResult.Assertions = append(stepResult.Assertions, assertResult)
if !assertResult.Passed {
stepResult.Success = false
result.Passed = false
}
}
if stepResult.Error != nil {
stepResult.Success = false
result.Passed = false
}
if r.config.Verbose {
r.printStepResult(&stepResult)
}
result.Steps = append(result.Steps, stepResult)
// Stop on failure
if !stepResult.Success {
break
}
}
result.Duration = time.Since(startTime)
if reportPath, err := r.writeReport(result); err == nil {
result.ReportPath = reportPath
}
return result
}
func (r *Runner) executeStep(step Step, sessionID string) StepResult {
retries := r.config.StepRetries
if retries < 0 {
retries = 0
}
return r.executeStepWithRetry(step, sessionID, retries)
}
func (r *Runner) executeStepWithRetry(step Step, sessionID string, retries int) StepResult {
if retries < 0 {
retries = 0
}
var retryNotes []string
for attempt := 0; attempt <= retries; attempt++ {
result := r.executeStepOnce(step, sessionID)
shouldRetry, reason := r.shouldRetryStep(&result, step)
if !shouldRetry || attempt == retries {
result.Retries = len(retryNotes)
result.RetryNotes = retryNotes
return result
}
if reason != "" {
retryNotes = append(retryNotes, reason)
}
if reason == "rate_limit" && r.config.RateLimitCooldown > 0 {
if r.config.Verbose {
fmt.Printf("\n--- Rate limit cooldown (%s) before retry ---\n", r.config.RateLimitCooldown)
}
time.Sleep(r.config.RateLimitCooldown)
}
if r.config.Verbose {
fmt.Printf("\n--- Retrying step '%s' (attempt %d/%d) due to transient failure ---\n",
step.Name, attempt+1, retries)
}
}
return r.executeStepOnce(step, sessionID)
}
func (r *Runner) executeStepOnce(step Step, sessionID string) StepResult {
return r.executeStepOnceWithClient(step, sessionID, r.client)
}
func (r *Runner) executeStepOnceWithClient(step Step, sessionID string, client *http.Client) StepResult {
startTime := time.Now()
result := StepResult{
StepName: step.Name,
Prompt: step.Prompt,
SessionID: sessionID,
Success: true,
}
// Build request
reqBody := map[string]interface{}{
"prompt": step.Prompt,
}
if sessionID != "" {
reqBody["session_id"] = sessionID
}
if strings.TrimSpace(r.config.Model) != "" {
reqBody["model"] = r.config.Model
}
if len(step.Mentions) > 0 {
reqBody["mentions"] = step.Mentions
}
bodyBytes, _ := json.Marshal(reqBody)
req, err := http.NewRequest("POST", r.config.BaseURL+"/api/ai/chat", bytes.NewReader(bodyBytes))
if err != nil {
result.Error = fmt.Errorf("failed to create request: %w", err)
result.Success = false
return result
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Accept", "text/event-stream")
req.SetBasicAuth(r.config.Username, r.config.Password)
// Execute request
if client == nil {
client = r.client
}
resp, err := client.Do(req)
if err != nil {
result.Error = fmt.Errorf("request failed: %w", err)
result.Success = false
return result
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
result.Error = fmt.Errorf("unexpected status %d: %s", resp.StatusCode, string(body))
result.Success = false
return result
}
// Parse SSE stream
result.RawEvents, result.ToolCalls, result.Approvals, result.Content, result.SessionID, result.Model, result.InputTokens, result.OutputTokens, err = r.parseSSEStream(resp.Body, step.ApprovalDecision, step.ApprovalReason)
if err != nil {
result.Error = fmt.Errorf("failed to parse SSE stream: %w", err)
result.Success = false
return result
}
result.Duration = time.Since(startTime)
return result
}
func (r *Runner) runPreflight() StepResult {
step := Step{
Name: "Preflight",
Prompt: "Say hello.",
}
client := &http.Client{
Timeout: r.config.PreflightTimeout,
}
result := r.executeStepOnceWithClient(step, "", client)
result.StepName = "Preflight"
if result.Error == nil && strings.TrimSpace(result.Content) == "" && len(result.ToolCalls) == 0 {
result.Error = fmt.Errorf("preflight returned empty response")
result.Success = false
}
return result
}
func (r *Runner) shouldRetryStep(result *StepResult, step Step) (bool, string) {
if result == nil {
return false, ""
}
if result.Error != nil && isRateLimitError(result.Error.Error()) {
if r.config.RetryOnRateLimit {
return true, "rate_limit"
}
return false, ""
}
// Retry on known transient errors (stream parse or phantom detection).
if result.Error != nil && r.config.RetryOnStreamFailure {
errMsg := result.Error.Error()
if strings.Contains(errMsg, "token too long") ||
strings.Contains(errMsg, "failed to parse SSE stream") ||
strings.Contains(errMsg, "stream error") {
return true, "stream_error"
}
}
phantomMessage := "I apologize, but I wasn't able to access the infrastructure tools needed to complete that request"
if r.config.RetryOnPhantom && !hasSuccessfulToolCallRetry(result.ToolCalls) && strings.Contains(result.Content, phantomMessage) {
return true, "phantom_detection"
}
if r.config.RetryOnEmptyResponse && strings.TrimSpace(result.Content) == "" {
// If we got successful tool calls, don't retry just because content was empty.
if len(result.ToolCalls) == 0 || !hasSuccessfulToolCallRetry(result.ToolCalls) {
return true, "empty_response"
}
}
// If an explicit tool was requested and no tool calls occurred, retry once.
if r.config.RetryOnExplicitTool && len(result.ToolCalls) == 0 && requiresExplicitTool(step.Prompt) {
return true, "no_tool_calls_for_explicit_tool"
}
if r.config.RetryOnToolErrors && len(result.ToolCalls) > 0 && !hasSuccessfulToolCallRetry(result.ToolCalls) {
if hasRetryableToolError(result.ToolCalls) {
return true, "tool_error"
}
}
return false, ""
}
func requiresExplicitTool(prompt string) bool {
prompt = strings.ToLower(prompt)
explicitTools := []string{
"pulse_read",
"pulse_control",
"pulse_query",
"pulse_discovery",
"pulse_docker",
"pulse_kubernetes",
"pulse_metrics",
"pulse_storage",
}
for _, tool := range explicitTools {
if strings.Contains(prompt, tool) {
return true
}
}
if strings.Contains(prompt, "read-only tool") || strings.Contains(prompt, "read only tool") {
return true
}
if strings.Contains(prompt, "control tool") || strings.Contains(prompt, "query tool") {
return true
}
return false
}
func applyEvalEnvOverrides(config *Config) {
if config == nil {
return
}
if value, ok := envInt("EVAL_HTTP_TIMEOUT"); ok && value > 0 {
config.RequestTimeout = time.Duration(value) * time.Second
}
if value, ok := envInt("EVAL_STEP_RETRIES"); ok {
config.StepRetries = value
} else if config.StepRetries == 0 {
config.StepRetries = 1
}
if value, ok := envBool("EVAL_RETRY_ON_PHANTOM"); ok {
config.RetryOnPhantom = value
} else if !config.RetryOnPhantom {
config.RetryOnPhantom = true
}
if value, ok := envBool("EVAL_RETRY_ON_EXPLICIT_TOOL"); ok {
config.RetryOnExplicitTool = value
} else if !config.RetryOnExplicitTool {
config.RetryOnExplicitTool = true
}
if value, ok := envBool("EVAL_RETRY_ON_STREAM_FAILURE"); ok {
config.RetryOnStreamFailure = value
} else if !config.RetryOnStreamFailure {
config.RetryOnStreamFailure = true
}
if value, ok := envBool("EVAL_RETRY_ON_EMPTY_RESPONSE"); ok {
config.RetryOnEmptyResponse = value
} else if !config.RetryOnEmptyResponse {
config.RetryOnEmptyResponse = true
}
if value, ok := envBool("EVAL_RETRY_ON_TOOL_ERRORS"); ok {
config.RetryOnToolErrors = value
} else if !config.RetryOnToolErrors {
config.RetryOnToolErrors = true
}
if value, ok := envBool("EVAL_RETRY_ON_RATE_LIMIT"); ok {
config.RetryOnRateLimit = value
}
if value, ok := envInt("EVAL_RATE_LIMIT_COOLDOWN"); ok && value > 0 {
config.RateLimitCooldown = time.Duration(value) * time.Second
}
if value, ok := envBool("EVAL_PREFLIGHT"); ok {
config.Preflight = value
}
if value, ok := envInt("EVAL_PREFLIGHT_TIMEOUT"); ok && value > 0 {
config.PreflightTimeout = time.Duration(value) * time.Second
} else if config.PreflightTimeout == 0 {
config.PreflightTimeout = 15 * time.Second
}
if value, ok := envString("EVAL_MODEL"); ok && strings.TrimSpace(config.Model) == "" {
config.Model = value
}
if dir, ok := envString("EVAL_REPORT_DIR"); ok {
config.ReportDir = dir
}
}
func (r *Runner) writeReport(result ScenarioResult) (string, error) {
if r == nil || r.config.ReportDir == "" {
return "", nil
}
if err := os.MkdirAll(r.config.ReportDir, 0700); err != nil {
return "", err
}
timestamp := time.Now().Format("20060102-150405")
nameParts := []string{sanitizeFilename(result.ScenarioName)}
if model := strings.TrimSpace(r.config.Model); model != "" {
nameParts = append(nameParts, sanitizeFilename(model))
}
filename := fmt.Sprintf("eval-%s-%s.json", strings.Join(nameParts, "-"), timestamp)
path := filepath.Join(r.config.ReportDir, filename)
report := struct {
GeneratedAt time.Time `json:"generated_at"`
BaseURL string `json:"base_url"`
Username string `json:"username"`
Model string `json:"model,omitempty"`
Result ScenarioResult `json:"result"`
}{
GeneratedAt: time.Now(),
BaseURL: r.config.BaseURL,
Username: r.config.Username,
Model: r.config.Model,
Result: result,
}
data, err := json.MarshalIndent(report, "", " ")
if err != nil {
return "", err
}
if err := os.WriteFile(path, data, 0600); err != nil {
return "", err
}
return path, nil
}
func sanitizeFilename(name string) string {
name = strings.ToLower(strings.TrimSpace(name))
name = strings.ReplaceAll(name, " ", "-")
name = strings.ReplaceAll(name, "/", "-")
name = strings.ReplaceAll(name, "\\", "-")
name = strings.ReplaceAll(name, ":", "-")
return name
}
func envBool(key string) (bool, bool) {
value, ok := os.LookupEnv(key)
if !ok {
return false, false
}
switch strings.ToLower(strings.TrimSpace(value)) {
case "1", "true", "yes", "y", "on":
return true, true
case "0", "false", "no", "n", "off":
return false, true
default:
return false, false
}
}
func envInt(key string) (int, bool) {
value, ok := os.LookupEnv(key)
if !ok {
return 0, false
}
var parsed int
if _, err := fmt.Sscanf(strings.TrimSpace(value), "%d", &parsed); err != nil {
return 0, false
}
return parsed, true
}
func envFloat(key string) (float64, bool) {
value, ok := os.LookupEnv(key)
if !ok {
return 0, false
}
var parsed float64
if _, err := fmt.Sscanf(strings.TrimSpace(value), "%f", &parsed); err != nil {
return 0, false
}
return parsed, true
}
func envString(key string) (string, bool) {
value, ok := os.LookupEnv(key)
if !ok {
return "", false
}
value = strings.TrimSpace(value)
if value == "" {
return "", false
}
return value, true
}
type aiSettingsResponse struct {
PatrolModel string `json:"patrol_model"`
}
type aiSettingsUpdateRequest struct {
PatrolModel *string `json:"patrol_model,omitempty"`
}
func (r *Runner) applyPatrolModelOverride(ctx context.Context) (func(), error) {
if r == nil {
return nil, nil
}
override := r.patrolModelOverride()
if override == "" {
return nil, nil
}
current, err := r.getAISettings(ctx)
if err != nil {
return nil, err
}
if current.PatrolModel == override {
return nil, nil
}
if err := r.updateAISettings(ctx, aiSettingsUpdateRequest{PatrolModel: &override}); err != nil {
return nil, err
}
restore := func() {
_ = r.updateAISettings(context.Background(), aiSettingsUpdateRequest{PatrolModel: &current.PatrolModel})
}
return restore, nil
}
func (r *Runner) patrolModelOverride() string {
if value, ok := envString("EVAL_PATROL_MODEL"); ok {
return normalizeModelString(value)
}
return normalizeModelString(strings.TrimSpace(r.config.Model))
}
func normalizeModelString(model string) string {
model = strings.TrimSpace(model)
if model == "" {
return ""
}
provider, name := config.ParseModelString(model)
if provider == "" || name == "" {
return model
}
return provider + ":" + name
}
func (r *Runner) getAISettings(ctx context.Context) (*aiSettingsResponse, error) {
if r == nil {
return nil, fmt.Errorf("nil runner")
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, r.config.BaseURL+"/api/settings/ai", nil)
if err != nil {
return nil, err
}
req.SetBasicAuth(r.config.Username, r.config.Password)
resp, err := r.client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("get AI settings failed (%d): %s", resp.StatusCode, strings.TrimSpace(string(body)))
}
var settings aiSettingsResponse
if err := json.NewDecoder(resp.Body).Decode(&settings); err != nil {
return nil, err
}
return &settings, nil
}
func (r *Runner) updateAISettings(ctx context.Context, update aiSettingsUpdateRequest) error {
if r == nil {
return fmt.Errorf("nil runner")
}
payload, err := json.Marshal(update)
if err != nil {
return err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPut, r.config.BaseURL+"/api/settings/ai/update", bytes.NewReader(payload))
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/json")
req.SetBasicAuth(r.config.Username, r.config.Password)
resp, err := r.client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
return fmt.Errorf("update AI settings failed (%d): %s", resp.StatusCode, strings.TrimSpace(string(body)))
}
return nil
}
func (r *Runner) handleApprovalDecision(decision ApprovalDecision, approvalID, reason string) error {
if r == nil || approvalID == "" {
return nil
}
path := ""
switch decision {
case ApprovalApprove:
path = "/api/ai/approvals/" + approvalID + "/approve"
case ApprovalDeny:
path = "/api/ai/approvals/" + approvalID + "/deny"
default:
return nil
}
var body io.Reader
if decision == ApprovalDeny && reason != "" {
payload := map[string]string{"reason": reason}
if encoded, err := json.Marshal(payload); err == nil {
body = bytes.NewReader(encoded)
}
}
req, err := http.NewRequest("POST", r.config.BaseURL+path, body)
if err != nil {
return fmt.Errorf("failed to create approval request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.SetBasicAuth(r.config.Username, r.config.Password)
resp, err := r.client.Do(req)
if err != nil {
return fmt.Errorf("approval %s request failed: %w", decision, err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
respBody, _ := io.ReadAll(resp.Body)
return fmt.Errorf("approval %s returned status %d: %s", decision, resp.StatusCode, strings.TrimSpace(string(respBody)))
}
return nil
}
func hasSuccessfulToolCallRetry(toolCalls []ToolCallEvent) bool {
for _, tc := range toolCalls {
if tc.Success {
return true
}
}
return false
}
func hasRetryableToolError(toolCalls []ToolCallEvent) bool {
retryableIndicators := []string{
"timeout",
"timed out",
"context deadline exceeded",
"connection refused",
"connection reset",
"network is unreachable",
"no such host",
"i/o timeout",
"server error",
"502",
"503",
"504",
"eof",
"dial tcp",
"temporarily",
"query is required",
}
nonRetryableIndicators := []string{
"read_only_violation",
"strict_resolution",
"routing_mismatch",
}
for _, tc := range toolCalls {
if tc.Success {
continue
}
lower := strings.ToLower(tc.Output)
if lower == "" {
continue
}
for _, indicator := range nonRetryableIndicators {
if strings.Contains(lower, indicator) {
goto next
}
}
for _, indicator := range retryableIndicators {
if strings.Contains(lower, indicator) {
return true
}
}
next:
}
return false
}
func isRateLimitError(message string) bool {
lower := strings.ToLower(message)
indicators := []string{
"rate limit",
"rate-limit",
"retry-after",
"too many requests",
"429",
"quota",
"resource has been exhausted",
}
for _, indicator := range indicators {
if indicator != "" && strings.Contains(lower, indicator) {
return true
}
}
return false
}
func (r *Runner) parseSSEStream(body io.Reader, approvalDecision ApprovalDecision, approvalReason string) ([]SSEEvent, []ToolCallEvent, []ApprovalEvent, string, string, string, int, int, error) {
var events []SSEEvent
var toolCalls []ToolCallEvent
var approvals []ApprovalEvent
var contentBuilder strings.Builder
var sessionID string
var model string
var inputTokens int
var outputTokens int
handledApprovals := make(map[string]struct{})
// Track tool calls in progress
toolCallsInProgress := make(map[string]*ToolCallEvent)
scanner := bufio.NewScanner(body)
// Allow large SSE events (tool results can be big).
const maxSSEEventSize = 8 * 1024 * 1024
scanner.Buffer(make([]byte, 0, 64*1024), maxSSEEventSize)
for scanner.Scan() {
line := scanner.Text()
if !strings.HasPrefix(line, "data: ") {
continue
}
data := strings.TrimPrefix(line, "data: ")
if data == "" {
continue
}
// Parse the event
var event struct {
Type string `json:"type"`
Data json.RawMessage `json:"data"`
}
if err := json.Unmarshal([]byte(data), &event); err != nil {
// Try parsing as raw event data
continue
}
eventData := event.Data
if event.Type == "complete" && len(event.Data) == 0 {
eventData = json.RawMessage([]byte(data))
}
events = append(events, SSEEvent{
Type: event.Type,
Data: eventData,
})
switch event.Type {
case "session":
var sessionData struct {
ID string `json:"id"`
}
if err := json.Unmarshal(event.Data, &sessionData); err == nil {
sessionID = sessionData.ID
}
case "done":
var doneData struct {
SessionID string `json:"session_id"`
InputTokens int `json:"input_tokens"`
OutputTokens int `json:"output_tokens"`
}
if err := json.Unmarshal(event.Data, &doneData); err == nil {
if doneData.SessionID != "" {
sessionID = doneData.SessionID
}
if doneData.InputTokens > 0 || doneData.OutputTokens > 0 {
inputTokens = doneData.InputTokens
outputTokens = doneData.OutputTokens
}
}
case "content":
var contentData struct {
Text string `json:"text"`
}
if err := json.Unmarshal(event.Data, &contentData); err == nil {
contentBuilder.WriteString(contentData.Text)
}
case "tool_start":
var toolData struct {
ID string `json:"id"`
Name string `json:"name"`
Input string `json:"input"`
}
if err := json.Unmarshal(event.Data, &toolData); err == nil {
toolCallsInProgress[toolData.ID] = &ToolCallEvent{
ID: toolData.ID,
Name: toolData.Name,
Input: toolData.Input,
}
}
case "tool_end":
var toolData struct {
ID string `json:"id"`
Name string `json:"name"`
Input string `json:"input"`
Output string `json:"output"`
Success bool `json:"success"`
}
if err := json.Unmarshal(event.Data, &toolData); err == nil {
if tc, ok := toolCallsInProgress[toolData.ID]; ok {
if toolData.Input != "" {
tc.Input = toolData.Input
}
tc.Output = toolData.Output
tc.Success = toolData.Success
toolCalls = append(toolCalls, *tc)
delete(toolCallsInProgress, toolData.ID)
} else {
// Tool end without start
toolCalls = append(toolCalls, ToolCallEvent{
ID: toolData.ID,
Name: toolData.Name,
Input: toolData.Input,
Output: toolData.Output,
Success: toolData.Success,
})
}
}
case "approval_needed":
var approvalData struct {
ApprovalID string `json:"approval_id"`
ToolID string `json:"tool_id"`
ToolName string `json:"tool_name"`
Command string `json:"command"`
Risk string `json:"risk"`
Description string `json:"description"`
}
if err := json.Unmarshal(event.Data, &approvalData); err == nil {
approvals = append(approvals, ApprovalEvent{
ApprovalID: approvalData.ApprovalID,
ToolID: approvalData.ToolID,
ToolName: approvalData.ToolName,
Command: approvalData.Command,
Risk: approvalData.Risk,
Description: approvalData.Description,
})
if approvalDecision != ApprovalNone && approvalData.ApprovalID != "" {
if _, ok := handledApprovals[approvalData.ApprovalID]; !ok {
handledApprovals[approvalData.ApprovalID] = struct{}{}
if err := r.handleApprovalDecision(approvalDecision, approvalData.ApprovalID, approvalReason); err != nil {
return events, toolCalls, approvals, contentBuilder.String(), sessionID, model, inputTokens, outputTokens, err
}
}
}
}
case "complete":
var completeData struct {
Model string `json:"model"`
InputTokens int `json:"input_tokens"`
OutputTokens int `json:"output_tokens"`
}
if err := json.Unmarshal([]byte(data), &completeData); err == nil {
if completeData.Model != "" {
model = completeData.Model
}
inputTokens = completeData.InputTokens
outputTokens = completeData.OutputTokens
}
case "error":
var errorData struct {
Message string `json:"message"`
}
if err := json.Unmarshal(event.Data, &errorData); err == nil && strings.TrimSpace(errorData.Message) != "" {
return events, toolCalls, approvals, contentBuilder.String(), sessionID, model, inputTokens, outputTokens, fmt.Errorf("stream error: %s", errorData.Message)
}
var rawMsg string
if err := json.Unmarshal(event.Data, &rawMsg); err == nil && strings.TrimSpace(rawMsg) != "" {
return events, toolCalls, approvals, contentBuilder.String(), sessionID, model, inputTokens, outputTokens, fmt.Errorf("stream error: %s", rawMsg)
}
}
}
if err := scanner.Err(); err != nil {
return events, toolCalls, approvals, contentBuilder.String(), sessionID, model, inputTokens, outputTokens, err
}
return events, toolCalls, approvals, contentBuilder.String(), sessionID, model, inputTokens, outputTokens, nil
}
func (r *Runner) printStepResult(result *StepResult) {
fmt.Printf("\n--- Result ---\n")
fmt.Printf("Duration: %v\n", result.Duration)
fmt.Printf("Session: %s\n", result.SessionID)
if result.Retries > 0 {
fmt.Printf("Retries: %d", result.Retries)
if len(result.RetryNotes) > 0 {
fmt.Printf(" (%s)", strings.Join(result.RetryNotes, ", "))
}
fmt.Printf("\n")
}
if len(result.Approvals) > 0 {
fmt.Printf("Approvals: %d\n", len(result.Approvals))
}
if result.Error != nil {
fmt.Printf("ERROR: %v\n", result.Error)
}
if len(result.ToolCalls) > 0 {
fmt.Printf("\nTool Calls:\n")
for _, tc := range result.ToolCalls {
status := "OK"
if !tc.Success {
status = "FAILED"
}
fmt.Printf(" - %s [%s]: %s\n", tc.Name, status, truncate(tc.Input, 80))
if !tc.Success || r.config.Verbose {
fmt.Printf(" Output: %s\n", truncate(tc.Output, 200))
}
}
}
if result.Content != "" {
fmt.Printf("\nAssistant Response:\n%s\n", truncate(result.Content, 500))
}
if len(result.Assertions) > 0 {
fmt.Printf("\nAssertions:\n")
for _, a := range result.Assertions {
status := "PASS"
if !a.Passed {
status = "FAIL"
}
fmt.Printf(" [%s] %s: %s\n", status, a.Name, a.Message)
}
}
}
func truncate(s string, max int) string {
if len(s) <= max {
return s
}
return s[:max] + "..."
}
// PrintSummary prints a summary of the scenario result
func (r *Runner) PrintSummary(result ScenarioResult) {
fmt.Printf("\n")
fmt.Printf("========================================\n")
fmt.Printf("SCENARIO: %s\n", result.ScenarioName)
fmt.Printf("========================================\n")
fmt.Printf("Duration: %v\n", result.Duration)
passedSteps := 0
totalRetries := 0
for _, step := range result.Steps {
if step.Success {
passedSteps++
}
totalRetries += step.Retries
}
fmt.Printf("Steps: %d/%d passed\n", passedSteps, len(result.Steps))
if totalRetries > 0 {
fmt.Printf("Retries: %d\n", totalRetries)
for _, step := range result.Steps {
if step.Retries > 0 {
note := ""
if len(step.RetryNotes) > 0 {
note = fmt.Sprintf(" (%s)", strings.Join(step.RetryNotes, ", "))
}
fmt.Printf(" - %s: %d%s\n", step.StepName, step.Retries, note)
}
}
}
if result.ReportPath != "" {
fmt.Printf("Report: %s\n", result.ReportPath)
}
if result.Passed {
fmt.Printf("Result: PASSED\n")
} else {
fmt.Printf("Result: FAILED\n")
fmt.Printf("\nFailures:\n")
for _, step := range result.Steps {
if !step.Success {
fmt.Printf(" - %s\n", step.StepName)
if step.Error != nil {
fmt.Printf(" Error: %v\n", step.Error)
}
for _, a := range step.Assertions {
if !a.Passed {
fmt.Printf(" Assertion '%s': %s\n", a.Name, a.Message)
}
}
}
}
}
fmt.Printf("========================================\n")
}