Pulse/internal/ai/eval/eval.go

// Package eval provides an evaluation framework for testing Pulse Assistant
// behavior end-to-end. It sends prompts to the live API and captures the
// full trace of tool calls, FSM transitions, and responses for verification.
package eval

import (
	"bufio"
	"bytes"
	"context"
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"os"
	"path/filepath"
	"strings"
	"time"

	"github.com/rcourtman/pulse-go-rewrite/internal/config"
)

// Config holds eval runner configuration
type Config struct {
	BaseURL  string // e.g., "http://127.0.0.1:7655"
	Username string
	Password string
	Verbose  bool
	Model    string
	// HTTP timeout for each chat request.
	RequestTimeout time.Duration
	// Retry behavior for transient eval failures.
	StepRetries          int
	RetryOnPhantom       bool
	RetryOnExplicitTool  bool
	RetryOnStreamFailure bool
	RetryOnEmptyResponse bool
	RetryOnToolErrors    bool
	RetryOnRateLimit     bool
	RateLimitCooldown    time.Duration
	// Optional preflight to fail fast when SSE hangs.
	Preflight        bool
	PreflightTimeout time.Duration
	// Optional report output directory (JSON per scenario).
	ReportDir string
}

// DefaultConfig returns a config for local development
func DefaultConfig() Config {
	return Config{
		BaseURL:              "http://127.0.0.1:7655",
		Username:             "admin",
		Password:             "admin",
		Verbose:              true,
		RequestTimeout:       5 * time.Minute,
		StepRetries:          2,
		RetryOnPhantom:       true,
		RetryOnExplicitTool:  true,
		RetryOnStreamFailure: true,
		RetryOnEmptyResponse: true,
		RetryOnToolErrors:    true,
		RetryOnRateLimit:     false,
		RateLimitCooldown:    0,
		Preflight:            false,
		PreflightTimeout:     15 * time.Second,
	}
}

// Runner executes eval scenarios against the Pulse API
type Runner struct {
	config Config
	client *http.Client
}

// NewRunner creates a new eval runner
func NewRunner(config Config) *Runner {
	applyEvalEnvOverrides(&config)
	timeout := config.RequestTimeout
	if timeout <= 0 {
		timeout = 5 * time.Minute
	}
	return &Runner{
		config: config,
		client: &http.Client{
			Timeout: timeout, // Long timeout for AI responses
		},
	}
}

// StepResult captures the result of a single eval step
type StepResult struct {
	StepName     string
	Prompt       string
	SessionID    string
	Model        string
	InputTokens  int
	OutputTokens int
	Success      bool
	Error        error
	Duration     time.Duration
	Retries      int
	RetryNotes   []string
	ToolCalls    []ToolCallEvent
	Approvals    []ApprovalEvent
	Content      string
	RawEvents    []SSEEvent
	Assertions   []AssertionResult
}

// ToolCallEvent represents a tool call captured during execution
type ToolCallEvent struct {
	ID      string
	Name    string
	Input   string
	Output  string
	Success bool
}

// ApprovalEvent represents an approval request captured during execution
type ApprovalEvent struct {
	ApprovalID  string
	ToolID      string
	ToolName    string
	Command     string
	Risk        string
	Description string
}

// SSEEvent represents a raw SSE event from the stream
type SSEEvent struct {
	Type string
	Data json.RawMessage
}

// AssertionResult captures the result of a single assertion
type AssertionResult struct {
	Name    string
	Passed  bool
	Message string
}

// ScenarioResult captures the result of a full scenario
type ScenarioResult struct {
	ScenarioName string
	Steps        []StepResult
	Passed       bool
	Duration     time.Duration
	ReportPath   string
}

// StepMention represents a structured mention attached to a step.
// When present, these are sent alongside the prompt so the backend
// can resolve resources without discovery tool calls.
type StepMention struct {
	ID   string `json:"id"`
	Name string `json:"name"`
	Type string `json:"type"`
	Node string `json:"node,omitempty"`
}

// Step defines a single step in an eval scenario
type Step struct {
	Name             string
	Prompt           string
	Mentions         []StepMention // optional structured mentions
	Assertions       []Assertion
	ApprovalDecision ApprovalDecision
	ApprovalReason   string
}

// ApprovalDecision controls how eval handles approval requests during a step.
type ApprovalDecision string

const (
	ApprovalNone    ApprovalDecision = ""
	ApprovalApprove ApprovalDecision = "approve"
	ApprovalDeny    ApprovalDecision = "deny"
)

// Assertion defines a check to run after a step
type Assertion func(result *StepResult) AssertionResult

// Scenario defines a multi-step eval scenario
type Scenario struct {
	Name        string
	Description string
	Steps       []Step
}

// RunScenario executes a scenario and returns the results
func (r *Runner) RunScenario(scenario Scenario) ScenarioResult {
	startTime := time.Now()
	result := ScenarioResult{
		ScenarioName: scenario.Name,
		Passed:       true,
	}

	var sessionID string

	if r.config.Preflight {
		preflight := r.runPreflight()
		result.Steps = append(result.Steps, preflight)
		if !preflight.Success {
			result.Passed = false
			result.Duration = time.Since(startTime)
			if reportPath, err := r.writeReport(result); err == nil {
				result.ReportPath = reportPath
			}
			return result
		}
	}

	for i, step := range scenario.Steps {
		if r.config.Verbose {
			fmt.Printf("\n=== Step %d: %s ===\n", i+1, step.Name)
			fmt.Printf("Prompt: %s\n", step.Prompt)
		}

		stepResult := r.executeStep(step, sessionID)

		// Use session from first step for subsequent steps
		if sessionID == "" && stepResult.SessionID != "" {
			sessionID = stepResult.SessionID
		}
		stepResult.SessionID = sessionID

		// Run assertions
		for _, assertion := range step.Assertions {
			assertResult := assertion(&stepResult)
			stepResult.Assertions = append(stepResult.Assertions, assertResult)
			if !assertResult.Passed {
				stepResult.Success = false
				result.Passed = false
			}
		}

		if stepResult.Error != nil {
			stepResult.Success = false
			result.Passed = false
		}

		if r.config.Verbose {
			r.printStepResult(&stepResult)
		}

		result.Steps = append(result.Steps, stepResult)

		// Stop on failure
		if !stepResult.Success {
			break
		}
	}

	result.Duration = time.Since(startTime)
	if reportPath, err := r.writeReport(result); err == nil {
		result.ReportPath = reportPath
	}
	return result
}

func (r *Runner) executeStep(step Step, sessionID string) StepResult {
	retries := r.config.StepRetries
	if retries < 0 {
		retries = 0
	}
	return r.executeStepWithRetry(step, sessionID, retries)
}

func (r *Runner) executeStepWithRetry(step Step, sessionID string, retries int) StepResult {
	if retries < 0 {
		retries = 0
	}

	var retryNotes []string
	for attempt := 0; attempt <= retries; attempt++ {
		result := r.executeStepOnce(step, sessionID)
		shouldRetry, reason := r.shouldRetryStep(&result, step)
		if !shouldRetry || attempt == retries {
			result.Retries = len(retryNotes)
			result.RetryNotes = retryNotes
			return result
		}
		if reason != "" {
			retryNotes = append(retryNotes, reason)
		}
		if reason == "rate_limit" && r.config.RateLimitCooldown > 0 {
			if r.config.Verbose {
				fmt.Printf("\n--- Rate limit cooldown (%s) before retry ---\n", r.config.RateLimitCooldown)
			}
			time.Sleep(r.config.RateLimitCooldown)
		}
		if r.config.Verbose {
			fmt.Printf("\n--- Retrying step '%s' (attempt %d/%d) due to transient failure ---\n",
				step.Name, attempt+1, retries)
		}
	}

	return r.executeStepOnce(step, sessionID)
}

func (r *Runner) executeStepOnce(step Step, sessionID string) StepResult {
	return r.executeStepOnceWithClient(step, sessionID, r.client)
}

func (r *Runner) executeStepOnceWithClient(step Step, sessionID string, client *http.Client) StepResult {
	startTime := time.Now()
	result := StepResult{
		StepName:  step.Name,
		Prompt:    step.Prompt,
		SessionID: sessionID,
		Success:   true,
	}

	// Build request
	reqBody := map[string]interface{}{
		"prompt": step.Prompt,
	}
	if sessionID != "" {
		reqBody["session_id"] = sessionID
	}
	if strings.TrimSpace(r.config.Model) != "" {
		reqBody["model"] = r.config.Model
	}
	if len(step.Mentions) > 0 {
		reqBody["mentions"] = step.Mentions
	}

	bodyBytes, _ := json.Marshal(reqBody)
	req, err := http.NewRequest("POST", r.config.BaseURL+"/api/ai/chat", bytes.NewReader(bodyBytes))
	if err != nil {
		result.Error = fmt.Errorf("failed to create request: %w", err)
		result.Success = false
		return result
	}

	req.Header.Set("Content-Type", "application/json")
	req.Header.Set("Accept", "text/event-stream")
	req.SetBasicAuth(r.config.Username, r.config.Password)

	// Execute request
	if client == nil {
		client = r.client
	}
	resp, err := client.Do(req)
	if err != nil {
		result.Error = fmt.Errorf("request failed: %w", err)
		result.Success = false
		return result
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		body, _ := io.ReadAll(resp.Body)
		result.Error = fmt.Errorf("unexpected status %d: %s", resp.StatusCode, string(body))
		result.Success = false
		return result
	}

	// Parse SSE stream
	result.RawEvents, result.ToolCalls, result.Approvals, result.Content, result.SessionID, result.Model, result.InputTokens, result.OutputTokens, err = r.parseSSEStream(resp.Body, step.ApprovalDecision, step.ApprovalReason)
	if err != nil {
		result.Error = fmt.Errorf("failed to parse SSE stream: %w", err)
		result.Success = false
		return result
	}

	result.Duration = time.Since(startTime)
	return result
}

func (r *Runner) runPreflight() StepResult {
	step := Step{
		Name:   "Preflight",
		Prompt: "Say hello.",
	}
	client := &http.Client{
		Timeout: r.config.PreflightTimeout,
	}
	result := r.executeStepOnceWithClient(step, "", client)
	result.StepName = "Preflight"
	if result.Error == nil && strings.TrimSpace(result.Content) == "" && len(result.ToolCalls) == 0 {
		result.Error = fmt.Errorf("preflight returned empty response")
		result.Success = false
	}
	return result
}

func (r *Runner) shouldRetryStep(result *StepResult, step Step) (bool, string) {
	if result == nil {
		return false, ""
	}

	if result.Error != nil && isRateLimitError(result.Error.Error()) {
		if r.config.RetryOnRateLimit {
			return true, "rate_limit"
		}
		return false, ""
	}

	// Retry on known transient errors (stream parse or phantom detection).
	if result.Error != nil && r.config.RetryOnStreamFailure {
		errMsg := result.Error.Error()
		if strings.Contains(errMsg, "token too long") ||
			strings.Contains(errMsg, "failed to parse SSE stream") ||
			strings.Contains(errMsg, "stream error") {
			return true, "stream_error"
		}
	}

	phantomMessage := "I apologize, but I wasn't able to access the infrastructure tools needed to complete that request"
	if r.config.RetryOnPhantom && !hasSuccessfulToolCallRetry(result.ToolCalls) && strings.Contains(result.Content, phantomMessage) {
		return true, "phantom_detection"
	}

	if r.config.RetryOnEmptyResponse && strings.TrimSpace(result.Content) == "" {
		// If we got successful tool calls, don't retry just because content was empty.
		if len(result.ToolCalls) == 0 || !hasSuccessfulToolCallRetry(result.ToolCalls) {
			return true, "empty_response"
		}
	}

	// If an explicit tool was requested and no tool calls occurred, retry once.
	if r.config.RetryOnExplicitTool && len(result.ToolCalls) == 0 && requiresExplicitTool(step.Prompt) {
		return true, "no_tool_calls_for_explicit_tool"
	}

	if r.config.RetryOnToolErrors && len(result.ToolCalls) > 0 && !hasSuccessfulToolCallRetry(result.ToolCalls) {
		if hasRetryableToolError(result.ToolCalls) {
			return true, "tool_error"
		}
	}

	return false, ""
}

func requiresExplicitTool(prompt string) bool {
	prompt = strings.ToLower(prompt)
	explicitTools := []string{
		"pulse_read",
		"pulse_control",
		"pulse_query",
		"pulse_discovery",
		"pulse_docker",
		"pulse_kubernetes",
		"pulse_metrics",
		"pulse_storage",
	}
	for _, tool := range explicitTools {
		if strings.Contains(prompt, tool) {
			return true
		}
	}
	if strings.Contains(prompt, "read-only tool") || strings.Contains(prompt, "read only tool") {
		return true
	}
	if strings.Contains(prompt, "control tool") || strings.Contains(prompt, "query tool") {
		return true
	}
	return false
}

func applyEvalEnvOverrides(config *Config) {
	if config == nil {
		return
	}

	if value, ok := envInt("EVAL_HTTP_TIMEOUT"); ok && value > 0 {
		config.RequestTimeout = time.Duration(value) * time.Second
	}

	if value, ok := envInt("EVAL_STEP_RETRIES"); ok {
		config.StepRetries = value
	} else if config.StepRetries == 0 {
		config.StepRetries = 1
	}

	if value, ok := envBool("EVAL_RETRY_ON_PHANTOM"); ok {
		config.RetryOnPhantom = value
	} else if !config.RetryOnPhantom {
		config.RetryOnPhantom = true
	}

	if value, ok := envBool("EVAL_RETRY_ON_EXPLICIT_TOOL"); ok {
		config.RetryOnExplicitTool = value
	} else if !config.RetryOnExplicitTool {
		config.RetryOnExplicitTool = true
	}

	if value, ok := envBool("EVAL_RETRY_ON_STREAM_FAILURE"); ok {
		config.RetryOnStreamFailure = value
	} else if !config.RetryOnStreamFailure {
		config.RetryOnStreamFailure = true
	}

	if value, ok := envBool("EVAL_RETRY_ON_EMPTY_RESPONSE"); ok {
		config.RetryOnEmptyResponse = value
	} else if !config.RetryOnEmptyResponse {
		config.RetryOnEmptyResponse = true
	}

	if value, ok := envBool("EVAL_RETRY_ON_TOOL_ERRORS"); ok {
		config.RetryOnToolErrors = value
	} else if !config.RetryOnToolErrors {
		config.RetryOnToolErrors = true
	}

	if value, ok := envBool("EVAL_RETRY_ON_RATE_LIMIT"); ok {
		config.RetryOnRateLimit = value
	}

	if value, ok := envInt("EVAL_RATE_LIMIT_COOLDOWN"); ok && value > 0 {
		config.RateLimitCooldown = time.Duration(value) * time.Second
	}

	if value, ok := envBool("EVAL_PREFLIGHT"); ok {
		config.Preflight = value
	}

	if value, ok := envInt("EVAL_PREFLIGHT_TIMEOUT"); ok && value > 0 {
		config.PreflightTimeout = time.Duration(value) * time.Second
	} else if config.PreflightTimeout == 0 {
		config.PreflightTimeout = 15 * time.Second
	}

	if value, ok := envString("EVAL_MODEL"); ok && strings.TrimSpace(config.Model) == "" {
		config.Model = value
	}

	if dir, ok := envString("EVAL_REPORT_DIR"); ok {
		config.ReportDir = dir
	}
}

func (r *Runner) writeReport(result ScenarioResult) (string, error) {
	if r == nil || r.config.ReportDir == "" {
		return "", nil
	}

	if err := os.MkdirAll(r.config.ReportDir, 0700); err != nil {
		return "", err
	}

	timestamp := time.Now().Format("20060102-150405")
	nameParts := []string{sanitizeFilename(result.ScenarioName)}
	if model := strings.TrimSpace(r.config.Model); model != "" {
		nameParts = append(nameParts, sanitizeFilename(model))
	}
	filename := fmt.Sprintf("eval-%s-%s.json", strings.Join(nameParts, "-"), timestamp)
	path := filepath.Join(r.config.ReportDir, filename)

	report := struct {
		GeneratedAt time.Time      `json:"generated_at"`
		BaseURL     string         `json:"base_url"`
		Username    string         `json:"username"`
		Model       string         `json:"model,omitempty"`
		Result      ScenarioResult `json:"result"`
	}{
		GeneratedAt: time.Now(),
		BaseURL:     r.config.BaseURL,
		Username:    r.config.Username,
		Model:       r.config.Model,
		Result:      result,
	}

	data, err := json.MarshalIndent(report, "", "  ")
	if err != nil {
		return "", err
	}

	if err := os.WriteFile(path, data, 0600); err != nil {
		return "", err
	}

	return path, nil
}

func sanitizeFilename(name string) string {
	name = strings.ToLower(strings.TrimSpace(name))
	name = strings.ReplaceAll(name, " ", "-")
	name = strings.ReplaceAll(name, "/", "-")
	name = strings.ReplaceAll(name, "\\", "-")
	name = strings.ReplaceAll(name, ":", "-")
	return name
}

func envBool(key string) (bool, bool) {
	value, ok := os.LookupEnv(key)
	if !ok {
		return false, false
	}
	switch strings.ToLower(strings.TrimSpace(value)) {
	case "1", "true", "yes", "y", "on":
		return true, true
	case "0", "false", "no", "n", "off":
		return false, true
	default:
		return false, false
	}
}

func envInt(key string) (int, bool) {
	value, ok := os.LookupEnv(key)
	if !ok {
		return 0, false
	}
	var parsed int
	if _, err := fmt.Sscanf(strings.TrimSpace(value), "%d", &parsed); err != nil {
		return 0, false
	}
	return parsed, true
}

func envFloat(key string) (float64, bool) {
	value, ok := os.LookupEnv(key)
	if !ok {
		return 0, false
	}
	var parsed float64
	if _, err := fmt.Sscanf(strings.TrimSpace(value), "%f", &parsed); err != nil {
		return 0, false
	}
	return parsed, true
}

func envString(key string) (string, bool) {
	value, ok := os.LookupEnv(key)
	if !ok {
		return "", false
	}
	value = strings.TrimSpace(value)
	if value == "" {
		return "", false
	}
	return value, true
}

type aiSettingsResponse struct {
	PatrolModel string `json:"patrol_model"`
}

type aiSettingsUpdateRequest struct {
	PatrolModel *string `json:"patrol_model,omitempty"`
}

func (r *Runner) applyPatrolModelOverride(ctx context.Context) (func(), error) {
	if r == nil {
		return nil, nil
	}

	override := r.patrolModelOverride()
	if override == "" {
		return nil, nil
	}

	current, err := r.getAISettings(ctx)
	if err != nil {
		return nil, err
	}
	if current.PatrolModel == override {
		return nil, nil
	}

	if err := r.updateAISettings(ctx, aiSettingsUpdateRequest{PatrolModel: &override}); err != nil {
		return nil, err
	}

	restore := func() {
		_ = r.updateAISettings(context.Background(), aiSettingsUpdateRequest{PatrolModel: &current.PatrolModel})
	}
	return restore, nil
}

func (r *Runner) patrolModelOverride() string {
	if value, ok := envString("EVAL_PATROL_MODEL"); ok {
		return normalizeModelString(value)
	}
	return normalizeModelString(strings.TrimSpace(r.config.Model))
}

func normalizeModelString(model string) string {
	model = strings.TrimSpace(model)
	if model == "" {
		return ""
	}
	provider, name := config.ParseModelString(model)
	if provider == "" || name == "" {
		return model
	}
	return provider + ":" + name
}

func (r *Runner) getAISettings(ctx context.Context) (*aiSettingsResponse, error) {
	if r == nil {
		return nil, fmt.Errorf("nil runner")
	}
	req, err := http.NewRequestWithContext(ctx, http.MethodGet, r.config.BaseURL+"/api/settings/ai", nil)
	if err != nil {
		return nil, err
	}
	req.SetBasicAuth(r.config.Username, r.config.Password)

	resp, err := r.client.Do(req)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		body, _ := io.ReadAll(resp.Body)
		return nil, fmt.Errorf("get AI settings failed (%d): %s", resp.StatusCode, strings.TrimSpace(string(body)))
	}

	var settings aiSettingsResponse
	if err := json.NewDecoder(resp.Body).Decode(&settings); err != nil {
		return nil, err
	}

	return &settings, nil
}

func (r *Runner) updateAISettings(ctx context.Context, update aiSettingsUpdateRequest) error {
	if r == nil {
		return fmt.Errorf("nil runner")
	}
	payload, err := json.Marshal(update)
	if err != nil {
		return err
	}

	req, err := http.NewRequestWithContext(ctx, http.MethodPut, r.config.BaseURL+"/api/settings/ai/update", bytes.NewReader(payload))
	if err != nil {
		return err
	}
	req.Header.Set("Content-Type", "application/json")
	req.SetBasicAuth(r.config.Username, r.config.Password)

	resp, err := r.client.Do(req)
	if err != nil {
		return err
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		body, _ := io.ReadAll(resp.Body)
		return fmt.Errorf("update AI settings failed (%d): %s", resp.StatusCode, strings.TrimSpace(string(body)))
	}

	return nil
}

func (r *Runner) handleApprovalDecision(decision ApprovalDecision, approvalID, reason string) error {
	if r == nil || approvalID == "" {
		return nil
	}

	path := ""
	switch decision {
	case ApprovalApprove:
		path = "/api/ai/approvals/" + approvalID + "/approve"
	case ApprovalDeny:
		path = "/api/ai/approvals/" + approvalID + "/deny"
	default:
		return nil
	}

	var body io.Reader
	if decision == ApprovalDeny && reason != "" {
		payload := map[string]string{"reason": reason}
		if encoded, err := json.Marshal(payload); err == nil {
			body = bytes.NewReader(encoded)
		}
	}

	req, err := http.NewRequest("POST", r.config.BaseURL+path, body)
	if err != nil {
		return fmt.Errorf("failed to create approval request: %w", err)
	}
	req.Header.Set("Content-Type", "application/json")
	req.SetBasicAuth(r.config.Username, r.config.Password)

	resp, err := r.client.Do(req)
	if err != nil {
		return fmt.Errorf("approval %s request failed: %w", decision, err)
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		respBody, _ := io.ReadAll(resp.Body)
		return fmt.Errorf("approval %s returned status %d: %s", decision, resp.StatusCode, strings.TrimSpace(string(respBody)))
	}

	return nil
}

func hasSuccessfulToolCallRetry(toolCalls []ToolCallEvent) bool {
	for _, tc := range toolCalls {
		if tc.Success {
			return true
		}
	}
	return false
}

func hasRetryableToolError(toolCalls []ToolCallEvent) bool {
	retryableIndicators := []string{
		"timeout",
		"timed out",
		"context deadline exceeded",
		"connection refused",
		"connection reset",
		"network is unreachable",
		"no such host",
		"i/o timeout",
		"server error",
		"502",
		"503",
		"504",
		"eof",
		"dial tcp",
		"temporarily",
		"query is required",
	}

	nonRetryableIndicators := []string{
		"read_only_violation",
		"strict_resolution",
		"routing_mismatch",
	}

	for _, tc := range toolCalls {
		if tc.Success {
			continue
		}
		lower := strings.ToLower(tc.Output)
		if lower == "" {
			continue
		}
		for _, indicator := range nonRetryableIndicators {
			if strings.Contains(lower, indicator) {
				goto next
			}
		}
		for _, indicator := range retryableIndicators {
			if strings.Contains(lower, indicator) {
				return true
			}
		}
	next:
	}

	return false
}

func isRateLimitError(message string) bool {
	lower := strings.ToLower(message)
	indicators := []string{
		"rate limit",
		"rate-limit",
		"retry-after",
		"too many requests",
		"429",
		"quota",
		"resource has been exhausted",
	}
	for _, indicator := range indicators {
		if indicator != "" && strings.Contains(lower, indicator) {
			return true
		}
	}
	return false
}

func (r *Runner) parseSSEStream(body io.Reader, approvalDecision ApprovalDecision, approvalReason string) ([]SSEEvent, []ToolCallEvent, []ApprovalEvent, string, string, string, int, int, error) {
	var events []SSEEvent
	var toolCalls []ToolCallEvent
	var approvals []ApprovalEvent
	var contentBuilder strings.Builder
	var sessionID string
	var model string
	var inputTokens int
	var outputTokens int
	handledApprovals := make(map[string]struct{})

	// Track tool calls in progress
	toolCallsInProgress := make(map[string]*ToolCallEvent)

	scanner := bufio.NewScanner(body)
	// Allow large SSE events (tool results can be big).
	const maxSSEEventSize = 8 * 1024 * 1024
	scanner.Buffer(make([]byte, 0, 64*1024), maxSSEEventSize)
	for scanner.Scan() {
		line := scanner.Text()

		if !strings.HasPrefix(line, "data: ") {
			continue
		}

		data := strings.TrimPrefix(line, "data: ")
		if data == "" {
			continue
		}

		// Parse the event
		var event struct {
			Type string          `json:"type"`
			Data json.RawMessage `json:"data"`
		}
		if err := json.Unmarshal([]byte(data), &event); err != nil {
			// Try parsing as raw event data
			continue
		}

		eventData := event.Data
		if event.Type == "complete" && len(event.Data) == 0 {
			eventData = json.RawMessage([]byte(data))
		}
		events = append(events, SSEEvent{
			Type: event.Type,
			Data: eventData,
		})

		switch event.Type {
		case "session":
			var sessionData struct {
				ID string `json:"id"`
			}
			if err := json.Unmarshal(event.Data, &sessionData); err == nil {
				sessionID = sessionData.ID
			}
		case "done":
			var doneData struct {
				SessionID    string `json:"session_id"`
				InputTokens  int    `json:"input_tokens"`
				OutputTokens int    `json:"output_tokens"`
			}
			if err := json.Unmarshal(event.Data, &doneData); err == nil {
				if doneData.SessionID != "" {
					sessionID = doneData.SessionID
				}
				if doneData.InputTokens > 0 || doneData.OutputTokens > 0 {
					inputTokens = doneData.InputTokens
					outputTokens = doneData.OutputTokens
				}
			}

		case "content":
			var contentData struct {
				Text string `json:"text"`
			}
			if err := json.Unmarshal(event.Data, &contentData); err == nil {
				contentBuilder.WriteString(contentData.Text)
			}

		case "tool_start":
			var toolData struct {
				ID    string `json:"id"`
				Name  string `json:"name"`
				Input string `json:"input"`
			}
			if err := json.Unmarshal(event.Data, &toolData); err == nil {
				toolCallsInProgress[toolData.ID] = &ToolCallEvent{
					ID:    toolData.ID,
					Name:  toolData.Name,
					Input: toolData.Input,
				}
			}

		case "tool_end":
			var toolData struct {
				ID      string `json:"id"`
				Name    string `json:"name"`
				Input   string `json:"input"`
				Output  string `json:"output"`
				Success bool   `json:"success"`
			}
			if err := json.Unmarshal(event.Data, &toolData); err == nil {
				if tc, ok := toolCallsInProgress[toolData.ID]; ok {
					if toolData.Input != "" {
						tc.Input = toolData.Input
					}
					tc.Output = toolData.Output
					tc.Success = toolData.Success
					toolCalls = append(toolCalls, *tc)
					delete(toolCallsInProgress, toolData.ID)
				} else {
					// Tool end without start
					toolCalls = append(toolCalls, ToolCallEvent{
						ID:      toolData.ID,
						Name:    toolData.Name,
						Input:   toolData.Input,
						Output:  toolData.Output,
						Success: toolData.Success,
					})
				}
			}

		case "approval_needed":
			var approvalData struct {
				ApprovalID  string `json:"approval_id"`
				ToolID      string `json:"tool_id"`
				ToolName    string `json:"tool_name"`
				Command     string `json:"command"`
				Risk        string `json:"risk"`
				Description string `json:"description"`
			}
			if err := json.Unmarshal(event.Data, &approvalData); err == nil {
				approvals = append(approvals, ApprovalEvent{
					ApprovalID:  approvalData.ApprovalID,
					ToolID:      approvalData.ToolID,
					ToolName:    approvalData.ToolName,
					Command:     approvalData.Command,
					Risk:        approvalData.Risk,
					Description: approvalData.Description,
				})
				if approvalDecision != ApprovalNone && approvalData.ApprovalID != "" {
					if _, ok := handledApprovals[approvalData.ApprovalID]; !ok {
						handledApprovals[approvalData.ApprovalID] = struct{}{}
						if err := r.handleApprovalDecision(approvalDecision, approvalData.ApprovalID, approvalReason); err != nil {
							return events, toolCalls, approvals, contentBuilder.String(), sessionID, model, inputTokens, outputTokens, err
						}
					}
				}
			}

		case "complete":
			var completeData struct {
				Model        string `json:"model"`
				InputTokens  int    `json:"input_tokens"`
				OutputTokens int    `json:"output_tokens"`
			}
			if err := json.Unmarshal([]byte(data), &completeData); err == nil {
				if completeData.Model != "" {
					model = completeData.Model
				}
				inputTokens = completeData.InputTokens
				outputTokens = completeData.OutputTokens
			}

		case "error":
			var errorData struct {
				Message string `json:"message"`
			}
			if err := json.Unmarshal(event.Data, &errorData); err == nil && strings.TrimSpace(errorData.Message) != "" {
				return events, toolCalls, approvals, contentBuilder.String(), sessionID, model, inputTokens, outputTokens, fmt.Errorf("stream error: %s", errorData.Message)
			}
			var rawMsg string
			if err := json.Unmarshal(event.Data, &rawMsg); err == nil && strings.TrimSpace(rawMsg) != "" {
				return events, toolCalls, approvals, contentBuilder.String(), sessionID, model, inputTokens, outputTokens, fmt.Errorf("stream error: %s", rawMsg)
			}
		}
	}

	if err := scanner.Err(); err != nil {
		return events, toolCalls, approvals, contentBuilder.String(), sessionID, model, inputTokens, outputTokens, err
	}

	return events, toolCalls, approvals, contentBuilder.String(), sessionID, model, inputTokens, outputTokens, nil
}

func (r *Runner) printStepResult(result *StepResult) {
	fmt.Printf("\n--- Result ---\n")
	fmt.Printf("Duration: %v\n", result.Duration)
	fmt.Printf("Session: %s\n", result.SessionID)
	if result.Retries > 0 {
		fmt.Printf("Retries: %d", result.Retries)
		if len(result.RetryNotes) > 0 {
			fmt.Printf(" (%s)", strings.Join(result.RetryNotes, ", "))
		}
		fmt.Printf("\n")
	}
	if len(result.Approvals) > 0 {
		fmt.Printf("Approvals: %d\n", len(result.Approvals))
	}

	if result.Error != nil {
		fmt.Printf("ERROR: %v\n", result.Error)
	}

	if len(result.ToolCalls) > 0 {
		fmt.Printf("\nTool Calls:\n")
		for _, tc := range result.ToolCalls {
			status := "OK"
			if !tc.Success {
				status = "FAILED"
			}
			fmt.Printf("  - %s [%s]: %s\n", tc.Name, status, truncate(tc.Input, 80))
			if !tc.Success || r.config.Verbose {
				fmt.Printf("    Output: %s\n", truncate(tc.Output, 200))
			}
		}
	}

	if result.Content != "" {
		fmt.Printf("\nAssistant Response:\n%s\n", truncate(result.Content, 500))
	}

	if len(result.Assertions) > 0 {
		fmt.Printf("\nAssertions:\n")
		for _, a := range result.Assertions {
			status := "PASS"
			if !a.Passed {
				status = "FAIL"
			}
			fmt.Printf("  [%s] %s: %s\n", status, a.Name, a.Message)
		}
	}
}

func truncate(s string, max int) string {
	if len(s) <= max {
		return s
	}
	return s[:max] + "..."
}

// PrintSummary prints a summary of the scenario result
func (r *Runner) PrintSummary(result ScenarioResult) {
	fmt.Printf("\n")
	fmt.Printf("========================================\n")
	fmt.Printf("SCENARIO: %s\n", result.ScenarioName)
	fmt.Printf("========================================\n")
	fmt.Printf("Duration: %v\n", result.Duration)

	passedSteps := 0
	totalRetries := 0
	for _, step := range result.Steps {
		if step.Success {
			passedSteps++
		}
		totalRetries += step.Retries
	}

	fmt.Printf("Steps: %d/%d passed\n", passedSteps, len(result.Steps))
	if totalRetries > 0 {
		fmt.Printf("Retries: %d\n", totalRetries)
		for _, step := range result.Steps {
			if step.Retries > 0 {
				note := ""
				if len(step.RetryNotes) > 0 {
					note = fmt.Sprintf(" (%s)", strings.Join(step.RetryNotes, ", "))
				}
				fmt.Printf("  - %s: %d%s\n", step.StepName, step.Retries, note)
			}
		}
	}
	if result.ReportPath != "" {
		fmt.Printf("Report: %s\n", result.ReportPath)
	}

	if result.Passed {
		fmt.Printf("Result: PASSED\n")
	} else {
		fmt.Printf("Result: FAILED\n")
		fmt.Printf("\nFailures:\n")
		for _, step := range result.Steps {
			if !step.Success {
				fmt.Printf("  - %s\n", step.StepName)
				if step.Error != nil {
					fmt.Printf("    Error: %v\n", step.Error)
				}
				for _, a := range step.Assertions {
					if !a.Passed {
						fmt.Printf("    Assertion '%s': %s\n", a.Name, a.Message)
					}
				}
			}
		}
	}
	fmt.Printf("========================================\n")
}