Pulse/internal/ai/eval/assertions.go

package eval

import (
	"fmt"
	"strings"
)

// === Common Assertions ===

// AssertToolUsed checks that a specific tool was called
func AssertToolUsed(toolName string) Assertion {
	return func(result *StepResult) AssertionResult {
		for _, tc := range result.ToolCalls {
			if tc.Name == toolName {
				return AssertionResult{
					Name:    fmt.Sprintf("tool_used:%s", toolName),
					Passed:  true,
					Message: fmt.Sprintf("Tool '%s' was called", toolName),
				}
			}
		}
		return AssertionResult{
			Name:    fmt.Sprintf("tool_used:%s", toolName),
			Passed:  false,
			Message: fmt.Sprintf("Tool '%s' was NOT called. Tools used: %v", toolName, getToolNames(result.ToolCalls)),
		}
	}
}

// AssertToolNotUsed checks that a specific tool was NOT called
func AssertToolNotUsed(toolName string) Assertion {
	return func(result *StepResult) AssertionResult {
		for _, tc := range result.ToolCalls {
			if tc.Name == toolName {
				return AssertionResult{
					Name:    fmt.Sprintf("tool_not_used:%s", toolName),
					Passed:  false,
					Message: fmt.Sprintf("Tool '%s' was called", toolName),
				}
			}
		}
		return AssertionResult{
			Name:    fmt.Sprintf("tool_not_used:%s", toolName),
			Passed:  true,
			Message: fmt.Sprintf("Tool '%s' was not called", toolName),
		}
	}
}

// AssertAnyToolUsed checks that at least one tool was called
func AssertAnyToolUsed() Assertion {
	return func(result *StepResult) AssertionResult {
		if len(result.ToolCalls) > 0 {
			return AssertionResult{
				Name:    "any_tool_used",
				Passed:  true,
				Message: fmt.Sprintf("%d tool(s) called: %v", len(result.ToolCalls), getToolNames(result.ToolCalls)),
			}
		}
		return AssertionResult{
			Name:    "any_tool_used",
			Passed:  false,
			Message: "No tools were called",
		}
	}
}

// AssertNoToolErrors checks that all tool calls succeeded
func AssertNoToolErrors() Assertion {
	return func(result *StepResult) AssertionResult {
		var failures []string
		for _, tc := range result.ToolCalls {
			if !tc.Success {
				failures = append(failures, fmt.Sprintf("%s: %s", tc.Name, truncate(tc.Output, 100)))
			}
		}
		if len(failures) == 0 {
			return AssertionResult{
				Name:    "no_tool_errors",
				Passed:  true,
				Message: "All tool calls succeeded",
			}
		}
		return AssertionResult{
			Name:    "no_tool_errors",
			Passed:  false,
			Message: fmt.Sprintf("Tool failures: %v", failures),
		}
	}
}

// AssertContentContains checks that the response contains a substring
func AssertContentContains(substring string) Assertion {
	return func(result *StepResult) AssertionResult {
		if strings.Contains(strings.ToLower(result.Content), strings.ToLower(substring)) {
			return AssertionResult{
				Name:    fmt.Sprintf("content_contains:%s", truncate(substring, 20)),
				Passed:  true,
				Message: fmt.Sprintf("Content contains '%s'", substring),
			}
		}
		return AssertionResult{
			Name:    fmt.Sprintf("content_contains:%s", truncate(substring, 20)),
			Passed:  false,
			Message: fmt.Sprintf("Content does NOT contain '%s'", substring),
		}
	}
}

// AssertContentContainsAny checks that the response contains at least one substring.
func AssertContentContainsAny(substrings ...string) Assertion {
	return func(result *StepResult) AssertionResult {
		for _, substring := range substrings {
			if strings.Contains(strings.ToLower(result.Content), strings.ToLower(substring)) {
				return AssertionResult{
					Name:    "content_contains_any",
					Passed:  true,
					Message: fmt.Sprintf("Content contains '%s'", substring),
				}
			}
		}
		return AssertionResult{
			Name:    "content_contains_any",
			Passed:  false,
			Message: fmt.Sprintf("Content does NOT contain any of: %v", substrings),
		}
	}
}

// AssertContentNotContains checks that the response does NOT contain a substring
func AssertContentNotContains(substring string) Assertion {
	return func(result *StepResult) AssertionResult {
		if !strings.Contains(strings.ToLower(result.Content), strings.ToLower(substring)) {
			return AssertionResult{
				Name:    fmt.Sprintf("content_not_contains:%s", truncate(substring, 20)),
				Passed:  true,
				Message: fmt.Sprintf("Content does not contain '%s'", substring),
			}
		}
		return AssertionResult{
			Name:    fmt.Sprintf("content_not_contains:%s", truncate(substring, 20)),
			Passed:  false,
			Message: fmt.Sprintf("Content SHOULD NOT contain '%s' but does", substring),
		}
	}
}

// AssertNoPhantomDetection checks that phantom detection did not trigger
func AssertNoPhantomDetection() Assertion {
	return func(result *StepResult) AssertionResult {
		// The exact phantom detection message from agentic.go
		phantomMessage := "I apologize, but I wasn't able to access the infrastructure tools needed to complete that request"
		if strings.Contains(result.Content, phantomMessage) {
			if hasSuccessfulToolCall(result.ToolCalls) {
				return AssertionResult{
					Name:    "no_phantom_detection",
					Passed:  true,
					Message: "Phantom detection phrase present, but tool calls succeeded",
				}
			}
			// Find where in the content it appears
			idx := strings.Index(result.Content, phantomMessage)
			context := result.Content[max(0, idx-50):min(len(result.Content), idx+100)]
			return AssertionResult{
				Name:    "no_phantom_detection",
				Passed:  false,
				Message: fmt.Sprintf("Phantom detection triggered, found at: ...%s...", context),
			}
		}
		return AssertionResult{
			Name:    "no_phantom_detection",
			Passed:  true,
			Message: "No phantom detection",
		}
	}
}

// AssertToolOutputContains checks that a specific tool's output contains a substring
func AssertToolOutputContains(toolName, substring string) Assertion {
	return func(result *StepResult) AssertionResult {
		for _, tc := range result.ToolCalls {
			if tc.Name == toolName {
				if strings.Contains(strings.ToLower(tc.Output), strings.ToLower(substring)) {
					return AssertionResult{
						Name:    fmt.Sprintf("tool_output:%s_contains:%s", toolName, truncate(substring, 20)),
						Passed:  true,
						Message: fmt.Sprintf("Tool '%s' output contains '%s'", toolName, substring),
					}
				}
				return AssertionResult{
					Name:    fmt.Sprintf("tool_output:%s_contains:%s", toolName, truncate(substring, 20)),
					Passed:  false,
					Message: fmt.Sprintf("Tool '%s' output does NOT contain '%s'", toolName, substring),
				}
			}
		}
		return AssertionResult{
			Name:    fmt.Sprintf("tool_output:%s_contains:%s", toolName, truncate(substring, 20)),
			Passed:  false,
			Message: fmt.Sprintf("Tool '%s' was not called", toolName),
		}
	}
}

// AssertNoError checks that no execution error occurred
func AssertNoError() Assertion {
	return func(result *StepResult) AssertionResult {
		if result.Error == nil {
			return AssertionResult{
				Name:    "no_error",
				Passed:  true,
				Message: "No execution error",
			}
		}
		return AssertionResult{
			Name:    "no_error",
			Passed:  false,
			Message: fmt.Sprintf("Execution error: %v", result.Error),
		}
	}
}

// AssertDurationUnder checks that the step completed within a time limit
func AssertDurationUnder(maxDuration string) Assertion {
	return func(result *StepResult) AssertionResult {
		// Parse duration - simplified, just handle seconds for now
		var maxSec float64
		fmt.Sscanf(maxDuration, "%fs", &maxSec)
		if maxSec == 0 {
			fmt.Sscanf(maxDuration, "%f", &maxSec)
		}

		actualSec := result.Duration.Seconds()
		if actualSec <= maxSec {
			return AssertionResult{
				Name:    fmt.Sprintf("duration_under:%s", maxDuration),
				Passed:  true,
				Message: fmt.Sprintf("Completed in %.1fs (max: %.1fs)", actualSec, maxSec),
			}
		}
		return AssertionResult{
			Name:    fmt.Sprintf("duration_under:%s", maxDuration),
			Passed:  false,
			Message: fmt.Sprintf("Took %.1fs which exceeds max of %.1fs", actualSec, maxSec),
		}
	}
}

// AssertToolNotBlocked checks that no tools were blocked
func AssertToolNotBlocked() Assertion {
	return func(result *StepResult) AssertionResult {
		for _, tc := range result.ToolCalls {
			if strings.Contains(tc.Output, `"blocked":true`) ||
				strings.Contains(tc.Output, "ROUTING_MISMATCH") ||
				strings.Contains(tc.Output, "FSM_BLOCKED") ||
				strings.Contains(tc.Output, "READ_ONLY_VIOLATION") ||
				strings.Contains(tc.Output, "STRICT_RESOLUTION") {
				return AssertionResult{
					Name:    "tool_not_blocked",
					Passed:  false,
					Message: fmt.Sprintf("Tool '%s' was blocked: %s", tc.Name, truncate(tc.Output, 100)),
				}
			}
		}
		return AssertionResult{
			Name:    "tool_not_blocked",
			Passed:  true,
			Message: "No tools were blocked",
		}
	}
}

// AssertEventualSuccess checks that at least one tool succeeded (allows intermediate failures)
// This is useful for complex workflows where some tools may be blocked but the model recovers.
func AssertEventualSuccess() Assertion {
	return func(result *StepResult) AssertionResult {
		successCount := 0
		for _, tc := range result.ToolCalls {
			if tc.Success {
				successCount++
			}
		}
		if successCount > 0 {
			return AssertionResult{
				Name:    "eventual_success",
				Passed:  true,
				Message: fmt.Sprintf("%d/%d tool calls succeeded", successCount, len(result.ToolCalls)),
			}
		}
		return AssertionResult{
			Name:    "eventual_success",
			Passed:  false,
			Message: "No tool calls succeeded",
		}
	}
}

// AssertEventualSuccessOrApproval checks that a tool succeeded or an approval was requested.
func AssertEventualSuccessOrApproval() Assertion {
	return func(result *StepResult) AssertionResult {
		for _, tc := range result.ToolCalls {
			if tc.Success {
				return AssertionResult{
					Name:    "eventual_success_or_approval",
					Passed:  true,
					Message: fmt.Sprintf("Tool '%s' succeeded", tc.Name),
				}
			}
		}
		if len(result.Approvals) > 0 {
			return AssertionResult{
				Name:    "eventual_success_or_approval",
				Passed:  true,
				Message: fmt.Sprintf("Approval requests: %d", len(result.Approvals)),
			}
		}
		return AssertionResult{
			Name:    "eventual_success_or_approval",
			Passed:  false,
			Message: "No tool calls succeeded and no approvals were requested",
		}
	}
}

// AssertMinToolCalls checks that at least N tools were called
func AssertMinToolCalls(min int) Assertion {
	return func(result *StepResult) AssertionResult {
		if len(result.ToolCalls) >= min {
			return AssertionResult{
				Name:    fmt.Sprintf("min_tool_calls:%d", min),
				Passed:  true,
				Message: fmt.Sprintf("%d tool calls made (min: %d)", len(result.ToolCalls), min),
			}
		}
		return AssertionResult{
			Name:    fmt.Sprintf("min_tool_calls:%d", min),
			Passed:  false,
			Message: fmt.Sprintf("Only %d tool calls made (expected at least %d)", len(result.ToolCalls), min),
		}
	}
}

// AssertMaxInputTokens checks that total input tokens stayed under a ceiling.
// This catches regressions where the model loops and burns excessive tokens.
func AssertMaxInputTokens(max int) Assertion {
	return func(result *StepResult) AssertionResult {
		if result.InputTokens <= max {
			return AssertionResult{
				Name:    fmt.Sprintf("max_input_tokens:%d", max),
				Passed:  true,
				Message: fmt.Sprintf("%d input tokens used (max: %d)", result.InputTokens, max),
			}
		}
		return AssertionResult{
			Name:    fmt.Sprintf("max_input_tokens:%d", max),
			Passed:  false,
			Message: fmt.Sprintf("%d input tokens used (expected at most %d)", result.InputTokens, max),
		}
	}
}

// AssertMaxToolCalls checks that the assistant made at most N tool calls.
// This catches regressions where the model loops excessively on simple tasks.
func AssertMaxToolCalls(max int) Assertion {
	return func(result *StepResult) AssertionResult {
		if len(result.ToolCalls) <= max {
			return AssertionResult{
				Name:    fmt.Sprintf("max_tool_calls:%d", max),
				Passed:  true,
				Message: fmt.Sprintf("%d tool calls made (max: %d)", len(result.ToolCalls), max),
			}
		}
		return AssertionResult{
			Name:    fmt.Sprintf("max_tool_calls:%d", max),
			Passed:  false,
			Message: fmt.Sprintf("%d tool calls made (expected at most %d). Tools: %v", len(result.ToolCalls), max, getToolNames(result.ToolCalls)),
		}
	}
}

// AssertHasContent checks that the assistant produced a non-empty response
func AssertHasContent() Assertion {
	return func(result *StepResult) AssertionResult {
		content := strings.TrimSpace(result.Content)
		if len(content) > 50 {
			return AssertionResult{
				Name:    "has_content",
				Passed:  true,
				Message: fmt.Sprintf("Response has %d characters", len(content)),
			}
		}
		return AssertionResult{
			Name:    "has_content",
			Passed:  false,
			Message: fmt.Sprintf("Response too short or empty (%d chars)", len(content)),
		}
	}
}

// AssertModelRecovered checks that if any tools were blocked, the model eventually succeeded
// with at least one tool call (indicating recovery from the block)
func AssertModelRecovered() Assertion {
	return func(result *StepResult) AssertionResult {
		blockedCount := 0
		successAfterBlock := false
		sawBlock := false

		for _, tc := range result.ToolCalls {
			if !tc.Success {
				blockedCount++
				sawBlock = true
			} else if sawBlock {
				successAfterBlock = true
			}
		}

		if blockedCount == 0 {
			return AssertionResult{
				Name:    "model_recovered",
				Passed:  true,
				Message: "No blocks to recover from",
			}
		}

		if successAfterBlock {
			return AssertionResult{
				Name:    "model_recovered",
				Passed:  true,
				Message: fmt.Sprintf("Model recovered from %d block(s)", blockedCount),
			}
		}

		return AssertionResult{
			Name:    "model_recovered",
			Passed:  false,
			Message: fmt.Sprintf("Model did not recover from %d block(s)", blockedCount),
		}
	}
}

// AssertToolSequence checks that tool calls occurred in the given order.
// The sequence does not need to be contiguous, but order must be preserved.
func AssertToolSequence(sequence []string) Assertion {
	return func(result *StepResult) AssertionResult {
		if len(sequence) == 0 {
			return AssertionResult{
				Name:    "tool_sequence",
				Passed:  true,
				Message: "No sequence required",
			}
		}

		seqIdx := 0
		for _, tc := range result.ToolCalls {
			if tc.Name == sequence[seqIdx] {
				seqIdx++
				if seqIdx == len(sequence) {
					return AssertionResult{
						Name:    "tool_sequence",
						Passed:  true,
						Message: fmt.Sprintf("Tool sequence matched: %v", sequence),
					}
				}
			}
		}

		return AssertionResult{
			Name:    "tool_sequence",
			Passed:  false,
			Message: fmt.Sprintf("Tool sequence not found. Expected: %v, got: %v", sequence, getToolNames(result.ToolCalls)),
		}
	}
}

// AssertToolInputContains checks that a tool's input contains a substring.
func AssertToolInputContains(toolName, substring string) Assertion {
	return func(result *StepResult) AssertionResult {
		for _, tc := range result.ToolCalls {
			if toolName != "" && tc.Name != toolName {
				continue
			}
			if strings.Contains(strings.ToLower(tc.Input), strings.ToLower(substring)) {
				return AssertionResult{
					Name:    fmt.Sprintf("tool_input:%s_contains:%s", toolName, truncate(substring, 20)),
					Passed:  true,
					Message: fmt.Sprintf("Tool '%s' input contains '%s'", tc.Name, substring),
				}
			}
			if toolName != "" {
				return AssertionResult{
					Name:    fmt.Sprintf("tool_input:%s_contains:%s", toolName, truncate(substring, 20)),
					Passed:  false,
					Message: fmt.Sprintf("Tool '%s' input does NOT contain '%s'", toolName, substring),
				}
			}
		}
		return AssertionResult{
			Name:    fmt.Sprintf("tool_input:%s_contains:%s", toolName, truncate(substring, 20)),
			Passed:  false,
			Message: fmt.Sprintf("Tool '%s' was not called", toolName),
		}
	}
}

// AssertAnyToolInputContains checks that any tool input contains a substring.
// If toolName is empty, any tool input is considered.
func AssertAnyToolInputContains(toolName, substring string) Assertion {
	return func(result *StepResult) AssertionResult {
		for _, tc := range result.ToolCalls {
			if toolName != "" && tc.Name != toolName {
				continue
			}
			if strings.Contains(strings.ToLower(tc.Input), strings.ToLower(substring)) {
				return AssertionResult{
					Name:    fmt.Sprintf("any_tool_input:%s_contains:%s", toolName, truncate(substring, 20)),
					Passed:  true,
					Message: fmt.Sprintf("Tool '%s' input contains '%s'", tc.Name, substring),
				}
			}
		}
		return AssertionResult{
			Name:    fmt.Sprintf("any_tool_input:%s_contains:%s", toolName, truncate(substring, 20)),
			Passed:  false,
			Message: fmt.Sprintf("No tool input matched '%s'", substring),
		}
	}
}

// AssertAnyToolInputContainsAny checks that any tool input contains any of the substrings.
// If toolName is empty, any tool input is considered.
func AssertAnyToolInputContainsAny(toolName string, substrings ...string) Assertion {
	return func(result *StepResult) AssertionResult {
		for _, tc := range result.ToolCalls {
			if toolName != "" && tc.Name != toolName {
				continue
			}
			for _, substring := range substrings {
				if strings.Contains(strings.ToLower(tc.Input), strings.ToLower(substring)) {
					return AssertionResult{
						Name:    fmt.Sprintf("any_tool_input:%s_contains_any", toolName),
						Passed:  true,
						Message: fmt.Sprintf("Tool '%s' input contains '%s'", tc.Name, substring),
					}
				}
			}
		}
		return AssertionResult{
			Name:    fmt.Sprintf("any_tool_input:%s_contains_any", toolName),
			Passed:  false,
			Message: fmt.Sprintf("No tool input matched any of: %v", substrings),
		}
	}
}

// AssertToolOutputContainsAny checks that a tool output contains any of the substrings.
// If toolName is empty, any tool output is considered.
func AssertToolOutputContainsAny(toolName string, substrings ...string) Assertion {
	return func(result *StepResult) AssertionResult {
		for _, tc := range result.ToolCalls {
			if toolName != "" && tc.Name != toolName {
				continue
			}
			for _, substring := range substrings {
				if strings.Contains(strings.ToLower(tc.Output), strings.ToLower(substring)) {
					return AssertionResult{
						Name:    fmt.Sprintf("tool_output:%s_contains_any", toolName),
						Passed:  true,
						Message: fmt.Sprintf("Tool '%s' output contains '%s'", tc.Name, substring),
					}
				}
			}
		}
		return AssertionResult{
			Name:    fmt.Sprintf("tool_output:%s_contains_any", toolName),
			Passed:  false,
			Message: fmt.Sprintf("No tool output matched any of: %v", substrings),
		}
	}
}

// AssertApprovalRequested checks that at least one approval request was emitted.
func AssertApprovalRequested() Assertion {
	return func(result *StepResult) AssertionResult {
		if len(result.Approvals) > 0 {
			return AssertionResult{
				Name:    "approval_requested",
				Passed:  true,
				Message: fmt.Sprintf("Approval requests: %d", len(result.Approvals)),
			}
		}
		return AssertionResult{
			Name:    "approval_requested",
			Passed:  false,
			Message: "No approval requests were captured",
		}
	}
}

// AssertOnlyToolsUsed checks that all tool calls are in the allow list.
func AssertOnlyToolsUsed(allowed ...string) Assertion {
	allowedSet := make(map[string]struct{}, len(allowed))
	for _, tool := range allowed {
		allowedSet[tool] = struct{}{}
	}
	return func(result *StepResult) AssertionResult {
		var unexpected []string
		for _, tc := range result.ToolCalls {
			if _, ok := allowedSet[tc.Name]; !ok {
				unexpected = append(unexpected, tc.Name)
			}
		}
		if len(unexpected) == 0 {
			return AssertionResult{
				Name:    "only_tools_used",
				Passed:  true,
				Message: fmt.Sprintf("Only allowed tools used: %v", allowed),
			}
		}
		return AssertionResult{
			Name:    "only_tools_used",
			Passed:  false,
			Message: fmt.Sprintf("Unexpected tools used: %v", unexpected),
		}
	}
}

// AssertRoutingMismatchRecovered verifies recovery if a routing mismatch occurs.
// If a routing mismatch is seen, the tool input must target the specific container.
// If no mismatch is seen, the tool input should still target the node.
func AssertRoutingMismatchRecovered(nodeName, containerName string) Assertion {
	return func(result *StepResult) AssertionResult {
		sawMismatch := false
		for _, tc := range result.ToolCalls {
			if strings.Contains(strings.ToLower(tc.Output), "routing_mismatch") {
				sawMismatch = true
				break
			}
		}

		if sawMismatch {
			for _, tc := range result.ToolCalls {
				if strings.Contains(strings.ToLower(tc.Input), strings.ToLower(containerName)) {
					return AssertionResult{
						Name:    "routing_mismatch_recovered",
						Passed:  true,
						Message: fmt.Sprintf("Routing mismatch recovered by targeting '%s'", containerName),
					}
				}
			}
			return AssertionResult{
				Name:    "routing_mismatch_recovered",
				Passed:  false,
				Message: fmt.Sprintf("Routing mismatch seen, but no tool input targeted '%s'", containerName),
			}
		}

		for _, tc := range result.ToolCalls {
			if strings.Contains(strings.ToLower(tc.Input), strings.ToLower(nodeName)) {
				return AssertionResult{
					Name:    "routing_mismatch_recovered",
					Passed:  true,
					Message: fmt.Sprintf("No routing mismatch; tool input targeted node '%s'", nodeName),
				}
			}
		}

		return AssertionResult{
			Name:    "routing_mismatch_recovered",
			Passed:  false,
			Message: fmt.Sprintf("No routing mismatch and no tool input targeted '%s'", nodeName),
		}
	}
}

// === Helper functions ===

func getToolNames(toolCalls []ToolCallEvent) []string {
	names := make([]string, len(toolCalls))
	for i, tc := range toolCalls {
		names[i] = tc.Name
	}
	return names
}

func hasSuccessfulToolCall(toolCalls []ToolCallEvent) bool {
	for _, tc := range toolCalls {
		if tc.Success {
			return true
		}
	}
	return false
}

func max(a, b int) int {
	if a > b {
		return a
	}
	return b
}

func min(a, b int) int {
	if a < b {
		return a
	}
	return b
}