mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-04-28 19:41:17 +00:00
706 lines
21 KiB
Go
706 lines
21 KiB
Go
package eval
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
)
|
|
|
|
// === Common Assertions ===
|
|
|
|
// AssertToolUsed checks that a specific tool was called
|
|
func AssertToolUsed(toolName string) Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
for _, tc := range result.ToolCalls {
|
|
if tc.Name == toolName {
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("tool_used:%s", toolName),
|
|
Passed: true,
|
|
Message: fmt.Sprintf("Tool '%s' was called", toolName),
|
|
}
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("tool_used:%s", toolName),
|
|
Passed: false,
|
|
Message: fmt.Sprintf("Tool '%s' was NOT called. Tools used: %v", toolName, getToolNames(result.ToolCalls)),
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertToolNotUsed checks that a specific tool was NOT called
|
|
func AssertToolNotUsed(toolName string) Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
for _, tc := range result.ToolCalls {
|
|
if tc.Name == toolName {
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("tool_not_used:%s", toolName),
|
|
Passed: false,
|
|
Message: fmt.Sprintf("Tool '%s' was called", toolName),
|
|
}
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("tool_not_used:%s", toolName),
|
|
Passed: true,
|
|
Message: fmt.Sprintf("Tool '%s' was not called", toolName),
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertAnyToolUsed checks that at least one tool was called
|
|
func AssertAnyToolUsed() Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
if len(result.ToolCalls) > 0 {
|
|
return AssertionResult{
|
|
Name: "any_tool_used",
|
|
Passed: true,
|
|
Message: fmt.Sprintf("%d tool(s) called: %v", len(result.ToolCalls), getToolNames(result.ToolCalls)),
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: "any_tool_used",
|
|
Passed: false,
|
|
Message: "No tools were called",
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertNoToolErrors checks that all tool calls succeeded
|
|
func AssertNoToolErrors() Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
var failures []string
|
|
for _, tc := range result.ToolCalls {
|
|
if !tc.Success {
|
|
failures = append(failures, fmt.Sprintf("%s: %s", tc.Name, truncate(tc.Output, 100)))
|
|
}
|
|
}
|
|
if len(failures) == 0 {
|
|
return AssertionResult{
|
|
Name: "no_tool_errors",
|
|
Passed: true,
|
|
Message: "All tool calls succeeded",
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: "no_tool_errors",
|
|
Passed: false,
|
|
Message: fmt.Sprintf("Tool failures: %v", failures),
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertContentContains checks that the response contains a substring
|
|
func AssertContentContains(substring string) Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
if strings.Contains(strings.ToLower(result.Content), strings.ToLower(substring)) {
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("content_contains:%s", truncate(substring, 20)),
|
|
Passed: true,
|
|
Message: fmt.Sprintf("Content contains '%s'", substring),
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("content_contains:%s", truncate(substring, 20)),
|
|
Passed: false,
|
|
Message: fmt.Sprintf("Content does NOT contain '%s'", substring),
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertContentContainsAny checks that the response contains at least one substring.
|
|
func AssertContentContainsAny(substrings ...string) Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
for _, substring := range substrings {
|
|
if strings.Contains(strings.ToLower(result.Content), strings.ToLower(substring)) {
|
|
return AssertionResult{
|
|
Name: "content_contains_any",
|
|
Passed: true,
|
|
Message: fmt.Sprintf("Content contains '%s'", substring),
|
|
}
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: "content_contains_any",
|
|
Passed: false,
|
|
Message: fmt.Sprintf("Content does NOT contain any of: %v", substrings),
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertContentNotContains checks that the response does NOT contain a substring
|
|
func AssertContentNotContains(substring string) Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
if !strings.Contains(strings.ToLower(result.Content), strings.ToLower(substring)) {
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("content_not_contains:%s", truncate(substring, 20)),
|
|
Passed: true,
|
|
Message: fmt.Sprintf("Content does not contain '%s'", substring),
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("content_not_contains:%s", truncate(substring, 20)),
|
|
Passed: false,
|
|
Message: fmt.Sprintf("Content SHOULD NOT contain '%s' but does", substring),
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertNoPhantomDetection checks that phantom detection did not trigger
|
|
func AssertNoPhantomDetection() Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
// The exact phantom detection message from agentic.go
|
|
phantomMessage := "I apologize, but I wasn't able to access the infrastructure tools needed to complete that request"
|
|
if strings.Contains(result.Content, phantomMessage) {
|
|
if hasSuccessfulToolCall(result.ToolCalls) {
|
|
return AssertionResult{
|
|
Name: "no_phantom_detection",
|
|
Passed: true,
|
|
Message: "Phantom detection phrase present, but tool calls succeeded",
|
|
}
|
|
}
|
|
// Find where in the content it appears
|
|
idx := strings.Index(result.Content, phantomMessage)
|
|
context := result.Content[max(0, idx-50):min(len(result.Content), idx+100)]
|
|
return AssertionResult{
|
|
Name: "no_phantom_detection",
|
|
Passed: false,
|
|
Message: fmt.Sprintf("Phantom detection triggered, found at: ...%s...", context),
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: "no_phantom_detection",
|
|
Passed: true,
|
|
Message: "No phantom detection",
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertToolOutputContains checks that a specific tool's output contains a substring
|
|
func AssertToolOutputContains(toolName, substring string) Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
for _, tc := range result.ToolCalls {
|
|
if tc.Name == toolName {
|
|
if strings.Contains(strings.ToLower(tc.Output), strings.ToLower(substring)) {
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("tool_output:%s_contains:%s", toolName, truncate(substring, 20)),
|
|
Passed: true,
|
|
Message: fmt.Sprintf("Tool '%s' output contains '%s'", toolName, substring),
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("tool_output:%s_contains:%s", toolName, truncate(substring, 20)),
|
|
Passed: false,
|
|
Message: fmt.Sprintf("Tool '%s' output does NOT contain '%s'", toolName, substring),
|
|
}
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("tool_output:%s_contains:%s", toolName, truncate(substring, 20)),
|
|
Passed: false,
|
|
Message: fmt.Sprintf("Tool '%s' was not called", toolName),
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertNoError checks that no execution error occurred
|
|
func AssertNoError() Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
if result.Error == nil {
|
|
return AssertionResult{
|
|
Name: "no_error",
|
|
Passed: true,
|
|
Message: "No execution error",
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: "no_error",
|
|
Passed: false,
|
|
Message: fmt.Sprintf("Execution error: %v", result.Error),
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertDurationUnder checks that the step completed within a time limit
|
|
func AssertDurationUnder(maxDuration string) Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
// Parse duration - simplified, just handle seconds for now
|
|
var maxSec float64
|
|
fmt.Sscanf(maxDuration, "%fs", &maxSec)
|
|
if maxSec == 0 {
|
|
fmt.Sscanf(maxDuration, "%f", &maxSec)
|
|
}
|
|
|
|
actualSec := result.Duration.Seconds()
|
|
if actualSec <= maxSec {
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("duration_under:%s", maxDuration),
|
|
Passed: true,
|
|
Message: fmt.Sprintf("Completed in %.1fs (max: %.1fs)", actualSec, maxSec),
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("duration_under:%s", maxDuration),
|
|
Passed: false,
|
|
Message: fmt.Sprintf("Took %.1fs which exceeds max of %.1fs", actualSec, maxSec),
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertToolNotBlocked checks that no tools were blocked
|
|
func AssertToolNotBlocked() Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
for _, tc := range result.ToolCalls {
|
|
if strings.Contains(tc.Output, `"blocked":true`) ||
|
|
strings.Contains(tc.Output, "ROUTING_MISMATCH") ||
|
|
strings.Contains(tc.Output, "FSM_BLOCKED") ||
|
|
strings.Contains(tc.Output, "READ_ONLY_VIOLATION") ||
|
|
strings.Contains(tc.Output, "STRICT_RESOLUTION") {
|
|
return AssertionResult{
|
|
Name: "tool_not_blocked",
|
|
Passed: false,
|
|
Message: fmt.Sprintf("Tool '%s' was blocked: %s", tc.Name, truncate(tc.Output, 100)),
|
|
}
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: "tool_not_blocked",
|
|
Passed: true,
|
|
Message: "No tools were blocked",
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertEventualSuccess checks that at least one tool succeeded (allows intermediate failures)
|
|
// This is useful for complex workflows where some tools may be blocked but the model recovers.
|
|
func AssertEventualSuccess() Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
successCount := 0
|
|
for _, tc := range result.ToolCalls {
|
|
if tc.Success {
|
|
successCount++
|
|
}
|
|
}
|
|
if successCount > 0 {
|
|
return AssertionResult{
|
|
Name: "eventual_success",
|
|
Passed: true,
|
|
Message: fmt.Sprintf("%d/%d tool calls succeeded", successCount, len(result.ToolCalls)),
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: "eventual_success",
|
|
Passed: false,
|
|
Message: "No tool calls succeeded",
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertEventualSuccessOrApproval checks that a tool succeeded or an approval was requested.
|
|
func AssertEventualSuccessOrApproval() Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
for _, tc := range result.ToolCalls {
|
|
if tc.Success {
|
|
return AssertionResult{
|
|
Name: "eventual_success_or_approval",
|
|
Passed: true,
|
|
Message: fmt.Sprintf("Tool '%s' succeeded", tc.Name),
|
|
}
|
|
}
|
|
}
|
|
if len(result.Approvals) > 0 {
|
|
return AssertionResult{
|
|
Name: "eventual_success_or_approval",
|
|
Passed: true,
|
|
Message: fmt.Sprintf("Approval requests: %d", len(result.Approvals)),
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: "eventual_success_or_approval",
|
|
Passed: false,
|
|
Message: "No tool calls succeeded and no approvals were requested",
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertMinToolCalls checks that at least N tools were called
|
|
func AssertMinToolCalls(min int) Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
if len(result.ToolCalls) >= min {
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("min_tool_calls:%d", min),
|
|
Passed: true,
|
|
Message: fmt.Sprintf("%d tool calls made (min: %d)", len(result.ToolCalls), min),
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("min_tool_calls:%d", min),
|
|
Passed: false,
|
|
Message: fmt.Sprintf("Only %d tool calls made (expected at least %d)", len(result.ToolCalls), min),
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertMaxInputTokens checks that total input tokens stayed under a ceiling.
|
|
// This catches regressions where the model loops and burns excessive tokens.
|
|
func AssertMaxInputTokens(max int) Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
if result.InputTokens <= max {
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("max_input_tokens:%d", max),
|
|
Passed: true,
|
|
Message: fmt.Sprintf("%d input tokens used (max: %d)", result.InputTokens, max),
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("max_input_tokens:%d", max),
|
|
Passed: false,
|
|
Message: fmt.Sprintf("%d input tokens used (expected at most %d)", result.InputTokens, max),
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertMaxToolCalls checks that the assistant made at most N tool calls.
|
|
// This catches regressions where the model loops excessively on simple tasks.
|
|
func AssertMaxToolCalls(max int) Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
if len(result.ToolCalls) <= max {
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("max_tool_calls:%d", max),
|
|
Passed: true,
|
|
Message: fmt.Sprintf("%d tool calls made (max: %d)", len(result.ToolCalls), max),
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("max_tool_calls:%d", max),
|
|
Passed: false,
|
|
Message: fmt.Sprintf("%d tool calls made (expected at most %d). Tools: %v", len(result.ToolCalls), max, getToolNames(result.ToolCalls)),
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertHasContent checks that the assistant produced a non-empty response
|
|
func AssertHasContent() Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
content := strings.TrimSpace(result.Content)
|
|
if len(content) > 50 {
|
|
return AssertionResult{
|
|
Name: "has_content",
|
|
Passed: true,
|
|
Message: fmt.Sprintf("Response has %d characters", len(content)),
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: "has_content",
|
|
Passed: false,
|
|
Message: fmt.Sprintf("Response too short or empty (%d chars)", len(content)),
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertModelRecovered checks that if any tools were blocked, the model eventually succeeded
|
|
// with at least one tool call (indicating recovery from the block)
|
|
func AssertModelRecovered() Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
blockedCount := 0
|
|
successAfterBlock := false
|
|
sawBlock := false
|
|
|
|
for _, tc := range result.ToolCalls {
|
|
if !tc.Success {
|
|
blockedCount++
|
|
sawBlock = true
|
|
} else if sawBlock {
|
|
successAfterBlock = true
|
|
}
|
|
}
|
|
|
|
if blockedCount == 0 {
|
|
return AssertionResult{
|
|
Name: "model_recovered",
|
|
Passed: true,
|
|
Message: "No blocks to recover from",
|
|
}
|
|
}
|
|
|
|
if successAfterBlock {
|
|
return AssertionResult{
|
|
Name: "model_recovered",
|
|
Passed: true,
|
|
Message: fmt.Sprintf("Model recovered from %d block(s)", blockedCount),
|
|
}
|
|
}
|
|
|
|
return AssertionResult{
|
|
Name: "model_recovered",
|
|
Passed: false,
|
|
Message: fmt.Sprintf("Model did not recover from %d block(s)", blockedCount),
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertToolSequence checks that tool calls occurred in the given order.
|
|
// The sequence does not need to be contiguous, but order must be preserved.
|
|
func AssertToolSequence(sequence []string) Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
if len(sequence) == 0 {
|
|
return AssertionResult{
|
|
Name: "tool_sequence",
|
|
Passed: true,
|
|
Message: "No sequence required",
|
|
}
|
|
}
|
|
|
|
seqIdx := 0
|
|
for _, tc := range result.ToolCalls {
|
|
if tc.Name == sequence[seqIdx] {
|
|
seqIdx++
|
|
if seqIdx == len(sequence) {
|
|
return AssertionResult{
|
|
Name: "tool_sequence",
|
|
Passed: true,
|
|
Message: fmt.Sprintf("Tool sequence matched: %v", sequence),
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return AssertionResult{
|
|
Name: "tool_sequence",
|
|
Passed: false,
|
|
Message: fmt.Sprintf("Tool sequence not found. Expected: %v, got: %v", sequence, getToolNames(result.ToolCalls)),
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertToolInputContains checks that a tool's input contains a substring.
|
|
func AssertToolInputContains(toolName, substring string) Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
for _, tc := range result.ToolCalls {
|
|
if toolName != "" && tc.Name != toolName {
|
|
continue
|
|
}
|
|
if strings.Contains(strings.ToLower(tc.Input), strings.ToLower(substring)) {
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("tool_input:%s_contains:%s", toolName, truncate(substring, 20)),
|
|
Passed: true,
|
|
Message: fmt.Sprintf("Tool '%s' input contains '%s'", tc.Name, substring),
|
|
}
|
|
}
|
|
if toolName != "" {
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("tool_input:%s_contains:%s", toolName, truncate(substring, 20)),
|
|
Passed: false,
|
|
Message: fmt.Sprintf("Tool '%s' input does NOT contain '%s'", toolName, substring),
|
|
}
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("tool_input:%s_contains:%s", toolName, truncate(substring, 20)),
|
|
Passed: false,
|
|
Message: fmt.Sprintf("Tool '%s' was not called", toolName),
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertAnyToolInputContains checks that any tool input contains a substring.
|
|
// If toolName is empty, any tool input is considered.
|
|
func AssertAnyToolInputContains(toolName, substring string) Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
for _, tc := range result.ToolCalls {
|
|
if toolName != "" && tc.Name != toolName {
|
|
continue
|
|
}
|
|
if strings.Contains(strings.ToLower(tc.Input), strings.ToLower(substring)) {
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("any_tool_input:%s_contains:%s", toolName, truncate(substring, 20)),
|
|
Passed: true,
|
|
Message: fmt.Sprintf("Tool '%s' input contains '%s'", tc.Name, substring),
|
|
}
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("any_tool_input:%s_contains:%s", toolName, truncate(substring, 20)),
|
|
Passed: false,
|
|
Message: fmt.Sprintf("No tool input matched '%s'", substring),
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertAnyToolInputContainsAny checks that any tool input contains any of the substrings.
|
|
// If toolName is empty, any tool input is considered.
|
|
func AssertAnyToolInputContainsAny(toolName string, substrings ...string) Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
for _, tc := range result.ToolCalls {
|
|
if toolName != "" && tc.Name != toolName {
|
|
continue
|
|
}
|
|
for _, substring := range substrings {
|
|
if strings.Contains(strings.ToLower(tc.Input), strings.ToLower(substring)) {
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("any_tool_input:%s_contains_any", toolName),
|
|
Passed: true,
|
|
Message: fmt.Sprintf("Tool '%s' input contains '%s'", tc.Name, substring),
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("any_tool_input:%s_contains_any", toolName),
|
|
Passed: false,
|
|
Message: fmt.Sprintf("No tool input matched any of: %v", substrings),
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertToolOutputContainsAny checks that a tool output contains any of the substrings.
|
|
// If toolName is empty, any tool output is considered.
|
|
func AssertToolOutputContainsAny(toolName string, substrings ...string) Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
for _, tc := range result.ToolCalls {
|
|
if toolName != "" && tc.Name != toolName {
|
|
continue
|
|
}
|
|
for _, substring := range substrings {
|
|
if strings.Contains(strings.ToLower(tc.Output), strings.ToLower(substring)) {
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("tool_output:%s_contains_any", toolName),
|
|
Passed: true,
|
|
Message: fmt.Sprintf("Tool '%s' output contains '%s'", tc.Name, substring),
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: fmt.Sprintf("tool_output:%s_contains_any", toolName),
|
|
Passed: false,
|
|
Message: fmt.Sprintf("No tool output matched any of: %v", substrings),
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertApprovalRequested checks that at least one approval request was emitted.
|
|
func AssertApprovalRequested() Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
if len(result.Approvals) > 0 {
|
|
return AssertionResult{
|
|
Name: "approval_requested",
|
|
Passed: true,
|
|
Message: fmt.Sprintf("Approval requests: %d", len(result.Approvals)),
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: "approval_requested",
|
|
Passed: false,
|
|
Message: "No approval requests were captured",
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertOnlyToolsUsed checks that all tool calls are in the allow list.
|
|
func AssertOnlyToolsUsed(allowed ...string) Assertion {
|
|
allowedSet := make(map[string]struct{}, len(allowed))
|
|
for _, tool := range allowed {
|
|
allowedSet[tool] = struct{}{}
|
|
}
|
|
return func(result *StepResult) AssertionResult {
|
|
var unexpected []string
|
|
for _, tc := range result.ToolCalls {
|
|
if _, ok := allowedSet[tc.Name]; !ok {
|
|
unexpected = append(unexpected, tc.Name)
|
|
}
|
|
}
|
|
if len(unexpected) == 0 {
|
|
return AssertionResult{
|
|
Name: "only_tools_used",
|
|
Passed: true,
|
|
Message: fmt.Sprintf("Only allowed tools used: %v", allowed),
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: "only_tools_used",
|
|
Passed: false,
|
|
Message: fmt.Sprintf("Unexpected tools used: %v", unexpected),
|
|
}
|
|
}
|
|
}
|
|
|
|
// AssertRoutingMismatchRecovered verifies recovery if a routing mismatch occurs.
|
|
// If a routing mismatch is seen, the tool input must target the specific container.
|
|
// If no mismatch is seen, the tool input should still target the node.
|
|
func AssertRoutingMismatchRecovered(nodeName, containerName string) Assertion {
|
|
return func(result *StepResult) AssertionResult {
|
|
sawMismatch := false
|
|
for _, tc := range result.ToolCalls {
|
|
if strings.Contains(strings.ToLower(tc.Output), "routing_mismatch") {
|
|
sawMismatch = true
|
|
break
|
|
}
|
|
}
|
|
|
|
if sawMismatch {
|
|
for _, tc := range result.ToolCalls {
|
|
if strings.Contains(strings.ToLower(tc.Input), strings.ToLower(containerName)) {
|
|
return AssertionResult{
|
|
Name: "routing_mismatch_recovered",
|
|
Passed: true,
|
|
Message: fmt.Sprintf("Routing mismatch recovered by targeting '%s'", containerName),
|
|
}
|
|
}
|
|
}
|
|
return AssertionResult{
|
|
Name: "routing_mismatch_recovered",
|
|
Passed: false,
|
|
Message: fmt.Sprintf("Routing mismatch seen, but no tool input targeted '%s'", containerName),
|
|
}
|
|
}
|
|
|
|
for _, tc := range result.ToolCalls {
|
|
if strings.Contains(strings.ToLower(tc.Input), strings.ToLower(nodeName)) {
|
|
return AssertionResult{
|
|
Name: "routing_mismatch_recovered",
|
|
Passed: true,
|
|
Message: fmt.Sprintf("No routing mismatch; tool input targeted node '%s'", nodeName),
|
|
}
|
|
}
|
|
}
|
|
|
|
return AssertionResult{
|
|
Name: "routing_mismatch_recovered",
|
|
Passed: false,
|
|
Message: fmt.Sprintf("No routing mismatch and no tool input targeted '%s'", nodeName),
|
|
}
|
|
}
|
|
}
|
|
|
|
// === Helper functions ===
|
|
|
|
func getToolNames(toolCalls []ToolCallEvent) []string {
|
|
names := make([]string, len(toolCalls))
|
|
for i, tc := range toolCalls {
|
|
names[i] = tc.Name
|
|
}
|
|
return names
|
|
}
|
|
|
|
func hasSuccessfulToolCall(toolCalls []ToolCallEvent) bool {
|
|
for _, tc := range toolCalls {
|
|
if tc.Success {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func max(a, b int) int {
|
|
if a > b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|
|
|
|
func min(a, b int) int {
|
|
if a < b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|