mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-04-29 03:50:18 +00:00
507 lines
12 KiB
Go
507 lines
12 KiB
Go
package eval
|
|
|
|
import (
|
|
"fmt"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/stretchr/testify/assert"
|
|
)
|
|
|
|
func TestAssertions(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
assertion Assertion
|
|
result StepResult
|
|
passed bool
|
|
}{
|
|
// AssertToolUsed
|
|
{
|
|
name: "AssertToolUsed Pass",
|
|
assertion: AssertToolUsed("test_tool"),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Name: "test_tool"}},
|
|
},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertToolUsed Fail",
|
|
assertion: AssertToolUsed("test_tool"),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Name: "other_tool"}},
|
|
},
|
|
passed: false,
|
|
},
|
|
|
|
// AssertToolNotUsed
|
|
{
|
|
name: "AssertToolNotUsed Pass",
|
|
assertion: AssertToolNotUsed("test_tool"),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Name: "other_tool"}},
|
|
},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertToolNotUsed Fail",
|
|
assertion: AssertToolNotUsed("test_tool"),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Name: "test_tool"}},
|
|
},
|
|
passed: false,
|
|
},
|
|
|
|
// AssertAnyToolUsed
|
|
{
|
|
name: "AssertAnyToolUsed Pass",
|
|
assertion: AssertAnyToolUsed(),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Name: "tool"}},
|
|
},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertAnyToolUsed Fail",
|
|
assertion: AssertAnyToolUsed(),
|
|
result: StepResult{},
|
|
passed: false,
|
|
},
|
|
|
|
// AssertNoToolErrors
|
|
{
|
|
name: "AssertNoToolErrors Pass",
|
|
assertion: AssertNoToolErrors(),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Name: "tool", Success: true}},
|
|
},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertNoToolErrors Fail",
|
|
assertion: AssertNoToolErrors(),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Name: "tool", Success: false, Output: "err"}},
|
|
},
|
|
passed: false,
|
|
},
|
|
|
|
// AssertContentContains
|
|
{
|
|
name: "AssertContentContains Pass",
|
|
assertion: AssertContentContains("hello"),
|
|
result: StepResult{
|
|
Content: "Hello world",
|
|
},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertContentContains Fail",
|
|
assertion: AssertContentContains("hello"),
|
|
result: StepResult{
|
|
Content: "Hi world",
|
|
},
|
|
passed: false,
|
|
},
|
|
|
|
// AssertContentContainsAny
|
|
{
|
|
name: "AssertContentContainsAny Pass",
|
|
assertion: AssertContentContainsAny("hello", "hi"),
|
|
result: StepResult{
|
|
Content: "Hi world",
|
|
},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertContentContainsAny Fail",
|
|
assertion: AssertContentContainsAny("hello", "hi"),
|
|
result: StepResult{
|
|
Content: "Greetings world",
|
|
},
|
|
passed: false,
|
|
},
|
|
|
|
// AssertContentNotContains
|
|
{
|
|
name: "AssertContentNotContains Pass",
|
|
assertion: AssertContentNotContains("error"),
|
|
result: StepResult{
|
|
Content: "All good",
|
|
},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertContentNotContains Fail",
|
|
assertion: AssertContentNotContains("error"),
|
|
result: StepResult{
|
|
Content: "An error occurred",
|
|
},
|
|
passed: false,
|
|
},
|
|
|
|
// AssertNoPhantomDetection
|
|
{
|
|
name: "AssertNoPhantomDetection Pass (No phantom)",
|
|
assertion: AssertNoPhantomDetection(),
|
|
result: StepResult{
|
|
Content: "Normal response",
|
|
},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertNoPhantomDetection Pass (Phantom but recovered)",
|
|
assertion: AssertNoPhantomDetection(),
|
|
result: StepResult{
|
|
Content: "I apologize, but I wasn't able to access the infrastructure tools needed to complete that request... Here is the data.",
|
|
ToolCalls: []ToolCallEvent{{Name: "tool", Success: true}},
|
|
},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertNoPhantomDetection Fail",
|
|
assertion: AssertNoPhantomDetection(),
|
|
result: StepResult{
|
|
Content: "I apologize, but I wasn't able to access the infrastructure tools needed to complete that request",
|
|
ToolCalls: []ToolCallEvent{{Name: "tool", Success: false}},
|
|
},
|
|
passed: false,
|
|
},
|
|
|
|
// AssertToolOutputContains
|
|
{
|
|
name: "AssertToolOutputContains Pass",
|
|
assertion: AssertToolOutputContains("tool1", "success"),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Name: "tool1", Output: "Operation success"}},
|
|
},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertToolOutputContains Fail (Content mismatch)",
|
|
assertion: AssertToolOutputContains("tool1", "success"),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Name: "tool1", Output: "Operation failed"}},
|
|
},
|
|
passed: false,
|
|
},
|
|
{
|
|
name: "AssertToolOutputContains Fail (Tool not called)",
|
|
assertion: AssertToolOutputContains("tool1", "success"),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Name: "tool2", Output: "Operation success"}},
|
|
},
|
|
passed: false,
|
|
},
|
|
|
|
// AssertNoError
|
|
{
|
|
name: "AssertNoError Pass",
|
|
assertion: AssertNoError(),
|
|
result: StepResult{Error: nil},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertNoError Fail",
|
|
assertion: AssertNoError(),
|
|
result: StepResult{Error: fmt.Errorf("fail")},
|
|
passed: false,
|
|
},
|
|
|
|
// AssertDurationUnder
|
|
{
|
|
name: "AssertDurationUnder Pass",
|
|
assertion: AssertDurationUnder("1s"),
|
|
result: StepResult{Duration: 500 * time.Millisecond},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertDurationUnder Fail",
|
|
assertion: AssertDurationUnder("1s"),
|
|
result: StepResult{Duration: 2 * time.Second},
|
|
passed: false,
|
|
},
|
|
|
|
// AssertToolNotBlocked
|
|
{
|
|
name: "AssertToolNotBlocked Pass",
|
|
assertion: AssertToolNotBlocked(),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Name: "tool", Output: "ok"}},
|
|
},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertToolNotBlocked Fail",
|
|
assertion: AssertToolNotBlocked(),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Name: "tool", Output: `{"blocked":true}`}},
|
|
},
|
|
passed: false,
|
|
},
|
|
{
|
|
name: "AssertToolNotBlocked Fail (Routing Mismatch)",
|
|
assertion: AssertToolNotBlocked(),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Name: "tool", Output: "ROUTING_MISMATCH"}},
|
|
},
|
|
passed: false,
|
|
},
|
|
|
|
// AssertEventualSuccess
|
|
{
|
|
name: "AssertEventualSuccess Pass",
|
|
assertion: AssertEventualSuccess(),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Success: false}, {Success: true}},
|
|
},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertEventualSuccess Fail",
|
|
assertion: AssertEventualSuccess(),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Success: false}, {Success: false}},
|
|
},
|
|
passed: false,
|
|
},
|
|
|
|
// AssertEventualSuccessOrApproval
|
|
{
|
|
name: "AssertEventualSuccessOrApproval Pass (Success)",
|
|
assertion: AssertEventualSuccessOrApproval(),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Success: true}},
|
|
},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertEventualSuccessOrApproval Pass (Approval)",
|
|
assertion: AssertEventualSuccessOrApproval(),
|
|
result: StepResult{
|
|
Approvals: []ApprovalEvent{{ApprovalID: "1"}},
|
|
},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertEventualSuccessOrApproval Fail",
|
|
assertion: AssertEventualSuccessOrApproval(),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Success: false}},
|
|
Approvals: []ApprovalEvent{},
|
|
},
|
|
passed: false,
|
|
},
|
|
|
|
// AssertMinToolCalls
|
|
{
|
|
name: "AssertMinToolCalls Pass",
|
|
assertion: AssertMinToolCalls(2),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{}, {}},
|
|
},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertMinToolCalls Fail",
|
|
assertion: AssertMinToolCalls(2),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{}},
|
|
},
|
|
passed: false,
|
|
},
|
|
|
|
// AssertMaxInputTokens
|
|
{
|
|
name: "AssertMaxInputTokens Pass",
|
|
assertion: AssertMaxInputTokens(100),
|
|
result: StepResult{InputTokens: 50},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertMaxInputTokens Fail",
|
|
assertion: AssertMaxInputTokens(100),
|
|
result: StepResult{InputTokens: 150},
|
|
passed: false,
|
|
},
|
|
|
|
// AssertMaxToolCalls
|
|
{
|
|
name: "AssertMaxToolCalls Pass",
|
|
assertion: AssertMaxToolCalls(2),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{}, {}},
|
|
},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertMaxToolCalls Fail",
|
|
assertion: AssertMaxToolCalls(2),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{}, {}, {}},
|
|
},
|
|
passed: false,
|
|
},
|
|
|
|
// AssertHasContent
|
|
{
|
|
name: "AssertHasContent Pass",
|
|
assertion: AssertHasContent(),
|
|
result: StepResult{Content: "This assumes content length check is > 50 chars... so let's make it long enough to pass the check defined in assertions.go"},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertHasContent Fail",
|
|
assertion: AssertHasContent(),
|
|
result: StepResult{Content: "Too short"},
|
|
passed: false,
|
|
},
|
|
|
|
// AssertModelRecovered
|
|
{
|
|
name: "AssertModelRecovered Pass (No blocks)",
|
|
assertion: AssertModelRecovered(),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Success: true}},
|
|
},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertModelRecovered Pass (Blocked then success)",
|
|
assertion: AssertModelRecovered(),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Success: false}, {Success: true}},
|
|
},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertModelRecovered Fail",
|
|
assertion: AssertModelRecovered(),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Success: false}, {Success: false}},
|
|
},
|
|
passed: false,
|
|
},
|
|
|
|
// AssertToolSequence
|
|
{
|
|
name: "AssertToolSequence Pass",
|
|
assertion: AssertToolSequence([]string{"t1", "t2"}),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Name: "t1"}, {Name: "other"}, {Name: "t2"}},
|
|
},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertToolSequence Fail",
|
|
assertion: AssertToolSequence([]string{"t1", "t2"}),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Name: "t2"}, {Name: "t1"}},
|
|
},
|
|
passed: false,
|
|
},
|
|
|
|
// AssertToolInputContains
|
|
{
|
|
name: "AssertToolInputContains Pass",
|
|
assertion: AssertToolInputContains("t1", "val"),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Name: "t1", Input: "some val here"}},
|
|
},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertToolInputContains Fail",
|
|
assertion: AssertToolInputContains("t1", "val"),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Name: "t1", Input: "nothing"}},
|
|
},
|
|
passed: false,
|
|
},
|
|
|
|
// AssertAnyToolInputContains
|
|
{
|
|
name: "AssertAnyToolInputContains Pass",
|
|
assertion: AssertAnyToolInputContains("", "val"),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Name: "t1", Input: "val"}},
|
|
},
|
|
passed: true,
|
|
},
|
|
|
|
// AssertAnyToolInputContainsAny
|
|
{
|
|
name: "AssertAnyToolInputContainsAny Pass",
|
|
assertion: AssertAnyToolInputContainsAny("", "val", "foo"),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Name: "t1", Input: "foo"}},
|
|
},
|
|
passed: true,
|
|
},
|
|
|
|
// AssertToolOutputContainsAny
|
|
{
|
|
name: "AssertToolOutputContainsAny Pass",
|
|
assertion: AssertToolOutputContainsAny("t1", "success", "ok"),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Name: "t1", Output: "status: ok"}},
|
|
},
|
|
passed: true,
|
|
},
|
|
|
|
// AssertApprovalRequested
|
|
{
|
|
name: "AssertApprovalRequested Pass",
|
|
assertion: AssertApprovalRequested(),
|
|
result: StepResult{Approvals: []ApprovalEvent{{}}},
|
|
passed: true,
|
|
},
|
|
|
|
// AssertOnlyToolsUsed
|
|
{
|
|
name: "AssertOnlyToolsUsed Pass",
|
|
assertion: AssertOnlyToolsUsed("a", "b"),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Name: "a"}, {Name: "b"}},
|
|
},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertOnlyToolsUsed Fail",
|
|
assertion: AssertOnlyToolsUsed("a"),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{{Name: "b"}},
|
|
},
|
|
passed: false,
|
|
},
|
|
|
|
// AssertRoutingMismatchRecovered
|
|
{
|
|
name: "AssertRoutingMismatchRecovered Pass (Mismatch -> Recovery)",
|
|
assertion: AssertRoutingMismatchRecovered("node1", "cont1"),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{
|
|
{Output: "routing_mismatch"},
|
|
{Input: "target cont1"},
|
|
},
|
|
},
|
|
passed: true,
|
|
},
|
|
{
|
|
name: "AssertRoutingMismatchRecovered Pass (No Mismatch -> Target Node)",
|
|
assertion: AssertRoutingMismatchRecovered("node1", "cont1"),
|
|
result: StepResult{
|
|
ToolCalls: []ToolCallEvent{
|
|
{Input: "target node1"},
|
|
},
|
|
},
|
|
passed: true,
|
|
},
|
|
}
|
|
|
|
for _, tc := range tests {
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
res := tc.assertion(&tc.result)
|
|
assert.Equal(t, tc.passed, res.Passed, "Message: %s", res.Message)
|
|
})
|
|
}
|
|
}
|