Pulse/internal/ai/eval/eval_test.go
rcourtman c409e7a05e feat(eval): add patrol-specific eval scenarios and assertions
Add comprehensive patrol evaluation framework:

- patrol.go: Runner for patrol scenarios with streaming support
- patrol_assertions.go: Assertions for tool usage, findings, timing
- patrol_scenarios.go: Scenarios for basic, investigation, finding quality
- eval_test.go: Unit tests for patrol eval runner

Scenarios:
- patrol-basic: Verifies patrol completes with tools and findings
- patrol-investigation: Ensures investigation before reporting
- patrol-finding-quality: Validates finding structure and evidence

Run with: go run ./cmd/eval -scenario patrol
2026-01-28 23:19:11 +00:00

214 lines
5.7 KiB
Go

package eval
import (
"flag"
"os"
"testing"
)
var runLiveEval = flag.Bool("live", false, "Run live eval against Pulse API (requires running Pulse)")
func TestMain(m *testing.M) {
flag.Parse()
os.Exit(m.Run())
}
// TestQuickSmokeTest runs a minimal smoke test against the live API
// Run with: go test -v ./internal/ai/eval -run TestQuickSmokeTest -live
func TestQuickSmokeTest(t *testing.T) {
if !*runLiveEval {
t.Skip("Skipping live eval test. Use -live flag to run against live Pulse API")
}
runner := NewRunner(DefaultConfig())
scenario := QuickSmokeTest()
result := runner.RunScenario(scenario)
runner.PrintSummary(result)
if !result.Passed {
t.Fatalf("Scenario '%s' failed", scenario.Name)
}
}
// TestReadOnlyInfrastructure runs the full read-only infrastructure scenario
// Run with: go test -v ./internal/ai/eval -run TestReadOnlyInfrastructure -live
func TestReadOnlyInfrastructure(t *testing.T) {
if !*runLiveEval {
t.Skip("Skipping live eval test. Use -live flag to run against live Pulse API")
}
runner := NewRunner(DefaultConfig())
scenario := ReadOnlyInfrastructureScenario()
result := runner.RunScenario(scenario)
runner.PrintSummary(result)
if !result.Passed {
t.Fatalf("Scenario '%s' failed", scenario.Name)
}
}
// TestRoutingValidation runs the routing validation scenario
// Run with: go test -v ./internal/ai/eval -run TestRoutingValidation -live
func TestRoutingValidation(t *testing.T) {
if !*runLiveEval {
t.Skip("Skipping live eval test. Use -live flag to run against live Pulse API")
}
runner := NewRunner(DefaultConfig())
scenario := RoutingValidationScenario()
result := runner.RunScenario(scenario)
runner.PrintSummary(result)
if !result.Passed {
t.Fatalf("Scenario '%s' failed", scenario.Name)
}
}
// TestLogTailing runs the log tailing scenario
// Run with: go test -v ./internal/ai/eval -run TestLogTailing -live
func TestLogTailing(t *testing.T) {
if !*runLiveEval {
t.Skip("Skipping live eval test. Use -live flag to run against live Pulse API")
}
runner := NewRunner(DefaultConfig())
scenario := LogTailingScenario()
result := runner.RunScenario(scenario)
runner.PrintSummary(result)
if !result.Passed {
t.Fatalf("Scenario '%s' failed", scenario.Name)
}
}
// TestDiscovery runs the infrastructure discovery scenario
// Run with: go test -v ./internal/ai/eval -run TestDiscovery -live
func TestDiscovery(t *testing.T) {
if !*runLiveEval {
t.Skip("Skipping live eval test. Use -live flag to run against live Pulse API")
}
runner := NewRunner(DefaultConfig())
scenario := DiscoveryScenario()
result := runner.RunScenario(scenario)
runner.PrintSummary(result)
if !result.Passed {
t.Fatalf("Scenario '%s' failed", scenario.Name)
}
}
// TestAllScenarios runs all defined scenarios
// Run with: go test -v ./internal/ai/eval -run TestAllScenarios -live
func TestAllScenarios(t *testing.T) {
if !*runLiveEval {
t.Skip("Skipping live eval test. Use -live flag to run against live Pulse API")
}
runner := NewRunner(DefaultConfig())
scenarios := []Scenario{
QuickSmokeTest(),
ReadOnlyInfrastructureScenario(),
RoutingValidationScenario(),
LogTailingScenario(),
DiscoveryScenario(),
}
allPassed := true
for _, scenario := range scenarios {
t.Run(scenario.Name, func(t *testing.T) {
result := runner.RunScenario(scenario)
runner.PrintSummary(result)
if !result.Passed {
allPassed = false
t.Errorf("Scenario '%s' failed", scenario.Name)
}
})
}
if !allPassed {
t.Fatal("One or more scenarios failed")
}
}
// TestPatrolBasic runs the basic patrol eval scenario
// Run with: go test -v ./internal/ai/eval -run TestPatrolBasic -live
func TestPatrolBasic(t *testing.T) {
if !*runLiveEval {
t.Skip("Skipping live eval test. Use -live flag to run against live Pulse API")
}
runner := NewRunner(DefaultConfig())
result := runner.RunPatrolScenario(PatrolBasicScenario())
runner.PrintPatrolSummary(result)
if !result.Success {
t.Fatalf("Patrol scenario '%s' failed", "Patrol Basic Run")
}
}
// TestPatrolInvestigation runs the patrol investigation quality scenario
// Run with: go test -v ./internal/ai/eval -run TestPatrolInvestigation -live
func TestPatrolInvestigation(t *testing.T) {
if !*runLiveEval {
t.Skip("Skipping live eval test. Use -live flag to run against live Pulse API")
}
runner := NewRunner(DefaultConfig())
result := runner.RunPatrolScenario(PatrolInvestigationScenario())
runner.PrintPatrolSummary(result)
if !result.Success {
t.Fatalf("Patrol scenario '%s' failed", "Patrol Investigation Quality")
}
}
// TestPatrolFindingQuality runs the patrol finding quality scenario
// Run with: go test -v ./internal/ai/eval -run TestPatrolFindingQuality -live
func TestPatrolFindingQuality(t *testing.T) {
if !*runLiveEval {
t.Skip("Skipping live eval test. Use -live flag to run against live Pulse API")
}
runner := NewRunner(DefaultConfig())
result := runner.RunPatrolScenario(PatrolFindingQualityScenario())
runner.PrintPatrolSummary(result)
if !result.Success {
t.Fatalf("Patrol scenario '%s' failed", "Patrol Finding Quality")
}
}
// TestAllPatrolScenarios runs all patrol eval scenarios
// Run with: go test -v ./internal/ai/eval -run TestAllPatrolScenarios -live
func TestAllPatrolScenarios(t *testing.T) {
if !*runLiveEval {
t.Skip("Skipping live eval test. Use -live flag to run against live Pulse API")
}
runner := NewRunner(DefaultConfig())
allPassed := true
for _, scenario := range AllPatrolScenarios() {
t.Run(scenario.Name, func(t *testing.T) {
result := runner.RunPatrolScenario(scenario)
runner.PrintPatrolSummary(result)
if !result.Success {
allPassed = false
t.Errorf("Patrol scenario '%s' failed", scenario.Name)
}
})
}
if !allPassed {
t.Fatal("One or more patrol scenarios failed")
}
}