docs: update AI evaluation matrix and approval workflow documentation

2026-07-09 16:00:59 +00:00 · 2026-01-30 19:00:10 +00:00 · 2026-01-30 19:00:10 +00:00 · 17208cbf9d
commit 17208cbf9d
parent 10df3e4d95
10 changed files with 774 additions and 67 deletions
--- a/.github/workflows/eval-model-matrix.yml
+++ b/.github/workflows/eval-model-matrix.yml
@ -0,0 +1,63 @@
+name: Pulse AI Model Matrix
+
+on:
+  workflow_dispatch:
+    inputs:
+      scenario:
+        description: Scenario or collection to run (e.g. matrix, smoke, readonly, advanced)
+        required: true
+        default: matrix
+      models:
+        description: Comma-separated model list (e.g. gpt-4.1-mini,claude-3-5-sonnet,gemini-1.5-pro,ollama:llama3.1)
+        required: false
+        default: ""
+      providers:
+        description: Optional provider filter (e.g. openai,anthropic,gemini,ollama)
+        required: false
+        default: ""
+      base_url:
+        description: Pulse API base URL (e.g. http://127.0.0.1:7655)
+        required: true
+
+jobs:
+  eval:
+    name: Model Matrix Eval
+    runs-on: self-hosted
+    timeout-minutes: 60
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version-file: go.mod
+
+      - name: Run eval matrix
+        env:
+          EVAL_REPORT_DIR: tmp/eval-reports
+          PULSE_EVAL_USER: ${{ secrets.PULSE_EVAL_USER || 'admin' }}
+          PULSE_EVAL_PASS: ${{ secrets.PULSE_EVAL_PASS || 'admin' }}
+        run: |
+          MODEL_ARGS=("-auto-models")
+          if [ -n "${{ inputs.models }}" ]; then
+            MODEL_ARGS=("-models" "${{ inputs.models }}")
+          fi
+          if [ -n "${{ inputs.providers }}" ]; then
+            export EVAL_MODEL_PROVIDERS="${{ inputs.providers }}"
+          fi
+          go run ./cmd/eval \
+            -scenario "${{ inputs.scenario }}" \
+            "${MODEL_ARGS[@]}" \
+            -url "${{ inputs.base_url }}" \
+            -user "${PULSE_EVAL_USER}" \
+            -pass "${PULSE_EVAL_PASS}"
+
+      - name: Upload eval reports
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: eval-reports
+          path: tmp/eval-reports
+          retention-days: 14
--- a/.gitignore
+++ b/.gitignore
@ -206,6 +206,6 @@ scripts/safe-checkout.sh
 BACKUP_SYSTEM.md

 # Generated artifacts
-eval
+/eval
 test_output.txt
 coverage_summary.txt
--- a/analyze_coverage.py
+++ b/analyze_coverage.py
@ -0,0 +1,56 @@
+
+import sys
+import os
+
+def parse_coverage(filename):
+    if not os.path.exists(filename):
+        print(f"File {filename} not found")
+        return
+
+    package_stmts = {}
+    package_covered = {}
+
+    with open(filename, 'r') as f:
+        lines = f.readlines()
+
+    current_mode = ""
+    for line in lines:
+        if line.startswith("mode:"):
+            current_mode = line.split()[1]
+            continue
+        
+        parts = line.strip().split(':')
+        if len(parts) != 2:
+            continue
+        
+        file_path = parts[0]
+        # Package is directory of file_path
+        package_name = os.path.dirname(file_path)
+        
+        metrics = parts[1].split()
+        if len(metrics) != 3:
+            continue
+            
+        # start_end = metrics[0]
+        num_stmts = int(metrics[1])
+        count = int(metrics[2])
+        
+        package_stmts[package_name] = package_stmts.get(package_name, 0) + num_stmts
+        if count > 0:
+            package_covered[package_name] = package_covered.get(package_name, 0) + num_stmts
+
+    results = []
+    for pkg, total in package_stmts.items():
+        covered = package_covered.get(pkg, 0)
+        percent = (covered / total) * 100 if total > 0 else 0
+        results.append((pkg, percent, covered, total))
+
+    # Sort by percentage (ascending)
+    results.sort(key=lambda x: x[1])
+
+    print("Package Coverage Report (Bottom 20):")
+    for pkg, pct, cov, tot in results[:20]:
+        print(f"{pct:6.2f}% ({cov}/{tot}) {pkg}")
+
+if __name__ == "__main__":
+    parse_coverage("coverage.out")
--- a/cmd/eval/main.go
+++ b/cmd/eval/main.go
@ -9,27 +9,39 @@
 //
 // Options:
 //
-//	-scenario string  Scenario to run: smoke, readonly, enforce, routing, routing-recovery, logs, readonly-recovery, search-id, disambiguate, context-target, discovery, writeverify, strict, strict-block, strict-recovery, readonly-guardrails, noninteractive, approval, approval-approve, approval-deny, patrol, patrol-basic, patrol-investigation, patrol-finding-quality, all (default "smoke")
+//	-scenario string  Scenario to run: smoke, readonly, enforce, routing, routing-recovery, logs, readonly-recovery, search-id, disambiguate, context-target, discovery, writeverify, strict, strict-block, strict-recovery, readonly-guardrails, noninteractive, approval, approval-approve, approval-deny, approval-combo, patrol, patrol-basic, patrol-investigation, patrol-finding-quality, patrol-signal-coverage, matrix, all (default "smoke")
 //	-url string       Pulse API base URL (default "http://127.0.0.1:7655")
 //	-user string      Username for auth (default "admin")
 //	-pass string      Password for auth (default "admin")
+//	-model string     Model override for chat requests
+//	-models string    Comma-separated list of models to run (overrides -model)
+//	-auto-models      Auto-select latest models per provider
 //	-list             List available scenarios and exit
 //	-quiet            Only show summary, not step-by-step output
 package main

 import (
+	"encoding/json"
 	"flag"
 	"fmt"
+	"io"
+	"net/http"
 	"os"
+	"sort"
+	"strings"
+	"time"

 	"github.com/rcourtman/pulse-go-rewrite/internal/ai/eval"
 )

 func main() {
-	scenario := flag.String("scenario", "smoke", "Scenario to run: smoke, readonly, enforce, routing, routing-recovery, logs, readonly-recovery, search-id, disambiguate, context-target, discovery, writeverify, strict, strict-block, strict-recovery, readonly-guardrails, noninteractive, approval, approval-approve, approval-deny, patrol, patrol-basic, patrol-investigation, patrol-finding-quality, all")
+	scenario := flag.String("scenario", "smoke", "Scenario to run: smoke, readonly, enforce, routing, routing-recovery, logs, readonly-recovery, search-id, disambiguate, context-target, discovery, writeverify, guest-control, guest-idempotent, guest-discovery, guest-natural, guest-multi, readonly-filtering, read-loop-recovery, ambiguous-intent, strict, strict-block, strict-recovery, readonly-guardrails, noninteractive, approval, approval-approve, approval-deny, approval-combo, patrol, patrol-basic, patrol-investigation, patrol-finding-quality, patrol-signal-coverage, matrix, all")
 	url := flag.String("url", "http://127.0.0.1:7655", "Pulse API base URL")
 	user := flag.String("user", "admin", "Username for auth")
 	pass := flag.String("pass", "admin", "Password for auth")
+	model := flag.String("model", "", "Model override for chat requests")
+	models := flag.String("models", "", "Comma-separated list of models to run (overrides -model)")
+	autoModels := flag.Bool("auto-models", false, "Auto-select latest models per provider")
 	list := flag.Bool("list", false, "List available scenarios and exit")
 	quiet := flag.Bool("quiet", false, "Only show summary, not step-by-step output")

@ -40,58 +52,113 @@ func main() {
 		return
 	}

-	config := eval.Config{
-		BaseURL:  *url,
-		Username: *user,
-		Password: *pass,
-		Verbose:  !*quiet,
+	baseConfig := eval.DefaultConfig()
+	baseConfig.BaseURL = *url
+	baseConfig.Username = *user
+	baseConfig.Password = *pass
+	baseConfig.Verbose = !*quiet
+
+	if value, ok := envBool("EVAL_PREFLIGHT"); ok {
+		baseConfig.Preflight = value
+	}
+	if value, ok := envInt("EVAL_PREFLIGHT_TIMEOUT"); ok && value > 0 {
+		baseConfig.PreflightTimeout = time.Duration(value) * time.Second
+	} else if baseConfig.PreflightTimeout == 0 {
+		baseConfig.PreflightTimeout = 15 * time.Second
 	}

-	runner := eval.NewRunner(config)
-
-	// Check for patrol scenarios first
-	patrolScenarios := getPatrolScenarios(*scenario)
-	if len(patrolScenarios) > 0 {
-		allPassed := true
-		for _, ps := range patrolScenarios {
-			fmt.Printf("\n>>> Running patrol scenario: %s\n", ps.Name)
-			fmt.Printf(">>> %s\n", ps.Description)
-
-			result := runner.RunPatrolScenario(ps)
-			runner.PrintPatrolSummary(result)
-
-			if !result.Success {
-				allPassed = false
-			}
-		}
-
-		if allPassed {
-			fmt.Printf("\n>>> ALL PATROL SCENARIOS PASSED\n")
-			os.Exit(0)
-		} else {
-			fmt.Printf("\n>>> SOME PATROL SCENARIOS FAILED\n")
+	modelList := parseModelList(*models)
+	if len(modelList) == 0 && *autoModels {
+		autoList, details, stats, err := fetchAutoModels(baseConfig.BaseURL, baseConfig.Username, baseConfig.Password)
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "Failed to auto-select models: %v\n", err)
 			os.Exit(1)
 		}
-		return
+		modelList = autoList
+		fmt.Printf(">>> Auto-selected models: %s\n", strings.Join(modelList, ", "))
+		if len(stats) > 0 {
+			fmt.Println(">>> Auto-selection provider summary:")
+			providers := sortedProviders(stats)
+			for _, provider := range providers {
+				stat := stats[provider]
+				fmt.Printf("  - %s: %d models (%d notable)\n", provider, stat.Total, stat.Notable)
+			}
+		}
+		if len(details) > 0 {
+			fmt.Println(">>> Auto-selection details:")
+			for _, detail := range details {
+				meta := detail.Reason
+				if meta == "" {
+					meta = "selected"
+				}
+				fmt.Printf("  - %s: %s (%s)\n", detail.Provider, detail.ID, meta)
+			}
+		}
+	}
+	if len(modelList) == 0 {
+		modelList = []string{strings.TrimSpace(*model)}
+	}
+	if len(modelList) == 0 {
+		modelList = []string{""}
 	}

-	// Standard chat scenarios
+	patrolScenarios := getPatrolScenarios(*scenario)
 	scenarios := getScenarios(*scenario)
-	if len(scenarios) == 0 {
+	if len(patrolScenarios) == 0 && len(scenarios) == 0 {
 		fmt.Fprintf(os.Stderr, "Unknown scenario: %s\n", *scenario)
 		fmt.Fprintf(os.Stderr, "Use -list to see available scenarios\n")
 		os.Exit(1)
 	}

 	allPassed := true
-	for _, s := range scenarios {
-		fmt.Printf("\n>>> Running scenario: %s\n", s.Name)
-		fmt.Printf(">>> %s\n", s.Description)
+	for _, modelID := range modelList {
+		config := baseConfig
+		config.Model = strings.TrimSpace(modelID)

-		result := runner.RunScenario(s)
-		runner.PrintSummary(result)
+		if config.Model != "" {
+			fmt.Printf("\n>>> Using model: %s\n", config.Model)
+		}

-		if !result.Passed {
+		if config.Preflight {
+			fmt.Printf(">>> Preflight enabled (timeout %s)\n", config.PreflightTimeout)
+		}
+
+		runner := eval.NewRunner(config)
+
+		if len(patrolScenarios) > 0 {
+			modelPassed := true
+			for _, ps := range patrolScenarios {
+				fmt.Printf("\n>>> Running patrol scenario: %s\n", ps.Name)
+				fmt.Printf(">>> %s\n", ps.Description)
+
+				result := runner.RunPatrolScenario(ps)
+				runner.PrintPatrolSummary(result)
+
+				if !result.Success {
+					modelPassed = false
+				}
+			}
+
+			if !modelPassed {
+				allPassed = false
+			}
+			continue
+		}
+
+		modelPassed := true
+		for _, s := range scenarios {
+			fmt.Printf("\n>>> Running scenario: %s\n", s.Name)
+			fmt.Printf(">>> %s\n", s.Description)
+
+			result := runner.RunScenario(s)
+			runner.PrintSummary(result)
+
+			if !result.Passed {
+				modelPassed = false
+			}
+		}
+
+		if !modelPassed {
 			allPassed = false
 		}
 	}
@ -121,6 +188,18 @@ func listScenarios() {
 	fmt.Println("    context-target - Context target carryover (2 steps)")
 	fmt.Println("    discovery    - Infrastructure discovery test (2 steps)")
 	fmt.Println()
+	fmt.Println("  Guest Control:")
+	fmt.Println("    guest-control    - Stop + start a guest via @mentions (2 steps)")
+	fmt.Println("    guest-idempotent - Idempotent stop (stop twice + start, 3 steps)")
+	fmt.Println("    guest-discovery  - Stop without @mentions (discovery path, 2 steps)")
+	fmt.Println("    guest-natural    - Natural language variations (turn off, shut down, 4 steps)")
+	fmt.Println("    guest-multi      - Multi-mention status query (2 resources, 1 step)")
+	fmt.Println()
+	fmt.Println("  Safety & Filtering:")
+	fmt.Println("    readonly-filtering  - Control tools excluded from read-only queries (3 steps)")
+	fmt.Println("    read-loop-recovery  - Model produces text after budget blocks (2 steps)")
+	fmt.Println("    ambiguous-intent    - Ambiguous requests default to read-only (3 steps)")
+	fmt.Println()
 	fmt.Println("  Advanced:")
 	fmt.Println("    troubleshoot - Multi-step troubleshooting workflow (4 steps)")
 	fmt.Println("    deepdive     - Deep investigation of a service (4 steps)")
@ -138,15 +217,18 @@ func listScenarios() {
 	fmt.Println("    approval    - Approval flow (1 step, opt-in)")
 	fmt.Println("    approval-approve - Approval approve flow (1 step, opt-in)")
 	fmt.Println("    approval-deny - Approval deny flow (1 step, opt-in)")
+	fmt.Println("    approval-combo - Approval approve + deny in one session (2 steps, opt-in)")
 	fmt.Println()
 	fmt.Println("  Patrol:")
 	fmt.Println("    patrol              - Run all patrol scenarios")
 	fmt.Println("    patrol-basic        - Basic patrol run (completion, tools, findings)")
 	fmt.Println("    patrol-investigation - Investigation quality (investigate before report)")
 	fmt.Println("    patrol-finding-quality - Finding validation (well-formed findings)")
+	fmt.Println("    patrol-signal-coverage - Signal-to-finding coverage scoring")
 	fmt.Println()
 	fmt.Println("  Collections:")
 	fmt.Println("    all          - Run all basic scenarios")
+	fmt.Println("    matrix       - Model matrix quick run (smoke + readonly)")
 	fmt.Println("    advanced     - Run all advanced scenarios")
 	fmt.Println("    full         - Run everything")
 	fmt.Println()
@ -165,6 +247,8 @@ func getPatrolScenarios(name string) []eval.PatrolScenario {
 		return []eval.PatrolScenario{eval.PatrolInvestigationScenario()}
 	case "patrol-finding-quality":
 		return []eval.PatrolScenario{eval.PatrolFindingQualityScenario()}
+	case "patrol-signal-coverage", "patrol-quality":
+		return []eval.PatrolScenario{eval.PatrolSignalCoverageScenario()}
 	default:
 		return nil
 	}
@ -196,6 +280,26 @@ func getScenarios(name string) []eval.Scenario {
 	case "discovery":
 		return []eval.Scenario{eval.DiscoveryScenario()}

+	// Guest control scenarios
+	case "guest-control":
+		return []eval.Scenario{eval.GuestControlStopScenario()}
+	case "guest-idempotent":
+		return []eval.Scenario{eval.GuestControlIdempotentScenario()}
+	case "guest-discovery":
+		return []eval.Scenario{eval.GuestControlDiscoveryScenario()}
+	case "guest-natural":
+		return []eval.Scenario{eval.GuestControlNaturalLanguageScenario()}
+	case "guest-multi":
+		return []eval.Scenario{eval.GuestControlMultiMentionScenario()}
+
+	// Safety & filtering scenarios
+	case "readonly-filtering":
+		return []eval.Scenario{eval.ReadOnlyToolFilteringScenario()}
+	case "read-loop-recovery":
+		return []eval.Scenario{eval.ReadLoopRecoveryScenario()}
+	case "ambiguous-intent":
+		return []eval.Scenario{eval.AmbiguousIntentScenario()}
+
 	// Advanced scenarios
 	case "troubleshoot":
 		return []eval.Scenario{eval.TroubleshootingScenario()}
@ -229,6 +333,8 @@ func getScenarios(name string) []eval.Scenario {
 		return []eval.Scenario{eval.ApprovalApproveScenario()}
 	case "approval-deny":
 		return []eval.Scenario{eval.ApprovalDenyScenario()}
+	case "approval-combo":
+		return []eval.Scenario{eval.ApprovalComboScenario()}

 	// Collections
 	case "all":
@ -245,6 +351,11 @@ func getScenarios(name string) []eval.Scenario {
 			eval.ContextTargetCarryoverScenario(),
 			eval.DiscoveryScenario(),
 		}
+	case "matrix":
+		return []eval.Scenario{
+			eval.QuickSmokeTest(),
+			eval.ReadOnlyInfrastructureScenario(),
+		}
 	case "advanced":
 		return []eval.Scenario{
 			eval.TroubleshootingScenario(),
@ -255,13 +366,20 @@ func getScenarios(name string) []eval.Scenario {
 			eval.DockerInDockerScenario(),
 			eval.ContextChainScenario(),
 			eval.WriteVerifyScenario(),
+			eval.GuestControlStopScenario(),
+			eval.GuestControlIdempotentScenario(),
+			eval.GuestControlDiscoveryScenario(),
+			eval.GuestControlNaturalLanguageScenario(),
+			eval.GuestControlMultiMentionScenario(),
+			eval.ReadOnlyToolFilteringScenario(),
+			eval.ReadLoopRecoveryScenario(),
+			eval.AmbiguousIntentScenario(),
 			eval.StrictResolutionScenario(),
 			eval.StrictResolutionBlockScenario(),
 			eval.StrictResolutionRecoveryScenario(),
 			eval.ReadOnlyEnforcementScenario(),
 			eval.NonInteractiveGuardrailScenario(),
-			eval.ApprovalApproveScenario(),
-			eval.ApprovalDenyScenario(),
+			eval.ApprovalComboScenario(),
 		}
 	case "full":
 		return []eval.Scenario{
@ -284,15 +402,331 @@ func getScenarios(name string) []eval.Scenario {
 			eval.DockerInDockerScenario(),
 			eval.ContextChainScenario(),
 			eval.WriteVerifyScenario(),
+			eval.GuestControlStopScenario(),
+			eval.GuestControlIdempotentScenario(),
+			eval.GuestControlDiscoveryScenario(),
+			eval.GuestControlNaturalLanguageScenario(),
+			eval.GuestControlMultiMentionScenario(),
+			eval.ReadOnlyToolFilteringScenario(),
+			eval.ReadLoopRecoveryScenario(),
+			eval.AmbiguousIntentScenario(),
 			eval.StrictResolutionScenario(),
 			eval.StrictResolutionBlockScenario(),
 			eval.StrictResolutionRecoveryScenario(),
 			eval.ReadOnlyEnforcementScenario(),
 			eval.NonInteractiveGuardrailScenario(),
-			eval.ApprovalApproveScenario(),
-			eval.ApprovalDenyScenario(),
+			eval.ApprovalComboScenario(),
 		}
 	default:
 		return nil
 	}
 }
+
+func envBool(key string) (bool, bool) {
+	value, ok := os.LookupEnv(key)
+	if !ok {
+		return false, false
+	}
+	switch strings.ToLower(strings.TrimSpace(value)) {
+	case "1", "true", "yes", "y", "on":
+		return true, true
+	case "0", "false", "no", "n", "off":
+		return false, true
+	default:
+		return false, false
+	}
+}
+
+func envInt(key string) (int, bool) {
+	value, ok := os.LookupEnv(key)
+	if !ok {
+		return 0, false
+	}
+	var parsed int
+	if _, err := fmt.Sscanf(strings.TrimSpace(value), "%d", &parsed); err != nil {
+		return 0, false
+	}
+	return parsed, true
+}
+
+func parseModelList(raw string) []string {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return nil
+	}
+	parts := strings.Split(raw, ",")
+	models := make([]string, 0, len(parts))
+	for _, part := range parts {
+		trimmed := strings.TrimSpace(part)
+		if trimmed == "" {
+			continue
+		}
+		models = append(models, trimmed)
+	}
+	return models
+}
+
+type apiModelInfo struct {
+	ID          string `json:"id"`
+	Name        string `json:"name"`
+	Description string `json:"description,omitempty"`
+	Notable     bool   `json:"notable"`
+	CreatedAt   int64  `json:"created_at,omitempty"`
+}
+
+type apiModelsResponse struct {
+	Models []apiModelInfo `json:"models"`
+	Error  string         `json:"error,omitempty"`
+}
+
+type providerStats struct {
+	Total   int
+	Notable int
+}
+
+type autoSelectionDetail struct {
+	Provider  string
+	ID        string
+	Name      string
+	Notable   bool
+	CreatedAt int64
+	Reason    string
+}
+
+func fetchAutoModels(baseURL, user, pass string) ([]string, []autoSelectionDetail, map[string]providerStats, error) {
+	if strings.TrimSpace(baseURL) == "" {
+		return nil, nil, nil, fmt.Errorf("base URL is required")
+	}
+
+	req, err := http.NewRequest("GET", strings.TrimRight(baseURL, "/")+"/api/ai/models", nil)
+	if err != nil {
+		return nil, nil, nil, fmt.Errorf("failed to build models request: %w", err)
+	}
+	req.SetBasicAuth(user, pass)
+
+	client := &http.Client{Timeout: 30 * time.Second}
+	resp, err := client.Do(req)
+	if err != nil {
+		return nil, nil, nil, fmt.Errorf("models request failed: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(resp.Body)
+		return nil, nil, nil, fmt.Errorf("models request returned %d: %s", resp.StatusCode, strings.TrimSpace(string(body)))
+	}
+
+	var payload apiModelsResponse
+	if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
+		return nil, nil, nil, fmt.Errorf("failed to decode models response: %w", err)
+	}
+	if payload.Error != "" {
+		return nil, nil, nil, fmt.Errorf(payload.Error)
+	}
+
+	providerFilter := parseProviderFilterWithDefault(os.Getenv("EVAL_MODEL_PROVIDERS"))
+	excludeKeywords := parseExcludeKeywords(os.Getenv("EVAL_MODEL_EXCLUDE_KEYWORDS"))
+	limit := 2
+	if value, ok := envInt("EVAL_MODEL_LIMIT"); ok && value > 0 {
+		limit = value
+	}
+
+	grouped := make(map[string][]apiModelInfo)
+	stats := make(map[string]providerStats)
+	for _, model := range payload.Models {
+		if model.ID == "" {
+			continue
+		}
+		parts := strings.SplitN(model.ID, ":", 2)
+		provider := parts[0]
+		if provider == "" {
+			continue
+		}
+		if len(providerFilter) > 0 && !providerFilter[provider] {
+			continue
+		}
+		if len(excludeKeywords) > 0 && hasAnyKeyword(model, excludeKeywords) {
+			continue
+		}
+		grouped[provider] = append(grouped[provider], model)
+		stat := stats[provider]
+		stat.Total++
+		if model.Notable {
+			stat.Notable++
+		}
+		stats[provider] = stat
+	}
+
+	if len(grouped) == 0 {
+		return nil, nil, stats, fmt.Errorf("no models found for auto-selection")
+	}
+
+	providers := make([]string, 0, len(grouped))
+	for provider := range grouped {
+		providers = append(providers, provider)
+	}
+	sort.Strings(providers)
+
+	seen := make(map[string]bool)
+	selected := make([]string, 0, len(grouped)*limit)
+	details := make([]autoSelectionDetail, 0, len(grouped)*limit)
+	for _, provider := range providers {
+		models := grouped[provider]
+		sort.Slice(models, func(i, j int) bool {
+			if models[i].Notable != models[j].Notable {
+				return models[i].Notable
+			}
+			if models[i].CreatedAt != models[j].CreatedAt {
+				return models[i].CreatedAt > models[j].CreatedAt
+			}
+			return models[i].ID < models[j].ID
+		})
+		for _, model := range models {
+			if len(selected) >= len(grouped)*limit {
+				break
+			}
+			if seen[model.ID] {
+				continue
+			}
+			seen[model.ID] = true
+			selected = append(selected, model.ID)
+			details = append(details, autoSelectionDetail{
+				Provider:  provider,
+				ID:        model.ID,
+				Name:      model.Name,
+				Notable:   model.Notable,
+				CreatedAt: model.CreatedAt,
+				Reason:    selectionReason(model, stats[provider]),
+			})
+			if countProvider(selected, provider) >= limit {
+				break
+			}
+		}
+	}
+
+	if len(selected) == 0 {
+		return nil, nil, stats, fmt.Errorf("auto-selection produced no models")
+	}
+	return selected, details, stats, nil
+}
+
+func parseProviderFilter(raw string) map[string]bool {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return nil
+	}
+	out := make(map[string]bool)
+	for _, part := range strings.Split(raw, ",") {
+		trimmed := strings.TrimSpace(part)
+		if trimmed == "" {
+			continue
+		}
+		out[trimmed] = true
+	}
+	return out
+}
+
+func parseProviderFilterWithDefault(raw string) map[string]bool {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return map[string]bool{
+			"openai":    true,
+			"anthropic": true,
+			"deepseek":  true,
+			"gemini":    true,
+			"ollama":    true,
+		}
+	}
+	return parseProviderFilter(raw)
+}
+
+func parseExcludeKeywords(raw string) []string {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return []string{
+			"codex",
+			"openai:gpt-5.2-pro",
+			"image",
+			"vision",
+			"video",
+			"audio",
+			"speech",
+			"embed",
+			"embedding",
+			"moderation",
+			"rerank",
+			"tts",
+			"realtime",
+			"transcribe",
+		}
+	}
+	switch strings.ToLower(raw) {
+	case "0", "false", "off", "none":
+		return nil
+	}
+	parts := strings.Split(raw, ",")
+	out := make([]string, 0, len(parts))
+	for _, part := range parts {
+		trimmed := strings.TrimSpace(part)
+		if trimmed == "" {
+			continue
+		}
+		out = append(out, strings.ToLower(trimmed))
+	}
+	return out
+}
+
+func hasAnyKeyword(model apiModelInfo, keywords []string) bool {
+	if len(keywords) == 0 {
+		return false
+	}
+	target := strings.ToLower(model.ID + " " + model.Name + " " + model.Description)
+	for _, keyword := range keywords {
+		if keyword == "" {
+			continue
+		}
+		if strings.Contains(target, keyword) {
+			return true
+		}
+	}
+	return false
+}
+
+func countProvider(models []string, provider string) int {
+	if provider == "" {
+		return 0
+	}
+	count := 0
+	prefix := provider + ":"
+	for _, model := range models {
+		if strings.HasPrefix(model, prefix) {
+			count++
+		}
+	}
+	return count
+}
+
+func selectionReason(model apiModelInfo, stat providerStats) string {
+	parts := make([]string, 0, 2)
+	if stat.Notable == 0 {
+		parts = append(parts, "no notable models")
+	}
+	if model.Notable {
+		parts = append(parts, "notable")
+	} else if model.CreatedAt > 0 {
+		created := time.Unix(model.CreatedAt, 0).UTC().Format("2006-01-02")
+		parts = append(parts, "created_at="+created)
+	} else {
+		parts = append(parts, "fallback")
+	}
+	return strings.Join(parts, "; ")
+}
+
+func sortedProviders(stats map[string]providerStats) []string {
+	providers := make([]string, 0, len(stats))
+	for provider := range stats {
+		providers = append(providers, provider)
+	}
+	sort.Strings(providers)
+	return providers
+}
--- a/docs/AI.md
+++ b/docs/AI.md
@ -125,6 +125,35 @@ Alert-triggered analysis runs attach a timeline event to the alert, so investiga

 > **License note**: Kubernetes AI analysis is gated by the `kubernetes_ai` Pulse Pro feature.

+## Pulse Assistant (Chat): How It Works
+
+Pulse Assistant is **tool-driven**. It does not "guess" system state — it calls live tools and reports their outputs.
+
+### The Model's Workflow (Discover → Investigate → Act)
+- **Discover**: Uses `pulse_query` (or `pulse_discovery`) to find real resources and IDs.
+- **Investigate**: Uses `pulse_read` to run bounded, read-only commands and check status/logs.
+- **Act** (optional): Uses `pulse_control` for changes, then verifies with a read.
+
+### Safety Gates That Make It Trustworthy
+- **Strict Resolution (optional)**: When enabled, the assistant must discover a resource before it can act on it. This prevents fabricated IDs.
+- **Read/Write separation**: Read-only commands go through `pulse_read`; write actions go through `pulse_control`. This keeps the workflow state machine honest.
+- **Verification after writes**: After any write, the assistant must perform a read check before it can finish the response.
+- **Non‑interactive guardrails**: Commands that could hang (e.g., `tail -f`) are rewritten into bounded, safe forms.
+- **Approval mode**: In Controlled mode, every write requires explicit user approval. Autonomous mode is available only with Pro.
+
+### What You See As a User
+- **Clear tool usage**: Each step shows which tool ran and what it returned.
+- **Structured recovery**: If a tool is blocked, the assistant adapts (e.g., runs discovery, switches tools, or asks for approval).
+- **Verified outcomes**: Changes are followed by a read check before the assistant claims success.
+
+## Why It's Impressive (and Reliable)
+
+Pulse Assistant behaves like a careful operator:
+- It **grounds answers in live data** instead of assumptions.
+- It **adapts** when guardrails block an action.
+- It **verifies** changes before reporting success.
+- It **keeps you in control** with explicit approval gates.
+
 ## Configuration

 Configure in the UI: **Settings → System → AI Assistant**
@ -149,6 +178,34 @@ You can set separate models for:
 - Patrol (`patrol_model`)
 - Auto-fix remediation (`auto_fix_model`)

+## Model Matrix (Pulse Assistant)
+
+This table summarizes the most recent **Pulse Assistant** eval runs per model. Patrol is still in development and is not scored yet.
+Time/tokens reflect the combined **Smoke + Read-only** matrix run.
+Transient provider errors (rate limits, unavailable chat endpoints) are skipped when rendering the table.
+
+Update the table from eval reports:
+```
+EVAL_REPORT_DIR=tmp/eval-reports go run ./cmd/eval -scenario matrix -auto-models
+python3 scripts/eval/render_model_matrix.py tmp/eval-reports --write-doc docs/AI.md
+```
+Or use the helper script:
+```
+scripts/eval/run_model_matrix.sh
+```
+
+<!-- MODEL_MATRIX_START -->
+| Model | Smoke | Read-only | Time (matrix) | Tokens (matrix) | Last run (UTC) |
+| --- | --- | --- | --- | --- | --- |
+| anthropic:claude-3-haiku-20240307 | ✅ | ❌ | 2m 42s | — | 2026-01-29 |
+| anthropic:claude-haiku-4-5-20251001 | ✅ | ✅ | 8s | 18,923 | 2026-01-29 |
+| anthropic:claude-opus-4-5-20251101 | ✅ | ✅ | 9m 31s | 1,120,530 | 2026-01-29 |
+| gemini:gemini-3-flash-preview | ✅ | ✅ | 7m 4s | — | 2026-01-29 |
+| gemini:gemini-3-pro-preview | ✅ | ✅ | 3m 54s | 1,914 | 2026-01-29 |
+| openai:gpt-5.2 | ✅ | ✅ | 5s | 12,363 | 2026-01-29 |
+| openai:gpt-5.2-chat-latest | ✅ | ✅ | 8s | 12,595 | 2026-01-29 |
+<!-- MODEL_MATRIX_END -->
+
 ### Testing

 - Test provider connectivity: `POST /api/ai/test` and `POST /api/ai/test/{provider}`
@ -202,6 +259,14 @@ Pulse uses three AI permission levels for infrastructure control:
 - **Controlled**: AI asks for approval before executing commands or control actions.
 - **Autonomous (Pro)**: AI executes actions without prompting.

+### Using Approvals (Controlled Mode)
+
+When control level is **Controlled**, write actions pause for approval:
+
+- In chat, you’ll see an approval card with the proposed command.
+- **Approve** to execute and verify the change, or **Deny** to cancel it.
+- Only users with admin privileges can approve/deny.
+
 ### Advanced Network Restrictions

 Pulse blocks AI tool HTTP fetches to loopback and link-local addresses by default. For local development, you can allow loopback targets:
--- a/docs/EVAL.md
+++ b/docs/EVAL.md
@ -20,6 +20,16 @@ Run a single scenario:
 go run ./cmd/eval -scenario readonly
 ```

+Run the model matrix quick set:
+```
+go run ./cmd/eval -scenario matrix
+```
+
+Auto-select models (latest per provider):
+```
+go run ./cmd/eval -scenario matrix -auto-models
+```
+
 ## Environment Overrides

 These env vars let you align the evals with your infrastructure naming:
@ -35,6 +45,10 @@ EVAL_HOMEASSISTANT_CONTAINER
 EVAL_MQTT_CONTAINER
 EVAL_ZIGBEE_CONTAINER
 EVAL_FRIGATE_CONTAINER
+EVAL_MODEL                  (optional model override)
+EVAL_MODEL_PROVIDERS        (optional comma-separated provider filter for auto selection; defaults to openai,anthropic,deepseek,gemini,ollama)
+EVAL_MODEL_LIMIT            (optional per-provider limit for auto selection, default 2)
+EVAL_MODEL_EXCLUDE_KEYWORDS (optional comma-separated keywords to skip models; default filters image/video/audio, codex, and specific pre-release IDs like openai:gpt-5.2-pro until chat support is live; set to "none" to disable)
 ```

 Write/verify and strict-resolution controls:
@ -51,12 +65,15 @@ EVAL_EXPECT_APPROVAL         (set to 1 to assert approval_needed event)
 Retry controls and reports:

 ```
+EVAL_HTTP_TIMEOUT           (seconds, default 300)
 EVAL_STEP_RETRIES            (default 2)
 EVAL_RETRY_ON_PHANTOM        (default 1)
 EVAL_RETRY_ON_EXPLICIT_TOOL  (default 1)
 EVAL_RETRY_ON_STREAM_FAILURE (default 1)
 EVAL_RETRY_ON_EMPTY_RESPONSE (default 1)
 EVAL_RETRY_ON_TOOL_ERRORS    (default 1)
+EVAL_RETRY_ON_RATE_LIMIT     (default 0)
+EVAL_RATE_LIMIT_COOLDOWN     (seconds, optional backoff before retry)
 EVAL_PREFLIGHT              (set to 1 to run a quick chat preflight)
 EVAL_PREFLIGHT_TIMEOUT       (seconds, default 15)
 EVAL_REPORT_DIR              (write JSON report per scenario)
@ -106,12 +123,38 @@ EVAL_EXPECT_APPROVAL=1 \
 go run ./cmd/eval -scenario approval-deny
 ```

+Approval combo flow (approve + deny in one session):
+```
+EVAL_EXPECT_APPROVAL=1 \
+go run ./cmd/eval -scenario approval-combo
+```
+
 Write then verify (safe no-op command by default):
 ```
 EVAL_REQUIRE_WRITE_VERIFY=1 \
 go run ./cmd/eval -scenario writeverify
 ```

+## Model Matrix Workflow
+
+Run the matrix and update the docs table in one step:
+```
+scripts/eval/run_model_matrix.sh
+```
+
+Key overrides:
+```
+PULSE_BASE_URL=http://127.0.0.1:7655
+PULSE_EVAL_USER=admin
+PULSE_EVAL_PASS=admin
+EVAL_MODEL_PROVIDERS=openai,anthropic,gemini
+EVAL_MODEL_LIMIT=2
+EVAL_MODELS=anthropic:claude-haiku-4-5-20251001
+EVAL_SCENARIO=matrix
+EVAL_REPORT_DIR=tmp/eval-reports
+EVAL_WRITE_DOC=1
+```
+
 ## Notes

 - The evals run against live infrastructure. Use safe commands or keep the default `EVAL_WRITE_COMMAND=true`.
--- a/docs/architecture/pulse-assistant.md
+++ b/docs/architecture/pulse-assistant.md
@ -57,6 +57,18 @@ Pulse Assistant is a **protocol-driven, safety-gated AI system** for infrastruct
 3. **Writes must be verified.** FSM enforces read-after-write before final answer.
 4. **Errors are recoverable.** Structured error responses enable self-correction without prompt engineering.

+## 1.1 User-Visible Behavior (What Feels "Impressive")
+
+When you use Pulse Assistant in chat, these behaviors are deliberate and enforced by the backend:
+
+- **Grounded answers**: The assistant uses live tools and surfaces their outputs.
+- **Discover → Investigate → Act**: It queries resources first, reads status/logs, and only then acts.
+- **Verified changes**: After a write, it performs a read check before concluding.
+- **Approval gates**: In Controlled mode, write actions emit approvals and wait for a decision.
+- **Self‑recovery**: If blocked (routing mismatch, read‑only violation, strict resolution), it adapts and retries with a safe path.
+
+These are not prompt conventions — they are enforced by the FSM + tool executor.
+
 ---

 ## 2. Core Design Principles (Invariants)
@ -88,6 +100,18 @@ Resolved resources are **session-scoped** and **in-memory only**. They are never

 **Enforcement:** `ResolvedContext` not serialized, rebuilt each session in `chat/session.go`

+### Approval Flow (Controlled Mode)
+
+When `control_level=controlled`, write tools emit an approval request instead of executing:
+
+1. Tool returns `APPROVAL_REQUIRED: { approval_id, command, ... }`
+2. Agentic loop emits `approval_needed` SSE event
+3. UI or API approves/denies via `/api/ai/approvals/{id}/approve|deny`
+4. On approve, the tool re-executes with `_approval_id` and proceeds
+5. On deny, the assistant returns `Command denied: <reason>`
+
+This keeps the LLM in a proposer role while letting users explicitly authorize actions.
+
 ### Invariant 6: Read/Write Tool Separation

 > **This is the most commonly violated invariant.** Read it carefully.
--- a/mock.env
+++ b/mock.env
@ -0,0 +1,10 @@
+# Mock Mode Configuration
+PULSE_MOCK_MODE=false
+PULSE_MOCK_NODES=7
+PULSE_MOCK_VMS_PER_NODE=5
+PULSE_MOCK_LXCS_PER_NODE=8
+PULSE_MOCK_DOCKER_HOSTS=3
+PULSE_MOCK_DOCKER_CONTAINERS=12
+PULSE_MOCK_RANDOM_METRICS=true
+PULSE_MOCK_STOPPED_PERCENT=20
+PULSE_LICENSE_PUBLIC_KEY="OzbVzmg+TaSGt0eWzDVpn0QkqhOzJqUbOFvSF3AmuRU="
--- a/scripts/dev-check.sh
+++ b/scripts/dev-check.sh
@ -17,7 +17,7 @@ if [[ "${1:-}" == "--kill" ]]; then
    pkill -9 -f "bin/pulse$" 2>/dev/null || true
    pkill -9 -f "^\./pulse$" 2>/dev/null || true
    pkill -f "node.*vite" 2>/dev/null || true
-    pkill -f "watch-backup.sh" 2>/dev/null || true
+    pkill -f "watch-snapshot.sh" 2>/dev/null || true
    sleep 2
    echo -e "${GREEN}✓${NC} All dev processes stopped"
    exit 0
@ -72,15 +72,15 @@ else
    echo -e "${YELLOW}⚠ Not running (enable in settings)${NC}"
 fi

-# Check file backup watcher
-echo -n "File backup watcher: "
-BACKUP_PID=$(pgrep -f "watch-backup.sh" 2>/dev/null | head -1)
-if [[ -n "$BACKUP_PID" ]]; then
-    BACKUP_COUNT=$(ls ~/.pulse-backups 2>/dev/null | wc -l | tr -d ' ')
-    echo -e "${GREEN}✓ Running (PID: $BACKUP_PID, $BACKUP_COUNT backups)${NC}"
+# Check snapshot watcher
+echo -n "Snapshot watcher: "
+SNAPSHOT_PID=$(pgrep -f "watch-snapshot.sh" 2>/dev/null | head -1)
+if [[ -n "$SNAPSHOT_PID" ]]; then
+    SNAPSHOT_COUNT=$(git -C ~/.pulse-snapshots rev-list --count HEAD 2>/dev/null || echo 0)
+    echo -e "${GREEN}✓ Running (PID: $SNAPSHOT_PID, $SNAPSHOT_COUNT snapshots)${NC}"
 else
    echo -e "${YELLOW}⚠ Not running (optional - protects against accidental file loss)${NC}"
-    echo "   Start: ./scripts/watch-backup.sh &"
+    echo "   Start: ./scripts/watch-snapshot.sh &"
 fi

 # Show recent errors
--- a/scripts/hot-dev.sh
+++ b/scripts/hot-dev.sh
@ -5,7 +5,7 @@
 # - Go backend with auto-rebuild on file changes (via inotifywait)
 # - Vite frontend dev server with HMR
 # - Auto-detection of pulse-pro module for Pro features
-# - File backup watcher (if scripts/watch-backup.sh exists)
+# - Snapshot watcher (if scripts/watch-snapshot.sh exists)
 #
 # Environment Variables:
 #   HOT_DEV_USE_PROD_DATA=true   Use /etc/pulse for data (sessions, config, etc.)
@ -204,10 +204,14 @@ pkill -x "pulse" 2>/dev/null || true
 sleep 1
 pkill -9 -x "pulse" 2>/dev/null || true

+
 kill_port "${FRONTEND_DEV_PORT}"
 kill_port "${PULSE_DEV_API_PORT}"
 kill_port "${EXTRA_CLEANUP_PORT}"

+# Truncate debug log
+:> /tmp/pulse-debug.log
+
 sleep 2

 # Verify ports are free
@ -387,7 +391,7 @@ else
    fi
 fi

-LOG_LEVEL=debug \
+LOG_LEVEL="${LOG_LEVEL:-debug}" \
 FRONTEND_PORT="${PULSE_DEV_API_PORT:-7655}" \
 PORT="${PULSE_DEV_API_PORT:-7655}" \
 PULSE_DATA_DIR="${PULSE_DATA_DIR:-}" \
@ -397,7 +401,9 @@ PULSE_DEV="${PULSE_DEV:-true}" \
 PULSE_AUTH_USER="${PULSE_AUTH_USER:-}" \
 PULSE_AUTH_PASS="${PULSE_AUTH_PASS:-}" \
 ALLOWED_ORIGINS="${ALLOWED_ORIGINS:-}" \
-./pulse >> /tmp/pulse-debug.log 2>&1 &
+LOG_FILE="/tmp/pulse-debug.log" \
+LOG_MAX_SIZE="50" \
+./pulse > /dev/null 2>&1 &
 BACKEND_PID=$!

 sleep 2
@ -418,7 +424,7 @@ log_info "Starting backend health monitor..."

        if [[ "$PULSE_COUNT" -eq 0 ]]; then
            log_warn "⚠️  Pulse died unexpectedly, restarting..."
-            LOG_LEVEL=debug \
+            LOG_LEVEL="${LOG_LEVEL:-debug}" \
            FRONTEND_PORT="${PULSE_DEV_API_PORT:-7655}" \
            PORT="${PULSE_DEV_API_PORT:-7655}" \
            PULSE_DATA_DIR="${PULSE_DATA_DIR:-}" \
@ -428,7 +434,9 @@ log_info "Starting backend health monitor..."
            PULSE_AUTH_USER="${PULSE_AUTH_USER:-}" \
            PULSE_AUTH_PASS="${PULSE_AUTH_PASS:-}" \
            ALLOWED_ORIGINS="${ALLOWED_ORIGINS:-}" \
-            ./pulse >> /tmp/pulse-debug.log 2>&1 &
+            LOG_FILE="/tmp/pulse-debug.log" \
+            LOG_MAX_SIZE="50" \
+            ./pulse > /dev/null 2>&1 &
            NEW_PID=$!
            sleep 2
            if kill -0 "$NEW_PID" 2>/dev/null; then
@ -440,7 +448,7 @@ log_info "Starting backend health monitor..."
            log_error "⚠️  Multiple Pulse processes detected ($PULSE_COUNT), killing all and restarting..."
            pkill -9 -f "^\./pulse$" 2>/dev/null || true
            sleep 2
-            LOG_LEVEL=debug \
+            LOG_LEVEL="${LOG_LEVEL:-debug}" \
            FRONTEND_PORT="${PULSE_DEV_API_PORT:-7655}" \
            PORT="${PULSE_DEV_API_PORT:-7655}" \
            PULSE_DATA_DIR="${PULSE_DATA_DIR:-}" \
@ -450,7 +458,9 @@ log_info "Starting backend health monitor..."
            PULSE_AUTH_USER="${PULSE_AUTH_USER:-}" \
            PULSE_AUTH_PASS="${PULSE_AUTH_PASS:-}" \
            ALLOWED_ORIGINS="${ALLOWED_ORIGINS:-}" \
-            ./pulse >> /tmp/pulse-debug.log 2>&1 &
+            LOG_FILE="/tmp/pulse-debug.log" \
+            LOG_MAX_SIZE="50" \
+            ./pulse > /dev/null 2>&1 &
            NEW_PID=$!
            sleep 2
            if kill -0 "$NEW_PID" 2>/dev/null; then
@ -479,7 +489,7 @@ log_info "Starting backend file watcher..."
        pkill -9 -f "^\./pulse$" 2>/dev/null || true
        sleep 1

-        LOG_LEVEL=debug \
+        LOG_LEVEL="${LOG_LEVEL:-debug}" \
        FRONTEND_PORT="${PULSE_DEV_API_PORT:-7655}" \
        PORT="${PULSE_DEV_API_PORT:-7655}" \
        PULSE_DATA_DIR="${PULSE_DATA_DIR:-}" \
@ -489,7 +499,9 @@ log_info "Starting backend file watcher..."
        PULSE_AUTH_USER="${PULSE_AUTH_USER:-}" \
        PULSE_AUTH_PASS="${PULSE_AUTH_PASS:-}" \
        ALLOWED_ORIGINS="${ALLOWED_ORIGINS:-}" \
-        ./pulse >> /tmp/pulse-debug.log 2>&1 &
+        LOG_FILE="/tmp/pulse-debug.log" \
+        LOG_MAX_SIZE="50" \
+        ./pulse > /dev/null 2>&1 &
        NEW_PID=$!
        sleep 1

@ -612,7 +624,7 @@ cleanup() {
    # Fallback cleanup
    pkill -f "inotifywait.*pulse" 2>/dev/null || true
    pkill -f "fswatch.*pulse" 2>/dev/null || true
-    pkill -f "watch-backup.sh" 2>/dev/null || true
+    pkill -f "watch-snapshot.sh" 2>/dev/null || true

    log_info "Hot-dev stopped."
 }
@ -620,12 +632,12 @@ trap cleanup INT TERM EXIT

 # --- Start File Backup Watcher (optional) ---

-BACKUP_SCRIPT="${ROOT_DIR}/scripts/watch-backup.sh"
-if [[ -x "${BACKUP_SCRIPT}" ]]; then
-    log_info "Starting file backup watcher..."
-    "${BACKUP_SCRIPT}" > /tmp/pulse-watch-backup.log 2>&1 &
+SNAPSHOT_SCRIPT="${ROOT_DIR}/scripts/watch-snapshot.sh"
+if [[ -x "${SNAPSHOT_SCRIPT}" ]]; then
+    log_info "Starting snapshot watcher..."
+    "${SNAPSHOT_SCRIPT}" > /tmp/pulse-watch-snapshot.log 2>&1 &
    BACKUP_WATCHER_PID=$!
-    log_info "File backups: ~/.pulse-backups (PID: ${BACKUP_WATCHER_PID})"
+    log_info "Snapshots: ~/.pulse-snapshots (PID: ${BACKUP_WATCHER_PID})"
 fi

 # --- Start Frontend ---