Fix QuickAnalysis cost recording and audit the pattern

Service.Narrate (b84b87d8d) and Service.NarrateFleet (d4463a615) fixed missing cost-record calls in the report-narrative path. Auditing the rest of internal/ai for the same bug class found one more: Service.QuickAnalysis. It is used for alert auto-resolve and similar lightweight decisions, so production token spend on auto-resolve analysis was invisible in the AI usage dashboard. Mirror the same fix: capture costStore under the read lock alongside provider/cfg, and after provider.Chat returns, record a UsageEvent labelled with the request's UseCase (defaulting to "quick_analysis" when the caller leaves it blank). Recording happens before the empty-content guard so failed-but-billed calls are still visible. Adds cost_recording_audit_test.go: an AST-level audit that walks internal/ai/*.go (excluding _test.go and sub-packages), finds every function calling .Chat() on a providers.Provider value, and asserts each function body also references .Record() on a cost store. Exemption is allowed via a //cost-recording-exempt: <reason> doc comment. RunPatrolToolPreflight is annotated as exempt — it is a connectivity self-test, not user workload, and should not pollute the operator's cost dashboard. The audit is intentionally local (function-scoped, not interprocedural). A passthrough wrapper that calls a recording function rather than calling Record itself would need an explicit exemption naming the wrapped callee. Keeping the scan local makes new Chat callers loud rather than letting silent gaps creep in via indirection. Future Chat callers must either record cost or carry the exemption marker. The audit fails CI otherwise, so the regression that shipped in b2bd9d114 (Narrate) and would have shipped again in d4463a615 (fleet) cannot recur silently.
2026-05-20 01:01:20 +00:00 · 2026-05-10 21:57:39 +01:00 · 2026-05-10 21:57:39 +01:00 · 08491b9f48
commit 08491b9f48
parent c6a786a36d
3 changed files with 198 additions and 0 deletions
--- a/internal/ai/cost_recording_audit_test.go
+++ b/internal/ai/cost_recording_audit_test.go
@ -0,0 +1,167 @@
+package ai
+
+import (
+	"go/ast"
+	"go/parser"
+	"go/token"
+	"io/fs"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+)
+
+// TestCostRecordingCoverage is an AST-level audit: every function in
+// internal/ai that calls .Chat() on a providers.Provider value must
+// also reference cost recording (.Record() on a cost store) within
+// the same function body, or be explicitly exempted with a doc
+// comment marker. This guards against the bug class we shipped twice
+// in the report-narrative path: a Chat call that consumes provider
+// tokens but doesn't surface in the operator's cost ledger.
+//
+// Exemption marker: place "//cost-recording-exempt: <reason>" in the
+// function's doc comment. Use sparingly — the only legitimate case so
+// far is patrol preflight (a connectivity test, not user workload).
+//
+// The scan is intentionally local (function-scoped, not interprocedural).
+// A passthrough wrapper that calls a recording function instead of
+// calling Record itself would need an exemption with the wrapped
+// callee named in the reason. Keeping it local makes false positives
+// loud rather than letting silent gaps creep in via indirection.
+func TestCostRecordingCoverage(t *testing.T) {
+	const exemptMarker = "cost-recording-exempt"
+
+	type offender struct {
+		file     string
+		funcName string
+		line     int
+	}
+	var offenders []offender
+
+	fset := token.NewFileSet()
+
+	walkErr := filepath.WalkDir(".", func(path string, d fs.DirEntry, err error) error {
+		if err != nil {
+			return err
+		}
+		if d.IsDir() {
+			// Stay in the canonical package directory; the providers
+			// package implements Chat itself and is out of scope, and
+			// every other subdirectory is a separate package with its
+			// own coverage test if needed.
+			if path == "." {
+				return nil
+			}
+			return fs.SkipDir
+		}
+		name := d.Name()
+		if !strings.HasSuffix(name, ".go") || strings.HasSuffix(name, "_test.go") {
+			return nil
+		}
+
+		src, readErr := os.ReadFile(path)
+		if readErr != nil {
+			return readErr
+		}
+		file, parseErr := parser.ParseFile(fset, path, src, parser.ParseComments)
+		if parseErr != nil {
+			return parseErr
+		}
+
+		ast.Inspect(file, func(n ast.Node) bool {
+			fn, ok := n.(*ast.FuncDecl)
+			if !ok || fn.Body == nil {
+				return true
+			}
+			if hasExemptionComment(fn.Doc, exemptMarker) {
+				return true
+			}
+			if !bodyCallsMethod(fn.Body, "Chat") {
+				return true
+			}
+			if bodyCallsMethod(fn.Body, "Record") {
+				return true
+			}
+			pos := fset.Position(fn.Pos())
+			offenders = append(offenders, offender{
+				file:     path,
+				funcName: fn.Name.Name,
+				line:     pos.Line,
+			})
+			return true
+		})
+		return nil
+	})
+	if walkErr != nil {
+		t.Fatalf("walk: %v", walkErr)
+	}
+
+	if len(offenders) > 0 {
+		var lines []string
+		for _, o := range offenders {
+			lines = append(lines, "  "+o.file+":"+itoa(o.line)+" "+o.funcName)
+		}
+		t.Fatalf("found %d function(s) calling provider.Chat without recording cost.\n"+
+			"Each call site must record a cost.UsageEvent or carry a //%s: <reason> doc comment.\n"+
+			"Offenders:\n%s",
+			len(offenders), exemptMarker, strings.Join(lines, "\n"))
+	}
+}
+
+func bodyCallsMethod(body *ast.BlockStmt, method string) bool {
+	found := false
+	ast.Inspect(body, func(n ast.Node) bool {
+		if found {
+			return false
+		}
+		call, ok := n.(*ast.CallExpr)
+		if !ok {
+			return true
+		}
+		sel, ok := call.Fun.(*ast.SelectorExpr)
+		if !ok {
+			return true
+		}
+		if sel.Sel != nil && sel.Sel.Name == method {
+			found = true
+			return false
+		}
+		return true
+	})
+	return found
+}
+
+func hasExemptionComment(doc *ast.CommentGroup, marker string) bool {
+	if doc == nil {
+		return false
+	}
+	for _, c := range doc.List {
+		if strings.Contains(c.Text, marker) {
+			return true
+		}
+	}
+	return false
+}
+
+// itoa avoids strconv just to keep this audit self-contained.
+func itoa(n int) string {
+	if n == 0 {
+		return "0"
+	}
+	neg := n < 0
+	if neg {
+		n = -n
+	}
+	var buf [20]byte
+	i := len(buf)
+	for n > 0 {
+		i--
+		buf[i] = byte('0' + n%10)
+		n /= 10
+	}
+	if neg {
+		i--
+		buf[i] = '-'
+	}
+	return string(buf[i:])
+}
--- a/internal/ai/patrol_preflight.go
+++ b/internal/ai/patrol_preflight.go
@ -102,6 +102,11 @@ func (s *Service) TriggerPatrolPreflightAsync(provider, model string) {
 // classified into the result's Cause / Summary / Recommendation fields
 // the same way runtime Patrol failures are, so the caller can render a
 // single response shape for every outcome.
+//
+// cost-recording-exempt: connectivity/tool-call self-test, not user
+// workload. Operator-triggered preflight is observability-shaped, not
+// billed-feature-shaped, and should not pollute the AI usage dashboard.
+// This is the only intentional exception to the cost-recording audit.
 func (s *Service) RunPatrolToolPreflight(ctx context.Context, providerName, model string) PatrolPreflightResult {
 	started := time.Now()

--- a/internal/ai/service.go
+++ b/internal/ai/service.go
@ -1678,6 +1678,7 @@ func (s *Service) QuickAnalysis(ctx context.Context, req QuickAnalysisRequest) (
 	s.mu.RLock()
 	provider := s.provider
 	cfg := s.cfg
+	costStore := s.costStore
 	s.mu.RUnlock()

 	if provider == nil {
@ -1719,11 +1720,36 @@ func (s *Service) QuickAnalysis(ctx context.Context, req QuickAnalysisRequest) (
 		chatReq = sanitizer(chatReq)
 	}

+	useCase := strings.TrimSpace(req.UseCase)
+	if useCase == "" {
+		useCase = "quick_analysis"
+	}
+
 	resp, err := provider.Chat(ctx, chatReq)
 	if err != nil {
 		return "", fmt.Errorf("Pulse Assistant analysis failed: %w", err)
 	}

+	// Record token usage in the operator-facing cost ledger so quick
+	// analysis (alert auto-resolve, etc.) shows up alongside chat and
+	// patrol spend. Recording happens before the empty-content guard
+	// because the operator was billed regardless.
+	if costStore != nil {
+		providerName, _ := config.ParseModelString(model)
+		if providerName == "" {
+			providerName = provider.Name()
+		}
+		costStore.Record(cost.UsageEvent{
+			Timestamp:     time.Now(),
+			Provider:      providerName,
+			RequestModel:  model,
+			ResponseModel: resp.Model,
+			UseCase:       useCase,
+			InputTokens:   resp.InputTokens,
+			OutputTokens:  resp.OutputTokens,
+		})
+	}
+
 	if resp.Content == "" {
 		return "", fmt.Errorf("Pulse Assistant returned empty response")
 	}