Fix QuickAnalysis cost recording and audit the pattern

Service.Narrate (b84b87d8d) and Service.NarrateFleet (d4463a615)
fixed missing cost-record calls in the report-narrative path. Auditing
the rest of internal/ai for the same bug class found one more:
Service.QuickAnalysis. It is used for alert auto-resolve and similar
lightweight decisions, so production token spend on auto-resolve
analysis was invisible in the AI usage dashboard.

Mirror the same fix: capture costStore under the read lock alongside
provider/cfg, and after provider.Chat returns, record a UsageEvent
labelled with the request's UseCase (defaulting to "quick_analysis"
when the caller leaves it blank). Recording happens before the
empty-content guard so failed-but-billed calls are still visible.

Adds cost_recording_audit_test.go: an AST-level audit that walks
internal/ai/*.go (excluding _test.go and sub-packages), finds every
function calling .Chat() on a providers.Provider value, and asserts
each function body also references .Record() on a cost store.
Exemption is allowed via a //cost-recording-exempt: <reason> doc
comment. RunPatrolToolPreflight is annotated as exempt — it is a
connectivity self-test, not user workload, and should not pollute
the operator's cost dashboard.

The audit is intentionally local (function-scoped, not
interprocedural). A passthrough wrapper that calls a recording
function rather than calling Record itself would need an explicit
exemption naming the wrapped callee. Keeping the scan local makes
new Chat callers loud rather than letting silent gaps creep in via
indirection.

Future Chat callers must either record cost or carry the exemption
marker. The audit fails CI otherwise, so the regression that shipped
in b2bd9d114 (Narrate) and would have shipped again in d4463a615
(fleet) cannot recur silently.
This commit is contained in:
rcourtman 2026-05-10 21:57:39 +01:00
parent c6a786a36d
commit 08491b9f48
3 changed files with 198 additions and 0 deletions

View file

@ -0,0 +1,167 @@
package ai
import (
"go/ast"
"go/parser"
"go/token"
"io/fs"
"os"
"path/filepath"
"strings"
"testing"
)
// TestCostRecordingCoverage is an AST-level audit: every function in
// internal/ai that calls .Chat() on a providers.Provider value must
// also reference cost recording (.Record() on a cost store) within
// the same function body, or be explicitly exempted with a doc
// comment marker. This guards against the bug class we shipped twice
// in the report-narrative path: a Chat call that consumes provider
// tokens but doesn't surface in the operator's cost ledger.
//
// Exemption marker: place "//cost-recording-exempt: <reason>" in the
// function's doc comment. Use sparingly — the only legitimate case so
// far is patrol preflight (a connectivity test, not user workload).
//
// The scan is intentionally local (function-scoped, not interprocedural).
// A passthrough wrapper that calls a recording function instead of
// calling Record itself would need an exemption with the wrapped
// callee named in the reason. Keeping it local makes false positives
// loud rather than letting silent gaps creep in via indirection.
func TestCostRecordingCoverage(t *testing.T) {
const exemptMarker = "cost-recording-exempt"
type offender struct {
file string
funcName string
line int
}
var offenders []offender
fset := token.NewFileSet()
walkErr := filepath.WalkDir(".", func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if d.IsDir() {
// Stay in the canonical package directory; the providers
// package implements Chat itself and is out of scope, and
// every other subdirectory is a separate package with its
// own coverage test if needed.
if path == "." {
return nil
}
return fs.SkipDir
}
name := d.Name()
if !strings.HasSuffix(name, ".go") || strings.HasSuffix(name, "_test.go") {
return nil
}
src, readErr := os.ReadFile(path)
if readErr != nil {
return readErr
}
file, parseErr := parser.ParseFile(fset, path, src, parser.ParseComments)
if parseErr != nil {
return parseErr
}
ast.Inspect(file, func(n ast.Node) bool {
fn, ok := n.(*ast.FuncDecl)
if !ok || fn.Body == nil {
return true
}
if hasExemptionComment(fn.Doc, exemptMarker) {
return true
}
if !bodyCallsMethod(fn.Body, "Chat") {
return true
}
if bodyCallsMethod(fn.Body, "Record") {
return true
}
pos := fset.Position(fn.Pos())
offenders = append(offenders, offender{
file: path,
funcName: fn.Name.Name,
line: pos.Line,
})
return true
})
return nil
})
if walkErr != nil {
t.Fatalf("walk: %v", walkErr)
}
if len(offenders) > 0 {
var lines []string
for _, o := range offenders {
lines = append(lines, " "+o.file+":"+itoa(o.line)+" "+o.funcName)
}
t.Fatalf("found %d function(s) calling provider.Chat without recording cost.\n"+
"Each call site must record a cost.UsageEvent or carry a //%s: <reason> doc comment.\n"+
"Offenders:\n%s",
len(offenders), exemptMarker, strings.Join(lines, "\n"))
}
}
func bodyCallsMethod(body *ast.BlockStmt, method string) bool {
found := false
ast.Inspect(body, func(n ast.Node) bool {
if found {
return false
}
call, ok := n.(*ast.CallExpr)
if !ok {
return true
}
sel, ok := call.Fun.(*ast.SelectorExpr)
if !ok {
return true
}
if sel.Sel != nil && sel.Sel.Name == method {
found = true
return false
}
return true
})
return found
}
func hasExemptionComment(doc *ast.CommentGroup, marker string) bool {
if doc == nil {
return false
}
for _, c := range doc.List {
if strings.Contains(c.Text, marker) {
return true
}
}
return false
}
// itoa avoids strconv just to keep this audit self-contained.
func itoa(n int) string {
if n == 0 {
return "0"
}
neg := n < 0
if neg {
n = -n
}
var buf [20]byte
i := len(buf)
for n > 0 {
i--
buf[i] = byte('0' + n%10)
n /= 10
}
if neg {
i--
buf[i] = '-'
}
return string(buf[i:])
}

View file

@ -102,6 +102,11 @@ func (s *Service) TriggerPatrolPreflightAsync(provider, model string) {
// classified into the result's Cause / Summary / Recommendation fields
// the same way runtime Patrol failures are, so the caller can render a
// single response shape for every outcome.
//
// cost-recording-exempt: connectivity/tool-call self-test, not user
// workload. Operator-triggered preflight is observability-shaped, not
// billed-feature-shaped, and should not pollute the AI usage dashboard.
// This is the only intentional exception to the cost-recording audit.
func (s *Service) RunPatrolToolPreflight(ctx context.Context, providerName, model string) PatrolPreflightResult {
started := time.Now()

View file

@ -1678,6 +1678,7 @@ func (s *Service) QuickAnalysis(ctx context.Context, req QuickAnalysisRequest) (
s.mu.RLock()
provider := s.provider
cfg := s.cfg
costStore := s.costStore
s.mu.RUnlock()
if provider == nil {
@ -1719,11 +1720,36 @@ func (s *Service) QuickAnalysis(ctx context.Context, req QuickAnalysisRequest) (
chatReq = sanitizer(chatReq)
}
useCase := strings.TrimSpace(req.UseCase)
if useCase == "" {
useCase = "quick_analysis"
}
resp, err := provider.Chat(ctx, chatReq)
if err != nil {
return "", fmt.Errorf("Pulse Assistant analysis failed: %w", err)
}
// Record token usage in the operator-facing cost ledger so quick
// analysis (alert auto-resolve, etc.) shows up alongside chat and
// patrol spend. Recording happens before the empty-content guard
// because the operator was billed regardless.
if costStore != nil {
providerName, _ := config.ParseModelString(model)
if providerName == "" {
providerName = provider.Name()
}
costStore.Record(cost.UsageEvent{
Timestamp: time.Now(),
Provider: providerName,
RequestModel: model,
ResponseModel: resp.Model,
UseCase: useCase,
InputTokens: resp.InputTokens,
OutputTokens: resp.OutputTokens,
})
}
if resp.Content == "" {
return "", fmt.Errorf("Pulse Assistant returned empty response")
}