Pulse/internal/ai/chat/fsm.go

435 lines
13 KiB
Go

package chat
import (
"fmt"
"strings"
"time"
)
// SessionState represents the current state of a chat session's workflow.
// This FSM enforces structural guarantees that prevent prompt steering from
// creeping back and ensures contributors can't accidentally bypass safety checks.
type SessionState string
const (
// StateResolving - no validated target yet, must discover resources first
StateResolving SessionState = "RESOLVING"
// StateReading - read tools allowed, can query and explore
StateReading SessionState = "READING"
// StateWriting - write tools allowed (strict gating applies)
StateWriting SessionState = "WRITING"
// StateVerifying - must run at least one read after a write before final answer
StateVerifying SessionState = "VERIFYING"
)
// ToolKind classifies tool calls for FSM state transitions
type ToolKind int
const (
// ToolKindResolve - discovery/query tools that find resources
ToolKindResolve ToolKind = iota
// ToolKindRead - read-only tools (logs, metrics, status, config)
ToolKindRead
// ToolKindWrite - mutating tools (restart, stop, start, delete, file write)
ToolKindWrite
)
func (k ToolKind) String() string {
switch k {
case ToolKindResolve:
return "resolve"
case ToolKindRead:
return "read"
case ToolKindWrite:
return "write"
default:
return "unknown"
}
}
// SessionFSM tracks the workflow state for a chat session.
// This is stored alongside ResolvedContext in the session.
type SessionFSM struct {
State SessionState `json:"state"`
// WroteThisEpisode tracks whether we performed a write in this "episode"
WroteThisEpisode bool `json:"wrote_this_episode"`
// ReadAfterWrite tracks whether we performed a read *after* the last write
ReadAfterWrite bool `json:"read_after_write"`
// LastWriteTool records the last write tool for debugging/telemetry
LastWriteTool string `json:"last_write_tool,omitempty"`
// LastWriteAt records when the last write happened
LastWriteAt time.Time `json:"last_write_at,omitempty"`
// LastReadTool records the last read tool (for verification tracking)
LastReadTool string `json:"last_read_tool,omitempty"`
// LastReadAt records when the last read happened
LastReadAt time.Time `json:"last_read_at,omitempty"`
// PendingRecoveries tracks blocked operations awaiting recovery
// Key is recovery_id (UUID), cleaned up after TTL
PendingRecoveries map[string]*PendingRecovery `json:"-"`
}
// PendingRecovery tracks a blocked operation that may be retried after recovery
type PendingRecovery struct {
RecoveryID string `json:"recovery_id"`
ErrorCode string `json:"error_code"` // FSM_BLOCKED, STRICT_RESOLUTION
Tool string `json:"tool"` // original tool that was blocked
CreatedAt time.Time `json:"created_at"`
Attempts int `json:"attempts"` // number of recovery attempts
}
// RecoveryTTL is how long we track pending recoveries before cleanup
const RecoveryTTL = 10 * time.Minute
// NewSessionFSM creates a new FSM in the initial RESOLVING state
func NewSessionFSM() *SessionFSM {
return &SessionFSM{
State: StateResolving,
PendingRecoveries: make(map[string]*PendingRecovery),
}
}
// TrackPendingRecovery records a blocked operation that may be recovered.
// Returns the recovery_id for correlation.
func (fsm *SessionFSM) TrackPendingRecovery(errorCode, tool string) string {
fsm.cleanupExpiredRecoveries()
recoveryID := fmt.Sprintf("%s-%d", tool, time.Now().UnixNano())
fsm.PendingRecoveries[recoveryID] = &PendingRecovery{
RecoveryID: recoveryID,
ErrorCode: errorCode,
Tool: tool,
CreatedAt: time.Now(),
Attempts: 1,
}
return recoveryID
}
// CheckRecoverySuccess checks if a successful tool call resolves a pending recovery.
// Returns the PendingRecovery if found (for metrics), nil otherwise.
// The recovery is removed from tracking after this call.
func (fsm *SessionFSM) CheckRecoverySuccess(tool string) *PendingRecovery {
fsm.cleanupExpiredRecoveries()
// Look for any pending recovery for this tool
for id, pr := range fsm.PendingRecoveries {
if pr.Tool == tool {
delete(fsm.PendingRecoveries, id)
return pr
}
}
return nil
}
// cleanupExpiredRecoveries removes recoveries older than RecoveryTTL
func (fsm *SessionFSM) cleanupExpiredRecoveries() {
if fsm.PendingRecoveries == nil {
fsm.PendingRecoveries = make(map[string]*PendingRecovery)
return
}
cutoff := time.Now().Add(-RecoveryTTL)
for id, pr := range fsm.PendingRecoveries {
if pr.CreatedAt.Before(cutoff) {
delete(fsm.PendingRecoveries, id)
}
}
}
// CanExecuteTool checks if the current state allows executing a tool of the given kind.
// Returns an error describing why the tool is blocked, or nil if allowed.
func (fsm *SessionFSM) CanExecuteTool(kind ToolKind, toolName string) error {
switch fsm.State {
case StateResolving:
// In RESOLVING, only allow resolve/read tools (must discover before writing)
if kind == ToolKindWrite {
return &FSMBlockedError{
State: fsm.State,
ToolName: toolName,
ToolKind: kind,
Reason: "No resources have been discovered yet. Use pulse_query to discover resources before performing write operations.",
Recoverable: true,
}
}
return nil
case StateReading:
// In READING, all tools are allowed
return nil
case StateWriting:
// In WRITING, all tools are allowed (this state is transitional)
return nil
case StateVerifying:
// In VERIFYING, only allow read/resolve tools until verification is complete
if kind == ToolKindWrite {
return &FSMBlockedError{
State: fsm.State,
ToolName: toolName,
ToolKind: kind,
Reason: "Must verify the previous write operation before performing another write. Use a read tool (logs, status, query) to check the result first.",
Recoverable: true,
}
}
return nil
}
return nil
}
// CanFinalAnswer checks if the current state allows producing a final answer.
// Returns an error if the model should continue with tool calls instead.
func (fsm *SessionFSM) CanFinalAnswer() error {
if fsm.State == StateVerifying && !fsm.ReadAfterWrite {
return &FSMBlockedError{
State: fsm.State,
Reason: "Must verify the write operation before providing a final answer. Use a read tool to check the result.",
Recoverable: true,
}
}
return nil
}
// OnToolSuccess transitions the FSM state after a successful tool execution.
// Call this after a tool completes successfully.
func (fsm *SessionFSM) OnToolSuccess(kind ToolKind, toolName string) {
now := time.Now()
switch kind {
case ToolKindResolve:
// Discovery counts as a read - enables reading state
if fsm.State == StateResolving {
fsm.State = StateReading
}
fsm.LastReadTool = toolName
fsm.LastReadAt = now
// Resolve also counts as "read after write" for verification
if fsm.State == StateVerifying {
fsm.ReadAfterWrite = true
}
case ToolKindRead:
// Read transitions from RESOLVING to READING
if fsm.State == StateResolving {
fsm.State = StateReading
}
fsm.LastReadTool = toolName
fsm.LastReadAt = now
// Read after write clears the verification requirement
if fsm.State == StateVerifying {
fsm.ReadAfterWrite = true
}
case ToolKindWrite:
// Write transitions to VERIFYING state
fsm.State = StateVerifying
fsm.WroteThisEpisode = true
fsm.ReadAfterWrite = false
fsm.LastWriteTool = toolName
fsm.LastWriteAt = now
}
}
// CompleteVerification transitions from VERIFYING to READING after successful verification.
// Call this after ReadAfterWrite becomes true and you want to allow new writes.
func (fsm *SessionFSM) CompleteVerification() {
if fsm.State == StateVerifying && fsm.ReadAfterWrite {
fsm.State = StateReading
fsm.ReadAfterWrite = false // Reset for next verification cycle
// Note: WroteThisEpisode stays true - it tracks "wrote at all this session"
// not "wrote in current verification cycle"
}
}
// Reset resets the FSM to initial state (e.g., for session clear)
func (fsm *SessionFSM) Reset() {
fsm.State = StateResolving
fsm.WroteThisEpisode = false
fsm.ReadAfterWrite = false
fsm.LastWriteTool = ""
fsm.LastWriteAt = time.Time{}
fsm.LastReadTool = ""
fsm.LastReadAt = time.Time{}
}
// ResetKeepProgress resets verification tracking but keeps the "active" state
// Use this for context clear with keepPinned=true
func (fsm *SessionFSM) ResetKeepProgress() {
if fsm.State == StateVerifying {
fsm.State = StateReading
}
fsm.WroteThisEpisode = false
fsm.ReadAfterWrite = false
}
// FSMBlockedError is returned when the FSM blocks an action
type FSMBlockedError struct {
State SessionState
ToolName string
ToolKind ToolKind
Reason string
Recoverable bool
}
func (e *FSMBlockedError) Error() string {
if e.ToolName != "" {
return fmt.Sprintf("FSM blocked tool '%s' (%s) in state %s: %s", e.ToolName, e.ToolKind, e.State, e.Reason)
}
return fmt.Sprintf("FSM blocked in state %s: %s", e.State, e.Reason)
}
// Code returns the error code for tool responses
func (e *FSMBlockedError) Code() string {
return "FSM_BLOCKED"
}
// classifyToolByName classifies a tool by its name and action parameters.
// This is the centralized classification that new tools must be added to.
func classifyToolByName(toolName string, args map[string]interface{}) ToolKind {
// Get action if present
action, _ := args["action"].(string)
actionLower := strings.ToLower(action)
operation, _ := args["operation"].(string)
operationLower := strings.ToLower(operation)
switch toolName {
// === Query/Discovery tools (Resolve) ===
case "pulse_query":
// query actions: search, get, config, topology, list, health
return ToolKindResolve
case "pulse_discovery":
return ToolKindResolve
// === Read-only tools (Read) ===
case "pulse_metrics":
return ToolKindRead
case "pulse_alerts":
// Most alert operations are read-only
switch actionLower {
case "resolve", "dismiss":
return ToolKindWrite // These modify alert state
default:
return ToolKindRead
}
case "pulse_storage":
return ToolKindRead
case "pulse_kubernetes":
return ToolKindRead
case "pulse_knowledge":
// knowledge operations: remember is write, recall is read
switch actionLower {
case "remember", "note", "save":
return ToolKindWrite
default:
return ToolKindRead
}
case "pulse_pmg":
return ToolKindRead
case "pulse_read":
// pulse_read is ALWAYS read-only - enforced at the tool layer
// This tool never triggers VERIFYING state, even when running commands
return ToolKindRead
// === Control tools (Write) ===
case "pulse_control":
// pulse_control is always a write (guest control, run command)
return ToolKindWrite
case "pulse_docker":
// Docker operations depend on action
switch actionLower {
case "control":
return ToolKindWrite
case "update", "check_updates", "trigger_update":
return ToolKindWrite
default:
// services, tasks, swarm, list - read operations
return ToolKindRead
}
case "pulse_file_edit":
// File operations depend on action
switch actionLower {
case "read":
return ToolKindRead
case "write", "append":
return ToolKindWrite
default:
return ToolKindRead
}
// === Legacy tool names (for backwards compatibility) ===
case "pulse_run_command":
return ToolKindWrite
case "pulse_control_guest":
return ToolKindWrite
case "pulse_control_docker":
return ToolKindWrite
case "pulse_search_resources", "pulse_get_resource", "pulse_get_topology",
"pulse_list_infrastructure", "pulse_get_connection_health":
return ToolKindResolve
case "pulse_get_docker_logs", "pulse_get_performance_metrics",
"pulse_get_temperatures", "pulse_get_baselines", "pulse_get_patterns":
return ToolKindRead
// === Patrol tools ===
case "patrol_get_findings":
return ToolKindRead // Reading existing findings doesn't require discovery
case "patrol_report_finding", "patrol_resolve_finding":
return ToolKindWrite
}
// Check if the action/operation parameter indicates a write
writeActions := map[string]bool{
"start": true, "stop": true, "restart": true, "delete": true,
"shutdown": true, "reboot": true, "write": true, "append": true,
"update": true, "trigger": true, "resolve": true, "dismiss": true,
"control": true,
}
if writeActions[actionLower] || writeActions[operationLower] {
return ToolKindWrite
}
// Check if the action/operation parameter indicates a read
readActions := map[string]bool{
"get": true, "list": true, "search": true, "query": true,
"read": true, "logs": true, "status": true, "health": true,
"describe": true, "inspect": true, "show": true,
}
if readActions[actionLower] || readActions[operationLower] {
return ToolKindRead
}
// Default to WRITE for unknown tools (security-safe: requires discovery first,
// verification after). This ensures new tools don't accidentally bypass FSM gates.
return ToolKindWrite
}
// ClassifyToolCall classifies a tool call for FSM state transitions.
// This is the exported function that the agentic loop should use.
func ClassifyToolCall(toolName string, args map[string]interface{}) ToolKind {
return classifyToolByName(toolName, args)
}