diff --git a/docs/release-control/v6/internal/subsystems/ai-runtime.md b/docs/release-control/v6/internal/subsystems/ai-runtime.md index 2f09314cd..bed299645 100644 --- a/docs/release-control/v6/internal/subsystems/ai-runtime.md +++ b/docs/release-control/v6/internal/subsystems/ai-runtime.md @@ -108,7 +108,7 @@ runtime cost control, and shared AI transport surfaces. ## Completion Obligations -1. Update this contract when canonical AI runtime or transport entry points move, including transport-level provider request-shape changes such as DeepSeek `tool_choice` coercion, runtime-failure classification splits (for example separating forced tool selection rejection, no tool-capable endpoint, and generic model-level lack of tool support into distinct causes), Patrol-specific verification surfaces such as `POST /api/ai/patrol/preflight` that exercise the full chat-completions path with a minimal tool definition rather than only listing models, Patrol-preflight cache observability where the AI Service caches the most recent preflight outcome (success, soft warning, or classified failure) and the AI settings response surfaces it as `patrol_preflight` so the UI can hydrate a "last verified" indicator without forcing operators to re-run preflight on every page load, the auto-trigger contract on `HandleUpdateAISettings` where the save handler runs `TriggerPatrolPreflightAsync` only when the change actually moved Patrol transport (model swap, provider key for that model changed, or assistant just enabled with a Patrol model) so routine settings saves do not burn provider tokens, the startup-seed contract where the AI Service handler dispatches the same async preflight on Pulse boot when assistant is enabled and a Patrol model is configured so the cache is populated for the first `/api/settings/ai` poll after a restart instead of blanking back to "never verified", and the readiness-integration contract where the `tools` check in the Patrol readiness payload consults the cached preflight and surfaces the classified evidence (success, soft warning, or failure with classified summary plus "last preflight ") for the configured provider+model when available, falling back to the static `PatrolToolReadinessForModel` classifier only when the cache is empty or holds a result for a different model +1. Update this contract when canonical AI runtime or transport entry points move, including transport-level provider request-shape changes such as DeepSeek `tool_choice` coercion, runtime-failure classification splits (for example separating forced tool selection rejection, no tool-capable endpoint, and generic model-level lack of tool support into distinct causes), Patrol-specific verification surfaces such as `POST /api/ai/patrol/preflight` that exercise the full chat-completions path with a minimal tool definition rather than only listing models, Patrol-preflight cache observability where the AI Service caches the most recent preflight outcome (success, soft warning, or classified failure) and the AI settings response surfaces it as `patrol_preflight` so the UI can hydrate a "last verified" indicator without forcing operators to re-run preflight on every page load, the auto-trigger contract on `HandleUpdateAISettings` where the save handler runs `TriggerPatrolPreflightAsync` only when the change actually moved Patrol transport (model swap, provider key for that model changed, or assistant just enabled with a Patrol model) so routine settings saves do not burn provider tokens, the startup-seed contract where the AI Service handler dispatches the same async preflight on Pulse boot when assistant is enabled and a Patrol model is configured so the cache is populated for the first `/api/settings/ai` poll after a restart instead of blanking back to "never verified", the readiness-integration contract where the `tools` check in the Patrol readiness payload consults the cached preflight and surfaces the classified evidence (success, soft warning, or failure with classified summary plus "last preflight ") for the configured provider+model when available (falling back to the static `PatrolToolReadinessForModel` classifier only when the cache is empty or holds a result for a different model), and the stateless-Patrol-input contract where `ExecutePatrolStream` must pass only the current run's user prompt into the agentic loop rather than reloading the persisted `patrol-main` session history (so a prior run that ended with orphan `tool_calls` cannot poison every subsequent run with malformed conversation structure) 2. Keep AI runtime and shared API proof routing aligned in `registry.json` 3. Preserve explicit coverage for chat, Patrol, remediation, and cost-control behavior when AI runtime changes Patrol runtime failures are part of that runtime contract: provider, model, diff --git a/internal/ai/chat/service.go b/internal/ai/chat/service.go index bda65709a..50893d6e6 100644 --- a/internal/ai/chat/service.go +++ b/internal/ai/chat/service.go @@ -1906,7 +1906,7 @@ func (s *Service) ExecutePatrolStream(ctx context.Context, req PatrolRequest, ca tempLoop.SetBudgetChecker(s.budgetChecker) } - // Add user message + // Add user message to the session for forensics / audit trail. userMsg := Message{ ID: uuid.New().String(), Role: "user", @@ -1917,11 +1917,16 @@ func (s *Service) ExecutePatrolStream(ctx context.Context, req PatrolRequest, ca log.Warn().Err(err).Msg("failed to save patrol user message") } - // Get messages for context - messages, err := sessions.GetMessages(session.ID) - if err != nil { - return nil, fmt.Errorf("failed to get patrol messages: %w", err) - } + // Patrol runs are stateless investigations. The "patrol-main" session + // reuses the same id across scheduled runs, so loading prior session + // history into the agentic loop accumulates broken state: when any + // run ends after the model emitted tool_calls but before all tool + // results landed (provider error, timeout, context cancellation), + // the orphan tool_calls persist and every subsequent run hits + // "An assistant message with 'tool_calls' must be followed by tool + // messages responding to each 'tool_call_id'." Patrol must see only + // this run's user prompt; the session is just a forensic log. + messages := []Message{userMsg} // Get all tools (patrol runs in autonomous mode) filteredTools := s.filterToolsForPrompt(ctx, req.Prompt, true, true) diff --git a/internal/ai/patrol_readiness.go b/internal/ai/patrol_readiness.go index 03a9e33f0..b8bc078dc 100644 --- a/internal/ai/patrol_readiness.go +++ b/internal/ai/patrol_readiness.go @@ -27,6 +27,7 @@ const ( PatrolFailureCauseModelToolSupportUnverified PatrolFailureCause = "model_tool_support_unverified" PatrolFailureCauseToolChoiceRejected PatrolFailureCause = "tool_choice_rejected" PatrolFailureCauseNoToolCapableEndpoint PatrolFailureCause = "no_tool_capable_endpoint" + PatrolFailureCauseMalformedToolHistory PatrolFailureCause = "malformed_tool_history" PatrolFailureCauseModelUnavailable PatrolFailureCause = "model_unavailable" PatrolFailureCauseContextWindowTooSmall PatrolFailureCause = "context_window_too_small" PatrolFailureCauseProviderBilling PatrolFailureCause = "provider_billing" diff --git a/internal/ai/patrol_runtime_failure.go b/internal/ai/patrol_runtime_failure.go index 73f8803f3..cb40f1c00 100644 --- a/internal/ai/patrol_runtime_failure.go +++ b/internal/ai/patrol_runtime_failure.go @@ -100,6 +100,22 @@ func patrolNoToolCapableEndpoint(lower string) bool { return strings.Contains(lower, "no endpoints found") && strings.Contains(lower, "tool") } +// patrolMalformedToolHistory reports whether the upstream error indicates +// Pulse sent a conversation where an assistant message had tool_calls +// without matching tool result messages for every tool_call_id. Distinct +// from tool_choice / capability errors: this is a structural mismatch in +// the message slice Pulse assembled. DeepSeek phrases it as +// "An assistant message with 'tool_calls' must be followed by tool messages +// responding to each 'tool_call_id'", OpenAI uses similar wording. +func patrolMalformedToolHistory(lower string) bool { + if !strings.Contains(lower, "tool_call_id") && !strings.Contains(lower, "tool_calls") { + return false + } + return strings.Contains(lower, "must be followed by tool messages") || + strings.Contains(lower, "insufficient tool messages") || + strings.Contains(lower, "responding to each") +} + func ClassifyPatrolRuntimeFailure(err error) PatrolRuntimeFailureDiagnostic { failure := patrolRuntimeFailureFromError(err) return PatrolRuntimeFailureDiagnostic{ @@ -130,6 +146,12 @@ func patrolRuntimeFailureFromError(err error) patrolRuntimeFailure { } switch { + case patrolMalformedToolHistory(lower): + failure.Title = "Pulse Patrol: Malformed tool-call conversation history" + failure.Summary = "Malformed tool-call conversation history" + failure.Cause = PatrolFailureCauseMalformedToolHistory + failure.Description = "Pulse Patrol reached the provider, but the conversation it sent had an assistant message containing tool_calls without matching tool result messages for every tool_call_id. The provider rejects this structure. This usually means a previous Patrol run ended after the model emitted tool calls but before all results were captured, leaving orphan tool_calls in persisted state that the next run reused." + failure.Recommendation = "Pulse should treat each Patrol run as stateless. If the failure persists across runs, restart Pulse to clear any in-memory session state and report the issue." case patrolToolChoiceValueRejected(lower): failure.Title = "Pulse Patrol: Provider rejected forced tool selection" failure.Summary = "Provider rejected forced tool selection" @@ -242,6 +264,8 @@ func summarizePatrolRuntimeFailureDetail(raw string) string { } lower := strings.ToLower(raw) switch { + case patrolMalformedToolHistory(lower): + return "Pulse sent a malformed tool-call conversation. Each Patrol run should be stateless; restart Pulse if the failure persists." case patrolToolChoiceValueRejected(lower): return "Provider rejected Pulse's forced tool selection. Pulse will retry with automatic tool selection on the next Patrol run." case patrolNoToolCapableEndpoint(lower): diff --git a/internal/ai/patrol_runtime_failure_test.go b/internal/ai/patrol_runtime_failure_test.go index 0da8fcb7e..ab07c8ea2 100644 --- a/internal/ai/patrol_runtime_failure_test.go +++ b/internal/ai/patrol_runtime_failure_test.go @@ -96,6 +96,36 @@ func TestPatrolRuntimeFailureFromError_ClassifiesToolChoiceValueRejected(t *test } } +func TestPatrolRuntimeFailureFromError_ClassifiesMalformedToolHistory(t *testing.T) { + // Real failure mode that bit Patrol after the DeepSeek tool_choice fix + // landed and Patrol started actually executing tool calls: the + // patrol-main session was reused across runs, so when one run ended + // after the model emitted tool_calls but before all results landed, + // the next run inherited orphan tool_calls and the provider rejected + // the conversation structure with this exact phrasing. + err := errors.New(`agentic patrol failed: provider error: API error (400): An assistant message with 'tool_calls' must be followed by tool messages responding to each 'tool_call_id'. (insufficient tool messages following tool_calls message)`) + + failure := patrolRuntimeFailureFromError(err) + + if failure.Title != "Pulse Patrol: Malformed tool-call conversation history" { + t.Fatalf("unexpected title %q", failure.Title) + } + if failure.Cause != PatrolFailureCauseMalformedToolHistory { + t.Fatalf("unexpected cause %q (want %q)", failure.Cause, PatrolFailureCauseMalformedToolHistory) + } + if !strings.Contains(failure.Description, "tool_calls without matching tool result messages") { + t.Fatalf("expected description to explain orphan tool_calls, got %q", failure.Description) + } + if !strings.Contains(failure.Recommendation, "stateless") { + t.Fatalf("expected recommendation to mention statelessness, got %q", failure.Recommendation) + } + // Same redaction invariant as the rest of the classifier — never + // leak the literal upstream parameter names into Evidence. + if strings.Contains(failure.Evidence, "tool_call_id") { + t.Fatalf("evidence leaked raw provider parameter name: %q", failure.Evidence) + } +} + func TestPatrolRuntimeFailureFromError_ClassifiesGenericToolUnsupported(t *testing.T) { // Generic "tools are not supported" fallback for providers that say // the model truly cannot call tools (not a value-rejection or routing