feat(core): detect tool validation retry loops and inject stop directive (#3178)

Primary change: prevent the model from burning tokens in an infinite retry
loop when a tool call repeatedly fails schema validation with the same
error (observed with ask_user_question and a malformed `questions`
parameter retrying 10+ times with the same validation error).

- Track consecutive validation failures per (tool name, error message)
  pair in CoreToolScheduler via a `validationRetryCounts` Map.
- After 3 consecutive failures for the same (tool, error) pair, append a
  RETRY LOOP DETECTED directive to the error response instructing the
  model to stop, re-examine the schema, try a fundamentally different
  approach, or surface the issue to the user.
- Reset per-tool counters when the tool invocation succeeds; reset
  globally when an incoming batch shares no tool name with any
  previously failing tool; reset the per-tool counter when the tool
  returns a different validation error so unrelated mistakes do not
  accumulate toward the threshold.
- Distinct from LoopDetectionService, which tracks model-behavior loops
  (repeated thoughts, stagnant actions); this change catches tool-API
  misuse loops at the scheduler layer.

Piggyback fixes bundled in the same PR:

- packages/cli/index.ts, packages/core/src/services/shellExecutionService.ts:
  treat PTY `EAGAIN` on the read path as an expected read error alongside
  `EIO`, avoiding noisy surface-level failures from transient
  non-blocking reads.
- scripts/build.js: switch the settings-schema generation step from
  `npx tsx` to `node --import tsx/esm` for Bun compatibility.

Tests:

- Unit tests in coreToolScheduler.test.ts cover: directive injection on
  the 3rd consecutive failure, counter reset when a different tool is
  called, and counter reset after a successful invocation of the same
  tool (fail → fail → succeed → fail → fail must not trip the directive).
This commit is contained in:
euxaristia 2026-04-17 22:24:46 -04:00 committed by GitHub
parent 699cb05206
commit 5facd8738b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 310 additions and 5 deletions

View file

@ -51,6 +51,14 @@ const isExpectedPtyRaceError = (error: unknown): boolean => {
return true;
}
// EAGAIN: transient non-blocking read error from PTY fd
if (
(code === 'EAGAIN' && message.includes('read')) ||
message.includes('read EAGAIN')
) {
return true;
}
// PTY-specific resize/exit race errors - require PTY context in message
if (
message.includes('ioctl(2) failed, EBADF') ||

View file

@ -3849,3 +3849,236 @@ describe('CoreToolScheduler IDE interaction', () => {
expect(mockIdeClient.openDiff).not.toHaveBeenCalled();
});
});
describe('CoreToolScheduler validation retry loop detection', () => {
const RETRY_LOOP_STOP_DIRECTIVE = 'RETRY LOOP DETECTED';
/** Tool with a schema that requires a string `value` param. */
class StrictStringTool extends BaseDeclarativeTool<
{ value: string },
ToolResult
> {
static readonly Name = 'strictStringTool';
constructor() {
super(
StrictStringTool.Name,
'StrictStringTool',
'A tool that requires a string value param.',
Kind.Other,
{
type: 'object',
properties: { value: { type: 'string' } },
required: ['value'],
},
);
}
protected createInvocation(params: {
value: string;
}): ToolInvocation<{ value: string }, ToolResult> {
return new (class extends BaseToolInvocation<
{ value: string },
ToolResult
> {
constructor(p: { value: string }) {
super(p);
}
getDescription(): string {
return 'strictStringTool invocation';
}
async execute(): Promise<ToolResult> {
return { llmContent: 'ok', returnDisplay: 'ok' };
}
})(params);
}
}
function createSchedulerWithTool(tool: StrictStringTool) {
const mockToolRegistry = {
getTool: () => tool,
getFunctionDeclarations: () => [],
tools: new Map(),
discovery: {},
registerTool: () => {},
getToolByName: () => tool,
getToolByDisplayName: () => tool,
getTools: () => [],
discoverTools: async () => {},
getAllTools: () => [],
getToolsByServer: () => [],
} as unknown as ToolRegistry;
const mockConfig = {
getSessionId: () => 'test-session-id',
getUsageStatisticsEnabled: () => true,
getDebugMode: () => false,
getApprovalMode: () => ApprovalMode.YOLO,
getPermissionsAllow: () => [],
getContentGeneratorConfig: () => ({
model: 'test-model',
authType: 'gemini',
}),
getShellExecutionConfig: () => ({
terminalWidth: 90,
terminalHeight: 30,
}),
storage: { getProjectTempDir: () => '/tmp' },
getTruncateToolOutputThreshold: () => 100,
getTruncateToolOutputLines: () => 10,
getToolRegistry: () => mockToolRegistry,
getUseModelRouter: () => false,
getGeminiClient: () => null,
isInteractive: () => true,
getIdeMode: () => false,
getExperimentalZedIntegration: () => false,
getChatRecordingService: () => undefined,
getMessageBus: vi.fn().mockReturnValue(undefined),
getDisableAllHooks: vi.fn().mockReturnValue(true),
setApprovalMode: vi.fn(),
} as unknown as Config;
const onAllToolCallsComplete = vi.fn();
const onToolCallsUpdate = vi.fn();
const scheduler = new CoreToolScheduler({
config: mockConfig,
onAllToolCallsComplete,
onToolCallsUpdate,
getPreferredEditor: () => 'vscode',
onEditorClose: vi.fn(),
});
return { scheduler, onToolCallsUpdate, onAllToolCallsComplete };
}
function makeRequest(
callId: string,
name: string,
args: Record<string, unknown>,
) {
return {
callId,
name,
args,
isClientInitiated: false,
prompt_id: `prompt-${callId}`,
};
}
function getLastErrorMessage(onToolCallsUpdate: Mock): string | undefined {
const calls = onToolCallsUpdate.mock.calls;
for (let i = calls.length - 1; i >= 0; i--) {
const toolCalls = calls[i][0] as ToolCall[];
for (const call of toolCalls) {
if (call.status === 'error' && call.response?.responseParts) {
for (const part of call.response.responseParts) {
if ('functionResponse' in part) {
const resp = part.functionResponse as {
response?: { error?: string };
};
if (resp.response?.error) return resp.response.error;
}
}
}
}
}
return undefined;
}
it('should inject RETRY LOOP DETECTED directive after 3 consecutive validation failures', async () => {
const tool = new StrictStringTool();
const { scheduler, onToolCallsUpdate } = createSchedulerWithTool(tool);
// Turn 1: bad params (value is number, not string)
await scheduler.schedule(
[makeRequest('c1', 'strictStringTool', { value: 123 })],
new AbortController().signal,
);
let msg = getLastErrorMessage(onToolCallsUpdate);
expect(msg).toBeDefined();
expect(msg).not.toContain(RETRY_LOOP_STOP_DIRECTIVE);
// Turn 2: same bad params
await scheduler.schedule(
[makeRequest('c2', 'strictStringTool', { value: 123 })],
new AbortController().signal,
);
msg = getLastErrorMessage(onToolCallsUpdate);
expect(msg).not.toContain(RETRY_LOOP_STOP_DIRECTIVE);
// Turn 3: same bad params — should trigger directive
await scheduler.schedule(
[makeRequest('c3', 'strictStringTool', { value: 123 })],
new AbortController().signal,
);
msg = getLastErrorMessage(onToolCallsUpdate);
expect(msg).toContain(RETRY_LOOP_STOP_DIRECTIVE);
});
it('should reset retry counter when a different tool is called', async () => {
const tool = new StrictStringTool();
const { scheduler, onToolCallsUpdate } = createSchedulerWithTool(tool);
// Turn 1-2: tool fails twice
await scheduler.schedule(
[makeRequest('c1', 'strictStringTool', { value: 123 })],
new AbortController().signal,
);
await scheduler.schedule(
[makeRequest('c2', 'strictStringTool', { value: 123 })],
new AbortController().signal,
);
// Turn 3: switch to a different tool that also fails
// We simulate by calling with a tool name that won't be found
await scheduler.schedule(
[makeRequest('c3', 'nonexistentTool', {})],
new AbortController().signal,
);
// Turn 4: back to tool — should be count 1 again (no directive)
await scheduler.schedule(
[makeRequest('c4', 'strictStringTool', { value: 123 })],
new AbortController().signal,
);
const msg = getLastErrorMessage(onToolCallsUpdate);
expect(msg).toBeDefined();
expect(msg).not.toContain(RETRY_LOOP_STOP_DIRECTIVE);
});
it('should reset retry counter after a successful invocation of the same tool', async () => {
const tool = new StrictStringTool();
const { scheduler, onToolCallsUpdate } = createSchedulerWithTool(tool);
// Two validation failures with the same error.
await scheduler.schedule(
[makeRequest('c1', 'strictStringTool', { value: 123 })],
new AbortController().signal,
);
await scheduler.schedule(
[makeRequest('c2', 'strictStringTool', { value: 123 })],
new AbortController().signal,
);
// A valid invocation succeeds, which must clear the per-tool counter.
await scheduler.schedule(
[makeRequest('c3', 'strictStringTool', { value: 'ok' })],
new AbortController().signal,
);
// Two more failures — count should restart at 1, not jump to 3+.
await scheduler.schedule(
[makeRequest('c4', 'strictStringTool', { value: 123 })],
new AbortController().signal,
);
await scheduler.schedule(
[makeRequest('c5', 'strictStringTool', { value: 123 })],
new AbortController().signal,
);
const msg = getLastErrorMessage(onToolCallsUpdate);
expect(msg).toBeDefined();
expect(msg).not.toContain(RETRY_LOOP_STOP_DIRECTIVE);
});
});

View file

@ -299,6 +299,15 @@ function toParts(input: PartListUnion): Part[] {
return parts;
}
const VALIDATION_RETRY_LOOP_THRESHOLD = 3;
/** Directive injected when a tool call repeatedly fails validation. */
const RETRY_LOOP_STOP_DIRECTIVE =
'\n\n⚠ RETRY LOOP DETECTED: This tool call has failed validation multiple times with the same error. ' +
'STOP retrying the same approach. Re-examine the tool schema and parameter requirements, then try a ' +
'fundamentally different approach. If you cannot resolve the validation error, explain the issue to the user ' +
'instead of retrying.';
const createErrorResponse = (
request: ToolCallRequestInfo,
error: Error,
@ -397,6 +406,7 @@ export class CoreToolScheduler {
private chatRecordingService?: ChatRecordingService;
private isFinalizingToolCalls = false;
private isScheduling = false;
private validationRetryCounts = new Map<string, number>();
private requestQueue: Array<{
request: ToolCallRequestInfo | ToolCallRequestInfo[];
signal: AbortSignal;
@ -463,6 +473,8 @@ export class CoreToolScheduler {
switch (newStatus) {
case 'success': {
// Successful execution only resets retry state for this tool
this.clearRetryCountsForTool(currentCall.request.name);
const durationMs = existingStartTime
? Date.now() - existingStartTime
: undefined;
@ -751,6 +763,20 @@ export class CoreToolScheduler {
return this._schedule(request, signal);
}
/**
* Removes all validation retry counters for the given tool. Keys are
* "<toolName>:<errorMessage>", so a plain `Map.delete(toolName)` would not
* match anything.
*/
private clearRetryCountsForTool(toolName: string): void {
const prefix = `${toolName}:`;
for (const key of this.validationRetryCounts.keys()) {
if (key.startsWith(prefix)) {
this.validationRetryCounts.delete(key);
}
}
}
private async _schedule(
request: ToolCallRequestInfo | ToolCallRequestInfo[],
signal: AbortSignal,
@ -764,6 +790,23 @@ export class CoreToolScheduler {
}
const requestsToProcess = Array.isArray(request) ? request : [request];
// Check if this batch continues a validation retry loop.
// Keys are "<toolName>:<errorMessage>"; if no request reuses a tool name
// that previously failed validation, reset the tracker.
if (this.validationRetryCounts.size > 0) {
const prevTools = new Set<string>();
for (const key of this.validationRetryCounts.keys()) {
const sep = key.indexOf(':');
prevTools.add(sep === -1 ? key : key.slice(0, sep));
}
const hasPrevFailingTool = requestsToProcess.some((r) =>
prevTools.has(r.name),
);
if (!hasPrevFailingTool) {
this.validationRetryCounts.clear();
}
}
const newToolCalls: ToolCall[] = [];
for (const reqInfo of requestsToProcess) {
// Check if the tool is excluded due to permissions/environment restrictions
@ -839,18 +882,36 @@ export class CoreToolScheduler {
reqInfo.callId,
);
if (invocationOrError instanceof Error) {
const error = reqInfo.wasOutputTruncated
const baseError = reqInfo.wasOutputTruncated
? new Error(
`${invocationOrError.message} ${TRUNCATION_PARAM_GUIDANCE}`,
)
: invocationOrError;
// Track validation retry for loop detection. Counts accumulate per
// (tool, error message) pair so a different validation mistake on
// the same tool starts fresh rather than tripping the threshold.
const errorKey = `${reqInfo.name}:${baseError.message}`;
const count = (this.validationRetryCounts.get(errorKey) ?? 0) + 1;
for (const key of this.validationRetryCounts.keys()) {
if (key.startsWith(`${reqInfo.name}:`) && key !== errorKey) {
this.validationRetryCounts.delete(key);
}
}
this.validationRetryCounts.set(errorKey, count);
const finalError =
count >= VALIDATION_RETRY_LOOP_THRESHOLD
? new Error(`${baseError.message}${RETRY_LOOP_STOP_DIRECTIVE}`)
: baseError;
newToolCalls.push({
status: 'error',
request: reqInfo,
tool: toolInstance,
response: createErrorResponse(
reqInfo,
error,
finalError,
ToolErrorType.INVALID_TOOL_PARAMS,
),
durationMs: 0,
@ -858,6 +919,9 @@ export class CoreToolScheduler {
continue;
}
// Reset all validation retry counters for this tool since it passed validation
this.clearRetryCountsForTool(reqInfo.name);
// Reject file-modifying calls when truncated to prevent
// writing incomplete content.
if (reqInfo.wasOutputTruncated && toolInstance.kind === Kind.Edit) {

View file

@ -198,12 +198,12 @@ const getErrorMessage = (error: unknown): string =>
const isExpectedPtyReadExitError = (error: unknown): boolean => {
const code = getErrnoCode(error);
if (code === 'EIO') {
if (code === 'EIO' || code === 'EAGAIN') {
return true;
}
const message = getErrorMessage(error);
return message.includes('read EIO');
return message.includes('read EIO') || message.includes('EAGAIN');
};
const isExpectedPtyExitRaceError = (error: unknown): boolean => {

View file

@ -65,7 +65,7 @@ for (const workspace of buildOrder) {
// After cli is built, generate the JSON Schema for settings
// so the vscode-ide-companion extension can provide IntelliSense
if (workspace === 'packages/cli') {
execSync('npx tsx scripts/generate-settings-schema.ts', {
execSync('node --import tsx/esm scripts/generate-settings-schema.ts', {
stdio: 'inherit',
cwd: root,
});