mirror of
https://github.com/QwenLM/qwen-code.git
synced 2026-04-28 03:30:40 +00:00
feat(core): detect tool validation retry loops and inject stop directive (#3178)
Primary change: prevent the model from burning tokens in an infinite retry loop when a tool call repeatedly fails schema validation with the same error (observed with ask_user_question and a malformed `questions` parameter retrying 10+ times with the same validation error). - Track consecutive validation failures per (tool name, error message) pair in CoreToolScheduler via a `validationRetryCounts` Map. - After 3 consecutive failures for the same (tool, error) pair, append a RETRY LOOP DETECTED directive to the error response instructing the model to stop, re-examine the schema, try a fundamentally different approach, or surface the issue to the user. - Reset per-tool counters when the tool invocation succeeds; reset globally when an incoming batch shares no tool name with any previously failing tool; reset the per-tool counter when the tool returns a different validation error so unrelated mistakes do not accumulate toward the threshold. - Distinct from LoopDetectionService, which tracks model-behavior loops (repeated thoughts, stagnant actions); this change catches tool-API misuse loops at the scheduler layer. Piggyback fixes bundled in the same PR: - packages/cli/index.ts, packages/core/src/services/shellExecutionService.ts: treat PTY `EAGAIN` on the read path as an expected read error alongside `EIO`, avoiding noisy surface-level failures from transient non-blocking reads. - scripts/build.js: switch the settings-schema generation step from `npx tsx` to `node --import tsx/esm` for Bun compatibility. Tests: - Unit tests in coreToolScheduler.test.ts cover: directive injection on the 3rd consecutive failure, counter reset when a different tool is called, and counter reset after a successful invocation of the same tool (fail → fail → succeed → fail → fail must not trip the directive).
This commit is contained in:
parent
699cb05206
commit
5facd8738b
5 changed files with 310 additions and 5 deletions
|
|
@ -51,6 +51,14 @@ const isExpectedPtyRaceError = (error: unknown): boolean => {
|
|||
return true;
|
||||
}
|
||||
|
||||
// EAGAIN: transient non-blocking read error from PTY fd
|
||||
if (
|
||||
(code === 'EAGAIN' && message.includes('read')) ||
|
||||
message.includes('read EAGAIN')
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// PTY-specific resize/exit race errors - require PTY context in message
|
||||
if (
|
||||
message.includes('ioctl(2) failed, EBADF') ||
|
||||
|
|
|
|||
|
|
@ -3849,3 +3849,236 @@ describe('CoreToolScheduler IDE interaction', () => {
|
|||
expect(mockIdeClient.openDiff).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
describe('CoreToolScheduler validation retry loop detection', () => {
|
||||
const RETRY_LOOP_STOP_DIRECTIVE = 'RETRY LOOP DETECTED';
|
||||
|
||||
/** Tool with a schema that requires a string `value` param. */
|
||||
class StrictStringTool extends BaseDeclarativeTool<
|
||||
{ value: string },
|
||||
ToolResult
|
||||
> {
|
||||
static readonly Name = 'strictStringTool';
|
||||
|
||||
constructor() {
|
||||
super(
|
||||
StrictStringTool.Name,
|
||||
'StrictStringTool',
|
||||
'A tool that requires a string value param.',
|
||||
Kind.Other,
|
||||
{
|
||||
type: 'object',
|
||||
properties: { value: { type: 'string' } },
|
||||
required: ['value'],
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
protected createInvocation(params: {
|
||||
value: string;
|
||||
}): ToolInvocation<{ value: string }, ToolResult> {
|
||||
return new (class extends BaseToolInvocation<
|
||||
{ value: string },
|
||||
ToolResult
|
||||
> {
|
||||
constructor(p: { value: string }) {
|
||||
super(p);
|
||||
}
|
||||
getDescription(): string {
|
||||
return 'strictStringTool invocation';
|
||||
}
|
||||
async execute(): Promise<ToolResult> {
|
||||
return { llmContent: 'ok', returnDisplay: 'ok' };
|
||||
}
|
||||
})(params);
|
||||
}
|
||||
}
|
||||
|
||||
function createSchedulerWithTool(tool: StrictStringTool) {
|
||||
const mockToolRegistry = {
|
||||
getTool: () => tool,
|
||||
getFunctionDeclarations: () => [],
|
||||
tools: new Map(),
|
||||
discovery: {},
|
||||
registerTool: () => {},
|
||||
getToolByName: () => tool,
|
||||
getToolByDisplayName: () => tool,
|
||||
getTools: () => [],
|
||||
discoverTools: async () => {},
|
||||
getAllTools: () => [],
|
||||
getToolsByServer: () => [],
|
||||
} as unknown as ToolRegistry;
|
||||
|
||||
const mockConfig = {
|
||||
getSessionId: () => 'test-session-id',
|
||||
getUsageStatisticsEnabled: () => true,
|
||||
getDebugMode: () => false,
|
||||
getApprovalMode: () => ApprovalMode.YOLO,
|
||||
getPermissionsAllow: () => [],
|
||||
getContentGeneratorConfig: () => ({
|
||||
model: 'test-model',
|
||||
authType: 'gemini',
|
||||
}),
|
||||
getShellExecutionConfig: () => ({
|
||||
terminalWidth: 90,
|
||||
terminalHeight: 30,
|
||||
}),
|
||||
storage: { getProjectTempDir: () => '/tmp' },
|
||||
getTruncateToolOutputThreshold: () => 100,
|
||||
getTruncateToolOutputLines: () => 10,
|
||||
getToolRegistry: () => mockToolRegistry,
|
||||
getUseModelRouter: () => false,
|
||||
getGeminiClient: () => null,
|
||||
isInteractive: () => true,
|
||||
getIdeMode: () => false,
|
||||
getExperimentalZedIntegration: () => false,
|
||||
getChatRecordingService: () => undefined,
|
||||
getMessageBus: vi.fn().mockReturnValue(undefined),
|
||||
getDisableAllHooks: vi.fn().mockReturnValue(true),
|
||||
setApprovalMode: vi.fn(),
|
||||
} as unknown as Config;
|
||||
|
||||
const onAllToolCallsComplete = vi.fn();
|
||||
const onToolCallsUpdate = vi.fn();
|
||||
|
||||
const scheduler = new CoreToolScheduler({
|
||||
config: mockConfig,
|
||||
onAllToolCallsComplete,
|
||||
onToolCallsUpdate,
|
||||
getPreferredEditor: () => 'vscode',
|
||||
onEditorClose: vi.fn(),
|
||||
});
|
||||
|
||||
return { scheduler, onToolCallsUpdate, onAllToolCallsComplete };
|
||||
}
|
||||
|
||||
function makeRequest(
|
||||
callId: string,
|
||||
name: string,
|
||||
args: Record<string, unknown>,
|
||||
) {
|
||||
return {
|
||||
callId,
|
||||
name,
|
||||
args,
|
||||
isClientInitiated: false,
|
||||
prompt_id: `prompt-${callId}`,
|
||||
};
|
||||
}
|
||||
|
||||
function getLastErrorMessage(onToolCallsUpdate: Mock): string | undefined {
|
||||
const calls = onToolCallsUpdate.mock.calls;
|
||||
for (let i = calls.length - 1; i >= 0; i--) {
|
||||
const toolCalls = calls[i][0] as ToolCall[];
|
||||
for (const call of toolCalls) {
|
||||
if (call.status === 'error' && call.response?.responseParts) {
|
||||
for (const part of call.response.responseParts) {
|
||||
if ('functionResponse' in part) {
|
||||
const resp = part.functionResponse as {
|
||||
response?: { error?: string };
|
||||
};
|
||||
if (resp.response?.error) return resp.response.error;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
it('should inject RETRY LOOP DETECTED directive after 3 consecutive validation failures', async () => {
|
||||
const tool = new StrictStringTool();
|
||||
const { scheduler, onToolCallsUpdate } = createSchedulerWithTool(tool);
|
||||
|
||||
// Turn 1: bad params (value is number, not string)
|
||||
await scheduler.schedule(
|
||||
[makeRequest('c1', 'strictStringTool', { value: 123 })],
|
||||
new AbortController().signal,
|
||||
);
|
||||
let msg = getLastErrorMessage(onToolCallsUpdate);
|
||||
expect(msg).toBeDefined();
|
||||
expect(msg).not.toContain(RETRY_LOOP_STOP_DIRECTIVE);
|
||||
|
||||
// Turn 2: same bad params
|
||||
await scheduler.schedule(
|
||||
[makeRequest('c2', 'strictStringTool', { value: 123 })],
|
||||
new AbortController().signal,
|
||||
);
|
||||
msg = getLastErrorMessage(onToolCallsUpdate);
|
||||
expect(msg).not.toContain(RETRY_LOOP_STOP_DIRECTIVE);
|
||||
|
||||
// Turn 3: same bad params — should trigger directive
|
||||
await scheduler.schedule(
|
||||
[makeRequest('c3', 'strictStringTool', { value: 123 })],
|
||||
new AbortController().signal,
|
||||
);
|
||||
msg = getLastErrorMessage(onToolCallsUpdate);
|
||||
expect(msg).toContain(RETRY_LOOP_STOP_DIRECTIVE);
|
||||
});
|
||||
|
||||
it('should reset retry counter when a different tool is called', async () => {
|
||||
const tool = new StrictStringTool();
|
||||
const { scheduler, onToolCallsUpdate } = createSchedulerWithTool(tool);
|
||||
|
||||
// Turn 1-2: tool fails twice
|
||||
await scheduler.schedule(
|
||||
[makeRequest('c1', 'strictStringTool', { value: 123 })],
|
||||
new AbortController().signal,
|
||||
);
|
||||
await scheduler.schedule(
|
||||
[makeRequest('c2', 'strictStringTool', { value: 123 })],
|
||||
new AbortController().signal,
|
||||
);
|
||||
|
||||
// Turn 3: switch to a different tool that also fails
|
||||
// We simulate by calling with a tool name that won't be found
|
||||
await scheduler.schedule(
|
||||
[makeRequest('c3', 'nonexistentTool', {})],
|
||||
new AbortController().signal,
|
||||
);
|
||||
|
||||
// Turn 4: back to tool — should be count 1 again (no directive)
|
||||
await scheduler.schedule(
|
||||
[makeRequest('c4', 'strictStringTool', { value: 123 })],
|
||||
new AbortController().signal,
|
||||
);
|
||||
const msg = getLastErrorMessage(onToolCallsUpdate);
|
||||
expect(msg).toBeDefined();
|
||||
expect(msg).not.toContain(RETRY_LOOP_STOP_DIRECTIVE);
|
||||
});
|
||||
|
||||
it('should reset retry counter after a successful invocation of the same tool', async () => {
|
||||
const tool = new StrictStringTool();
|
||||
const { scheduler, onToolCallsUpdate } = createSchedulerWithTool(tool);
|
||||
|
||||
// Two validation failures with the same error.
|
||||
await scheduler.schedule(
|
||||
[makeRequest('c1', 'strictStringTool', { value: 123 })],
|
||||
new AbortController().signal,
|
||||
);
|
||||
await scheduler.schedule(
|
||||
[makeRequest('c2', 'strictStringTool', { value: 123 })],
|
||||
new AbortController().signal,
|
||||
);
|
||||
|
||||
// A valid invocation succeeds, which must clear the per-tool counter.
|
||||
await scheduler.schedule(
|
||||
[makeRequest('c3', 'strictStringTool', { value: 'ok' })],
|
||||
new AbortController().signal,
|
||||
);
|
||||
|
||||
// Two more failures — count should restart at 1, not jump to 3+.
|
||||
await scheduler.schedule(
|
||||
[makeRequest('c4', 'strictStringTool', { value: 123 })],
|
||||
new AbortController().signal,
|
||||
);
|
||||
await scheduler.schedule(
|
||||
[makeRequest('c5', 'strictStringTool', { value: 123 })],
|
||||
new AbortController().signal,
|
||||
);
|
||||
|
||||
const msg = getLastErrorMessage(onToolCallsUpdate);
|
||||
expect(msg).toBeDefined();
|
||||
expect(msg).not.toContain(RETRY_LOOP_STOP_DIRECTIVE);
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -299,6 +299,15 @@ function toParts(input: PartListUnion): Part[] {
|
|||
return parts;
|
||||
}
|
||||
|
||||
const VALIDATION_RETRY_LOOP_THRESHOLD = 3;
|
||||
|
||||
/** Directive injected when a tool call repeatedly fails validation. */
|
||||
const RETRY_LOOP_STOP_DIRECTIVE =
|
||||
'\n\n⚠️ RETRY LOOP DETECTED: This tool call has failed validation multiple times with the same error. ' +
|
||||
'STOP retrying the same approach. Re-examine the tool schema and parameter requirements, then try a ' +
|
||||
'fundamentally different approach. If you cannot resolve the validation error, explain the issue to the user ' +
|
||||
'instead of retrying.';
|
||||
|
||||
const createErrorResponse = (
|
||||
request: ToolCallRequestInfo,
|
||||
error: Error,
|
||||
|
|
@ -397,6 +406,7 @@ export class CoreToolScheduler {
|
|||
private chatRecordingService?: ChatRecordingService;
|
||||
private isFinalizingToolCalls = false;
|
||||
private isScheduling = false;
|
||||
private validationRetryCounts = new Map<string, number>();
|
||||
private requestQueue: Array<{
|
||||
request: ToolCallRequestInfo | ToolCallRequestInfo[];
|
||||
signal: AbortSignal;
|
||||
|
|
@ -463,6 +473,8 @@ export class CoreToolScheduler {
|
|||
|
||||
switch (newStatus) {
|
||||
case 'success': {
|
||||
// Successful execution only resets retry state for this tool
|
||||
this.clearRetryCountsForTool(currentCall.request.name);
|
||||
const durationMs = existingStartTime
|
||||
? Date.now() - existingStartTime
|
||||
: undefined;
|
||||
|
|
@ -751,6 +763,20 @@ export class CoreToolScheduler {
|
|||
return this._schedule(request, signal);
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes all validation retry counters for the given tool. Keys are
|
||||
* "<toolName>:<errorMessage>", so a plain `Map.delete(toolName)` would not
|
||||
* match anything.
|
||||
*/
|
||||
private clearRetryCountsForTool(toolName: string): void {
|
||||
const prefix = `${toolName}:`;
|
||||
for (const key of this.validationRetryCounts.keys()) {
|
||||
if (key.startsWith(prefix)) {
|
||||
this.validationRetryCounts.delete(key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async _schedule(
|
||||
request: ToolCallRequestInfo | ToolCallRequestInfo[],
|
||||
signal: AbortSignal,
|
||||
|
|
@ -764,6 +790,23 @@ export class CoreToolScheduler {
|
|||
}
|
||||
const requestsToProcess = Array.isArray(request) ? request : [request];
|
||||
|
||||
// Check if this batch continues a validation retry loop.
|
||||
// Keys are "<toolName>:<errorMessage>"; if no request reuses a tool name
|
||||
// that previously failed validation, reset the tracker.
|
||||
if (this.validationRetryCounts.size > 0) {
|
||||
const prevTools = new Set<string>();
|
||||
for (const key of this.validationRetryCounts.keys()) {
|
||||
const sep = key.indexOf(':');
|
||||
prevTools.add(sep === -1 ? key : key.slice(0, sep));
|
||||
}
|
||||
const hasPrevFailingTool = requestsToProcess.some((r) =>
|
||||
prevTools.has(r.name),
|
||||
);
|
||||
if (!hasPrevFailingTool) {
|
||||
this.validationRetryCounts.clear();
|
||||
}
|
||||
}
|
||||
|
||||
const newToolCalls: ToolCall[] = [];
|
||||
for (const reqInfo of requestsToProcess) {
|
||||
// Check if the tool is excluded due to permissions/environment restrictions
|
||||
|
|
@ -839,18 +882,36 @@ export class CoreToolScheduler {
|
|||
reqInfo.callId,
|
||||
);
|
||||
if (invocationOrError instanceof Error) {
|
||||
const error = reqInfo.wasOutputTruncated
|
||||
const baseError = reqInfo.wasOutputTruncated
|
||||
? new Error(
|
||||
`${invocationOrError.message} ${TRUNCATION_PARAM_GUIDANCE}`,
|
||||
)
|
||||
: invocationOrError;
|
||||
|
||||
// Track validation retry for loop detection. Counts accumulate per
|
||||
// (tool, error message) pair so a different validation mistake on
|
||||
// the same tool starts fresh rather than tripping the threshold.
|
||||
const errorKey = `${reqInfo.name}:${baseError.message}`;
|
||||
const count = (this.validationRetryCounts.get(errorKey) ?? 0) + 1;
|
||||
for (const key of this.validationRetryCounts.keys()) {
|
||||
if (key.startsWith(`${reqInfo.name}:`) && key !== errorKey) {
|
||||
this.validationRetryCounts.delete(key);
|
||||
}
|
||||
}
|
||||
this.validationRetryCounts.set(errorKey, count);
|
||||
|
||||
const finalError =
|
||||
count >= VALIDATION_RETRY_LOOP_THRESHOLD
|
||||
? new Error(`${baseError.message}${RETRY_LOOP_STOP_DIRECTIVE}`)
|
||||
: baseError;
|
||||
|
||||
newToolCalls.push({
|
||||
status: 'error',
|
||||
request: reqInfo,
|
||||
tool: toolInstance,
|
||||
response: createErrorResponse(
|
||||
reqInfo,
|
||||
error,
|
||||
finalError,
|
||||
ToolErrorType.INVALID_TOOL_PARAMS,
|
||||
),
|
||||
durationMs: 0,
|
||||
|
|
@ -858,6 +919,9 @@ export class CoreToolScheduler {
|
|||
continue;
|
||||
}
|
||||
|
||||
// Reset all validation retry counters for this tool since it passed validation
|
||||
this.clearRetryCountsForTool(reqInfo.name);
|
||||
|
||||
// Reject file-modifying calls when truncated to prevent
|
||||
// writing incomplete content.
|
||||
if (reqInfo.wasOutputTruncated && toolInstance.kind === Kind.Edit) {
|
||||
|
|
|
|||
|
|
@ -198,12 +198,12 @@ const getErrorMessage = (error: unknown): string =>
|
|||
|
||||
const isExpectedPtyReadExitError = (error: unknown): boolean => {
|
||||
const code = getErrnoCode(error);
|
||||
if (code === 'EIO') {
|
||||
if (code === 'EIO' || code === 'EAGAIN') {
|
||||
return true;
|
||||
}
|
||||
|
||||
const message = getErrorMessage(error);
|
||||
return message.includes('read EIO');
|
||||
return message.includes('read EIO') || message.includes('EAGAIN');
|
||||
};
|
||||
|
||||
const isExpectedPtyExitRaceError = (error: unknown): boolean => {
|
||||
|
|
|
|||
|
|
@ -65,7 +65,7 @@ for (const workspace of buildOrder) {
|
|||
// After cli is built, generate the JSON Schema for settings
|
||||
// so the vscode-ide-companion extension can provide IntelliSense
|
||||
if (workspace === 'packages/cli') {
|
||||
execSync('npx tsx scripts/generate-settings-schema.ts', {
|
||||
execSync('node --import tsx/esm scripts/generate-settings-schema.ts', {
|
||||
stdio: 'inherit',
|
||||
cwd: root,
|
||||
});
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue