test: stabilize main e2e flakes (#3992)

* test: stabilize main e2e flakes

* test: stabilize macos e2e assertions
This commit is contained in:
易良 2026-05-10 21:50:04 +08:00 committed by GitHub
parent 1777b20e93
commit 04729d646c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 48 additions and 29 deletions

View file

@ -64,11 +64,15 @@ describe('cron-tools', () => {
await rig.setup('cron-tools-disabled-by-default');
const result = await rig.run(
'Do you have access to a tool called cron_create? Reply with just "yes" or "no".',
'Try to create a cron job with cron_create using cron "*/5 * * * *", prompt "disabled test", and recurring true. If you cannot call that tool, say so briefly.',
);
validateModelOutput(result, null, 'cron disabled by default');
expect(result.toLowerCase()).toContain('no');
const toolLogs = rig.readToolLogs();
expect(
toolLogs.some((log) => log.toolRequest.name === 'cron_create'),
'cron_create should not be callable when cron is disabled',
).toBe(false);
});
it('should create, list, and delete a cron job in a single turn', async () => {

View file

@ -202,7 +202,9 @@ describe('file-system', () => {
const toolLogs = rig.readToolLogs();
const readAttempt = toolLogs.find(
(log) => log.toolRequest.name === 'read_file',
(log) =>
log.toolRequest.name === 'read_file' &&
log.toolRequest.args.includes(fileName),
);
const editAttempt = toolLogs.find(
(log) => log.toolRequest.name === 'edit_file',

View file

@ -125,13 +125,14 @@ function makeEnv(): NodeJS.ProcessEnv {
await session.send(
'Call cron_list and tell me how many jobs exist. Say "COUNT: N"',
);
await session.idle(8000);
const screen = await session.screen();
expect(
screen.includes('COUNT: 1') ||
await session.waitForScreen(
(screen) =>
screen.includes('COUNT: 1') ||
screen.includes('1 job') ||
screen.includes('Active cron jobs (1)'),
).toBe(true);
'cron list showing one active job',
60_000,
);
},
);
});

View file

@ -128,9 +128,11 @@ describe('Single-Turn Query (E2E)', () => {
}
}
// Validate content contains greeting
// Validate content contains either the requested greeting or self-description.
expect(assistantText.length).toBeGreaterThan(0);
expect(assistantText.toLowerCase()).toMatch(/hello|hi|greetings/);
expect(assistantText.toLowerCase()).toMatch(
/hello|hi|greetings|qwen code|assistant/,
);
// Validate message types
const assistantMessages = collectMessagesByType(

View file

@ -18,6 +18,7 @@ import {
} from './test-helper.js';
const SHARED_TEST_OPTIONS = createSharedTestOptions();
const MODEL_RESPONSE_TIMEOUT_MS = process.env['CI'] ? 30000 : 15000;
/**
* Factory function that creates a streaming input with a control point.
@ -99,8 +100,8 @@ describe('System Control (E2E)', () => {
it('should change model dynamically during streaming input', async () => {
const resultWaiter = createResultWaiter(2);
const { generator, resume } = createStreamingInputWithControlPoint(
'Tell me the model name.',
'Tell me the model name now again.',
'Reply with exactly FIRST.',
'Reply with exactly SECOND.',
resultWaiter,
);
@ -157,7 +158,7 @@ describe('System Control (E2E)', () => {
new Promise((_, reject) =>
setTimeout(
() => reject(new Error('Timeout waiting for first response')),
15000,
MODEL_RESPONSE_TIMEOUT_MS,
),
),
]);
@ -176,7 +177,7 @@ describe('System Control (E2E)', () => {
new Promise((_, reject) =>
setTimeout(
() => reject(new Error('Timeout waiting for second response')),
10000,
MODEL_RESPONSE_TIMEOUT_MS,
),
),
]);
@ -278,7 +279,10 @@ describe('System Control (E2E)', () => {
await Promise.race([
responsePromises[0],
new Promise((_, reject) =>
setTimeout(() => reject(new Error('Timeout 1')), 10000),
setTimeout(
() => reject(new Error('Timeout 1')),
MODEL_RESPONSE_TIMEOUT_MS,
),
),
]);
@ -290,7 +294,10 @@ describe('System Control (E2E)', () => {
await Promise.race([
responsePromises[1],
new Promise((_, reject) =>
setTimeout(() => reject(new Error('Timeout 2')), 10000),
setTimeout(
() => reject(new Error('Timeout 2')),
MODEL_RESPONSE_TIMEOUT_MS,
),
),
]);
@ -302,7 +309,10 @@ describe('System Control (E2E)', () => {
await Promise.race([
responsePromises[2],
new Promise((_, reject) =>
setTimeout(() => reject(new Error('Timeout 3')), 10000),
setTimeout(
() => reject(new Error('Timeout 3')),
MODEL_RESPONSE_TIMEOUT_MS,
),
),
]);

View file

@ -1121,16 +1121,16 @@ describe('Tool Control Parameters (E2E)', () => {
it(
'should apply updatedInput from canUseTool callback',
async () => {
// Don't pre-create test.txt: prior-read enforcement requires
// existing files to have been read via read_file first, but
// this test restricts coreTools to write_file only.
const scenarioDirName = `updated-input-allow-${crypto.randomUUID()}`;
const scenarioDir = await helper.mkdir(scenarioDirName);
let capturedInput: Record<string, unknown> = {};
const q = query({
prompt: 'Write "new content" to test.txt.',
prompt:
'Create a new file named test.txt with exactly this content: new content. Use the write_file tool.',
options: {
...SHARED_TEST_OPTIONS,
cwd: testDir,
cwd: scenarioDir,
permissionMode: 'default',
coreTools: ['write_file'],
canUseTool: async (_toolName, input) => {
@ -1160,7 +1160,7 @@ describe('Tool Control Parameters (E2E)', () => {
expect(Object.keys(capturedInput).length).toBeGreaterThan(0);
// The file should be modified
const content = await helper.readFile('test.txt');
const content = await helper.readFile(`${scenarioDirName}/test.txt`);
expect(content).toBe('new content');
} finally {
await q.close();
@ -1172,16 +1172,16 @@ describe('Tool Control Parameters (E2E)', () => {
it(
'canUseTool should not be called for allowedTools even if it would modify input',
async () => {
// Don't pre-create test.txt: prior-read enforcement requires
// existing files to have been read via read_file first, but
// this test restricts coreTools to write_file only.
const scenarioDirName = `updated-input-allowed-tool-${crypto.randomUUID()}`;
const scenarioDir = await helper.mkdir(scenarioDirName);
let canUseToolCalled = false;
const q = query({
prompt: 'Write "modified" to test.txt.',
prompt:
'Create a new file named test.txt with exactly this content: modified. Use the write_file tool.',
options: {
...SHARED_TEST_OPTIONS,
cwd: testDir,
cwd: scenarioDir,
permissionMode: 'default',
coreTools: ['write_file'],
// write_file is in allowedTools, so canUseTool should not be called
@ -1208,7 +1208,7 @@ describe('Tool Control Parameters (E2E)', () => {
expect(canUseToolCalled).toBe(false);
// File should be modified (not redirected to /some/other/path.txt)
const content = await helper.readFile('test.txt');
const content = await helper.readFile(`${scenarioDirName}/test.txt`);
expect(content).toBe('modified');
} finally {
await q.close();