fix(core): cap recursive file crawler at 100k entries to prevent OOM (#3138)

When the @ autocomplete triggers RecursiveFileSearch, the crawler
materialises the entire project tree into memory with no upper bound.
For very large workspaces (missing .gitignore, huge node_modules, home
directory as cwd) this pushes Node.js past its heap limit and crashes.

- Add `maxFiles` option to CrawlOptions; use fdir's withMaxFiles() to
  stop traversal early instead of post-hoc truncation
- Apply file-level ignore patterns during crawl via fdir filter() so
  ignored files don't consume the maxFiles budget
- Include maxFiles in the crawl cache key for correctness
- Set MAX_CRAWL_FILES = 100 000 in RecursiveFileSearch (caps peak
  memory at ~50 MB for the file list)

Fixes #3130
This commit is contained in:
tanzhenxin 2026-04-11 16:44:02 +08:00 committed by GitHub
parent 61ad9db9c1
commit e216ab35fc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 137 additions and 3 deletions

View file

@ -178,8 +178,10 @@ describe('crawler', () => {
});
expect(results).toEqual(
expect.arrayContaining(['.', '.gitignore', 'Foo.mk', 'bar.mk']),
expect.arrayContaining(['.', '.gitignore', 'Foo.mk']),
);
// bar.mk matches *.mk and is not negated, so it should be filtered out
expect(results).not.toContain('bar.mk');
});
it('should handle directory negation with glob', async () => {
@ -571,4 +573,106 @@ describe('crawler', () => {
);
});
});
describe('with maxFiles', () => {
it('should truncate results when maxFiles is exceeded', async () => {
tmpDir = await createTmpDir({
'a.txt': '',
'b.txt': '',
'c.txt': '',
sub: ['d.txt', 'e.txt'],
});
const ignore = loadIgnoreRules({
projectRoot: tmpDir,
useGitignore: false,
useQwenignore: false,
ignoreDirs: [],
});
const allResults = await crawl({
crawlDirectory: tmpDir,
cwd: tmpDir,
ignore,
cache: false,
cacheTtl: 0,
});
const limitedResults = await crawl({
crawlDirectory: tmpDir,
cwd: tmpDir,
ignore,
cache: false,
cacheTtl: 0,
maxFiles: 3,
});
expect(allResults.length).toBeGreaterThan(3);
expect(limitedResults.length).toBe(3);
});
it('should not count file-ignored entries toward maxFiles budget', async () => {
tmpDir = await createTmpDir({
'.gitignore': '*.log',
'a.txt': '',
'b.txt': '',
'noise1.log': '',
'noise2.log': '',
'noise3.log': '',
});
const ignore = loadIgnoreRules({
projectRoot: tmpDir,
useGitignore: true,
useQwenignore: false,
ignoreDirs: [],
});
// Valid entries: '.', '.gitignore', 'a.txt', 'b.txt' = 4
// Ignored entries: 'noise1.log', 'noise2.log', 'noise3.log'
// With maxFiles=4, all valid entries should fit because
// .log files are filtered out before the cap is applied.
const results = await crawl({
crawlDirectory: tmpDir,
cwd: tmpDir,
ignore,
cache: false,
cacheTtl: 0,
maxFiles: 4,
});
expect(results).toEqual(
expect.arrayContaining(['.', '.gitignore', 'a.txt', 'b.txt']),
);
for (const r of results) {
expect(r).not.toMatch(/\.log$/);
}
});
it('should not truncate when maxFiles exceeds total entries', async () => {
tmpDir = await createTmpDir({
'a.txt': '',
'b.txt': '',
});
const ignore = loadIgnoreRules({
projectRoot: tmpDir,
useGitignore: false,
useQwenignore: false,
ignoreDirs: [],
});
const results = await crawl({
crawlDirectory: tmpDir,
cwd: tmpDir,
ignore,
cache: false,
cacheTtl: 0,
maxFiles: 1000,
});
expect(results.length).toBeLessThanOrEqual(1000);
expect(results).toEqual(expect.arrayContaining(['.', 'a.txt', 'b.txt']));
});
});
});