mirror of
https://github.com/QwenLM/qwen-code.git
synced 2026-05-04 22:51:08 +00:00
fix(core): cap recursive file crawler at 100k entries to prevent OOM (#3138)
When the @ autocomplete triggers RecursiveFileSearch, the crawler materialises the entire project tree into memory with no upper bound. For very large workspaces (missing .gitignore, huge node_modules, home directory as cwd) this pushes Node.js past its heap limit and crashes. - Add `maxFiles` option to CrawlOptions; use fdir's withMaxFiles() to stop traversal early instead of post-hoc truncation - Apply file-level ignore patterns during crawl via fdir filter() so ignored files don't consume the maxFiles budget - Include maxFiles in the crawl cache key for correctness - Set MAX_CRAWL_FILES = 100 000 in RecursiveFileSearch (caps peak memory at ~50 MB for the file list) Fixes #3130
This commit is contained in:
parent
61ad9db9c1
commit
e216ab35fc
4 changed files with 137 additions and 3 deletions
|
|
@ -18,6 +18,7 @@ export const getCacheKey = (
|
|||
directory: string,
|
||||
ignoreContent: string,
|
||||
maxDepth?: number,
|
||||
maxFiles?: number,
|
||||
): string => {
|
||||
const hash = crypto.createHash('sha256');
|
||||
hash.update(directory);
|
||||
|
|
@ -25,6 +26,9 @@ export const getCacheKey = (
|
|||
if (maxDepth !== undefined) {
|
||||
hash.update(String(maxDepth));
|
||||
}
|
||||
if (maxFiles !== undefined) {
|
||||
hash.update(`maxFiles:${maxFiles}`);
|
||||
}
|
||||
return hash.digest('hex');
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -178,8 +178,10 @@ describe('crawler', () => {
|
|||
});
|
||||
|
||||
expect(results).toEqual(
|
||||
expect.arrayContaining(['.', '.gitignore', 'Foo.mk', 'bar.mk']),
|
||||
expect.arrayContaining(['.', '.gitignore', 'Foo.mk']),
|
||||
);
|
||||
// bar.mk matches *.mk and is not negated, so it should be filtered out
|
||||
expect(results).not.toContain('bar.mk');
|
||||
});
|
||||
|
||||
it('should handle directory negation with glob', async () => {
|
||||
|
|
@ -571,4 +573,106 @@ describe('crawler', () => {
|
|||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe('with maxFiles', () => {
|
||||
it('should truncate results when maxFiles is exceeded', async () => {
|
||||
tmpDir = await createTmpDir({
|
||||
'a.txt': '',
|
||||
'b.txt': '',
|
||||
'c.txt': '',
|
||||
sub: ['d.txt', 'e.txt'],
|
||||
});
|
||||
|
||||
const ignore = loadIgnoreRules({
|
||||
projectRoot: tmpDir,
|
||||
useGitignore: false,
|
||||
useQwenignore: false,
|
||||
ignoreDirs: [],
|
||||
});
|
||||
|
||||
const allResults = await crawl({
|
||||
crawlDirectory: tmpDir,
|
||||
cwd: tmpDir,
|
||||
ignore,
|
||||
cache: false,
|
||||
cacheTtl: 0,
|
||||
});
|
||||
|
||||
const limitedResults = await crawl({
|
||||
crawlDirectory: tmpDir,
|
||||
cwd: tmpDir,
|
||||
ignore,
|
||||
cache: false,
|
||||
cacheTtl: 0,
|
||||
maxFiles: 3,
|
||||
});
|
||||
|
||||
expect(allResults.length).toBeGreaterThan(3);
|
||||
expect(limitedResults.length).toBe(3);
|
||||
});
|
||||
|
||||
it('should not count file-ignored entries toward maxFiles budget', async () => {
|
||||
tmpDir = await createTmpDir({
|
||||
'.gitignore': '*.log',
|
||||
'a.txt': '',
|
||||
'b.txt': '',
|
||||
'noise1.log': '',
|
||||
'noise2.log': '',
|
||||
'noise3.log': '',
|
||||
});
|
||||
|
||||
const ignore = loadIgnoreRules({
|
||||
projectRoot: tmpDir,
|
||||
useGitignore: true,
|
||||
useQwenignore: false,
|
||||
ignoreDirs: [],
|
||||
});
|
||||
|
||||
// Valid entries: '.', '.gitignore', 'a.txt', 'b.txt' = 4
|
||||
// Ignored entries: 'noise1.log', 'noise2.log', 'noise3.log'
|
||||
// With maxFiles=4, all valid entries should fit because
|
||||
// .log files are filtered out before the cap is applied.
|
||||
const results = await crawl({
|
||||
crawlDirectory: tmpDir,
|
||||
cwd: tmpDir,
|
||||
ignore,
|
||||
cache: false,
|
||||
cacheTtl: 0,
|
||||
maxFiles: 4,
|
||||
});
|
||||
|
||||
expect(results).toEqual(
|
||||
expect.arrayContaining(['.', '.gitignore', 'a.txt', 'b.txt']),
|
||||
);
|
||||
for (const r of results) {
|
||||
expect(r).not.toMatch(/\.log$/);
|
||||
}
|
||||
});
|
||||
|
||||
it('should not truncate when maxFiles exceeds total entries', async () => {
|
||||
tmpDir = await createTmpDir({
|
||||
'a.txt': '',
|
||||
'b.txt': '',
|
||||
});
|
||||
|
||||
const ignore = loadIgnoreRules({
|
||||
projectRoot: tmpDir,
|
||||
useGitignore: false,
|
||||
useQwenignore: false,
|
||||
ignoreDirs: [],
|
||||
});
|
||||
|
||||
const results = await crawl({
|
||||
crawlDirectory: tmpDir,
|
||||
cwd: tmpDir,
|
||||
ignore,
|
||||
cache: false,
|
||||
cacheTtl: 0,
|
||||
maxFiles: 1000,
|
||||
});
|
||||
|
||||
expect(results.length).toBeLessThanOrEqual(1000);
|
||||
expect(results).toEqual(expect.arrayContaining(['.', 'a.txt', 'b.txt']));
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -16,6 +16,8 @@ export interface CrawlOptions {
|
|||
cwd: string;
|
||||
// The fdir maxDepth option.
|
||||
maxDepth?: number;
|
||||
// Maximum number of file entries to return. Prevents OOM on very large trees.
|
||||
maxFiles?: number;
|
||||
// A pre-configured Ignore instance.
|
||||
ignore: Ignore;
|
||||
// Caching options.
|
||||
|
|
@ -33,6 +35,7 @@ export async function crawl(options: CrawlOptions): Promise<string[]> {
|
|||
options.crawlDirectory,
|
||||
options.ignore.getFingerprint(),
|
||||
options.maxDepth,
|
||||
options.maxFiles,
|
||||
);
|
||||
const cachedResults = cache.read(cacheKey);
|
||||
|
||||
|
|
@ -43,10 +46,12 @@ export async function crawl(options: CrawlOptions): Promise<string[]> {
|
|||
|
||||
const posixCwd = toPosixPath(options.cwd);
|
||||
const posixCrawlDirectory = toPosixPath(options.crawlDirectory);
|
||||
const relativeToCrawlDir = path.posix.relative(posixCwd, posixCrawlDirectory);
|
||||
|
||||
let results: string[];
|
||||
try {
|
||||
const dirFilter = options.ignore.getDirectoryFilter();
|
||||
const fileFilter = options.ignore.getFileFilter();
|
||||
const api = new fdir()
|
||||
.withRelativePaths()
|
||||
.withDirs()
|
||||
|
|
@ -54,20 +59,30 @@ export async function crawl(options: CrawlOptions): Promise<string[]> {
|
|||
.exclude((_, dirPath) => {
|
||||
const relativePath = path.posix.relative(posixCrawlDirectory, dirPath);
|
||||
return dirFilter(`${relativePath}/`);
|
||||
})
|
||||
.filter((filePath, isDirectory) => {
|
||||
// Directories are already handled by the exclude() callback above.
|
||||
if (isDirectory) return true;
|
||||
// Apply file-level ignore patterns (e.g. *.log, *.map) during the
|
||||
// crawl so they don't consume the maxFiles budget.
|
||||
const cwdRelative = path.posix.join(relativeToCrawlDir, filePath);
|
||||
return !fileFilter(cwdRelative);
|
||||
});
|
||||
|
||||
if (options.maxDepth !== undefined) {
|
||||
api.withMaxDepth(options.maxDepth);
|
||||
}
|
||||
|
||||
if (options.maxFiles !== undefined) {
|
||||
api.withMaxFiles(options.maxFiles);
|
||||
}
|
||||
|
||||
results = await api.crawl(options.crawlDirectory).withPromise();
|
||||
} catch (_e) {
|
||||
// The directory probably doesn't exist.
|
||||
return [];
|
||||
}
|
||||
|
||||
const relativeToCrawlDir = path.posix.relative(posixCwd, posixCrawlDirectory);
|
||||
|
||||
const relativeToCwdResults = results.map((p) =>
|
||||
path.posix.join(relativeToCrawlDir, p),
|
||||
);
|
||||
|
|
@ -77,6 +92,7 @@ export async function crawl(options: CrawlOptions): Promise<string[]> {
|
|||
options.crawlDirectory,
|
||||
options.ignore.getFingerprint(),
|
||||
options.maxDepth,
|
||||
options.maxFiles,
|
||||
);
|
||||
cache.write(cacheKey, relativeToCwdResults, options.cacheTtl * 1000);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -14,6 +14,15 @@ import type { FzfResultItem } from 'fzf';
|
|||
import { AsyncFzf } from 'fzf';
|
||||
import { unescapePath } from '../paths.js';
|
||||
|
||||
/**
|
||||
* Safety cap on the number of file entries the recursive crawler will
|
||||
* materialise in memory. Without this, workspaces with millions of files
|
||||
* (e.g. missing .gitignore, huge node_modules trees) can push Node.js past
|
||||
* its heap limit and crash with an OOM. 100 000 entries is generous enough
|
||||
* for virtually all real projects while keeping peak memory well under 100 MB.
|
||||
*/
|
||||
const MAX_CRAWL_FILES = 100_000;
|
||||
|
||||
export interface FileSearchOptions {
|
||||
projectRoot: string;
|
||||
ignoreDirs: string[];
|
||||
|
|
@ -108,6 +117,7 @@ class RecursiveFileSearch implements FileSearch {
|
|||
cache: this.options.cache,
|
||||
cacheTtl: this.options.cacheTtl,
|
||||
maxDepth: this.options.maxDepth,
|
||||
maxFiles: MAX_CRAWL_FILES,
|
||||
});
|
||||
this.buildResultCache();
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue