From e216ab35fc99bcf23c454deb1e0930d64fdd5320 Mon Sep 17 00:00:00 2001 From: tanzhenxin Date: Sat, 11 Apr 2026 16:44:02 +0800 Subject: [PATCH] fix(core): cap recursive file crawler at 100k entries to prevent OOM (#3138) When the @ autocomplete triggers RecursiveFileSearch, the crawler materialises the entire project tree into memory with no upper bound. For very large workspaces (missing .gitignore, huge node_modules, home directory as cwd) this pushes Node.js past its heap limit and crashes. - Add `maxFiles` option to CrawlOptions; use fdir's withMaxFiles() to stop traversal early instead of post-hoc truncation - Apply file-level ignore patterns during crawl via fdir filter() so ignored files don't consume the maxFiles budget - Include maxFiles in the crawl cache key for correctness - Set MAX_CRAWL_FILES = 100 000 in RecursiveFileSearch (caps peak memory at ~50 MB for the file list) Fixes #3130 --- .../core/src/utils/filesearch/crawlCache.ts | 4 + .../core/src/utils/filesearch/crawler.test.ts | 106 +++++++++++++++++- packages/core/src/utils/filesearch/crawler.ts | 20 +++- .../core/src/utils/filesearch/fileSearch.ts | 10 ++ 4 files changed, 137 insertions(+), 3 deletions(-) diff --git a/packages/core/src/utils/filesearch/crawlCache.ts b/packages/core/src/utils/filesearch/crawlCache.ts index b905c9dfc..66a7e3d4c 100644 --- a/packages/core/src/utils/filesearch/crawlCache.ts +++ b/packages/core/src/utils/filesearch/crawlCache.ts @@ -18,6 +18,7 @@ export const getCacheKey = ( directory: string, ignoreContent: string, maxDepth?: number, + maxFiles?: number, ): string => { const hash = crypto.createHash('sha256'); hash.update(directory); @@ -25,6 +26,9 @@ export const getCacheKey = ( if (maxDepth !== undefined) { hash.update(String(maxDepth)); } + if (maxFiles !== undefined) { + hash.update(`maxFiles:${maxFiles}`); + } return hash.digest('hex'); }; diff --git a/packages/core/src/utils/filesearch/crawler.test.ts b/packages/core/src/utils/filesearch/crawler.test.ts index bae7e84cb..3e6b26164 100644 --- a/packages/core/src/utils/filesearch/crawler.test.ts +++ b/packages/core/src/utils/filesearch/crawler.test.ts @@ -178,8 +178,10 @@ describe('crawler', () => { }); expect(results).toEqual( - expect.arrayContaining(['.', '.gitignore', 'Foo.mk', 'bar.mk']), + expect.arrayContaining(['.', '.gitignore', 'Foo.mk']), ); + // bar.mk matches *.mk and is not negated, so it should be filtered out + expect(results).not.toContain('bar.mk'); }); it('should handle directory negation with glob', async () => { @@ -571,4 +573,106 @@ describe('crawler', () => { ); }); }); + + describe('with maxFiles', () => { + it('should truncate results when maxFiles is exceeded', async () => { + tmpDir = await createTmpDir({ + 'a.txt': '', + 'b.txt': '', + 'c.txt': '', + sub: ['d.txt', 'e.txt'], + }); + + const ignore = loadIgnoreRules({ + projectRoot: tmpDir, + useGitignore: false, + useQwenignore: false, + ignoreDirs: [], + }); + + const allResults = await crawl({ + crawlDirectory: tmpDir, + cwd: tmpDir, + ignore, + cache: false, + cacheTtl: 0, + }); + + const limitedResults = await crawl({ + crawlDirectory: tmpDir, + cwd: tmpDir, + ignore, + cache: false, + cacheTtl: 0, + maxFiles: 3, + }); + + expect(allResults.length).toBeGreaterThan(3); + expect(limitedResults.length).toBe(3); + }); + + it('should not count file-ignored entries toward maxFiles budget', async () => { + tmpDir = await createTmpDir({ + '.gitignore': '*.log', + 'a.txt': '', + 'b.txt': '', + 'noise1.log': '', + 'noise2.log': '', + 'noise3.log': '', + }); + + const ignore = loadIgnoreRules({ + projectRoot: tmpDir, + useGitignore: true, + useQwenignore: false, + ignoreDirs: [], + }); + + // Valid entries: '.', '.gitignore', 'a.txt', 'b.txt' = 4 + // Ignored entries: 'noise1.log', 'noise2.log', 'noise3.log' + // With maxFiles=4, all valid entries should fit because + // .log files are filtered out before the cap is applied. + const results = await crawl({ + crawlDirectory: tmpDir, + cwd: tmpDir, + ignore, + cache: false, + cacheTtl: 0, + maxFiles: 4, + }); + + expect(results).toEqual( + expect.arrayContaining(['.', '.gitignore', 'a.txt', 'b.txt']), + ); + for (const r of results) { + expect(r).not.toMatch(/\.log$/); + } + }); + + it('should not truncate when maxFiles exceeds total entries', async () => { + tmpDir = await createTmpDir({ + 'a.txt': '', + 'b.txt': '', + }); + + const ignore = loadIgnoreRules({ + projectRoot: tmpDir, + useGitignore: false, + useQwenignore: false, + ignoreDirs: [], + }); + + const results = await crawl({ + crawlDirectory: tmpDir, + cwd: tmpDir, + ignore, + cache: false, + cacheTtl: 0, + maxFiles: 1000, + }); + + expect(results.length).toBeLessThanOrEqual(1000); + expect(results).toEqual(expect.arrayContaining(['.', 'a.txt', 'b.txt'])); + }); + }); }); diff --git a/packages/core/src/utils/filesearch/crawler.ts b/packages/core/src/utils/filesearch/crawler.ts index 9184ba328..0fdf282b3 100644 --- a/packages/core/src/utils/filesearch/crawler.ts +++ b/packages/core/src/utils/filesearch/crawler.ts @@ -16,6 +16,8 @@ export interface CrawlOptions { cwd: string; // The fdir maxDepth option. maxDepth?: number; + // Maximum number of file entries to return. Prevents OOM on very large trees. + maxFiles?: number; // A pre-configured Ignore instance. ignore: Ignore; // Caching options. @@ -33,6 +35,7 @@ export async function crawl(options: CrawlOptions): Promise { options.crawlDirectory, options.ignore.getFingerprint(), options.maxDepth, + options.maxFiles, ); const cachedResults = cache.read(cacheKey); @@ -43,10 +46,12 @@ export async function crawl(options: CrawlOptions): Promise { const posixCwd = toPosixPath(options.cwd); const posixCrawlDirectory = toPosixPath(options.crawlDirectory); + const relativeToCrawlDir = path.posix.relative(posixCwd, posixCrawlDirectory); let results: string[]; try { const dirFilter = options.ignore.getDirectoryFilter(); + const fileFilter = options.ignore.getFileFilter(); const api = new fdir() .withRelativePaths() .withDirs() @@ -54,20 +59,30 @@ export async function crawl(options: CrawlOptions): Promise { .exclude((_, dirPath) => { const relativePath = path.posix.relative(posixCrawlDirectory, dirPath); return dirFilter(`${relativePath}/`); + }) + .filter((filePath, isDirectory) => { + // Directories are already handled by the exclude() callback above. + if (isDirectory) return true; + // Apply file-level ignore patterns (e.g. *.log, *.map) during the + // crawl so they don't consume the maxFiles budget. + const cwdRelative = path.posix.join(relativeToCrawlDir, filePath); + return !fileFilter(cwdRelative); }); if (options.maxDepth !== undefined) { api.withMaxDepth(options.maxDepth); } + if (options.maxFiles !== undefined) { + api.withMaxFiles(options.maxFiles); + } + results = await api.crawl(options.crawlDirectory).withPromise(); } catch (_e) { // The directory probably doesn't exist. return []; } - const relativeToCrawlDir = path.posix.relative(posixCwd, posixCrawlDirectory); - const relativeToCwdResults = results.map((p) => path.posix.join(relativeToCrawlDir, p), ); @@ -77,6 +92,7 @@ export async function crawl(options: CrawlOptions): Promise { options.crawlDirectory, options.ignore.getFingerprint(), options.maxDepth, + options.maxFiles, ); cache.write(cacheKey, relativeToCwdResults, options.cacheTtl * 1000); } diff --git a/packages/core/src/utils/filesearch/fileSearch.ts b/packages/core/src/utils/filesearch/fileSearch.ts index 9a030b436..b277f1df9 100644 --- a/packages/core/src/utils/filesearch/fileSearch.ts +++ b/packages/core/src/utils/filesearch/fileSearch.ts @@ -14,6 +14,15 @@ import type { FzfResultItem } from 'fzf'; import { AsyncFzf } from 'fzf'; import { unescapePath } from '../paths.js'; +/** + * Safety cap on the number of file entries the recursive crawler will + * materialise in memory. Without this, workspaces with millions of files + * (e.g. missing .gitignore, huge node_modules trees) can push Node.js past + * its heap limit and crash with an OOM. 100 000 entries is generous enough + * for virtually all real projects while keeping peak memory well under 100 MB. + */ +const MAX_CRAWL_FILES = 100_000; + export interface FileSearchOptions { projectRoot: string; ignoreDirs: string[]; @@ -108,6 +117,7 @@ class RecursiveFileSearch implements FileSearch { cache: this.options.cache, cacheTtl: this.options.cacheTtl, maxDepth: this.options.maxDepth, + maxFiles: MAX_CRAWL_FILES, }); this.buildResultCache(); }