From e216ab35fc99bcf23c454deb1e0930d64fdd5320 Mon Sep 17 00:00:00 2001
From: tanzhenxin <tanzhenxing1987@gmail.com>
Date: Sat, 11 Apr 2026 16:44:02 +0800
Subject: [PATCH] fix(core): cap recursive file crawler at 100k entries to
 prevent OOM (#3138)

When the @ autocomplete triggers RecursiveFileSearch, the crawler
materialises the entire project tree into memory with no upper bound.
For very large workspaces (missing .gitignore, huge node_modules, home
directory as cwd) this pushes Node.js past its heap limit and crashes.

- Add `maxFiles` option to CrawlOptions; use fdir's withMaxFiles() to
  stop traversal early instead of post-hoc truncation
- Apply file-level ignore patterns during crawl via fdir filter() so
  ignored files don't consume the maxFiles budget
- Include maxFiles in the crawl cache key for correctness
- Set MAX_CRAWL_FILES = 100 000 in RecursiveFileSearch (caps peak
  memory at ~50 MB for the file list)

Fixes #3130
---
 .../core/src/utils/filesearch/crawlCache.ts   |   4 +
 .../core/src/utils/filesearch/crawler.test.ts | 106 +++++++++++++++++-
 packages/core/src/utils/filesearch/crawler.ts |  20 +++-
 .../core/src/utils/filesearch/fileSearch.ts   |  10 ++
 4 files changed, 137 insertions(+), 3 deletions(-)

diff --git a/packages/core/src/utils/filesearch/crawlCache.ts b/packages/core/src/utils/filesearch/crawlCache.ts
index b905c9dfc..66a7e3d4c 100644
--- a/packages/core/src/utils/filesearch/crawlCache.ts
+++ b/packages/core/src/utils/filesearch/crawlCache.ts
@@ -18,6 +18,7 @@ export const getCacheKey = (
   directory: string,
   ignoreContent: string,
   maxDepth?: number,
+  maxFiles?: number,
 ): string => {
   const hash = crypto.createHash('sha256');
   hash.update(directory);
@@ -25,6 +26,9 @@ export const getCacheKey = (
   if (maxDepth !== undefined) {
     hash.update(String(maxDepth));
   }
+  if (maxFiles !== undefined) {
+    hash.update(`maxFiles:${maxFiles}`);
+  }
   return hash.digest('hex');
 };
 
diff --git a/packages/core/src/utils/filesearch/crawler.test.ts b/packages/core/src/utils/filesearch/crawler.test.ts
index bae7e84cb..3e6b26164 100644
--- a/packages/core/src/utils/filesearch/crawler.test.ts
+++ b/packages/core/src/utils/filesearch/crawler.test.ts
@@ -178,8 +178,10 @@ describe('crawler', () => {
     });
 
     expect(results).toEqual(
-      expect.arrayContaining(['.', '.gitignore', 'Foo.mk', 'bar.mk']),
+      expect.arrayContaining(['.', '.gitignore', 'Foo.mk']),
     );
+    // bar.mk matches *.mk and is not negated, so it should be filtered out
+    expect(results).not.toContain('bar.mk');
   });
 
   it('should handle directory negation with glob', async () => {
@@ -571,4 +573,106 @@ describe('crawler', () => {
       );
     });
   });
+
+  describe('with maxFiles', () => {
+    it('should truncate results when maxFiles is exceeded', async () => {
+      tmpDir = await createTmpDir({
+        'a.txt': '',
+        'b.txt': '',
+        'c.txt': '',
+        sub: ['d.txt', 'e.txt'],
+      });
+
+      const ignore = loadIgnoreRules({
+        projectRoot: tmpDir,
+        useGitignore: false,
+        useQwenignore: false,
+        ignoreDirs: [],
+      });
+
+      const allResults = await crawl({
+        crawlDirectory: tmpDir,
+        cwd: tmpDir,
+        ignore,
+        cache: false,
+        cacheTtl: 0,
+      });
+
+      const limitedResults = await crawl({
+        crawlDirectory: tmpDir,
+        cwd: tmpDir,
+        ignore,
+        cache: false,
+        cacheTtl: 0,
+        maxFiles: 3,
+      });
+
+      expect(allResults.length).toBeGreaterThan(3);
+      expect(limitedResults.length).toBe(3);
+    });
+
+    it('should not count file-ignored entries toward maxFiles budget', async () => {
+      tmpDir = await createTmpDir({
+        '.gitignore': '*.log',
+        'a.txt': '',
+        'b.txt': '',
+        'noise1.log': '',
+        'noise2.log': '',
+        'noise3.log': '',
+      });
+
+      const ignore = loadIgnoreRules({
+        projectRoot: tmpDir,
+        useGitignore: true,
+        useQwenignore: false,
+        ignoreDirs: [],
+      });
+
+      // Valid entries: '.', '.gitignore', 'a.txt', 'b.txt' = 4
+      // Ignored entries: 'noise1.log', 'noise2.log', 'noise3.log'
+      // With maxFiles=4, all valid entries should fit because
+      // .log files are filtered out before the cap is applied.
+      const results = await crawl({
+        crawlDirectory: tmpDir,
+        cwd: tmpDir,
+        ignore,
+        cache: false,
+        cacheTtl: 0,
+        maxFiles: 4,
+      });
+
+      expect(results).toEqual(
+        expect.arrayContaining(['.', '.gitignore', 'a.txt', 'b.txt']),
+      );
+      for (const r of results) {
+        expect(r).not.toMatch(/\.log$/);
+      }
+    });
+
+    it('should not truncate when maxFiles exceeds total entries', async () => {
+      tmpDir = await createTmpDir({
+        'a.txt': '',
+        'b.txt': '',
+      });
+
+      const ignore = loadIgnoreRules({
+        projectRoot: tmpDir,
+        useGitignore: false,
+        useQwenignore: false,
+        ignoreDirs: [],
+      });
+
+      const results = await crawl({
+        crawlDirectory: tmpDir,
+        cwd: tmpDir,
+        ignore,
+        cache: false,
+        cacheTtl: 0,
+        maxFiles: 1000,
+      });
+
+      expect(results.length).toBeLessThanOrEqual(1000);
+      expect(results).toEqual(expect.arrayContaining(['.', 'a.txt', 'b.txt']));
+    });
+  });
 });
diff --git a/packages/core/src/utils/filesearch/crawler.ts b/packages/core/src/utils/filesearch/crawler.ts
index 9184ba328..0fdf282b3 100644
--- a/packages/core/src/utils/filesearch/crawler.ts
+++ b/packages/core/src/utils/filesearch/crawler.ts
@@ -16,6 +16,8 @@ export interface CrawlOptions {
   cwd: string;
   // The fdir maxDepth option.
   maxDepth?: number;
+  // Maximum number of file entries to return. Prevents OOM on very large trees.
+  maxFiles?: number;
   // A pre-configured Ignore instance.
   ignore: Ignore;
   // Caching options.
@@ -33,6 +35,7 @@ export async function crawl(options: CrawlOptions): Promise<string[]> {
       options.crawlDirectory,
       options.ignore.getFingerprint(),
       options.maxDepth,
+      options.maxFiles,
     );
     const cachedResults = cache.read(cacheKey);
 
@@ -43,10 +46,12 @@ export async function crawl(options: CrawlOptions): Promise<string[]> {
 
   const posixCwd = toPosixPath(options.cwd);
   const posixCrawlDirectory = toPosixPath(options.crawlDirectory);
+  const relativeToCrawlDir = path.posix.relative(posixCwd, posixCrawlDirectory);
 
   let results: string[];
   try {
     const dirFilter = options.ignore.getDirectoryFilter();
+    const fileFilter = options.ignore.getFileFilter();
     const api = new fdir()
       .withRelativePaths()
       .withDirs()
@@ -54,20 +59,30 @@ export async function crawl(options: CrawlOptions): Promise<string[]> {
       .exclude((_, dirPath) => {
         const relativePath = path.posix.relative(posixCrawlDirectory, dirPath);
         return dirFilter(`${relativePath}/`);
+      })
+      .filter((filePath, isDirectory) => {
+        // Directories are already handled by the exclude() callback above.
+        if (isDirectory) return true;
+        // Apply file-level ignore patterns (e.g. *.log, *.map) during the
+        // crawl so they don't consume the maxFiles budget.
+        const cwdRelative = path.posix.join(relativeToCrawlDir, filePath);
+        return !fileFilter(cwdRelative);
       });
 
     if (options.maxDepth !== undefined) {
       api.withMaxDepth(options.maxDepth);
     }
 
+    if (options.maxFiles !== undefined) {
+      api.withMaxFiles(options.maxFiles);
+    }
+
     results = await api.crawl(options.crawlDirectory).withPromise();
   } catch (_e) {
     // The directory probably doesn't exist.
     return [];
   }
 
-  const relativeToCrawlDir = path.posix.relative(posixCwd, posixCrawlDirectory);
-
   const relativeToCwdResults = results.map((p) =>
     path.posix.join(relativeToCrawlDir, p),
   );
@@ -77,6 +92,7 @@ export async function crawl(options: CrawlOptions): Promise<string[]> {
       options.crawlDirectory,
       options.ignore.getFingerprint(),
       options.maxDepth,
+      options.maxFiles,
     );
     cache.write(cacheKey, relativeToCwdResults, options.cacheTtl * 1000);
   }
diff --git a/packages/core/src/utils/filesearch/fileSearch.ts b/packages/core/src/utils/filesearch/fileSearch.ts
index 9a030b436..b277f1df9 100644
--- a/packages/core/src/utils/filesearch/fileSearch.ts
+++ b/packages/core/src/utils/filesearch/fileSearch.ts
@@ -14,6 +14,15 @@ import type { FzfResultItem } from 'fzf';
 import { AsyncFzf } from 'fzf';
 import { unescapePath } from '../paths.js';
 
+/**
+ * Safety cap on the number of file entries the recursive crawler will
+ * materialise in memory. Without this, workspaces with millions of files
+ * (e.g. missing .gitignore, huge node_modules trees) can push Node.js past
+ * its heap limit and crash with an OOM.  100 000 entries is generous enough
+ * for virtually all real projects while keeping peak memory well under 100 MB.
+ */
+const MAX_CRAWL_FILES = 100_000;
+
 export interface FileSearchOptions {
   projectRoot: string;
   ignoreDirs: string[];
@@ -108,6 +117,7 @@ class RecursiveFileSearch implements FileSearch {
       cache: this.options.cache,
       cacheTtl: this.options.cacheTtl,
       maxDepth: this.options.maxDepth,
+      maxFiles: MAX_CRAWL_FILES,
     });
     this.buildResultCache();
   }