From 005f88e2e4ccd457d867a44bd8d73bd0019c384d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=85=8B=E7=AB=9F?= <1048927295@qq.com> Date: Fri, 24 Apr 2026 16:13:32 +0800 Subject: [PATCH] fix(core): address /diff PR review comments Addresses the five open review threads on #3491: - parseShortstat: anchored and bounded the regex (`^...$` with `\d{1,10}`) so adversarial inputs can no longer drive polynomial backtracking. Closes CodeQL alert #137. - fetchGitDiff: only parse the untracked-path list when we actually need it; the fast path now counts NUL bytes in the raw `ls-files -z` stdout (wenshao P1). - fetchGitDiff: base the `MAX_FILES_FOR_DETAILS` short-circuit on `tracked + untracked`, so repos with few edits but many untracked files still take the summary-only path (wenshao P2). - fetchGitDiff: count newlines in each untracked text file (binary sniff + 1 MB read cap) and fold that into both the header `+N` and the per-file row, so a brand-new file no longer renders as `+0 / -0` (BZ-D P2). - parseGitNumstat: switch to `git diff --numstat -z`. The parser now uses index-based slicing and a rename-pair state machine, so tracked filenames containing tabs/newlines/non-ASCII keep their real bytes (BZ-D P3). Renames collapse into a single `old => new` entry. UI: untracked rows render as `+N filename (new)` (or `~ filename (binary, new)`) instead of the placeholder `?` marker; `/diff` now shows real additions for fresh files. --- packages/cli/src/i18n/locales/en.js | 2 + packages/cli/src/i18n/locales/zh.js | 2 + .../cli/src/ui/commands/diffCommand.test.ts | 42 +++- packages/cli/src/ui/commands/diffCommand.ts | 18 +- packages/core/src/utils/gitDiff.test.ts | 150 +++++++++++- packages/core/src/utils/gitDiff.ts | 229 ++++++++++++++---- scripts/unused-keys-only-in-locales.json | 2 +- 7 files changed, 363 insertions(+), 82 deletions(-) diff --git a/packages/cli/src/i18n/locales/en.js b/packages/cli/src/i18n/locales/en.js index 01b8a1535..3b6b7f020 100644 --- a/packages/cli/src/i18n/locales/en.js +++ b/packages/cli/src/i18n/locales/en.js @@ -169,6 +169,8 @@ export default { '…and {{hidden}} more (showing first {{shown}})': '…and {{hidden}} more (showing first {{shown}})', '(binary)': '(binary)', + '(binary, new)': '(binary, new)', + '(new)': '(new)', // ============================================================================ // Commands - Agents diff --git a/packages/cli/src/i18n/locales/zh.js b/packages/cli/src/i18n/locales/zh.js index 2bfde8c6f..63c26b412 100644 --- a/packages/cli/src/i18n/locales/zh.js +++ b/packages/cli/src/i18n/locales/zh.js @@ -164,6 +164,8 @@ export default { '…and {{hidden}} more (showing first {{shown}})': '…还有 {{hidden}} 个(仅显示前 {{shown}} 个)', '(binary)': '(二进制)', + '(binary, new)': '(二进制,新增)', + '(new)': '(新增)', // ============================================================================ // Commands - Agents diff --git a/packages/cli/src/ui/commands/diffCommand.test.ts b/packages/cli/src/ui/commands/diffCommand.test.ts index 92ccb40a1..71c36446d 100644 --- a/packages/cli/src/ui/commands/diffCommand.test.ts +++ b/packages/cli/src/ui/commands/diffCommand.test.ts @@ -132,28 +132,46 @@ describe('diffCommand', () => { expect(content).toContain('src/b.ts'); }); - it('aligns untracked/binary rows with the numeric stat column', async () => { + it('shows untracked text files with their line count and a (new) marker', async () => { if (!diffCommand.action) throw new Error('Command has no action'); mockFetchGitDiff.mockResolvedValue({ - stats: { filesCount: 3, linesAdded: 10, linesRemoved: 2 }, + stats: { filesCount: 2, linesAdded: 12, linesRemoved: 2 }, perFileStats: new Map([ ['src/a.ts', { added: 10, removed: 2, isBinary: false }], [ - 'new.txt', - { added: 0, removed: 0, isBinary: false, isUntracked: true }, + 'notes.md', + { added: 2, removed: 0, isBinary: false, isUntracked: true }, ], - ['img.png', { added: 0, removed: 0, isBinary: true }], ]), } satisfies GitDiffResult); const result = await diffCommand.action(mockContext, ''); - const lines = (result as { content: string }).content.split('\n'); + const content = (result as { content: string }).content; + const lines = content.split('\n'); const aLine = lines.find((l) => l.endsWith('src/a.ts'))!; - const newLine = lines.find((l) => l.endsWith('new.txt'))!; - const imgLine = lines.find((l) => l.endsWith('img.png (binary)'))!; - // Filename column starts at the same offset in every row so that `?` / `~` - // markers line up with `+X -Y` entries. - expect(aLine.indexOf('src/a.ts')).toBe(newLine.indexOf('new.txt')); - expect(aLine.indexOf('src/a.ts')).toBe(imgLine.indexOf('img.png')); + const newLine = lines.find((l) => l.includes('notes.md'))!; + expect(newLine).toContain('+ 2'); + expect(newLine).toContain('(new)'); + // Stat columns stay aligned across tracked and new rows. + expect(aLine.indexOf('src/a.ts')).toBe(newLine.indexOf('notes.md')); + }); + + it('marks binary untracked files with (binary, new) and no line count', async () => { + if (!diffCommand.action) throw new Error('Command has no action'); + mockFetchGitDiff.mockResolvedValue({ + stats: { filesCount: 1, linesAdded: 0, linesRemoved: 0 }, + perFileStats: new Map([ + [ + 'blob.bin', + { added: 0, removed: 0, isBinary: true, isUntracked: true }, + ], + ]), + } satisfies GitDiffResult); + const result = await diffCommand.action(mockContext, ''); + const content = (result as { content: string }).content; + const binaryLine = content.split('\n').find((l) => l.includes('blob.bin'))!; + expect(binaryLine).toContain('(binary, new)'); + expect(binaryLine).not.toMatch(/\+\d/); + expect(binaryLine.trimStart().startsWith('~')).toBe(true); }); it('pads counts consistently for 4-digit values', async () => { diff --git a/packages/cli/src/ui/commands/diffCommand.ts b/packages/cli/src/ui/commands/diffCommand.ts index cd67b9780..b87be89be 100644 --- a/packages/cli/src/ui/commands/diffCommand.ts +++ b/packages/cli/src/ui/commands/diffCommand.ts @@ -102,27 +102,27 @@ function formatPerFile(perFileStats: Map): string[] { let maxAdded = 0; let maxRemoved = 0; for (const s of perFileStats.values()) { - if (s.isBinary || s.isUntracked) continue; + if (s.isBinary) continue; if (s.added > maxAdded) maxAdded = s.added; if (s.removed > maxRemoved) maxRemoved = s.removed; } const addWidth = String(maxAdded).length; const remWidth = String(maxRemoved).length; - // Width of the `+X -Y` stat column so `?` / `~` rows line up with it. + // Width of the `+X -Y` stat column so `~` (binary) rows line up with it. const statColumnWidth = 1 + addWidth + 1 + 1 + remWidth; const rows: string[] = []; for (const [filename, s] of perFileStats) { - if (s.isUntracked) { - rows.push(` ${padMarker('?', statColumnWidth)} ${filename}`); - } else if (s.isBinary) { - rows.push( - ` ${padMarker('~', statColumnWidth)} ${filename} ${t('(binary)')}`, - ); + if (s.isBinary) { + const suffix = s.isUntracked + ? ` ${t('(binary, new)')}` + : ` ${t('(binary)')}`; + rows.push(` ${padMarker('~', statColumnWidth)} ${filename}${suffix}`); } else { const added = `+${String(s.added).padStart(addWidth)}`; const removed = `-${String(s.removed).padStart(remWidth)}`; - rows.push(` ${added} ${removed} ${filename}`); + const suffix = s.isUntracked ? ` ${t('(new)')}` : ''; + rows.push(` ${added} ${removed} ${filename}${suffix}`); } } return rows; diff --git a/packages/core/src/utils/gitDiff.test.ts b/packages/core/src/utils/gitDiff.test.ts index 1953cc2ff..a118d8587 100644 --- a/packages/core/src/utils/gitDiff.test.ts +++ b/packages/core/src/utils/gitDiff.test.ts @@ -38,8 +38,8 @@ async function makeRepo(): Promise { } describe('parseGitNumstat', () => { - it('parses added/removed counts and file totals', () => { - const out = '3\t1\tsrc/a.ts\n10\t0\tsrc/b.ts\n0\t5\tsrc/c.ts\n'; + it('parses added/removed counts and file totals (NUL-delimited -z format)', () => { + const out = '3\t1\tsrc/a.ts\0' + '10\t0\tsrc/b.ts\0' + '0\t5\tsrc/c.ts\0'; const { stats, perFileStats } = parseGitNumstat(out); expect(stats).toEqual({ filesCount: 3, @@ -55,7 +55,7 @@ describe('parseGitNumstat', () => { }); it('treats `-` counts as binary with zero line deltas', () => { - const out = '-\t-\timg/logo.png\n'; + const out = '-\t-\timg/logo.png\0'; const { stats, perFileStats } = parseGitNumstat(out); expect(stats.filesCount).toBe(1); expect(stats.linesAdded).toBe(0); @@ -68,28 +68,44 @@ describe('parseGitNumstat', () => { }); it('keeps accurate totals but caps per-file entries at MAX_FILES', () => { - const lines: string[] = []; + const tokens: string[] = []; const totalFiles = MAX_FILES + 5; for (let i = 0; i < totalFiles; i++) { - lines.push(`1\t0\tfile${i}.ts`); + tokens.push(`1\t0\tfile${i}.ts`); } - const { stats, perFileStats } = parseGitNumstat(lines.join('\n')); + const { stats, perFileStats } = parseGitNumstat(tokens.join('\0') + '\0'); expect(stats.filesCount).toBe(totalFiles); expect(stats.linesAdded).toBe(totalFiles); expect(perFileStats.size).toBe(MAX_FILES); }); it('ignores malformed rows without crashing', () => { - const out = 'garbage-line\n2\t1\tsrc/a.ts\n'; + const out = 'garbage-token\0' + '2\t1\tsrc/a.ts\0'; const { stats, perFileStats } = parseGitNumstat(out); expect(stats.filesCount).toBe(1); expect(perFileStats.has('src/a.ts')).toBe(true); }); - it('handles filenames containing tabs', () => { - const out = '1\t2\tweird\tname.ts\n'; + it('preserves literal tabs in tracked filenames via the -z wire format', () => { + // With -z, git emits the raw path; no C-style quoting. `split('\t')` + // would mis-attribute characters after the first tab, so the parser has + // to use index-based slicing instead. + const out = '1\t2\tweird\tname.ts\0'; const { perFileStats } = parseGitNumstat(out); expect(perFileStats.has('weird\tname.ts')).toBe(true); + expect(perFileStats.get('weird\tname.ts')).toEqual({ + added: 1, + removed: 2, + isBinary: false, + }); + }); + + it('combines rename-pair tokens into a single entry', () => { + // `-z` rename format: `\t\t\0\0\0`. + const out = '0\t0\t\0' + 'src/old.ts\0' + 'src/new.ts\0'; + const { stats, perFileStats } = parseGitNumstat(out); + expect(stats.filesCount).toBe(1); + expect(perFileStats.has('src/old.ts => src/new.ts')).toBe(true); }); }); @@ -205,7 +221,7 @@ describe('fetchGitDiff', () => { } }); - it('captures tracked modifications and untracked files', async () => { + it('captures tracked modifications and counts lines in untracked text files', async () => { await fs.writeFile(path.join(repo, 'tracked.txt'), 'one\ntwo\nthree\n'); await git(repo, 'add', '.'); await git(repo, 'commit', '-q', '-m', 'init'); @@ -214,14 +230,43 @@ describe('fetchGitDiff', () => { path.join(repo, 'tracked.txt'), 'one\ntwo\nthree\nfour\n', ); - await fs.writeFile(path.join(repo, 'new.txt'), 'brand new\n'); + await fs.writeFile(path.join(repo, 'new.txt'), 'brand new\nsecond\n'); const result = await fetchGitDiff(repo); expect(result).not.toBeNull(); - expect(result!.stats.linesAdded).toBeGreaterThanOrEqual(1); expect(result!.stats.filesCount).toBe(2); + // Tracked: +1 from adding `four`. Untracked `new.txt`: 2 lines. + expect(result!.stats.linesAdded).toBe(3); expect(result!.perFileStats.get('tracked.txt')?.added).toBe(1); - expect(result!.perFileStats.get('new.txt')?.isUntracked).toBe(true); + expect(result!.perFileStats.get('new.txt')).toEqual({ + added: 2, + removed: 0, + isBinary: false, + isUntracked: true, + }); + }); + + it('flags untracked binary files without counting lines', async () => { + await fs.writeFile(path.join(repo, 'seed.txt'), 'x\n'); + await git(repo, 'add', '.'); + await git(repo, 'commit', '-q', '-m', 'init'); + + // A NUL byte in the first few bytes is git's own binary heuristic. + await fs.writeFile( + path.join(repo, 'blob.bin'), + Buffer.from([0x89, 0x00, 0xff, 0x10]), + ); + + const result = await fetchGitDiff(repo); + expect(result).not.toBeNull(); + expect(result!.perFileStats.get('blob.bin')).toEqual({ + added: 0, + removed: 0, + isBinary: true, + isUntracked: true, + }); + // Binary bytes must not contaminate the linesAdded total. + expect(result!.stats.linesAdded).toBe(0); }); it('returns zero stats on a clean working tree', async () => { @@ -455,6 +500,85 @@ describe('fetchGitDiff transient-state detection', () => { }); }); +describe('fetchGitDiff tracked-file filename robustness', () => { + it('keeps the real filename for tracked files that contain a tab', async () => { + const repo = await fs.mkdtemp(path.join(os.tmpdir(), 'qwen-gitdiff-tab-')); + try { + await execFileAsync('git', ['init', '-q', '-b', 'main'], { cwd: repo }); + await execFileAsync('git', ['config', 'user.email', 't@e.com'], { + cwd: repo, + }); + await execFileAsync('git', ['config', 'user.name', 'T'], { cwd: repo }); + await execFileAsync('git', ['config', 'commit.gpgsign', 'false'], { + cwd: repo, + }); + const weirdName = 'tab\there.txt'; + try { + await fs.writeFile(path.join(repo, weirdName), 'x\n'); + } catch { + return; // Filesystem refused the name; nothing to assert. + } + await execFileAsync('git', ['add', '.'], { cwd: repo }); + await execFileAsync('git', ['commit', '-q', '-m', 'init'], { cwd: repo }); + await fs.writeFile(path.join(repo, weirdName), 'y\n'); + + const result = await fetchGitDiff(repo); + expect(result).not.toBeNull(); + // With plain --numstat, git would C-quote this as `"tab\\there.txt"` + // and the map key would not match the real path. `--numstat -z` gives + // us the raw bytes back. + expect(result!.perFileStats.has(weirdName)).toBe(true); + expect(result!.perFileStats.has(`"tab\\there.txt"`)).toBe(false); + } finally { + await fs.rm(repo, { recursive: true, force: true }); + } + }); + + it('combines a rename into a single "old => new" per-file entry', async () => { + const repo = await fs.mkdtemp(path.join(os.tmpdir(), 'qwen-gitdiff-mv-')); + try { + await execFileAsync('git', ['init', '-q', '-b', 'main'], { cwd: repo }); + await execFileAsync('git', ['config', 'user.email', 't@e.com'], { + cwd: repo, + }); + await execFileAsync('git', ['config', 'user.name', 'T'], { cwd: repo }); + await execFileAsync('git', ['config', 'commit.gpgsign', 'false'], { + cwd: repo, + }); + await fs.writeFile(path.join(repo, 'old.txt'), 'x\n'); + await execFileAsync('git', ['add', '.'], { cwd: repo }); + await execFileAsync('git', ['commit', '-q', '-m', 'init'], { cwd: repo }); + await execFileAsync('git', ['mv', 'old.txt', 'new.txt'], { cwd: repo }); + + const result = await fetchGitDiff(repo); + expect(result).not.toBeNull(); + // Rename detection is the git default with -M; we preserve that display + // shape rather than splitting into delete + add rows. + const keys = [...result!.perFileStats.keys()]; + expect(keys.some((k) => k.includes('old.txt => new.txt'))).toBe(true); + } finally { + await fs.rm(repo, { recursive: true, force: true }); + } + }); +}); + +describe('parseShortstat ReDoS guard', () => { + it('runs in bounded time on pathological input', () => { + // CodeQL #137 flagged the previous regex as polynomial on many `0`s. + // After hardening (anchors + bounded digit runs), even 1e5 `0`s parse fast. + const adversarial = `${'0'.repeat(100_000)} files changed, ${'0'.repeat( + 100_000, + )} insertions(+)`; + const start = Date.now(); + const result = parseShortstat(adversarial); + const elapsed = Date.now() - start; + // Expect the bounded regex to either reject (too long for \d{1,10}) or + // match trivially. Either way it must not spin. + expect(elapsed).toBeLessThan(250); + expect(result).toBeNull(); + }); +}); + describe('fetchGitDiff non-ASCII filenames', () => { it('does not octal-escape UTF-8 filenames via core.quotepath', async () => { const repo = await makeRepo(); diff --git a/packages/core/src/utils/gitDiff.ts b/packages/core/src/utils/gitDiff.ts index 1fea8d597..56592784f 100644 --- a/packages/core/src/utils/gitDiff.ts +++ b/packages/core/src/utils/gitDiff.ts @@ -5,7 +5,7 @@ */ import { execFile } from 'node:child_process'; -import { access, readFile, stat } from 'node:fs/promises'; +import { access, open, readFile, stat } from 'node:fs/promises'; import * as path from 'node:path'; import { promisify } from 'node:util'; import type { Hunk } from 'diff'; @@ -43,6 +43,10 @@ export const MAX_DIFF_SIZE_BYTES = 1_000_000; export const MAX_LINES_PER_FILE = 400; /** Skip per-file parsing when the diff touches more than this many files. */ export const MAX_FILES_FOR_DETAILS = 500; +/** How much of an untracked file to read when counting its lines. */ +const UNTRACKED_READ_CAP_BYTES = MAX_DIFF_SIZE_BYTES; +/** Scan the first N bytes for NUL to detect binary files (matches git's heuristic). */ +const BINARY_SNIFF_BYTES = 8 * 1024; /** * Fetch numstat-based git diff stats (files changed, lines added/removed) and @@ -58,22 +62,40 @@ export async function fetchGitDiff(cwd: string): Promise { if (!isGitRepository(cwd)) return null; if (await isInTransientGitState(cwd)) return null; - // Quick probe + untracked list run in parallel — both are needed regardless - // of which path we take, and shortstat is O(1) memory so it can short-circuit - // huge generated workspaces before we pay the per-file numstat cost. - const [shortstatOut, untrackedPathsRaw] = await Promise.all([ + // Shortstat probe + untracked scan run in parallel — both are needed + // regardless of which path we take, and shortstat is O(1) memory so it can + // short-circuit huge generated workspaces before we pay the per-file + // numstat cost. For untracked we hold the raw stdout rather than the parsed + // list so the fast path only has to count NUL bytes instead of allocating + // a full path array. + const [shortstatOut, untrackedOut] = await Promise.all([ runGit(['--no-optional-locks', 'diff', 'HEAD', '--shortstat'], cwd), - fetchUntrackedPaths(cwd), + runGit( + [ + '--no-optional-locks', + 'ls-files', + '-z', + '--others', + '--exclude-standard', + ], + cwd, + ), ]); - const untrackedPaths = untrackedPathsRaw ?? []; + const untrackedCount = countNulDelimited(untrackedOut); if (shortstatOut != null) { const quickStats = parseShortstat(shortstatOut); - if (quickStats && quickStats.filesCount > MAX_FILES_FOR_DETAILS) { + // Base the short-circuit on tracked + untracked so repos with relatively + // few edits but a flood of untracked files (e.g. large generated + // workspaces) still take the summary-only path. + if ( + quickStats && + quickStats.filesCount + untrackedCount > MAX_FILES_FOR_DETAILS + ) { return { stats: { ...quickStats, - filesCount: quickStats.filesCount + untrackedPaths.length, + filesCount: quickStats.filesCount + untrackedCount, }, perFileStats: new Map(), }; @@ -81,29 +103,43 @@ export async function fetchGitDiff(cwd: string): Promise { } const numstatOut = await runGit( - ['--no-optional-locks', 'diff', 'HEAD', '--numstat'], + ['--no-optional-locks', 'diff', 'HEAD', '--numstat', '-z'], cwd, ); if (numstatOut == null) return null; const { stats, perFileStats } = parseGitNumstat(numstatOut); - if (untrackedPaths.length > 0) { + if (untrackedCount > 0) { // Count every untracked file in the totals, even if the per-file map is // already full. Otherwise `filesCount` under-reports whenever tracked // changes already fill the `MAX_FILES` slot. - stats.filesCount += untrackedPaths.length; - const remainingSlots = MAX_FILES - perFileStats.size; - for (const filePath of untrackedPaths.slice( - 0, - Math.max(0, remainingSlots), - )) { - perFileStats.set(filePath, { + stats.filesCount += untrackedCount; + const untrackedPaths = splitNulDelimited(untrackedOut); + const remainingSlots = Math.max(0, MAX_FILES - perFileStats.size); + const visiblePaths = untrackedPaths.slice(0, remainingSlots); + // Count lines in each newly-created file so the header's `+N` reflects + // the true additions a user would see if they `git add`'d. Reads are + // capped per-file and binary content is detected so huge log files or + // blobs don't stall /diff. + const untrackedStats = await Promise.all( + visiblePaths.map((relPath) => + countUntrackedLines(path.join(cwd, relPath)), + ), + ); + for (let i = 0; i < visiblePaths.length; i++) { + const relPath = visiblePaths[i] ?? ''; + const u = untrackedStats[i] ?? { added: 0, - removed: 0, isBinary: false, + }; + perFileStats.set(relPath, { + added: u.added, + removed: 0, + isBinary: u.isBinary, isUntracked: true, }); + stats.linesAdded += u.added; } } @@ -126,33 +162,83 @@ export async function fetchGitDiffHunks( } /** - * Parse `git diff --numstat` output. - * Format per line: `\t\t`. Binary files use `-` for - * counts. Only the first `MAX_FILES` entries are kept in `perFileStats`, but - * total stats account for every line. + * Parse `git diff --numstat -z` output. + * + * Wire format (stable per `git-diff(1)`): + * - Non-rename: `\t\t\0` + * - Rename: `\t\t\0\0\0` + * + * Using `-z` (vs the default newline-delimited form) keeps paths byte-accurate: + * tabs, newlines, and non-ASCII characters all round-trip without git's + * C-style quoting, so `perFileStats` keys match the real on-disk filenames. + * + * Binary files use `-` for both counts. Only the first `MAX_FILES` entries are + * retained in `perFileStats`; totals account for every entry. */ export function parseGitNumstat(stdout: string): GitDiffResult { - const lines = stdout.split('\n').filter(Boolean); + // Drop the trailing empty chunk from the terminating NUL. + const tokens = stdout.split('\0'); + if (tokens.length > 0 && tokens[tokens.length - 1] === '') tokens.pop(); + let added = 0; let removed = 0; let validFileCount = 0; const perFileStats = new Map(); - for (const line of lines) { - const parts = line.split('\t'); - if (parts.length < 3) continue; + // Rename entries span three tokens ({counts}, oldPath, newPath). When we + // see an empty path in the counts token we stash the counts here and + // consume the next two tokens as the rename pair. + let pending: { added: number; removed: number; isBinary: boolean } | null = + null; + let renameOld: string | null = null; - validFileCount++; - const addStr = parts[0]; - const remStr = parts[1]; - const filePath = parts.slice(2).join('\t'); + for (const token of tokens) { + if (pending) { + if (renameOld === null) { + renameOld = token; + continue; + } + commitEntry( + `${renameOld} => ${token}`, + pending.added, + pending.removed, + pending.isBinary, + ); + pending = null; + renameOld = null; + continue; + } + + // Index-based parse — `split('\t')` is unsafe because `-z` preserves + // literal tabs inside filenames. + const firstTab = token.indexOf('\t'); + if (firstTab < 0) continue; + const secondTab = token.indexOf('\t', firstTab + 1); + if (secondTab < 0) continue; + const addStr = token.slice(0, firstTab); + const remStr = token.slice(firstTab + 1, secondTab); + const filePath = token.slice(secondTab + 1); const isBinary = addStr === '-' || remStr === '-'; - const fileAdded = isBinary ? 0 : parseInt(addStr ?? '0', 10) || 0; - const fileRemoved = isBinary ? 0 : parseInt(remStr ?? '0', 10) || 0; + const fileAdded = isBinary ? 0 : parseInt(addStr, 10) || 0; + const fileRemoved = isBinary ? 0 : parseInt(remStr, 10) || 0; + if (filePath === '') { + // Rename header — wait for oldPath and newPath tokens. + pending = { added: fileAdded, removed: fileRemoved, isBinary }; + continue; + } + commitEntry(filePath, fileAdded, fileRemoved, isBinary); + } + + function commitEntry( + filePath: string, + fileAdded: number, + fileRemoved: number, + isBinary: boolean, + ): void { + validFileCount++; added += fileAdded; removed += fileRemoved; - if (perFileStats.size < MAX_FILES) { perFileStats.set(filePath, { added: fileAdded, @@ -249,10 +335,15 @@ export function parseGitDiff(stdout: string): Map { /** * Parse `git diff --shortstat` output, e.g. * ` 3 files changed, 42 insertions(+), 7 deletions(-)`. + * + * The regex is anchored (line start/end with the `m` flag) and uses single + * literal spaces plus bounded `\d{1,10}` digit runs. This closes CodeQL alert + * #137: the previous unanchored form with `\s+` and `\d+` in nested optional + * groups could backtrack polynomially on crafted strings of `0`s. */ export function parseShortstat(stdout: string): GitDiffStats | null { const match = stdout.match( - /(\d+)\s+files?\s+changed(?:,\s+(\d+)\s+insertions?\(\+\))?(?:,\s+(\d+)\s+deletions?\(-\))?/, + /^ ?(\d{1,10}) files? changed(?:, (\d{1,10}) insertions?\(\+\))?(?:, (\d{1,10}) deletions?\(-\))?$/m, ); if (!match) return null; return { @@ -262,6 +353,62 @@ export function parseShortstat(stdout: string): GitDiffStats | null { }; } +function countNulDelimited(stdout: string | null): number { + if (!stdout) return 0; + let count = 0; + for (let i = 0; i < stdout.length; i++) { + if (stdout.charCodeAt(i) === 0) count++; + } + return count; +} + +function splitNulDelimited(stdout: string | null): string[] { + if (!stdout) return []; + return stdout.split('\0').filter(Boolean); +} + +interface UntrackedLineStats { + added: number; + isBinary: boolean; +} + +/** + * Count lines in an untracked file so the /diff totals include it. Reads up + * to `UNTRACKED_READ_CAP_BYTES`, bails on NUL in the first `BINARY_SNIFF_BYTES` + * (git's own heuristic), and swallows read errors into `{added:0,isBinary:false}` + * so one unreadable file can't block the whole command. + */ +async function countUntrackedLines( + absPath: string, +): Promise { + let fh; + try { + fh = await open(absPath, 'r'); + } catch { + return { added: 0, isBinary: false }; + } + try { + const buf = Buffer.alloc(UNTRACKED_READ_CAP_BYTES); + const { bytesRead } = await fh.read(buf, 0, UNTRACKED_READ_CAP_BYTES, 0); + if (bytesRead === 0) return { added: 0, isBinary: false }; + const sniffEnd = Math.min(BINARY_SNIFF_BYTES, bytesRead); + for (let i = 0; i < sniffEnd; i++) { + if (buf[i] === 0) return { added: 0, isBinary: true }; + } + let lines = 0; + for (let i = 0; i < bytesRead; i++) { + if (buf[i] === 0x0a) lines++; + } + // If the file does not end in a newline, count the trailing partial line. + if (buf[bytesRead - 1] !== 0x0a) lines++; + return { added: lines, isBinary: false }; + } catch { + return { added: 0, isBinary: false }; + } finally { + await fh.close().catch(() => {}); + } +} + /** * Resolve the real git directory for a working tree, following `.git` file * indirection used by linked worktrees (`git worktree add`) and submodules. @@ -310,18 +457,6 @@ async function isInTransientGitState(cwd: string): Promise { return results.some(Boolean); } -async function fetchUntrackedPaths(cwd: string): Promise { - // `-z` emits NUL-delimited paths with no C-style quoting, so filenames - // containing newlines, tabs, or non-ASCII bytes round-trip cleanly. - const stdout = await runGit( - ['--no-optional-locks', 'ls-files', '-z', '--others', '--exclude-standard'], - cwd, - ); - if (!stdout) return null; - const paths = stdout.split('\0').filter(Boolean); - return paths.length > 0 ? paths : null; -} - async function runGit(args: string[], cwd: string): Promise { // `core.quotepath=false` keeps non-ASCII filenames as UTF-8 in git's output // instead of octal-escaping them (`\346\226\207.txt`), which would otherwise diff --git a/scripts/unused-keys-only-in-locales.json b/scripts/unused-keys-only-in-locales.json index abf3e862d..2e7729bf4 100644 --- a/scripts/unused-keys-only-in-locales.json +++ b/scripts/unused-keys-only-in-locales.json @@ -1,5 +1,5 @@ { - "generatedAt": "2026-04-24T07:26:06.808Z", + "generatedAt": "2026-04-24T08:13:04.067Z", "keys": [ " Models: Qwen latest models\n", " qwen auth qwen-oauth - Authenticate with Qwen OAuth (discontinued)",