From b3a06a7c461aeec77e28fdb82063ed160594ea77 Mon Sep 17 00:00:00 2001 From: wenshao Date: Thu, 7 May 2026 00:03:00 +0800 Subject: [PATCH] fix(attribution): committed-blob validation, deleted-leaf canonicalisation, sudo/env shifts, dir-stack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gpt-5.5 review (issue 4389405179): 1. realpathOrSelf falls back to the non-canonical input when the leaf doesn't exist (deleted file). recordEdit stored the entry under the canonical path; lookup post-deletion misses on macOS where /var ↔ /private/var. Canonicalise the parent and rejoin the basename for missing leaves so deleted-file getFileAttribution still resolves the canonical key. Test updated to assert the lookup-after-unlink path explicitly. 2. validateOnDiskHashes read the LIVE working-tree, so a user who `git add`'d AI's content and then made additional unstaged edits would have the entry dropped on a commit whose blob still matched AI's hash. Replace with `validateAgainst(getContent)` that takes a caller-supplied reader; attachCommitAttribution now passes a reader that fetches the COMMITTED blob via `git show HEAD:`. Working-tree validation kept as `validateAgainstWorkingTree` for code paths without a committed ref. Returns null = no comparison signal (entry preserved). Tests cover all three readers (committed-blob via stub, working-tree, null-passthrough). deepseek-v4-pro review #1: sanitiseAttribution defaults missing contentHash to '' on legacy-snapshot restore. recordEdit's divergence check would then trip on every subsequent edit and silently reset all the AI work. Skip the divergence check when existing.contentHash is empty — we have no baseline to compare against, so don't drop. Test added covering legacy-snapshot preservation through validateAgainst. deepseek #4: validateAgainst now logs every entry drop via debugLogger.debug so a 3am operator can see WHICH entry got dropped and tied to which canonical key. deepseek #8: GIT_NAMESPACE removed from GIT_ENV_SHIFTS_REPO. It prefixes ref names within the same repo but doesn't redirect git to a different on-disk repository, so a commit underneath it still lands in our cwd's repo. Doc comment explains the distinction. deepseek #9: pushd/popd treated as cwd-shifting alongside cd in gitCommitContext / isAmendCommit / findAttributableCommitSegment. pushd reuses cdTargetMayChangeRepo (relative-no-escape stays in-repo); popd unconditionally flips cwdShifted because we don't track the bash dir-stack. deepseek #10: sudo's value-taking flag table now has a parallel SUDO_FLAGS_SHIFT_CWD set covering -D / --chdir (Linux sudo 1.9.2+). Any segment whose sudo wrapper sees one of those flags returns null from tokeniseSegment — same contract as env -C / --chdir and GIT_DIR=... 328 tests pass; typecheck clean both packages. --- .../src/services/commitAttribution.test.ts | 86 ++++++++++---- .../core/src/services/commitAttribution.ts | 94 ++++++++++++---- packages/core/src/tools/shell.ts | 105 ++++++++++++++---- 3 files changed, 218 insertions(+), 67 deletions(-) diff --git a/packages/core/src/services/commitAttribution.test.ts b/packages/core/src/services/commitAttribution.test.ts index 38f37d8d3..4a6be648f 100644 --- a/packages/core/src/services/commitAttribution.test.ts +++ b/packages/core/src/services/commitAttribution.test.ts @@ -148,55 +148,101 @@ describe('CommitAttributionService', () => { expect(after2.aiContribution).toBeGreaterThan(after1.aiContribution); }); - // validateOnDiskHashes runs at commit time and drops entries whose - // current on-disk hash doesn't match what AI recorded — catches - // user edits that happened entirely outside the Edit/Write tools - // (no recordEdit was called, so the input-hash check above couldn't - // see the divergence). - describe('validateOnDiskHashes', () => { + // validateAgainst runs at commit time and drops entries whose + // recorded post-write hash doesn't match the caller-supplied + // content — catches user edits that happened entirely outside the + // Edit/Write tools (no recordEdit was called, so the input-hash + // check above couldn't see the divergence). + describe('validateAgainst', () => { let tmpDir: string; beforeEach(() => { - // Real fs (not mocked) — we want validateOnDiskHashes to - // actually read the disk we wrote to. tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'attr-validate-')); }); afterEach(() => { fs.rmSync(tmpDir, { recursive: true, force: true }); }); - it('drops entries whose on-disk content has diverged', () => { + it('drops entries whose content has diverged', () => { const service = CommitAttributionService.getInstance(); const filePath = path.join(tmpDir, 'diverged.ts'); fs.writeFileSync(filePath, 'AI wrote this', 'utf-8'); service.recordEdit(filePath, null, 'AI wrote this'); expect(service.getFileAttribution(filePath)).toBeDefined(); - // User overwrites the file outside the Edit/Write tools. - fs.writeFileSync(filePath, 'human replaced this', 'utf-8'); - - service.validateOnDiskHashes(); + // Caller passes a reader that returns the diverged content. + service.validateAgainst(() => 'human replaced this'); expect(service.getFileAttribution(filePath)).toBeUndefined(); }); - it('keeps entries whose on-disk content matches', () => { + it('keeps entries whose content matches', () => { const service = CommitAttributionService.getInstance(); const filePath = path.join(tmpDir, 'unchanged.ts'); fs.writeFileSync(filePath, 'AI wrote this', 'utf-8'); service.recordEdit(filePath, null, 'AI wrote this'); - service.validateOnDiskHashes(); + service.validateAgainst(() => 'AI wrote this'); expect(service.getFileAttribution(filePath)).toBeDefined(); }); - it('keeps entries for files that no longer exist on disk', () => { + it('keeps entries when getContent returns null (no comparison signal)', () => { + const service = CommitAttributionService.getInstance(); + const filePath = path.join(tmpDir, 'no-comparison.ts'); + fs.writeFileSync(filePath, 'will be queried', 'utf-8'); + service.recordEdit(filePath, null, 'will be queried'); + // null = "no committed blob / unreadable / out-of-scope" — the + // entry should NOT be dropped. + service.validateAgainst(() => null); + expect(service.getFileAttribution(filePath)).toBeDefined(); + }); + + // Legacy snapshot from before contentHash existed: the entry has + // an empty contentHash. We can't tell stale from fresh, so leave + // it alone (don't reset). + it('skips entries with empty contentHash (legacy snapshot)', () => { + const service = CommitAttributionService.getInstance(); + service.restoreFromSnapshot({ + type: 'attribution-snapshot', + surface: 'cli', + fileStates: { + '/legacy.ts': { + aiContribution: 50, + aiCreated: false, + contentHash: '', + }, + }, + promptCount: 0, + promptCountAtLastCommit: 0, + }); + // Even if the reader claims a different hash, an empty recorded + // hash means we have no baseline — keep the entry. + service.validateAgainst(() => 'totally different'); + expect(service.getFileAttribution('/legacy.ts')).toBeDefined(); + }); + + // Working-tree variant for code paths without committed blobs. + it('validateAgainstWorkingTree drops entries whose on-disk content diverged', () => { + const service = CommitAttributionService.getInstance(); + const filePath = path.join(tmpDir, 'wt-diverged.ts'); + fs.writeFileSync(filePath, 'AI wrote this', 'utf-8'); + service.recordEdit(filePath, null, 'AI wrote this'); + fs.writeFileSync(filePath, 'human replaced', 'utf-8'); + service.validateAgainstWorkingTree(); + expect(service.getFileAttribution(filePath)).toBeUndefined(); + }); + + // Deleted-file lookup must remain stable: recordEdit canonicalises + // the path via realpathSync; getFileAttribution must still resolve + // the same canonical key after the leaf is unlinked. realpathOrSelf + // canonicalises the parent and rejoins the basename for missing + // leaves so macOS /var ↔ /private/var doesn't break the lookup + // post-deletion. + it('keeps deleted-file entries reachable via the original path', () => { const service = CommitAttributionService.getInstance(); const filePath = path.join(tmpDir, 'deleted.ts'); fs.writeFileSync(filePath, 'will be deleted', 'utf-8'); service.recordEdit(filePath, null, 'will be deleted'); fs.unlinkSync(filePath); - // Deleted file: leave the attribution alone — the commit's - // deletion record is what the note will reflect, and a missing - // file isn't a divergence signal. - service.validateOnDiskHashes(); + // Lookup must still find the entry by the original path even + // though realpath of the leaf now throws. expect(service.getFileAttribution(filePath)).toBeDefined(); }); }); diff --git a/packages/core/src/services/commitAttribution.ts b/packages/core/src/services/commitAttribution.ts index c53a9f85d..07b47b753 100644 --- a/packages/core/src/services/commitAttribution.ts +++ b/packages/core/src/services/commitAttribution.ts @@ -23,8 +23,11 @@ import { createHash } from 'node:crypto'; import * as fs from 'node:fs'; import * as path from 'node:path'; +import { createDebugLogger } from '../utils/debugLogger.js'; import { isGeneratedFile } from './generatedFiles.js'; +const debugLogger = createDebugLogger('COMMIT_ATTRIBUTION'); + function computeContentHash(content: string): string { return createHash('sha256').update(content).digest('hex'); } @@ -35,14 +38,26 @@ function computeContentHash(content: string): string { * `fs.realpathSync` (what edit.ts/write-file.ts records) and * `path.relative` against `git rev-parse --show-toplevel` (which may * report either form) won't line up unless we normalise both sides. - * Falls back to the input on any fs error so a missing path can't - * make the lookup fail outright. + * + * For DELETED leaves (file no longer exists on disk), realpathSync + * throws — but the parent directory is still resolvable. Canonicalise + * the parent and rejoin the missing basename so a deleted file's + * lookup still hits the canonical key recordEdit stored before the + * file was removed. Without this, a `getFileAttribution(deletedPath)` + * call after the file was deleted would fall back to the + * non-canonical input and miss the canonical entry on macOS. */ function realpathOrSelf(p: string): string { try { return fs.realpathSync(p); } catch { - return p; + try { + const parent = path.dirname(p); + const realParent = fs.realpathSync(parent); + return path.join(realParent, path.basename(p)); + } catch { + return p; + } } } @@ -271,7 +286,13 @@ export class CommitAttributionService { // `oldContent` we're being told about doesn't match the hash we // recorded after AI's last write, the file diverged out-of-band. // Drop the accumulated counters before applying the new edit. - if (existing && oldContent !== null) { + // + // Skip the check when `existing.contentHash` is empty: that's a + // legacy snapshot (pre-divergence-detection schema) where we + // never recorded the post-write hash. Comparing an empty hash to + // the actual file hash would always trip the reset and silently + // wipe AI work that's still on disk. + if (existing && existing.contentHash && oldContent !== null) { const oldHash = computeContentHash(oldContent); if (existing.contentHash !== oldHash) { aiContribution = 0; @@ -291,34 +312,61 @@ export class CommitAttributionService { } /** - * Re-hash each tracked file's CURRENT on-disk content and drop - * entries whose hash doesn't match what AI's last write recorded. - * Catches the cases recordEdit's input-hash check can't see — i.e. - * the user (or another tool) modified the file entirely outside - * the Edit/Write tools, then committed it. Without this, the AI's - * stale aiContribution would attach to the human-only diff at - * commit time and credit AI for human work. + * Re-hash each tracked file's content via a caller-supplied reader + * and drop entries whose hash doesn't match what AI's last write + * recorded. Catches the cases recordEdit's input-hash check can't + * see — i.e. the user (or another tool) modified the file entirely + * outside the Edit/Write tools, then committed it. Without this, + * the AI's stale aiContribution would attach to the human-only + * diff at commit time and credit AI for human work. * - * Called immediately before generateNotePayload in the - * commit-attribution path. Files that no longer exist on disk - * (deletion) are left alone — the commit's deletion record is - * what the note should reflect, and reading them would just throw. + * `getContent(absPath)` returns the bytes the caller wants to + * compare against, or `null` if the entry shouldn't be checked + * (deletion, unreadable, no committed copy). Returning `null` + * leaves the entry alone rather than dropping it. + * + * Two production callers: + * 1. `attachCommitAttribution` after a commit — should pass a + * reader that fetches the COMMITTED blob (`git show HEAD:`) + * so unstaged working-tree changes the user made AFTER `git add` + * don't trip the divergence check on a commit whose blob still + * matches AI's recorded hash. + * 2. The legacy live-disk reader (`fs.readFileSync`) is exposed + * via `validateAgainstWorkingTree` for the no-committed-blob + * cases (e.g. amend-without-reflog where we can't pin a + * ref). Less precise but better than nothing. */ - validateOnDiskHashes(): void { + validateAgainst(getContent: (absPath: string) => string | null): void { for (const [key, attr] of this.fileAttributions) { - let current: string; - try { - current = fs.readFileSync(key, 'utf-8'); - } catch { - // File doesn't exist (deletion) or isn't readable — skip. - continue; - } + // Skip legacy entries that have no recorded post-write hash — + // we can't tell stale from fresh, so leave them alone. + if (!attr.contentHash) continue; + const current = getContent(key); + if (current === null) continue; // not a divergence signal if (computeContentHash(current) !== attr.contentHash) { + debugLogger.debug( + `validateAgainst: dropping stale attribution for ${key} (hash diverged)`, + ); this.fileAttributions.delete(key); } } } + /** + * Convenience wrapper around {@link validateAgainst} that reads + * the live working-tree file. Used for code paths where we can't + * read the committed blob (no commit happened, no ref available). + */ + validateAgainstWorkingTree(): void { + this.validateAgainst((p) => { + try { + return fs.readFileSync(p, 'utf-8'); + } catch { + return null; + } + }); + } + // ----------------------------------------------------------------------- // Prompt counting // ----------------------------------------------------------------------- diff --git a/packages/core/src/tools/shell.ts b/packages/core/src/tools/shell.ts index 671559ab6..3b776df2e 100644 --- a/packages/core/src/tools/shell.ts +++ b/packages/core/src/tools/shell.ts @@ -156,12 +156,16 @@ function tokeniseSegment(segment: string): string[] | null { while (i < tokens.length && tokens[i]!.startsWith('-')) { const flag = tokens[i]!; i++; - // `env -C DIR` / `env --chdir DIR` relocates the working - // directory before exec. Treat the segment as repo-shifting - // (same contract as a leading `GIT_DIR=...` assignment) so - // we don't stamp our trailer onto a commit that landed in a - // different repository. - if (wrapper === 'env' && ENV_FLAGS_SHIFT_CWD.has(flag)) { + // `env -C DIR` / `env --chdir DIR` (GNU coreutils 8.30+) and + // `sudo -D DIR` / `sudo --chdir DIR` (Linux sudo with --chdir) + // both relocate the working directory before exec. Treat the + // segment as repo-shifting (same contract as a leading + // `GIT_DIR=...` assignment) so we don't stamp our trailer onto + // a commit that landed in a different repository. + if ( + (wrapper === 'env' && ENV_FLAGS_SHIFT_CWD.has(flag)) || + (wrapper === 'sudo' && SUDO_FLAGS_SHIFT_CWD.has(flag)) + ) { return null; } // Value-taking flag tables, per wrapper: `sudo -u user`, @@ -225,6 +229,12 @@ const ENV_FLAGS_WITH_VALUE = new Set(['-u', '--unset', '-S', '--split-string']); // and the per-file note. const ENV_FLAGS_SHIFT_CWD = new Set(['-C', '--chdir']); +// `sudo`'s flags that relocate the working directory before exec. +// Linux sudo's `-D DIR` / `--chdir DIR` (1.9.2+) makes the inner +// command run in DIR, which means a `git commit` underneath it +// targets DIR's repo, not ours. Refuse the segment. +const SUDO_FLAGS_SHIFT_CWD = new Set(['-D', '--chdir']); + /** * Environment variables that redirect git's repository selection. A * leading `GIT_DIR=...`, `GIT_WORK_TREE=...`, etc. on a command makes @@ -239,12 +249,16 @@ const ENV_FLAGS_SHIFT_CWD = new Set(['-C', '--chdir']); * but don't move it to another repo, so attribution is still * meaningful. */ +// `GIT_NAMESPACE` is intentionally NOT here: it prefixes ref names +// within the same repository, but the working tree and object store +// are unchanged, so a `git commit` under it still lands in our cwd's +// repo. The set covers ONLY variables that change which on-disk +// repository git acts on. const GIT_ENV_SHIFTS_REPO = new Set([ 'GIT_DIR', 'GIT_WORK_TREE', 'GIT_COMMON_DIR', 'GIT_INDEX_FILE', - 'GIT_NAMESPACE', ]); /** @@ -360,10 +374,10 @@ function gitCommitContext(command: string): { const program = tokens[0]!; - if (program === 'cd') { - // A cd before any commit might redirect a later `git commit` into - // a different repo. A cd AFTER the commit doesn't matter for the - // commit we already saw. + if (program === 'cd' || program === 'pushd') { + // A cd / pushd before any commit might redirect a later + // `git commit` into a different repo. A cd AFTER the commit + // doesn't matter for the commit we already saw. // // A heuristic relaxation: relative cd targets that don't escape // upward (no `..`, no absolute path, no env-var/$home expansion) @@ -378,6 +392,14 @@ function gitCommitContext(command: string): { if (!hasCommit && cdTargetMayChangeRepo(tokens)) cwdShifted = true; continue; } + if (program === 'popd') { + // `popd` returns to a previous directory in the bash dir-stack. + // Without tracking the stack we can't know whether the resulting + // cwd is the same repo or a different one — treat conservatively + // as a shift before any commit. + if (!hasCommit) cwdShifted = true; + continue; + } if (program === 'git') { const { subcommand, changesCwd } = parseGitInvocation(tokens); @@ -487,10 +509,14 @@ function isAmendCommit(command: string): boolean { const tokens = tokeniseSegment(sub); if (!tokens || tokens.length === 0) continue; const program = tokens[0]!; - if (program === 'cd') { + if (program === 'cd' || program === 'pushd') { if (!cwdShifted && cdTargetMayChangeRepo(tokens)) cwdShifted = true; continue; } + if (program === 'popd') { + cwdShifted = true; + continue; + } if (program !== 'git') continue; const { subcommand, changesCwd } = parseGitInvocation(tokens); if (subcommand === 'commit' && !cwdShifted && !changesCwd) { @@ -540,13 +566,17 @@ function findAttributableCommitSegment( const tokens = tokeniseSegment(sub); if (!tokens || tokens.length === 0) continue; const program = tokens[0]!; - if (program === 'cd') { - // Mirror gitCommitContext's cd heuristic: relative paths that - // don't escape upward are treated as in-repo, so + if (program === 'cd' || program === 'pushd') { + // Mirror gitCommitContext's cd/pushd heuristic: relative paths + // that don't escape upward are treated as in-repo, so // `cd subdir && git commit ...` still finds the segment. if (!cwdShifted && cdTargetMayChangeRepo(tokens)) cwdShifted = true; continue; } + if (program === 'popd') { + cwdShifted = true; + continue; + } if (program === 'git') { const { subcommand, changesCwd } = parseGitInvocation(tokens); if (subcommand === 'commit' && !cwdShifted && !changesCwd) { @@ -2060,15 +2090,42 @@ export class ShellToolInvocation extends BaseToolInvocation< canonicalBase = baseDir; } - // Drop tracked entries whose on-disk content has diverged from - // what AI's last write recorded — catches the case where the - // user paste-replaced via an external editor, ran `git checkout`, - // or otherwise modified the file outside the Edit/Write tools. - // Without this, the AI's stale aiContribution would attach to - // the human-only diff at commit time and credit AI for the human - // work. Run this BEFORE matchCommittedFiles so the dropped - // entries are also out of the per-file payload. - attributionService.validateOnDiskHashes(); + // Drop tracked entries whose COMMITTED content has diverged + // from what AI's last write recorded — catches the case where + // the user paste-replaced via an external editor, ran + // `git checkout`, or otherwise modified the file outside the + // Edit/Write tools. Validate against the COMMITTED blob (via + // `git show HEAD:`) rather than the live working tree: + // the user can `git add` AI's content, then make additional + // unstaged edits, then `git commit` — the commit's blob still + // matches AI's recorded hash, but the working-tree file does + // not. A working-tree comparison would drop the entry on a + // commit that legitimately came from AI. Run BEFORE + // matchCommittedFiles so dropped entries are also out of the + // per-file payload. + attributionService.validateAgainst((absPath) => { + const rel = path + .relative(canonicalBase, absPath) + .split(path.sep) + .join('/'); + if (!rel || rel.startsWith('..')) return null; + try { + return childProcess + .execFileSync('git', ['show', `HEAD:${rel}`], { + cwd, + timeout: 2000, + stdio: ['ignore', 'pipe', 'ignore'], + maxBuffer: 16 * 1024 * 1024, + }) + .toString('utf-8'); + } catch { + // No committed content (deleted file, file not in commit, + // or git error) — leave the entry alone. Deletion handling + // is its own thread; what matters here is that we don't + // FALSE-positive on a missing-from-commit signal. + return null; + } + }); committedAbsolutePaths = attributionService.matchCommittedFiles( stagedInfo.files,