fix(attribution): committed-blob validation, deleted-leaf canonicalisation, sudo/env shifts, dir-stack

gpt-5.5 review (issue 4389405179):

1. realpathOrSelf falls back to the non-canonical input when the
   leaf doesn't exist (deleted file). recordEdit stored the entry
   under the canonical path; lookup post-deletion misses on macOS
   where /var ↔ /private/var. Canonicalise the parent and rejoin
   the basename for missing leaves so deleted-file getFileAttribution
   still resolves the canonical key. Test updated to assert the
   lookup-after-unlink path explicitly.

2. validateOnDiskHashes read the LIVE working-tree, so a user who
   `git add`'d AI's content and then made additional unstaged edits
   would have the entry dropped on a commit whose blob still matched
   AI's hash. Replace with `validateAgainst(getContent)` that takes
   a caller-supplied reader; attachCommitAttribution now passes a
   reader that fetches the COMMITTED blob via `git show HEAD:<rel>`.
   Working-tree validation kept as `validateAgainstWorkingTree` for
   code paths without a committed ref. Returns null = no comparison
   signal (entry preserved). Tests cover all three readers
   (committed-blob via stub, working-tree, null-passthrough).

deepseek-v4-pro review #1: sanitiseAttribution defaults missing
contentHash to '' on legacy-snapshot restore. recordEdit's
divergence check would then trip on every subsequent edit and
silently reset all the AI work. Skip the divergence check when
existing.contentHash is empty — we have no baseline to compare
against, so don't drop. Test added covering legacy-snapshot
preservation through validateAgainst.

deepseek #4: validateAgainst now logs every entry drop via
debugLogger.debug so a 3am operator can see WHICH entry got
dropped and tied to which canonical key.

deepseek #8: GIT_NAMESPACE removed from GIT_ENV_SHIFTS_REPO. It
prefixes ref names within the same repo but doesn't redirect git
to a different on-disk repository, so a commit underneath it still
lands in our cwd's repo. Doc comment explains the distinction.

deepseek #9: pushd/popd treated as cwd-shifting alongside cd in
gitCommitContext / isAmendCommit / findAttributableCommitSegment.
pushd reuses cdTargetMayChangeRepo (relative-no-escape stays
in-repo); popd unconditionally flips cwdShifted because we don't
track the bash dir-stack.

deepseek #10: sudo's value-taking flag table now has a parallel
SUDO_FLAGS_SHIFT_CWD set covering -D / --chdir (Linux sudo 1.9.2+).
Any segment whose sudo wrapper sees one of those flags returns null
from tokeniseSegment — same contract as env -C / --chdir and
GIT_DIR=...

328 tests pass; typecheck clean both packages.
This commit is contained in:
wenshao 2026-05-07 00:03:00 +08:00
parent 6ae266f52f
commit b3a06a7c46
3 changed files with 218 additions and 67 deletions

View file

@ -148,55 +148,101 @@ describe('CommitAttributionService', () => {
expect(after2.aiContribution).toBeGreaterThan(after1.aiContribution);
});
// validateOnDiskHashes runs at commit time and drops entries whose
// current on-disk hash doesn't match what AI recorded — catches
// user edits that happened entirely outside the Edit/Write tools
// (no recordEdit was called, so the input-hash check above couldn't
// see the divergence).
describe('validateOnDiskHashes', () => {
// validateAgainst runs at commit time and drops entries whose
// recorded post-write hash doesn't match the caller-supplied
// content — catches user edits that happened entirely outside the
// Edit/Write tools (no recordEdit was called, so the input-hash
// check above couldn't see the divergence).
describe('validateAgainst', () => {
let tmpDir: string;
beforeEach(() => {
// Real fs (not mocked) — we want validateOnDiskHashes to
// actually read the disk we wrote to.
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'attr-validate-'));
});
afterEach(() => {
fs.rmSync(tmpDir, { recursive: true, force: true });
});
it('drops entries whose on-disk content has diverged', () => {
it('drops entries whose content has diverged', () => {
const service = CommitAttributionService.getInstance();
const filePath = path.join(tmpDir, 'diverged.ts');
fs.writeFileSync(filePath, 'AI wrote this', 'utf-8');
service.recordEdit(filePath, null, 'AI wrote this');
expect(service.getFileAttribution(filePath)).toBeDefined();
// User overwrites the file outside the Edit/Write tools.
fs.writeFileSync(filePath, 'human replaced this', 'utf-8');
service.validateOnDiskHashes();
// Caller passes a reader that returns the diverged content.
service.validateAgainst(() => 'human replaced this');
expect(service.getFileAttribution(filePath)).toBeUndefined();
});
it('keeps entries whose on-disk content matches', () => {
it('keeps entries whose content matches', () => {
const service = CommitAttributionService.getInstance();
const filePath = path.join(tmpDir, 'unchanged.ts');
fs.writeFileSync(filePath, 'AI wrote this', 'utf-8');
service.recordEdit(filePath, null, 'AI wrote this');
service.validateOnDiskHashes();
service.validateAgainst(() => 'AI wrote this');
expect(service.getFileAttribution(filePath)).toBeDefined();
});
it('keeps entries for files that no longer exist on disk', () => {
it('keeps entries when getContent returns null (no comparison signal)', () => {
const service = CommitAttributionService.getInstance();
const filePath = path.join(tmpDir, 'no-comparison.ts');
fs.writeFileSync(filePath, 'will be queried', 'utf-8');
service.recordEdit(filePath, null, 'will be queried');
// null = "no committed blob / unreadable / out-of-scope" — the
// entry should NOT be dropped.
service.validateAgainst(() => null);
expect(service.getFileAttribution(filePath)).toBeDefined();
});
// Legacy snapshot from before contentHash existed: the entry has
// an empty contentHash. We can't tell stale from fresh, so leave
// it alone (don't reset).
it('skips entries with empty contentHash (legacy snapshot)', () => {
const service = CommitAttributionService.getInstance();
service.restoreFromSnapshot({
type: 'attribution-snapshot',
surface: 'cli',
fileStates: {
'/legacy.ts': {
aiContribution: 50,
aiCreated: false,
contentHash: '',
},
},
promptCount: 0,
promptCountAtLastCommit: 0,
});
// Even if the reader claims a different hash, an empty recorded
// hash means we have no baseline — keep the entry.
service.validateAgainst(() => 'totally different');
expect(service.getFileAttribution('/legacy.ts')).toBeDefined();
});
// Working-tree variant for code paths without committed blobs.
it('validateAgainstWorkingTree drops entries whose on-disk content diverged', () => {
const service = CommitAttributionService.getInstance();
const filePath = path.join(tmpDir, 'wt-diverged.ts');
fs.writeFileSync(filePath, 'AI wrote this', 'utf-8');
service.recordEdit(filePath, null, 'AI wrote this');
fs.writeFileSync(filePath, 'human replaced', 'utf-8');
service.validateAgainstWorkingTree();
expect(service.getFileAttribution(filePath)).toBeUndefined();
});
// Deleted-file lookup must remain stable: recordEdit canonicalises
// the path via realpathSync; getFileAttribution must still resolve
// the same canonical key after the leaf is unlinked. realpathOrSelf
// canonicalises the parent and rejoins the basename for missing
// leaves so macOS /var ↔ /private/var doesn't break the lookup
// post-deletion.
it('keeps deleted-file entries reachable via the original path', () => {
const service = CommitAttributionService.getInstance();
const filePath = path.join(tmpDir, 'deleted.ts');
fs.writeFileSync(filePath, 'will be deleted', 'utf-8');
service.recordEdit(filePath, null, 'will be deleted');
fs.unlinkSync(filePath);
// Deleted file: leave the attribution alone — the commit's
// deletion record is what the note will reflect, and a missing
// file isn't a divergence signal.
service.validateOnDiskHashes();
// Lookup must still find the entry by the original path even
// though realpath of the leaf now throws.
expect(service.getFileAttribution(filePath)).toBeDefined();
});
});

View file

@ -23,8 +23,11 @@
import { createHash } from 'node:crypto';
import * as fs from 'node:fs';
import * as path from 'node:path';
import { createDebugLogger } from '../utils/debugLogger.js';
import { isGeneratedFile } from './generatedFiles.js';
const debugLogger = createDebugLogger('COMMIT_ATTRIBUTION');
function computeContentHash(content: string): string {
return createHash('sha256').update(content).digest('hex');
}
@ -35,14 +38,26 @@ function computeContentHash(content: string): string {
* `fs.realpathSync` (what edit.ts/write-file.ts records) and
* `path.relative` against `git rev-parse --show-toplevel` (which may
* report either form) won't line up unless we normalise both sides.
* Falls back to the input on any fs error so a missing path can't
* make the lookup fail outright.
*
* For DELETED leaves (file no longer exists on disk), realpathSync
* throws but the parent directory is still resolvable. Canonicalise
* the parent and rejoin the missing basename so a deleted file's
* lookup still hits the canonical key recordEdit stored before the
* file was removed. Without this, a `getFileAttribution(deletedPath)`
* call after the file was deleted would fall back to the
* non-canonical input and miss the canonical entry on macOS.
*/
function realpathOrSelf(p: string): string {
try {
return fs.realpathSync(p);
} catch {
return p;
try {
const parent = path.dirname(p);
const realParent = fs.realpathSync(parent);
return path.join(realParent, path.basename(p));
} catch {
return p;
}
}
}
@ -271,7 +286,13 @@ export class CommitAttributionService {
// `oldContent` we're being told about doesn't match the hash we
// recorded after AI's last write, the file diverged out-of-band.
// Drop the accumulated counters before applying the new edit.
if (existing && oldContent !== null) {
//
// Skip the check when `existing.contentHash` is empty: that's a
// legacy snapshot (pre-divergence-detection schema) where we
// never recorded the post-write hash. Comparing an empty hash to
// the actual file hash would always trip the reset and silently
// wipe AI work that's still on disk.
if (existing && existing.contentHash && oldContent !== null) {
const oldHash = computeContentHash(oldContent);
if (existing.contentHash !== oldHash) {
aiContribution = 0;
@ -291,34 +312,61 @@ export class CommitAttributionService {
}
/**
* Re-hash each tracked file's CURRENT on-disk content and drop
* entries whose hash doesn't match what AI's last write recorded.
* Catches the cases recordEdit's input-hash check can't see i.e.
* the user (or another tool) modified the file entirely outside
* the Edit/Write tools, then committed it. Without this, the AI's
* stale aiContribution would attach to the human-only diff at
* commit time and credit AI for human work.
* Re-hash each tracked file's content via a caller-supplied reader
* and drop entries whose hash doesn't match what AI's last write
* recorded. Catches the cases recordEdit's input-hash check can't
* see i.e. the user (or another tool) modified the file entirely
* outside the Edit/Write tools, then committed it. Without this,
* the AI's stale aiContribution would attach to the human-only
* diff at commit time and credit AI for human work.
*
* Called immediately before generateNotePayload in the
* commit-attribution path. Files that no longer exist on disk
* (deletion) are left alone the commit's deletion record is
* what the note should reflect, and reading them would just throw.
* `getContent(absPath)` returns the bytes the caller wants to
* compare against, or `null` if the entry shouldn't be checked
* (deletion, unreadable, no committed copy). Returning `null`
* leaves the entry alone rather than dropping it.
*
* Two production callers:
* 1. `attachCommitAttribution` after a commit should pass a
* reader that fetches the COMMITTED blob (`git show HEAD:<rel>`)
* so unstaged working-tree changes the user made AFTER `git add`
* don't trip the divergence check on a commit whose blob still
* matches AI's recorded hash.
* 2. The legacy live-disk reader (`fs.readFileSync`) is exposed
* via `validateAgainstWorkingTree` for the no-committed-blob
* cases (e.g. amend-without-reflog where we can't pin a
* ref). Less precise but better than nothing.
*/
validateOnDiskHashes(): void {
validateAgainst(getContent: (absPath: string) => string | null): void {
for (const [key, attr] of this.fileAttributions) {
let current: string;
try {
current = fs.readFileSync(key, 'utf-8');
} catch {
// File doesn't exist (deletion) or isn't readable — skip.
continue;
}
// Skip legacy entries that have no recorded post-write hash —
// we can't tell stale from fresh, so leave them alone.
if (!attr.contentHash) continue;
const current = getContent(key);
if (current === null) continue; // not a divergence signal
if (computeContentHash(current) !== attr.contentHash) {
debugLogger.debug(
`validateAgainst: dropping stale attribution for ${key} (hash diverged)`,
);
this.fileAttributions.delete(key);
}
}
}
/**
* Convenience wrapper around {@link validateAgainst} that reads
* the live working-tree file. Used for code paths where we can't
* read the committed blob (no commit happened, no ref available).
*/
validateAgainstWorkingTree(): void {
this.validateAgainst((p) => {
try {
return fs.readFileSync(p, 'utf-8');
} catch {
return null;
}
});
}
// -----------------------------------------------------------------------
// Prompt counting
// -----------------------------------------------------------------------

View file

@ -156,12 +156,16 @@ function tokeniseSegment(segment: string): string[] | null {
while (i < tokens.length && tokens[i]!.startsWith('-')) {
const flag = tokens[i]!;
i++;
// `env -C DIR` / `env --chdir DIR` relocates the working
// directory before exec. Treat the segment as repo-shifting
// (same contract as a leading `GIT_DIR=...` assignment) so
// we don't stamp our trailer onto a commit that landed in a
// different repository.
if (wrapper === 'env' && ENV_FLAGS_SHIFT_CWD.has(flag)) {
// `env -C DIR` / `env --chdir DIR` (GNU coreutils 8.30+) and
// `sudo -D DIR` / `sudo --chdir DIR` (Linux sudo with --chdir)
// both relocate the working directory before exec. Treat the
// segment as repo-shifting (same contract as a leading
// `GIT_DIR=...` assignment) so we don't stamp our trailer onto
// a commit that landed in a different repository.
if (
(wrapper === 'env' && ENV_FLAGS_SHIFT_CWD.has(flag)) ||
(wrapper === 'sudo' && SUDO_FLAGS_SHIFT_CWD.has(flag))
) {
return null;
}
// Value-taking flag tables, per wrapper: `sudo -u user`,
@ -225,6 +229,12 @@ const ENV_FLAGS_WITH_VALUE = new Set(['-u', '--unset', '-S', '--split-string']);
// and the per-file note.
const ENV_FLAGS_SHIFT_CWD = new Set(['-C', '--chdir']);
// `sudo`'s flags that relocate the working directory before exec.
// Linux sudo's `-D DIR` / `--chdir DIR` (1.9.2+) makes the inner
// command run in DIR, which means a `git commit` underneath it
// targets DIR's repo, not ours. Refuse the segment.
const SUDO_FLAGS_SHIFT_CWD = new Set(['-D', '--chdir']);
/**
* Environment variables that redirect git's repository selection. A
* leading `GIT_DIR=...`, `GIT_WORK_TREE=...`, etc. on a command makes
@ -239,12 +249,16 @@ const ENV_FLAGS_SHIFT_CWD = new Set(['-C', '--chdir']);
* but don't move it to another repo, so attribution is still
* meaningful.
*/
// `GIT_NAMESPACE` is intentionally NOT here: it prefixes ref names
// within the same repository, but the working tree and object store
// are unchanged, so a `git commit` under it still lands in our cwd's
// repo. The set covers ONLY variables that change which on-disk
// repository git acts on.
const GIT_ENV_SHIFTS_REPO = new Set([
'GIT_DIR',
'GIT_WORK_TREE',
'GIT_COMMON_DIR',
'GIT_INDEX_FILE',
'GIT_NAMESPACE',
]);
/**
@ -360,10 +374,10 @@ function gitCommitContext(command: string): {
const program = tokens[0]!;
if (program === 'cd') {
// A cd before any commit might redirect a later `git commit` into
// a different repo. A cd AFTER the commit doesn't matter for the
// commit we already saw.
if (program === 'cd' || program === 'pushd') {
// A cd / pushd before any commit might redirect a later
// `git commit` into a different repo. A cd AFTER the commit
// doesn't matter for the commit we already saw.
//
// A heuristic relaxation: relative cd targets that don't escape
// upward (no `..`, no absolute path, no env-var/$home expansion)
@ -378,6 +392,14 @@ function gitCommitContext(command: string): {
if (!hasCommit && cdTargetMayChangeRepo(tokens)) cwdShifted = true;
continue;
}
if (program === 'popd') {
// `popd` returns to a previous directory in the bash dir-stack.
// Without tracking the stack we can't know whether the resulting
// cwd is the same repo or a different one — treat conservatively
// as a shift before any commit.
if (!hasCommit) cwdShifted = true;
continue;
}
if (program === 'git') {
const { subcommand, changesCwd } = parseGitInvocation(tokens);
@ -487,10 +509,14 @@ function isAmendCommit(command: string): boolean {
const tokens = tokeniseSegment(sub);
if (!tokens || tokens.length === 0) continue;
const program = tokens[0]!;
if (program === 'cd') {
if (program === 'cd' || program === 'pushd') {
if (!cwdShifted && cdTargetMayChangeRepo(tokens)) cwdShifted = true;
continue;
}
if (program === 'popd') {
cwdShifted = true;
continue;
}
if (program !== 'git') continue;
const { subcommand, changesCwd } = parseGitInvocation(tokens);
if (subcommand === 'commit' && !cwdShifted && !changesCwd) {
@ -540,13 +566,17 @@ function findAttributableCommitSegment(
const tokens = tokeniseSegment(sub);
if (!tokens || tokens.length === 0) continue;
const program = tokens[0]!;
if (program === 'cd') {
// Mirror gitCommitContext's cd heuristic: relative paths that
// don't escape upward are treated as in-repo, so
if (program === 'cd' || program === 'pushd') {
// Mirror gitCommitContext's cd/pushd heuristic: relative paths
// that don't escape upward are treated as in-repo, so
// `cd subdir && git commit ...` still finds the segment.
if (!cwdShifted && cdTargetMayChangeRepo(tokens)) cwdShifted = true;
continue;
}
if (program === 'popd') {
cwdShifted = true;
continue;
}
if (program === 'git') {
const { subcommand, changesCwd } = parseGitInvocation(tokens);
if (subcommand === 'commit' && !cwdShifted && !changesCwd) {
@ -2060,15 +2090,42 @@ export class ShellToolInvocation extends BaseToolInvocation<
canonicalBase = baseDir;
}
// Drop tracked entries whose on-disk content has diverged from
// what AI's last write recorded — catches the case where the
// user paste-replaced via an external editor, ran `git checkout`,
// or otherwise modified the file outside the Edit/Write tools.
// Without this, the AI's stale aiContribution would attach to
// the human-only diff at commit time and credit AI for the human
// work. Run this BEFORE matchCommittedFiles so the dropped
// entries are also out of the per-file payload.
attributionService.validateOnDiskHashes();
// Drop tracked entries whose COMMITTED content has diverged
// from what AI's last write recorded — catches the case where
// the user paste-replaced via an external editor, ran
// `git checkout`, or otherwise modified the file outside the
// Edit/Write tools. Validate against the COMMITTED blob (via
// `git show HEAD:<rel>`) rather than the live working tree:
// the user can `git add` AI's content, then make additional
// unstaged edits, then `git commit` — the commit's blob still
// matches AI's recorded hash, but the working-tree file does
// not. A working-tree comparison would drop the entry on a
// commit that legitimately came from AI. Run BEFORE
// matchCommittedFiles so dropped entries are also out of the
// per-file payload.
attributionService.validateAgainst((absPath) => {
const rel = path
.relative(canonicalBase, absPath)
.split(path.sep)
.join('/');
if (!rel || rel.startsWith('..')) return null;
try {
return childProcess
.execFileSync('git', ['show', `HEAD:${rel}`], {
cwd,
timeout: 2000,
stdio: ['ignore', 'pipe', 'ignore'],
maxBuffer: 16 * 1024 * 1024,
})
.toString('utf-8');
} catch {
// No committed content (deleted file, file not in commit,
// or git error) — leave the entry alone. Deletion handling
// is its own thread; what matters here is that we don't
// FALSE-positive on a missing-from-commit signal.
return null;
}
});
committedAbsolutePaths = attributionService.matchCommittedFiles(
stagedInfo.files,