qwen-code/packages/core/src/services/commitAttribution.ts
wenshao b3a06a7c46 fix(attribution): committed-blob validation, deleted-leaf canonicalisation, sudo/env shifts, dir-stack
gpt-5.5 review (issue 4389405179):

1. realpathOrSelf falls back to the non-canonical input when the
   leaf doesn't exist (deleted file). recordEdit stored the entry
   under the canonical path; lookup post-deletion misses on macOS
   where /var ↔ /private/var. Canonicalise the parent and rejoin
   the basename for missing leaves so deleted-file getFileAttribution
   still resolves the canonical key. Test updated to assert the
   lookup-after-unlink path explicitly.

2. validateOnDiskHashes read the LIVE working-tree, so a user who
   `git add`'d AI's content and then made additional unstaged edits
   would have the entry dropped on a commit whose blob still matched
   AI's hash. Replace with `validateAgainst(getContent)` that takes
   a caller-supplied reader; attachCommitAttribution now passes a
   reader that fetches the COMMITTED blob via `git show HEAD:<rel>`.
   Working-tree validation kept as `validateAgainstWorkingTree` for
   code paths without a committed ref. Returns null = no comparison
   signal (entry preserved). Tests cover all three readers
   (committed-blob via stub, working-tree, null-passthrough).

deepseek-v4-pro review #1: sanitiseAttribution defaults missing
contentHash to '' on legacy-snapshot restore. recordEdit's
divergence check would then trip on every subsequent edit and
silently reset all the AI work. Skip the divergence check when
existing.contentHash is empty — we have no baseline to compare
against, so don't drop. Test added covering legacy-snapshot
preservation through validateAgainst.

deepseek #4: validateAgainst now logs every entry drop via
debugLogger.debug so a 3am operator can see WHICH entry got
dropped and tied to which canonical key.

deepseek #8: GIT_NAMESPACE removed from GIT_ENV_SHIFTS_REPO. It
prefixes ref names within the same repo but doesn't redirect git
to a different on-disk repository, so a commit underneath it still
lands in our cwd's repo. Doc comment explains the distinction.

deepseek #9: pushd/popd treated as cwd-shifting alongside cd in
gitCommitContext / isAmendCommit / findAttributableCommitSegment.
pushd reuses cdTargetMayChangeRepo (relative-no-escape stays
in-repo); popd unconditionally flips cwdShifted because we don't
track the bash dir-stack.

deepseek #10: sudo's value-taking flag table now has a parallel
SUDO_FLAGS_SHIFT_CWD set covering -D / --chdir (Linux sudo 1.9.2+).
Any segment whose sudo wrapper sees one of those flags returns null
from tokeniseSegment — same contract as env -C / --chdir and
GIT_DIR=...

328 tests pass; typecheck clean both packages.
2026-05-07 00:03:00 +08:00

786 lines
29 KiB
TypeScript

/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
/**
* Commit Attribution Service
*
* Tracks character-level contribution ratios between AI and humans per file.
* When a git commit is made, this data is combined with git diff analysis to
* calculate real AI vs human contribution percentages, stored as git notes.
*
* Features:
* - Character-level prefix/suffix diff algorithm
* - Real AI/human contribution ratio via git diff
* - Surface tracking (cli/ide/api/sdk)
* - Prompt counting (since-last-commit window)
* - Snapshot/restore for session persistence
* - Generated file exclusion
*/
import { createHash } from 'node:crypto';
import * as fs from 'node:fs';
import * as path from 'node:path';
import { createDebugLogger } from '../utils/debugLogger.js';
import { isGeneratedFile } from './generatedFiles.js';
const debugLogger = createDebugLogger('COMMIT_ATTRIBUTION');
function computeContentHash(content: string): string {
return createHash('sha256').update(content).digest('hex');
}
/**
* Resolve symlinks on a path. On macOS in particular, `/var` is a
* symlink to `/private/var`, so an absolute path captured via
* `fs.realpathSync` (what edit.ts/write-file.ts records) and
* `path.relative` against `git rev-parse --show-toplevel` (which may
* report either form) won't line up unless we normalise both sides.
*
* For DELETED leaves (file no longer exists on disk), realpathSync
* throws — but the parent directory is still resolvable. Canonicalise
* the parent and rejoin the missing basename so a deleted file's
* lookup still hits the canonical key recordEdit stored before the
* file was removed. Without this, a `getFileAttribution(deletedPath)`
* call after the file was deleted would fall back to the
* non-canonical input and miss the canonical entry on macOS.
*/
function realpathOrSelf(p: string): string {
try {
return fs.realpathSync(p);
} catch {
try {
const parent = path.dirname(p);
const realParent = fs.realpathSync(parent);
return path.join(realParent, path.basename(p));
} catch {
return p;
}
}
}
// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------
export interface FileAttribution {
/** Total characters contributed by AI (accumulated across edits) */
aiContribution: number;
/** Whether the file was created by AI */
aiCreated: boolean;
/**
* SHA-256 of the file content immediately after AI's last write. Used
* to detect out-of-band mutation (paste-replace via external editor,
* `rm` + recreate, manual save) so AI's accumulated counter doesn't
* silently get credited to subsequent human edits. recordEdit checks
* this on every call (resets when the input `oldContent` doesn't
* match), and `validateOnDiskHashes` re-verifies before a commit
* note is generated to catch user edits that happened entirely
* outside the Edit/Write tools.
*/
contentHash: string;
}
/** Per-file attribution detail in the git notes payload. */
export interface FileAttributionDetail {
aiChars: number;
humanChars: number;
percent: number;
surface?: string;
}
/** Full attribution payload stored as git notes JSON. */
export interface CommitAttributionNote {
version: 1;
generator: string;
files: Record<string, FileAttributionDetail>;
summary: {
aiPercent: number;
aiChars: number;
humanChars: number;
totalFilesTouched: number;
surfaces: string[];
};
surfaceBreakdown: Record<string, { aiChars: number; percent: number }>;
/**
* Sample of generated/vendored files that were excluded from
* attribution. Capped at `MAX_EXCLUDED_GENERATED_SAMPLE` paths so a
* commit churning thousands of `dist/` artifacts can't blow past the
* 30 KB note budget and silently drop attribution for the real
* source files in the same commit. Use `excludedGeneratedCount` for
* the true total.
*/
excludedGenerated: string[];
/** Total count of excluded files (≥ excludedGenerated.length). */
excludedGeneratedCount: number;
promptCount: number;
}
/**
* Upper bound on the number of excluded-generated paths we serialize
* into the git note. Keeps the JSON payload bounded for commits with
* lots of generated artifacts.
*/
export const MAX_EXCLUDED_GENERATED_SAMPLE = 50;
/** Result of running git commands to get staged file info. */
export interface StagedFileInfo {
files: string[];
diffSizes: Map<string, number>;
deletedFiles: Set<string>;
/**
* Absolute path of the repository root (`git rev-parse --show-toplevel`).
* Optional for backward compatibility with synthetic test inputs;
* production callers should set it so file paths in `files` (which are
* relative to the repo root) align with absolute paths tracked by the
* attribution service. When absent, callers may fall back to the
* configured target directory at the cost of zeroed-out attribution
* for files outside that directory.
*/
repoRoot?: string;
}
/**
* On-disk schema version for AttributionSnapshot. Bump when the shape
* changes incompatibly so restoreFromSnapshot can refuse / migrate
* stale payloads instead of silently producing NaN counters or
* mismatched key shapes.
*/
export const ATTRIBUTION_SNAPSHOT_VERSION = 1;
/** Serializable snapshot for session persistence. */
export interface AttributionSnapshot {
type: 'attribution-snapshot';
/** Schema version; absent on pre-versioning snapshots, treated as 1. */
version?: number;
surface: string;
fileStates: Record<string, FileAttribution>;
promptCount: number;
promptCountAtLastCommit: number;
}
// ---------------------------------------------------------------------------
// Model name sanitization
// ---------------------------------------------------------------------------
const INTERNAL_MODEL_PATTERNS = [
/qwen[-_]?\d+(\.\d+)?[-_]?b?/i,
/qwen[-_]?coder[-_]?\d*/i,
/qwen[-_]?max/i,
/qwen[-_]?plus/i,
/qwen[-_]?turbo/i,
];
const SANITIZED_GENERATOR_NAME = 'Qwen-Coder';
function sanitizeModelName(name: string): string {
for (const pattern of INTERNAL_MODEL_PATTERNS) {
if (pattern.test(name)) {
return SANITIZED_GENERATOR_NAME;
}
}
return name;
}
// ---------------------------------------------------------------------------
// Utilities
// ---------------------------------------------------------------------------
/**
* Defensive coercions for restoring snapshot fields. A snapshot can
* arrive with `undefined` / wrong-type fields if the on-disk JSON was
* partially written or pre-dates the current schema; without coercion
* they would flow through `Math.min(undefined, n) === NaN` into the
* git-notes payload.
*/
function sanitiseCount(v: unknown): number {
return typeof v === 'number' && Number.isFinite(v) && v >= 0 ? v : 0;
}
function sanitiseAttribution(v: unknown): FileAttribution {
const obj = (v ?? {}) as Partial<FileAttribution>;
return {
aiContribution: sanitiseCount(obj.aiContribution),
aiCreated: typeof obj.aiCreated === 'boolean' ? obj.aiCreated : false,
contentHash: typeof obj.contentHash === 'string' ? obj.contentHash : '',
};
}
/**
* Surface label embedded in the git-notes payload. Defaults to `'cli'`
* for the qwen-code CLI; embedders (IDE extensions, SDK consumers) can
* override by setting `QWEN_CODE_ENTRYPOINT` before construction so the
* note records where the contribution was authored.
*/
export function getClientSurface(): string {
return process.env['QWEN_CODE_ENTRYPOINT'] ?? 'cli';
}
// ---------------------------------------------------------------------------
// Service
// ---------------------------------------------------------------------------
export class CommitAttributionService {
private static instance: CommitAttributionService | null = null;
/** Per-file AI contribution tracking (keyed by absolute path) */
private fileAttributions: Map<string, FileAttribution> = new Map();
/** Client surface (cli, ide, api, sdk, etc.) */
private surface: string = getClientSurface();
// -- Prompt counting --
private promptCount: number = 0;
private promptCountAtLastCommit: number = 0;
private constructor() {}
static getInstance(): CommitAttributionService {
if (!CommitAttributionService.instance) {
CommitAttributionService.instance = new CommitAttributionService();
}
return CommitAttributionService.instance;
}
/** Reset singleton for testing. */
static resetInstance(): void {
CommitAttributionService.instance = null;
}
// -----------------------------------------------------------------------
// Recording
// -----------------------------------------------------------------------
/**
* Record an AI edit to a file.
* Uses prefix/suffix matching for precise character-level contribution.
*
* `filePath` is canonicalised via `fs.realpathSync` before being used
* as a key, so symlinked paths (e.g. `/var/...` ↔ `/private/var/...`
* on macOS) collapse to the same entry instead of silently producing
* two parallel records.
*
* Divergence detection: if a tracked entry's recorded `contentHash`
* doesn't match the hash of the `oldContent` we received here, the
* file was changed out-of-band between AI's last write and this
* call (paste-replace via external editor, `git checkout`, manual
* save, ...). Reset `aiContribution` and `aiCreated` to 0/false
* before applying the new edit so prior AI work that the user
* since overwrote isn't credited to the next commit.
*/
recordEdit(
filePath: string,
oldContent: string | null,
newContent: string,
): void {
const key = realpathOrSelf(filePath);
const existing = this.fileAttributions.get(key);
const isNewFile = oldContent === null;
let aiContribution = existing?.aiContribution ?? 0;
let aiCreated = existing?.aiCreated ?? false;
// If we have a prior tracked state for this file AND the input
// `oldContent` we're being told about doesn't match the hash we
// recorded after AI's last write, the file diverged out-of-band.
// Drop the accumulated counters before applying the new edit.
//
// Skip the check when `existing.contentHash` is empty: that's a
// legacy snapshot (pre-divergence-detection schema) where we
// never recorded the post-write hash. Comparing an empty hash to
// the actual file hash would always trip the reset and silently
// wipe AI work that's still on disk.
if (existing && existing.contentHash && oldContent !== null) {
const oldHash = computeContentHash(oldContent);
if (existing.contentHash !== oldHash) {
aiContribution = 0;
aiCreated = false;
}
}
const contribution = computeCharContribution(oldContent ?? '', newContent);
aiContribution += contribution;
if (isNewFile) aiCreated = true;
this.fileAttributions.set(key, {
aiContribution,
aiCreated,
contentHash: computeContentHash(newContent),
});
}
/**
* Re-hash each tracked file's content via a caller-supplied reader
* and drop entries whose hash doesn't match what AI's last write
* recorded. Catches the cases recordEdit's input-hash check can't
* see — i.e. the user (or another tool) modified the file entirely
* outside the Edit/Write tools, then committed it. Without this,
* the AI's stale aiContribution would attach to the human-only
* diff at commit time and credit AI for human work.
*
* `getContent(absPath)` returns the bytes the caller wants to
* compare against, or `null` if the entry shouldn't be checked
* (deletion, unreadable, no committed copy). Returning `null`
* leaves the entry alone rather than dropping it.
*
* Two production callers:
* 1. `attachCommitAttribution` after a commit — should pass a
* reader that fetches the COMMITTED blob (`git show HEAD:<rel>`)
* so unstaged working-tree changes the user made AFTER `git add`
* don't trip the divergence check on a commit whose blob still
* matches AI's recorded hash.
* 2. The legacy live-disk reader (`fs.readFileSync`) is exposed
* via `validateAgainstWorkingTree` for the no-committed-blob
* cases (e.g. amend-without-reflog where we can't pin a
* ref). Less precise but better than nothing.
*/
validateAgainst(getContent: (absPath: string) => string | null): void {
for (const [key, attr] of this.fileAttributions) {
// Skip legacy entries that have no recorded post-write hash —
// we can't tell stale from fresh, so leave them alone.
if (!attr.contentHash) continue;
const current = getContent(key);
if (current === null) continue; // not a divergence signal
if (computeContentHash(current) !== attr.contentHash) {
debugLogger.debug(
`validateAgainst: dropping stale attribution for ${key} (hash diverged)`,
);
this.fileAttributions.delete(key);
}
}
}
/**
* Convenience wrapper around {@link validateAgainst} that reads
* the live working-tree file. Used for code paths where we can't
* read the committed blob (no commit happened, no ref available).
*/
validateAgainstWorkingTree(): void {
this.validateAgainst((p) => {
try {
return fs.readFileSync(p, 'utf-8');
} catch {
return null;
}
});
}
// -----------------------------------------------------------------------
// Prompt counting
// -----------------------------------------------------------------------
incrementPromptCount(): void {
this.promptCount++;
}
getPromptCount(): number {
return this.promptCount;
}
/** Prompts since last commit (for "N-shotted" display). */
getPromptsSinceLastCommit(): number {
return this.promptCount - this.promptCountAtLastCommit;
}
// -----------------------------------------------------------------------
// Querying
// -----------------------------------------------------------------------
getAttributions(): Map<string, FileAttribution> {
const copy = new Map<string, FileAttribution>();
for (const [k, v] of this.fileAttributions) {
copy.set(k, { ...v });
}
return copy;
}
getFileAttribution(filePath: string): FileAttribution | undefined {
// Canonicalise so callers don't have to know about the realpath
// normalization happening inside `recordEdit`.
const attr = this.fileAttributions.get(realpathOrSelf(filePath));
return attr ? { ...attr } : undefined;
}
hasAttributions(): boolean {
return this.fileAttributions.size > 0;
}
getSurface(): string {
return this.surface;
}
/**
* Clear file attribution data. Called after commit (success or failure).
* @param commitSucceeded If true, also updates the "at last commit"
* counters so getPromptsSinceLastCommit() resets to 0.
*/
clearAttributions(commitSucceeded: boolean = true): void {
if (commitSucceeded) {
this.promptCountAtLastCommit = this.promptCount;
}
this.fileAttributions.clear();
}
/**
* Clear attribution data for the specific files that just landed in
* a commit, leaving entries for files the user *didn't* include
* (partial commits, `git add A && git commit -m "..."`) intact so
* they're still credited on a later commit. Snapshots prompt
* counters since a commit did succeed.
*
* Inputs must already be canonical absolute paths. The caller
* should resolve repo-relative diff entries against a canonical
* (realpath'd) repo root rather than realpathing each leaf — at
* cleanup time the leaf for a just-deleted file no longer exists,
* so per-leaf `fs.realpathSync` would fail and fall back to a
* non-canonical path that misses the stored canonical key.
*/
clearAttributedFiles(committedAbsolutePaths: Set<string>): void {
this.promptCountAtLastCommit = this.promptCount;
for (const p of committedAbsolutePaths) {
this.fileAttributions.delete(p);
}
}
/**
* Snapshot the prompt counter as the new "last commit" without
* clearing per-file attribution. Used when a commit landed but we
* can't reliably determine which files were in it (multi-commit
* chain we won't write a note for, attribution toggle off, diff
* analysis failed). Wholesale-clearing in those branches would
* silently wipe pending AI edits for *unrelated* files the user
* didn't stage — a worse failure mode than the small risk of
* stale per-file state for files that did just land.
*/
noteCommitWithoutClearing(): void {
this.promptCountAtLastCommit = this.promptCount;
}
/**
* Resolve a set of repo-relative file paths to the canonical absolute
* keys actually stored in the attribution map. Used by cleanup to
* partial-clear only the files that just landed in a commit.
*
* Matching by walking `fileAttributions` (instead of resolving each
* relative path with `path.resolve` + `fs.realpathSync`) is the only
* approach that handles all of: deleted files (where realpathSync
* throws), intermediate-symlink directories (where path.resolve only
* canonicalises the base), and renamed files (where the diff-time
* relative path differs from the recordEdit-time absolute path —
* still no match here, that's a rename-tracking concern handled
* separately). Each tracked key is canonical (recordEdit ran it
* through `realpathOrSelf`), so its computed relative form against
* the canonical repo root is what generateNotePayload uses too.
*/
matchCommittedFiles(
relativeFiles: Iterable<string>,
canonicalRepoRoot: string,
): Set<string> {
const wanted = new Set(relativeFiles);
const matched = new Set<string>();
for (const key of this.fileAttributions.keys()) {
const rel = path
.relative(canonicalRepoRoot, key)
.split(path.sep)
.join('/');
if (wanted.has(rel)) {
matched.add(key);
}
}
return matched;
}
// -----------------------------------------------------------------------
// Snapshot / restore (session persistence)
// -----------------------------------------------------------------------
/** Serialize current state for session persistence. */
toSnapshot(): AttributionSnapshot {
const fileStates: Record<string, FileAttribution> = {};
for (const [k, v] of this.fileAttributions) {
fileStates[k] = { ...v };
}
return {
type: 'attribution-snapshot',
version: ATTRIBUTION_SNAPSHOT_VERSION,
surface: this.surface,
fileStates,
promptCount: this.promptCount,
promptCountAtLastCommit: this.promptCountAtLastCommit,
};
}
/** Restore state from a persisted snapshot. */
restoreFromSnapshot(snapshot: AttributionSnapshot): void {
// The resume-time caller (client.ts) passes `snapshot` as a
// structural cast from `unknown`, so its TS-typed shape is only
// a hint — the actual runtime value can be anything (corrupted
// JSONL line, hand-edited session file, schema drift). Bail to
// a clean reset on any envelope-level shape mismatch:
// - non-object / null / array
// - wrong `type` discriminator
// - non-numeric `version` (after the `version ?? 1` default)
// - non-object `fileStates`
// Per-field coercion (sanitiseAttribution etc.) handles damage
// INSIDE a structurally valid snapshot; this gate stops a
// wholesale-wrong payload from polluting fileAttributions with
// garbage keys before per-field validation can run.
const isPlainObject = (v: unknown): v is Record<string, unknown> =>
typeof v === 'object' && v !== null && !Array.isArray(v);
const looksLikeSnapshot =
isPlainObject(snapshot) &&
(snapshot as Record<string, unknown>)['type'] === 'attribution-snapshot';
if (!looksLikeSnapshot) {
this.fileAttributions.clear();
this.surface = getClientSurface();
this.promptCount = 0;
this.promptCountAtLastCommit = 0;
return;
}
// Future schema bumps land here. Treat absent `version` as 1
// (the schema in production at the time this field was added) so
// existing on-disk snapshots restore cleanly.
const snapshotVersion = snapshot.version ?? 1;
if (snapshotVersion !== ATTRIBUTION_SNAPSHOT_VERSION) {
// Don't trust a stale shape — its fields may have moved or
// changed semantics. Reset to a fresh state rather than
// splice incompatible data.
this.fileAttributions.clear();
this.surface = getClientSurface();
this.promptCount = 0;
this.promptCountAtLastCommit = 0;
return;
}
// `surface` is embedded verbatim in the git-notes payload and used
// as a Map/Record key downstream. A corrupted snapshot with a
// non-string value (e.g. `{}`, `42`, `null`) would coerce into
// strings like `[object Object]` and break the payload shape.
// Fall back to the current client surface when the stored value
// isn't a string.
this.surface =
typeof snapshot.surface === 'string' && snapshot.surface.length > 0
? snapshot.surface
: getClientSurface();
// A corrupted or partially-written snapshot can leave numeric
// counters as `undefined`; without coercion, downstream
// `Math.min(undefined, n)` produces NaN that flows into the
// git-notes payload. Coerce per-field with a typed default.
this.promptCount = sanitiseCount(snapshot.promptCount);
this.promptCountAtLastCommit = sanitiseCount(
snapshot.promptCountAtLastCommit,
);
// Enforce the invariant `atLastCommit <= total`: a corrupted /
// partially-written snapshot with the inverse would surface a
// negative `getPromptsSinceLastCommit()` and propagate as a
// "(-3)-shotted" trailer into PR descriptions.
if (this.promptCountAtLastCommit > this.promptCount) {
this.promptCountAtLastCommit = this.promptCount;
}
this.fileAttributions.clear();
// Reject a corrupted `fileStates` (e.g. an array, a string, or
// null) before iterating: `Object.entries(<array>)` would happily
// produce `[index, value]` pairs and seed fileAttributions with
// numeric-string keys.
const fileStates = isPlainObject(snapshot.fileStates)
? snapshot.fileStates
: {};
for (const [k, v] of Object.entries(fileStates)) {
// Re-canonicalise on restore so old snapshots (written before
// recordEdit started running keys through realpath) end up
// with the same shape as newly-recorded entries. If both the
// symlinked and canonical forms were stored under separate
// keys (e.g. a session straddling the canonicalisation fix),
// collapsing them onto the same canonical key MUST merge their
// attribution rather than overwrite — otherwise the second
// entry to land wins and the AI's accumulated contribution from
// the first form is silently dropped.
const canonicalKey = realpathOrSelf(k);
const incoming = sanitiseAttribution(v);
const existing = this.fileAttributions.get(canonicalKey);
if (existing) {
// Sum aiContribution and OR aiCreated. Pick the
// most-recently-recorded contentHash (incoming wins) so
// post-restore divergence checks compare against the freshest
// hash; an old form's stale hash would force unnecessary
// resets on the next recordEdit.
this.fileAttributions.set(canonicalKey, {
aiContribution: existing.aiContribution + incoming.aiContribution,
aiCreated: existing.aiCreated || incoming.aiCreated,
contentHash: incoming.contentHash || existing.contentHash,
});
} else {
this.fileAttributions.set(canonicalKey, incoming);
}
}
}
// -----------------------------------------------------------------------
// Payload generation
// -----------------------------------------------------------------------
/**
* Generate the git notes JSON payload by combining tracked AI contributions
* with staged file information from git.
*/
generateNotePayload(
stagedInfo: StagedFileInfo,
baseDir: string,
generatorName?: string,
): CommitAttributionNote {
const generator = sanitizeModelName(
generatorName ?? SANITIZED_GENERATOR_NAME,
);
const files: Record<string, FileAttributionDetail> = {};
const excludedGenerated: string[] = [];
let excludedGeneratedCount = 0;
const surfaceCounts: Record<string, number> = {};
let totalAiChars = 0;
let totalHumanChars = 0;
// Build lookup: relative path → tracked AI contribution. Keys in
// `fileAttributions` are already canonical (recordEdit runs them
// through realpath); we only need to canonicalise `baseDir`,
// which comes from `git rev-parse --show-toplevel` and may be a
// symlink (e.g. macOS `/var` → `/private/var`). Without that
// canonicalisation `path.relative` would produce a `../...` key
// that never matches the diff output. Normalize separators to
// forward slashes so git paths line up on Windows.
const canonicalBase = realpathOrSelf(baseDir);
const aiLookup = new Map<string, FileAttribution>();
for (const [absPath, attr] of this.fileAttributions) {
const rel = path
.relative(canonicalBase, absPath)
.split(path.sep)
.join('/');
aiLookup.set(rel, attr);
}
for (const relFile of stagedInfo.files) {
if (isGeneratedFile(relFile)) {
excludedGeneratedCount++;
// Cap the sample so a commit churning thousands of `dist/`
// artifacts can't blow past the 30 KB note budget.
if (excludedGenerated.length < MAX_EXCLUDED_GENERATED_SAMPLE) {
excludedGenerated.push(relFile);
}
continue;
}
const tracked = aiLookup.get(relFile);
const diffSize = stagedInfo.diffSizes.get(relFile) ?? 0;
const isDeleted = stagedInfo.deletedFiles.has(relFile);
let aiChars: number;
let humanChars: number;
if (tracked) {
// Clamp aiChars to diffSize so aiChars+humanChars stays
// consistent with the committed change magnitude derived from
// `git diff --numstat`. Without this, cases where
// tracked.aiContribution exceeds the committed change size
// can leave aiChars > diffSize: humanChars then snaps to 0
// but aiChars stays large, inflating the per-file total
// beyond what was committed.
aiChars = Math.min(tracked.aiContribution, diffSize);
humanChars = Math.max(0, diffSize - aiChars);
} else if (isDeleted) {
// Deleted files with no AI tracking are attributed entirely to
// the human. diffSize comes from `git diff --numstat` so empty
// deletions legitimately have diffSize=0 — a magic fallback
// would only inflate totals.
aiChars = 0;
humanChars = diffSize;
} else {
aiChars = 0;
humanChars = diffSize;
}
const total = aiChars + humanChars;
const percent = total > 0 ? Math.round((aiChars / total) * 100) : 0;
files[relFile] = { aiChars, humanChars, percent, surface: this.surface };
totalAiChars += aiChars;
totalHumanChars += humanChars;
surfaceCounts[this.surface] =
(surfaceCounts[this.surface] ?? 0) + aiChars;
}
const totalChars = totalAiChars + totalHumanChars;
const aiPercent =
totalChars > 0 ? Math.round((totalAiChars / totalChars) * 100) : 0;
// Surface breakdown
const surfaceBreakdown: Record<
string,
{ aiChars: number; percent: number }
> = {};
for (const [surf, chars] of Object.entries(surfaceCounts)) {
surfaceBreakdown[surf] = {
aiChars: chars,
percent: totalChars > 0 ? Math.round((chars / totalChars) * 100) : 0,
};
}
return {
version: 1,
generator,
files,
summary: {
aiPercent,
aiChars: totalAiChars,
humanChars: totalHumanChars,
totalFilesTouched: Object.keys(files).length,
surfaces: [this.surface],
},
surfaceBreakdown,
excludedGenerated,
excludedGeneratedCount,
promptCount: this.getPromptsSinceLastCommit(),
};
}
}
// ---------------------------------------------------------------------------
// Character contribution calculation (Claude's prefix/suffix algorithm)
// ---------------------------------------------------------------------------
/**
* Compute the character contribution for a file modification.
* Uses common prefix/suffix matching to find the actual changed region,
* then returns the larger of the old/new changed lengths.
*/
export function computeCharContribution(
oldContent: string,
newContent: string,
): number {
if (oldContent === '' || newContent === '') {
return oldContent === '' ? newContent.length : oldContent.length;
}
const minLen = Math.min(oldContent.length, newContent.length);
let prefixEnd = 0;
while (
prefixEnd < minLen &&
oldContent[prefixEnd] === newContent[prefixEnd]
) {
prefixEnd++;
}
let suffixLen = 0;
while (
suffixLen < minLen - prefixEnd &&
oldContent[oldContent.length - 1 - suffixLen] ===
newContent[newContent.length - 1 - suffixLen]
) {
suffixLen++;
}
const oldChangedLen = oldContent.length - prefixEnd - suffixLen;
const newChangedLen = newContent.length - prefixEnd - suffixLen;
return Math.max(oldChangedLen, newChangedLen);
}