mirror of
https://github.com/QwenLM/qwen-code.git
synced 2026-05-13 15:32:19 +00:00
gpt-5.5 review (issue 4389405179): 1. realpathOrSelf falls back to the non-canonical input when the leaf doesn't exist (deleted file). recordEdit stored the entry under the canonical path; lookup post-deletion misses on macOS where /var ↔ /private/var. Canonicalise the parent and rejoin the basename for missing leaves so deleted-file getFileAttribution still resolves the canonical key. Test updated to assert the lookup-after-unlink path explicitly. 2. validateOnDiskHashes read the LIVE working-tree, so a user who `git add`'d AI's content and then made additional unstaged edits would have the entry dropped on a commit whose blob still matched AI's hash. Replace with `validateAgainst(getContent)` that takes a caller-supplied reader; attachCommitAttribution now passes a reader that fetches the COMMITTED blob via `git show HEAD:<rel>`. Working-tree validation kept as `validateAgainstWorkingTree` for code paths without a committed ref. Returns null = no comparison signal (entry preserved). Tests cover all three readers (committed-blob via stub, working-tree, null-passthrough). deepseek-v4-pro review #1: sanitiseAttribution defaults missing contentHash to '' on legacy-snapshot restore. recordEdit's divergence check would then trip on every subsequent edit and silently reset all the AI work. Skip the divergence check when existing.contentHash is empty — we have no baseline to compare against, so don't drop. Test added covering legacy-snapshot preservation through validateAgainst. deepseek #4: validateAgainst now logs every entry drop via debugLogger.debug so a 3am operator can see WHICH entry got dropped and tied to which canonical key. deepseek #8: GIT_NAMESPACE removed from GIT_ENV_SHIFTS_REPO. It prefixes ref names within the same repo but doesn't redirect git to a different on-disk repository, so a commit underneath it still lands in our cwd's repo. Doc comment explains the distinction. deepseek #9: pushd/popd treated as cwd-shifting alongside cd in gitCommitContext / isAmendCommit / findAttributableCommitSegment. pushd reuses cdTargetMayChangeRepo (relative-no-escape stays in-repo); popd unconditionally flips cwdShifted because we don't track the bash dir-stack. deepseek #10: sudo's value-taking flag table now has a parallel SUDO_FLAGS_SHIFT_CWD set covering -D / --chdir (Linux sudo 1.9.2+). Any segment whose sudo wrapper sees one of those flags returns null from tokeniseSegment — same contract as env -C / --chdir and GIT_DIR=... 328 tests pass; typecheck clean both packages.
786 lines
29 KiB
TypeScript
786 lines
29 KiB
TypeScript
/**
|
|
* @license
|
|
* Copyright 2025 Google LLC
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*/
|
|
|
|
/**
|
|
* Commit Attribution Service
|
|
*
|
|
* Tracks character-level contribution ratios between AI and humans per file.
|
|
* When a git commit is made, this data is combined with git diff analysis to
|
|
* calculate real AI vs human contribution percentages, stored as git notes.
|
|
*
|
|
* Features:
|
|
* - Character-level prefix/suffix diff algorithm
|
|
* - Real AI/human contribution ratio via git diff
|
|
* - Surface tracking (cli/ide/api/sdk)
|
|
* - Prompt counting (since-last-commit window)
|
|
* - Snapshot/restore for session persistence
|
|
* - Generated file exclusion
|
|
*/
|
|
|
|
import { createHash } from 'node:crypto';
|
|
import * as fs from 'node:fs';
|
|
import * as path from 'node:path';
|
|
import { createDebugLogger } from '../utils/debugLogger.js';
|
|
import { isGeneratedFile } from './generatedFiles.js';
|
|
|
|
const debugLogger = createDebugLogger('COMMIT_ATTRIBUTION');
|
|
|
|
function computeContentHash(content: string): string {
|
|
return createHash('sha256').update(content).digest('hex');
|
|
}
|
|
|
|
/**
|
|
* Resolve symlinks on a path. On macOS in particular, `/var` is a
|
|
* symlink to `/private/var`, so an absolute path captured via
|
|
* `fs.realpathSync` (what edit.ts/write-file.ts records) and
|
|
* `path.relative` against `git rev-parse --show-toplevel` (which may
|
|
* report either form) won't line up unless we normalise both sides.
|
|
*
|
|
* For DELETED leaves (file no longer exists on disk), realpathSync
|
|
* throws — but the parent directory is still resolvable. Canonicalise
|
|
* the parent and rejoin the missing basename so a deleted file's
|
|
* lookup still hits the canonical key recordEdit stored before the
|
|
* file was removed. Without this, a `getFileAttribution(deletedPath)`
|
|
* call after the file was deleted would fall back to the
|
|
* non-canonical input and miss the canonical entry on macOS.
|
|
*/
|
|
function realpathOrSelf(p: string): string {
|
|
try {
|
|
return fs.realpathSync(p);
|
|
} catch {
|
|
try {
|
|
const parent = path.dirname(p);
|
|
const realParent = fs.realpathSync(parent);
|
|
return path.join(realParent, path.basename(p));
|
|
} catch {
|
|
return p;
|
|
}
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Types
|
|
// ---------------------------------------------------------------------------
|
|
|
|
export interface FileAttribution {
|
|
/** Total characters contributed by AI (accumulated across edits) */
|
|
aiContribution: number;
|
|
/** Whether the file was created by AI */
|
|
aiCreated: boolean;
|
|
/**
|
|
* SHA-256 of the file content immediately after AI's last write. Used
|
|
* to detect out-of-band mutation (paste-replace via external editor,
|
|
* `rm` + recreate, manual save) so AI's accumulated counter doesn't
|
|
* silently get credited to subsequent human edits. recordEdit checks
|
|
* this on every call (resets when the input `oldContent` doesn't
|
|
* match), and `validateOnDiskHashes` re-verifies before a commit
|
|
* note is generated to catch user edits that happened entirely
|
|
* outside the Edit/Write tools.
|
|
*/
|
|
contentHash: string;
|
|
}
|
|
|
|
/** Per-file attribution detail in the git notes payload. */
|
|
export interface FileAttributionDetail {
|
|
aiChars: number;
|
|
humanChars: number;
|
|
percent: number;
|
|
surface?: string;
|
|
}
|
|
|
|
/** Full attribution payload stored as git notes JSON. */
|
|
export interface CommitAttributionNote {
|
|
version: 1;
|
|
generator: string;
|
|
files: Record<string, FileAttributionDetail>;
|
|
summary: {
|
|
aiPercent: number;
|
|
aiChars: number;
|
|
humanChars: number;
|
|
totalFilesTouched: number;
|
|
surfaces: string[];
|
|
};
|
|
surfaceBreakdown: Record<string, { aiChars: number; percent: number }>;
|
|
/**
|
|
* Sample of generated/vendored files that were excluded from
|
|
* attribution. Capped at `MAX_EXCLUDED_GENERATED_SAMPLE` paths so a
|
|
* commit churning thousands of `dist/` artifacts can't blow past the
|
|
* 30 KB note budget and silently drop attribution for the real
|
|
* source files in the same commit. Use `excludedGeneratedCount` for
|
|
* the true total.
|
|
*/
|
|
excludedGenerated: string[];
|
|
/** Total count of excluded files (≥ excludedGenerated.length). */
|
|
excludedGeneratedCount: number;
|
|
promptCount: number;
|
|
}
|
|
|
|
/**
|
|
* Upper bound on the number of excluded-generated paths we serialize
|
|
* into the git note. Keeps the JSON payload bounded for commits with
|
|
* lots of generated artifacts.
|
|
*/
|
|
export const MAX_EXCLUDED_GENERATED_SAMPLE = 50;
|
|
|
|
/** Result of running git commands to get staged file info. */
|
|
export interface StagedFileInfo {
|
|
files: string[];
|
|
diffSizes: Map<string, number>;
|
|
deletedFiles: Set<string>;
|
|
/**
|
|
* Absolute path of the repository root (`git rev-parse --show-toplevel`).
|
|
* Optional for backward compatibility with synthetic test inputs;
|
|
* production callers should set it so file paths in `files` (which are
|
|
* relative to the repo root) align with absolute paths tracked by the
|
|
* attribution service. When absent, callers may fall back to the
|
|
* configured target directory at the cost of zeroed-out attribution
|
|
* for files outside that directory.
|
|
*/
|
|
repoRoot?: string;
|
|
}
|
|
|
|
/**
|
|
* On-disk schema version for AttributionSnapshot. Bump when the shape
|
|
* changes incompatibly so restoreFromSnapshot can refuse / migrate
|
|
* stale payloads instead of silently producing NaN counters or
|
|
* mismatched key shapes.
|
|
*/
|
|
export const ATTRIBUTION_SNAPSHOT_VERSION = 1;
|
|
|
|
/** Serializable snapshot for session persistence. */
|
|
export interface AttributionSnapshot {
|
|
type: 'attribution-snapshot';
|
|
/** Schema version; absent on pre-versioning snapshots, treated as 1. */
|
|
version?: number;
|
|
surface: string;
|
|
fileStates: Record<string, FileAttribution>;
|
|
promptCount: number;
|
|
promptCountAtLastCommit: number;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Model name sanitization
|
|
// ---------------------------------------------------------------------------
|
|
|
|
const INTERNAL_MODEL_PATTERNS = [
|
|
/qwen[-_]?\d+(\.\d+)?[-_]?b?/i,
|
|
/qwen[-_]?coder[-_]?\d*/i,
|
|
/qwen[-_]?max/i,
|
|
/qwen[-_]?plus/i,
|
|
/qwen[-_]?turbo/i,
|
|
];
|
|
|
|
const SANITIZED_GENERATOR_NAME = 'Qwen-Coder';
|
|
|
|
function sanitizeModelName(name: string): string {
|
|
for (const pattern of INTERNAL_MODEL_PATTERNS) {
|
|
if (pattern.test(name)) {
|
|
return SANITIZED_GENERATOR_NAME;
|
|
}
|
|
}
|
|
return name;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Utilities
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Defensive coercions for restoring snapshot fields. A snapshot can
|
|
* arrive with `undefined` / wrong-type fields if the on-disk JSON was
|
|
* partially written or pre-dates the current schema; without coercion
|
|
* they would flow through `Math.min(undefined, n) === NaN` into the
|
|
* git-notes payload.
|
|
*/
|
|
function sanitiseCount(v: unknown): number {
|
|
return typeof v === 'number' && Number.isFinite(v) && v >= 0 ? v : 0;
|
|
}
|
|
|
|
function sanitiseAttribution(v: unknown): FileAttribution {
|
|
const obj = (v ?? {}) as Partial<FileAttribution>;
|
|
return {
|
|
aiContribution: sanitiseCount(obj.aiContribution),
|
|
aiCreated: typeof obj.aiCreated === 'boolean' ? obj.aiCreated : false,
|
|
contentHash: typeof obj.contentHash === 'string' ? obj.contentHash : '',
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Surface label embedded in the git-notes payload. Defaults to `'cli'`
|
|
* for the qwen-code CLI; embedders (IDE extensions, SDK consumers) can
|
|
* override by setting `QWEN_CODE_ENTRYPOINT` before construction so the
|
|
* note records where the contribution was authored.
|
|
*/
|
|
export function getClientSurface(): string {
|
|
return process.env['QWEN_CODE_ENTRYPOINT'] ?? 'cli';
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Service
|
|
// ---------------------------------------------------------------------------
|
|
|
|
export class CommitAttributionService {
|
|
private static instance: CommitAttributionService | null = null;
|
|
|
|
/** Per-file AI contribution tracking (keyed by absolute path) */
|
|
private fileAttributions: Map<string, FileAttribution> = new Map();
|
|
/** Client surface (cli, ide, api, sdk, etc.) */
|
|
private surface: string = getClientSurface();
|
|
|
|
// -- Prompt counting --
|
|
private promptCount: number = 0;
|
|
private promptCountAtLastCommit: number = 0;
|
|
|
|
private constructor() {}
|
|
|
|
static getInstance(): CommitAttributionService {
|
|
if (!CommitAttributionService.instance) {
|
|
CommitAttributionService.instance = new CommitAttributionService();
|
|
}
|
|
return CommitAttributionService.instance;
|
|
}
|
|
|
|
/** Reset singleton for testing. */
|
|
static resetInstance(): void {
|
|
CommitAttributionService.instance = null;
|
|
}
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Recording
|
|
// -----------------------------------------------------------------------
|
|
|
|
/**
|
|
* Record an AI edit to a file.
|
|
* Uses prefix/suffix matching for precise character-level contribution.
|
|
*
|
|
* `filePath` is canonicalised via `fs.realpathSync` before being used
|
|
* as a key, so symlinked paths (e.g. `/var/...` ↔ `/private/var/...`
|
|
* on macOS) collapse to the same entry instead of silently producing
|
|
* two parallel records.
|
|
*
|
|
* Divergence detection: if a tracked entry's recorded `contentHash`
|
|
* doesn't match the hash of the `oldContent` we received here, the
|
|
* file was changed out-of-band between AI's last write and this
|
|
* call (paste-replace via external editor, `git checkout`, manual
|
|
* save, ...). Reset `aiContribution` and `aiCreated` to 0/false
|
|
* before applying the new edit so prior AI work that the user
|
|
* since overwrote isn't credited to the next commit.
|
|
*/
|
|
recordEdit(
|
|
filePath: string,
|
|
oldContent: string | null,
|
|
newContent: string,
|
|
): void {
|
|
const key = realpathOrSelf(filePath);
|
|
|
|
const existing = this.fileAttributions.get(key);
|
|
const isNewFile = oldContent === null;
|
|
|
|
let aiContribution = existing?.aiContribution ?? 0;
|
|
let aiCreated = existing?.aiCreated ?? false;
|
|
|
|
// If we have a prior tracked state for this file AND the input
|
|
// `oldContent` we're being told about doesn't match the hash we
|
|
// recorded after AI's last write, the file diverged out-of-band.
|
|
// Drop the accumulated counters before applying the new edit.
|
|
//
|
|
// Skip the check when `existing.contentHash` is empty: that's a
|
|
// legacy snapshot (pre-divergence-detection schema) where we
|
|
// never recorded the post-write hash. Comparing an empty hash to
|
|
// the actual file hash would always trip the reset and silently
|
|
// wipe AI work that's still on disk.
|
|
if (existing && existing.contentHash && oldContent !== null) {
|
|
const oldHash = computeContentHash(oldContent);
|
|
if (existing.contentHash !== oldHash) {
|
|
aiContribution = 0;
|
|
aiCreated = false;
|
|
}
|
|
}
|
|
|
|
const contribution = computeCharContribution(oldContent ?? '', newContent);
|
|
aiContribution += contribution;
|
|
if (isNewFile) aiCreated = true;
|
|
|
|
this.fileAttributions.set(key, {
|
|
aiContribution,
|
|
aiCreated,
|
|
contentHash: computeContentHash(newContent),
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Re-hash each tracked file's content via a caller-supplied reader
|
|
* and drop entries whose hash doesn't match what AI's last write
|
|
* recorded. Catches the cases recordEdit's input-hash check can't
|
|
* see — i.e. the user (or another tool) modified the file entirely
|
|
* outside the Edit/Write tools, then committed it. Without this,
|
|
* the AI's stale aiContribution would attach to the human-only
|
|
* diff at commit time and credit AI for human work.
|
|
*
|
|
* `getContent(absPath)` returns the bytes the caller wants to
|
|
* compare against, or `null` if the entry shouldn't be checked
|
|
* (deletion, unreadable, no committed copy). Returning `null`
|
|
* leaves the entry alone rather than dropping it.
|
|
*
|
|
* Two production callers:
|
|
* 1. `attachCommitAttribution` after a commit — should pass a
|
|
* reader that fetches the COMMITTED blob (`git show HEAD:<rel>`)
|
|
* so unstaged working-tree changes the user made AFTER `git add`
|
|
* don't trip the divergence check on a commit whose blob still
|
|
* matches AI's recorded hash.
|
|
* 2. The legacy live-disk reader (`fs.readFileSync`) is exposed
|
|
* via `validateAgainstWorkingTree` for the no-committed-blob
|
|
* cases (e.g. amend-without-reflog where we can't pin a
|
|
* ref). Less precise but better than nothing.
|
|
*/
|
|
validateAgainst(getContent: (absPath: string) => string | null): void {
|
|
for (const [key, attr] of this.fileAttributions) {
|
|
// Skip legacy entries that have no recorded post-write hash —
|
|
// we can't tell stale from fresh, so leave them alone.
|
|
if (!attr.contentHash) continue;
|
|
const current = getContent(key);
|
|
if (current === null) continue; // not a divergence signal
|
|
if (computeContentHash(current) !== attr.contentHash) {
|
|
debugLogger.debug(
|
|
`validateAgainst: dropping stale attribution for ${key} (hash diverged)`,
|
|
);
|
|
this.fileAttributions.delete(key);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Convenience wrapper around {@link validateAgainst} that reads
|
|
* the live working-tree file. Used for code paths where we can't
|
|
* read the committed blob (no commit happened, no ref available).
|
|
*/
|
|
validateAgainstWorkingTree(): void {
|
|
this.validateAgainst((p) => {
|
|
try {
|
|
return fs.readFileSync(p, 'utf-8');
|
|
} catch {
|
|
return null;
|
|
}
|
|
});
|
|
}
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Prompt counting
|
|
// -----------------------------------------------------------------------
|
|
|
|
incrementPromptCount(): void {
|
|
this.promptCount++;
|
|
}
|
|
|
|
getPromptCount(): number {
|
|
return this.promptCount;
|
|
}
|
|
|
|
/** Prompts since last commit (for "N-shotted" display). */
|
|
getPromptsSinceLastCommit(): number {
|
|
return this.promptCount - this.promptCountAtLastCommit;
|
|
}
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Querying
|
|
// -----------------------------------------------------------------------
|
|
|
|
getAttributions(): Map<string, FileAttribution> {
|
|
const copy = new Map<string, FileAttribution>();
|
|
for (const [k, v] of this.fileAttributions) {
|
|
copy.set(k, { ...v });
|
|
}
|
|
return copy;
|
|
}
|
|
|
|
getFileAttribution(filePath: string): FileAttribution | undefined {
|
|
// Canonicalise so callers don't have to know about the realpath
|
|
// normalization happening inside `recordEdit`.
|
|
const attr = this.fileAttributions.get(realpathOrSelf(filePath));
|
|
return attr ? { ...attr } : undefined;
|
|
}
|
|
|
|
hasAttributions(): boolean {
|
|
return this.fileAttributions.size > 0;
|
|
}
|
|
|
|
getSurface(): string {
|
|
return this.surface;
|
|
}
|
|
|
|
/**
|
|
* Clear file attribution data. Called after commit (success or failure).
|
|
* @param commitSucceeded If true, also updates the "at last commit"
|
|
* counters so getPromptsSinceLastCommit() resets to 0.
|
|
*/
|
|
clearAttributions(commitSucceeded: boolean = true): void {
|
|
if (commitSucceeded) {
|
|
this.promptCountAtLastCommit = this.promptCount;
|
|
}
|
|
this.fileAttributions.clear();
|
|
}
|
|
|
|
/**
|
|
* Clear attribution data for the specific files that just landed in
|
|
* a commit, leaving entries for files the user *didn't* include
|
|
* (partial commits, `git add A && git commit -m "..."`) intact so
|
|
* they're still credited on a later commit. Snapshots prompt
|
|
* counters since a commit did succeed.
|
|
*
|
|
* Inputs must already be canonical absolute paths. The caller
|
|
* should resolve repo-relative diff entries against a canonical
|
|
* (realpath'd) repo root rather than realpathing each leaf — at
|
|
* cleanup time the leaf for a just-deleted file no longer exists,
|
|
* so per-leaf `fs.realpathSync` would fail and fall back to a
|
|
* non-canonical path that misses the stored canonical key.
|
|
*/
|
|
clearAttributedFiles(committedAbsolutePaths: Set<string>): void {
|
|
this.promptCountAtLastCommit = this.promptCount;
|
|
for (const p of committedAbsolutePaths) {
|
|
this.fileAttributions.delete(p);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Snapshot the prompt counter as the new "last commit" without
|
|
* clearing per-file attribution. Used when a commit landed but we
|
|
* can't reliably determine which files were in it (multi-commit
|
|
* chain we won't write a note for, attribution toggle off, diff
|
|
* analysis failed). Wholesale-clearing in those branches would
|
|
* silently wipe pending AI edits for *unrelated* files the user
|
|
* didn't stage — a worse failure mode than the small risk of
|
|
* stale per-file state for files that did just land.
|
|
*/
|
|
noteCommitWithoutClearing(): void {
|
|
this.promptCountAtLastCommit = this.promptCount;
|
|
}
|
|
|
|
/**
|
|
* Resolve a set of repo-relative file paths to the canonical absolute
|
|
* keys actually stored in the attribution map. Used by cleanup to
|
|
* partial-clear only the files that just landed in a commit.
|
|
*
|
|
* Matching by walking `fileAttributions` (instead of resolving each
|
|
* relative path with `path.resolve` + `fs.realpathSync`) is the only
|
|
* approach that handles all of: deleted files (where realpathSync
|
|
* throws), intermediate-symlink directories (where path.resolve only
|
|
* canonicalises the base), and renamed files (where the diff-time
|
|
* relative path differs from the recordEdit-time absolute path —
|
|
* still no match here, that's a rename-tracking concern handled
|
|
* separately). Each tracked key is canonical (recordEdit ran it
|
|
* through `realpathOrSelf`), so its computed relative form against
|
|
* the canonical repo root is what generateNotePayload uses too.
|
|
*/
|
|
matchCommittedFiles(
|
|
relativeFiles: Iterable<string>,
|
|
canonicalRepoRoot: string,
|
|
): Set<string> {
|
|
const wanted = new Set(relativeFiles);
|
|
const matched = new Set<string>();
|
|
for (const key of this.fileAttributions.keys()) {
|
|
const rel = path
|
|
.relative(canonicalRepoRoot, key)
|
|
.split(path.sep)
|
|
.join('/');
|
|
if (wanted.has(rel)) {
|
|
matched.add(key);
|
|
}
|
|
}
|
|
return matched;
|
|
}
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Snapshot / restore (session persistence)
|
|
// -----------------------------------------------------------------------
|
|
|
|
/** Serialize current state for session persistence. */
|
|
toSnapshot(): AttributionSnapshot {
|
|
const fileStates: Record<string, FileAttribution> = {};
|
|
for (const [k, v] of this.fileAttributions) {
|
|
fileStates[k] = { ...v };
|
|
}
|
|
return {
|
|
type: 'attribution-snapshot',
|
|
version: ATTRIBUTION_SNAPSHOT_VERSION,
|
|
surface: this.surface,
|
|
fileStates,
|
|
promptCount: this.promptCount,
|
|
promptCountAtLastCommit: this.promptCountAtLastCommit,
|
|
};
|
|
}
|
|
|
|
/** Restore state from a persisted snapshot. */
|
|
restoreFromSnapshot(snapshot: AttributionSnapshot): void {
|
|
// The resume-time caller (client.ts) passes `snapshot` as a
|
|
// structural cast from `unknown`, so its TS-typed shape is only
|
|
// a hint — the actual runtime value can be anything (corrupted
|
|
// JSONL line, hand-edited session file, schema drift). Bail to
|
|
// a clean reset on any envelope-level shape mismatch:
|
|
// - non-object / null / array
|
|
// - wrong `type` discriminator
|
|
// - non-numeric `version` (after the `version ?? 1` default)
|
|
// - non-object `fileStates`
|
|
// Per-field coercion (sanitiseAttribution etc.) handles damage
|
|
// INSIDE a structurally valid snapshot; this gate stops a
|
|
// wholesale-wrong payload from polluting fileAttributions with
|
|
// garbage keys before per-field validation can run.
|
|
const isPlainObject = (v: unknown): v is Record<string, unknown> =>
|
|
typeof v === 'object' && v !== null && !Array.isArray(v);
|
|
const looksLikeSnapshot =
|
|
isPlainObject(snapshot) &&
|
|
(snapshot as Record<string, unknown>)['type'] === 'attribution-snapshot';
|
|
if (!looksLikeSnapshot) {
|
|
this.fileAttributions.clear();
|
|
this.surface = getClientSurface();
|
|
this.promptCount = 0;
|
|
this.promptCountAtLastCommit = 0;
|
|
return;
|
|
}
|
|
// Future schema bumps land here. Treat absent `version` as 1
|
|
// (the schema in production at the time this field was added) so
|
|
// existing on-disk snapshots restore cleanly.
|
|
const snapshotVersion = snapshot.version ?? 1;
|
|
if (snapshotVersion !== ATTRIBUTION_SNAPSHOT_VERSION) {
|
|
// Don't trust a stale shape — its fields may have moved or
|
|
// changed semantics. Reset to a fresh state rather than
|
|
// splice incompatible data.
|
|
this.fileAttributions.clear();
|
|
this.surface = getClientSurface();
|
|
this.promptCount = 0;
|
|
this.promptCountAtLastCommit = 0;
|
|
return;
|
|
}
|
|
|
|
// `surface` is embedded verbatim in the git-notes payload and used
|
|
// as a Map/Record key downstream. A corrupted snapshot with a
|
|
// non-string value (e.g. `{}`, `42`, `null`) would coerce into
|
|
// strings like `[object Object]` and break the payload shape.
|
|
// Fall back to the current client surface when the stored value
|
|
// isn't a string.
|
|
this.surface =
|
|
typeof snapshot.surface === 'string' && snapshot.surface.length > 0
|
|
? snapshot.surface
|
|
: getClientSurface();
|
|
// A corrupted or partially-written snapshot can leave numeric
|
|
// counters as `undefined`; without coercion, downstream
|
|
// `Math.min(undefined, n)` produces NaN that flows into the
|
|
// git-notes payload. Coerce per-field with a typed default.
|
|
this.promptCount = sanitiseCount(snapshot.promptCount);
|
|
this.promptCountAtLastCommit = sanitiseCount(
|
|
snapshot.promptCountAtLastCommit,
|
|
);
|
|
// Enforce the invariant `atLastCommit <= total`: a corrupted /
|
|
// partially-written snapshot with the inverse would surface a
|
|
// negative `getPromptsSinceLastCommit()` and propagate as a
|
|
// "(-3)-shotted" trailer into PR descriptions.
|
|
if (this.promptCountAtLastCommit > this.promptCount) {
|
|
this.promptCountAtLastCommit = this.promptCount;
|
|
}
|
|
|
|
this.fileAttributions.clear();
|
|
// Reject a corrupted `fileStates` (e.g. an array, a string, or
|
|
// null) before iterating: `Object.entries(<array>)` would happily
|
|
// produce `[index, value]` pairs and seed fileAttributions with
|
|
// numeric-string keys.
|
|
const fileStates = isPlainObject(snapshot.fileStates)
|
|
? snapshot.fileStates
|
|
: {};
|
|
for (const [k, v] of Object.entries(fileStates)) {
|
|
// Re-canonicalise on restore so old snapshots (written before
|
|
// recordEdit started running keys through realpath) end up
|
|
// with the same shape as newly-recorded entries. If both the
|
|
// symlinked and canonical forms were stored under separate
|
|
// keys (e.g. a session straddling the canonicalisation fix),
|
|
// collapsing them onto the same canonical key MUST merge their
|
|
// attribution rather than overwrite — otherwise the second
|
|
// entry to land wins and the AI's accumulated contribution from
|
|
// the first form is silently dropped.
|
|
const canonicalKey = realpathOrSelf(k);
|
|
const incoming = sanitiseAttribution(v);
|
|
const existing = this.fileAttributions.get(canonicalKey);
|
|
if (existing) {
|
|
// Sum aiContribution and OR aiCreated. Pick the
|
|
// most-recently-recorded contentHash (incoming wins) so
|
|
// post-restore divergence checks compare against the freshest
|
|
// hash; an old form's stale hash would force unnecessary
|
|
// resets on the next recordEdit.
|
|
this.fileAttributions.set(canonicalKey, {
|
|
aiContribution: existing.aiContribution + incoming.aiContribution,
|
|
aiCreated: existing.aiCreated || incoming.aiCreated,
|
|
contentHash: incoming.contentHash || existing.contentHash,
|
|
});
|
|
} else {
|
|
this.fileAttributions.set(canonicalKey, incoming);
|
|
}
|
|
}
|
|
}
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Payload generation
|
|
// -----------------------------------------------------------------------
|
|
|
|
/**
|
|
* Generate the git notes JSON payload by combining tracked AI contributions
|
|
* with staged file information from git.
|
|
*/
|
|
generateNotePayload(
|
|
stagedInfo: StagedFileInfo,
|
|
baseDir: string,
|
|
generatorName?: string,
|
|
): CommitAttributionNote {
|
|
const generator = sanitizeModelName(
|
|
generatorName ?? SANITIZED_GENERATOR_NAME,
|
|
);
|
|
|
|
const files: Record<string, FileAttributionDetail> = {};
|
|
const excludedGenerated: string[] = [];
|
|
let excludedGeneratedCount = 0;
|
|
const surfaceCounts: Record<string, number> = {};
|
|
let totalAiChars = 0;
|
|
let totalHumanChars = 0;
|
|
|
|
// Build lookup: relative path → tracked AI contribution. Keys in
|
|
// `fileAttributions` are already canonical (recordEdit runs them
|
|
// through realpath); we only need to canonicalise `baseDir`,
|
|
// which comes from `git rev-parse --show-toplevel` and may be a
|
|
// symlink (e.g. macOS `/var` → `/private/var`). Without that
|
|
// canonicalisation `path.relative` would produce a `../...` key
|
|
// that never matches the diff output. Normalize separators to
|
|
// forward slashes so git paths line up on Windows.
|
|
const canonicalBase = realpathOrSelf(baseDir);
|
|
const aiLookup = new Map<string, FileAttribution>();
|
|
for (const [absPath, attr] of this.fileAttributions) {
|
|
const rel = path
|
|
.relative(canonicalBase, absPath)
|
|
.split(path.sep)
|
|
.join('/');
|
|
aiLookup.set(rel, attr);
|
|
}
|
|
|
|
for (const relFile of stagedInfo.files) {
|
|
if (isGeneratedFile(relFile)) {
|
|
excludedGeneratedCount++;
|
|
// Cap the sample so a commit churning thousands of `dist/`
|
|
// artifacts can't blow past the 30 KB note budget.
|
|
if (excludedGenerated.length < MAX_EXCLUDED_GENERATED_SAMPLE) {
|
|
excludedGenerated.push(relFile);
|
|
}
|
|
continue;
|
|
}
|
|
|
|
const tracked = aiLookup.get(relFile);
|
|
const diffSize = stagedInfo.diffSizes.get(relFile) ?? 0;
|
|
const isDeleted = stagedInfo.deletedFiles.has(relFile);
|
|
|
|
let aiChars: number;
|
|
let humanChars: number;
|
|
|
|
if (tracked) {
|
|
// Clamp aiChars to diffSize so aiChars+humanChars stays
|
|
// consistent with the committed change magnitude derived from
|
|
// `git diff --numstat`. Without this, cases where
|
|
// tracked.aiContribution exceeds the committed change size
|
|
// can leave aiChars > diffSize: humanChars then snaps to 0
|
|
// but aiChars stays large, inflating the per-file total
|
|
// beyond what was committed.
|
|
aiChars = Math.min(tracked.aiContribution, diffSize);
|
|
humanChars = Math.max(0, diffSize - aiChars);
|
|
} else if (isDeleted) {
|
|
// Deleted files with no AI tracking are attributed entirely to
|
|
// the human. diffSize comes from `git diff --numstat` so empty
|
|
// deletions legitimately have diffSize=0 — a magic fallback
|
|
// would only inflate totals.
|
|
aiChars = 0;
|
|
humanChars = diffSize;
|
|
} else {
|
|
aiChars = 0;
|
|
humanChars = diffSize;
|
|
}
|
|
|
|
const total = aiChars + humanChars;
|
|
const percent = total > 0 ? Math.round((aiChars / total) * 100) : 0;
|
|
|
|
files[relFile] = { aiChars, humanChars, percent, surface: this.surface };
|
|
totalAiChars += aiChars;
|
|
totalHumanChars += humanChars;
|
|
surfaceCounts[this.surface] =
|
|
(surfaceCounts[this.surface] ?? 0) + aiChars;
|
|
}
|
|
|
|
const totalChars = totalAiChars + totalHumanChars;
|
|
const aiPercent =
|
|
totalChars > 0 ? Math.round((totalAiChars / totalChars) * 100) : 0;
|
|
|
|
// Surface breakdown
|
|
const surfaceBreakdown: Record<
|
|
string,
|
|
{ aiChars: number; percent: number }
|
|
> = {};
|
|
for (const [surf, chars] of Object.entries(surfaceCounts)) {
|
|
surfaceBreakdown[surf] = {
|
|
aiChars: chars,
|
|
percent: totalChars > 0 ? Math.round((chars / totalChars) * 100) : 0,
|
|
};
|
|
}
|
|
|
|
return {
|
|
version: 1,
|
|
generator,
|
|
files,
|
|
summary: {
|
|
aiPercent,
|
|
aiChars: totalAiChars,
|
|
humanChars: totalHumanChars,
|
|
totalFilesTouched: Object.keys(files).length,
|
|
surfaces: [this.surface],
|
|
},
|
|
surfaceBreakdown,
|
|
excludedGenerated,
|
|
excludedGeneratedCount,
|
|
promptCount: this.getPromptsSinceLastCommit(),
|
|
};
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Character contribution calculation (Claude's prefix/suffix algorithm)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Compute the character contribution for a file modification.
|
|
* Uses common prefix/suffix matching to find the actual changed region,
|
|
* then returns the larger of the old/new changed lengths.
|
|
*/
|
|
export function computeCharContribution(
|
|
oldContent: string,
|
|
newContent: string,
|
|
): number {
|
|
if (oldContent === '' || newContent === '') {
|
|
return oldContent === '' ? newContent.length : oldContent.length;
|
|
}
|
|
|
|
const minLen = Math.min(oldContent.length, newContent.length);
|
|
let prefixEnd = 0;
|
|
while (
|
|
prefixEnd < minLen &&
|
|
oldContent[prefixEnd] === newContent[prefixEnd]
|
|
) {
|
|
prefixEnd++;
|
|
}
|
|
|
|
let suffixLen = 0;
|
|
while (
|
|
suffixLen < minLen - prefixEnd &&
|
|
oldContent[oldContent.length - 1 - suffixLen] ===
|
|
newContent[newContent.length - 1 - suffixLen]
|
|
) {
|
|
suffixLen++;
|
|
}
|
|
|
|
const oldChangedLen = oldContent.length - prefixEnd - suffixLen;
|
|
const newChangedLen = newContent.length - prefixEnd - suffixLen;
|
|
return Math.max(oldChangedLen, newChangedLen);
|
|
}
|