diff --git a/docs/users/configuration/settings.md b/docs/users/configuration/settings.md index 48759d7f7..26c2aeda0 100644 --- a/docs/users/configuration/settings.md +++ b/docs/users/configuration/settings.md @@ -83,16 +83,17 @@ Settings are organized into categories. Most settings should be placed within th #### general -| Setting | Type | Description | Default | -| ------------------------------------------ | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------- | -| `general.preferredEditor` | string | The preferred editor to open files in. | `undefined` | -| `general.vimMode` | boolean | Enable Vim keybindings. | `false` | -| `general.enableAutoUpdate` | boolean | Enable automatic update checks and installations on startup. | `true` | -| `general.showSessionRecap` | boolean | Auto-show a one-line "where you left off" recap when returning to the terminal after being away. Off by default. Use `/recap` to trigger manually regardless of this setting. | `false` | -| `general.sessionRecapAwayThresholdMinutes` | number | Minutes the terminal must be blurred before an auto-recap fires on focus-in. Only used when `showSessionRecap` is enabled. | `5` | -| `general.gitCoAuthor` | boolean | Automatically add a Co-authored-by trailer to git commit messages when commits are made through Qwen Code. | `true` | -| `general.checkpointing.enabled` | boolean | Enable session checkpointing for recovery. | `false` | -| `general.defaultFileEncoding` | string | Default encoding for new files. Use `"utf-8"` (default) for UTF-8 without BOM, or `"utf-8-bom"` for UTF-8 with BOM. Only change this if your project specifically requires BOM. | `"utf-8"` | +| Setting | Type | Description | Default | +| ------------------------------------------ | ------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------- | +| `general.preferredEditor` | string | The preferred editor to open files in. | `undefined` | +| `general.vimMode` | boolean | Enable Vim keybindings. | `false` | +| `general.enableAutoUpdate` | boolean | Enable automatic update checks and installations on startup. | `true` | +| `general.showSessionRecap` | boolean | Auto-show a one-line "where you left off" recap when returning to the terminal after being away. Off by default. Use `/recap` to trigger manually regardless of this setting. | `false` | +| `general.sessionRecapAwayThresholdMinutes` | number | Minutes the terminal must be blurred before an auto-recap fires on focus-in. Only used when `showSessionRecap` is enabled. | `5` | +| `general.gitCoAuthor.commit` | boolean | Add a Co-authored-by trailer to git commit messages AND attach a per-file AI-attribution git note (`refs/notes/ai-attribution`) for commits made through Qwen Code. Disabling skips both. | `true` | +| `general.gitCoAuthor.pr` | boolean | Append a Qwen Code attribution line to pull request descriptions when running `gh pr create`. | `true` | +| `general.checkpointing.enabled` | boolean | Enable session checkpointing for recovery. | `false` | +| `general.defaultFileEncoding` | string | Default encoding for new files. Use `"utf-8"` (default) for UTF-8 without BOM, or `"utf-8-bom"` for UTF-8 with BOM. Only change this if your project specifically requires BOM. | `"utf-8"` | #### output diff --git a/integration-tests/cli/settings-migration.test.ts b/integration-tests/cli/settings-migration.test.ts index 181dcb2ad..e67a90040 100644 --- a/integration-tests/cli/settings-migration.test.ts +++ b/integration-tests/cli/settings-migration.test.ts @@ -24,16 +24,21 @@ const { v2PreexistingEnableSettings, v3LegacyDisableSettings, v999FutureVersionSettings, + v3GitCoAuthorBooleanSettings, } = workspacesSettings; /** - * Integration tests for settings migration chain (V1 -> V2 -> V3) + * Integration tests for settings migration chain (V1 -> V2 -> V3 -> V4) * * These tests verify that: - * 1. V1 settings are automatically migrated to V3 on CLI startup - * 2. V2 settings are automatically migrated to V3 on CLI startup - * 3. V3 settings remain unchanged + * 1. V1 settings are automatically migrated to V4 on CLI startup + * 2. V2 settings are automatically migrated to V4 on CLI startup + * 3. V3 settings are automatically migrated to V4 on CLI startup * 4. Migration is idempotent (running multiple times produces same result) + * + * The numeric assertions use the literal `4` to match + * `SETTINGS_VERSION`; bump that constant and the literal together + * when adding a future migration. */ describe('settings-migration', () => { let rig: TestRig; @@ -77,7 +82,7 @@ describe('settings-migration', () => { }; describe('V1 settings migration', () => { - it('should migrate V1 settings to V3 on CLI startup', async () => { + it('should migrate V1 settings forward through the chain on CLI startup', async () => { rig.setup('v1-to-v3-migration'); // Write V1 settings directly (overwrites the one created by setup) @@ -94,8 +99,8 @@ describe('settings-migration', () => { // Read migrated settings const migratedSettings = readSettingsFile(rig); - // Verify migration to V3 - expect(migratedSettings['$version']).toBe(3); + // Verify migration to V4 (current SETTINGS_VERSION) + expect(migratedSettings['$version']).toBe(4); expect(migratedSettings['ui']).toEqual({ theme: 'dark', hideTips: false, @@ -137,7 +142,7 @@ describe('settings-migration', () => { const migratedSettings = readSettingsFile(rig); // Expected output based on stable test output - expect(migratedSettings['$version']).toBe(3); + expect(migratedSettings['$version']).toBe(4); expect(migratedSettings['tools']).toEqual({ autoAccept: false }); expect(migratedSettings['context']).toEqual({ includeDirectories: [] }); expect(migratedSettings['model']).toEqual({ name: ['gemini', 'claude'] }); @@ -161,8 +166,8 @@ describe('settings-migration', () => { // Read migrated settings const migratedSettings = readSettingsFile(rig); - // Should be migrated to V3 - expect(migratedSettings['$version']).toBe(3); + // Should be migrated to V4 + expect(migratedSettings['$version']).toBe(4); // Legacy string values for ui/general should be preserved as-is (user data) expect(migratedSettings['ui']).toBe('legacy-ui-string'); expect(migratedSettings['general']).toBe('legacy-general-string'); @@ -189,7 +194,7 @@ describe('settings-migration', () => { const migratedSettings = readSettingsFile(rig); // Expected output based on stable test output - expect(migratedSettings['$version']).toBe(3); + expect(migratedSettings['$version']).toBe(4); expect(migratedSettings['model']).toEqual({ name: 'qwen-plus' }); expect(migratedSettings['ui']).toEqual({ hideWindowTitle: true, @@ -209,7 +214,7 @@ describe('settings-migration', () => { }); describe('V2 settings migration', () => { - it('should migrate V2 settings to V3 on CLI startup', async () => { + it('should migrate V2 settings forward through the chain on CLI startup', async () => { rig.setup('v2-to-v3-migration'); // Write V2 settings directly (overwrites the one created by setup) @@ -225,8 +230,8 @@ describe('settings-migration', () => { // Read migrated settings const migratedSettings = readSettingsFile(rig); - // Verify migration to V3 - expect(migratedSettings['$version']).toBe(3); + // Verify migration to V4 (current SETTINGS_VERSION) + expect(migratedSettings['$version']).toBe(4); // Verify disable* -> enable* conversion with inversion expect( @@ -302,8 +307,8 @@ describe('settings-migration', () => { // Read migrated settings const migratedSettings = readSettingsFile(rig); - // Should be updated to V3 version - expect(migratedSettings['$version']).toBe(3); + // Should be updated to V4 version + expect(migratedSettings['$version']).toBe(4); // Other settings should remain unchanged expect(migratedSettings['ui']).toEqual({ theme: 'dark' }); expect(migratedSettings['model']).toEqual({ name: 'gemini' }); @@ -330,12 +335,12 @@ describe('settings-migration', () => { const migratedSettings = readSettingsFile(rig); // Version metadata should still be normalized to current version - expect(migratedSettings['$version']).toBe(3); + expect(migratedSettings['$version']).toBe(4); // Existing user content should be preserved expect(migratedSettings['customOnlyKey']).toBe('value'); }); - it('should coerce valid string booleans and remove invalid deprecated keys while bumping V2 to V3', async () => { + it('should coerce valid string booleans and remove invalid deprecated keys while bumping V2 forward through the chain', async () => { rig.setup('v2-non-boolean-disable-values-migration'); // Cover both coercible string booleans and invalid non-boolean values: @@ -372,7 +377,7 @@ describe('settings-migration', () => { const migratedSettings = readSettingsFile(rig); // Coercible strings are migrated; invalid disable* values are removed. - expect(migratedSettings['$version']).toBe(3); + expect(migratedSettings['$version']).toBe(4); expect(migratedSettings['general']).toEqual({ enableAutoUpdate: false, }); @@ -437,7 +442,7 @@ describe('settings-migration', () => { const migratedSettings = readSettingsFile(rig); // Expected output based on stable test output - expect(migratedSettings['$version']).toBe(3); + expect(migratedSettings['$version']).toBe(4); // Migration converts disable* to enable* by inverting the value // disableAutoUpdate: false -> enableAutoUpdate: true (inverted) // But disableUpdateNag: true may affect the consolidation @@ -501,11 +506,10 @@ describe('settings-migration', () => { // Read settings const finalSettings = readSettingsFile(rig); - // Should remain V3 - expect(finalSettings['$version']).toBe(3); - // Note: V3 settings with legacy disable* keys are left as-is - // Migration only runs when version < current version - // Since this is already V3, no migration logic is applied + // V3 → V4 migration bumps the version; V3→V4 only touches + // general.gitCoAuthor, so unrelated legacy disable* keys remain as-is + // (V2→V3 ran on original V3 load, not re-applied here). + expect(finalSettings['$version']).toBe(4); expect( (finalSettings['general'] as Record)?.[ 'disableAutoUpdate' @@ -536,6 +540,44 @@ describe('settings-migration', () => { note: 'should remain unchanged in v3', }); }); + + // V3 used to allow `general.gitCoAuthor: `. The V3→V4 + // migration must expand that boolean into the new + // `{ commit, pr }` object shape so the user's stored opt-out + // doesn't get silently overwritten by the schema defaults + // (which default both sub-toggles to `true`) on the next save. + // The unit test in `v3-to-v4.test.ts` already pins the + // migration body, but without an end-to-end fixture the real + // CLI load → migrate → write path could regress without + // this suite noticing. + it('should expand legacy boolean general.gitCoAuthor: false through V3 → V4', async () => { + rig.setup('v3-gitcoauthor-boolean'); + + overwriteSettingsFile(rig, v3GitCoAuthorBooleanSettings); + + try { + await rig.runCommand(['mcp', 'list']); + } catch { + // Expected to potentially fail + } + + const finalSettings = readSettingsFile(rig); + + expect(finalSettings['$version']).toBe(4); + expect( + (finalSettings['general'] as Record)?.['gitCoAuthor'], + ).toEqual({ commit: false, pr: false }); + // Sibling general.* keys must survive the migration unchanged. + expect( + (finalSettings['general'] as Record)?.[ + 'disableAutoUpdate' + ], + ).toBe(true); + // And so must unrelated top-level sections. + expect(finalSettings['custom']).toEqual({ + note: 'preserve me through v3->v4', + }); + }); }); describe('Future version settings handling', () => { diff --git a/integration-tests/fixtures/settings-migration/workspaces.json b/integration-tests/fixtures/settings-migration/workspaces.json index bd9798009..331276a87 100644 --- a/integration-tests/fixtures/settings-migration/workspaces.json +++ b/integration-tests/fixtures/settings-migration/workspaces.json @@ -184,5 +184,15 @@ "experimentalFlag": { "enabled": true } + }, + "v3GitCoAuthorBooleanSettings": { + "$version": 3, + "general": { + "gitCoAuthor": false, + "disableAutoUpdate": true + }, + "custom": { + "note": "preserve me through v3->v4" + } } } diff --git a/packages/cli/src/config/migration/index.test.ts b/packages/cli/src/config/migration/index.test.ts index 52bae237e..720370660 100644 --- a/packages/cli/src/config/migration/index.test.ts +++ b/packages/cli/src/config/migration/index.test.ts @@ -15,7 +15,7 @@ import { SETTINGS_VERSION } from '../settings.js'; describe('Migration Framework Integration', () => { describe('runMigrations', () => { - it('should migrate V1 settings to V3', () => { + it('should migrate V1 settings all the way to the current version', () => { const v1Settings = { theme: 'dark', model: 'gemini', @@ -25,8 +25,8 @@ describe('Migration Framework Integration', () => { const result = runMigrations(v1Settings, 'user'); - expect(result.finalVersion).toBe(3); - expect(result.executedMigrations).toHaveLength(2); + expect(result.finalVersion).toBe(SETTINGS_VERSION); + expect(result.executedMigrations).toHaveLength(SETTINGS_VERSION - 1); expect(result.executedMigrations[0]).toEqual({ fromVersion: 1, toVersion: 2, @@ -38,7 +38,7 @@ describe('Migration Framework Integration', () => { // Check V2 structure was created const settings = result.settings as Record; - expect(settings['$version']).toBe(3); + expect(settings['$version']).toBe(SETTINGS_VERSION); expect(settings['ui']).toEqual({ theme: 'dark', accessibility: { enableLoadingPhrases: true }, @@ -51,7 +51,7 @@ describe('Migration Framework Integration', () => { ).toBe(false); }); - it('should migrate V2 settings to V3', () => { + it('should migrate V2 settings forward through the chain', () => { const v2Settings = { $version: 2, ui: { theme: 'light' }, @@ -60,15 +60,15 @@ describe('Migration Framework Integration', () => { const result = runMigrations(v2Settings, 'user'); - expect(result.finalVersion).toBe(3); - expect(result.executedMigrations).toHaveLength(1); + expect(result.finalVersion).toBe(SETTINGS_VERSION); + expect(result.executedMigrations).toHaveLength(SETTINGS_VERSION - 2); expect(result.executedMigrations[0]).toEqual({ fromVersion: 2, toVersion: 3, }); const settings = result.settings as Record; - expect(settings['$version']).toBe(3); + expect(settings['$version']).toBe(SETTINGS_VERSION); expect( (settings['general'] as Record)['enableAutoUpdate'], ).toBe(true); @@ -77,18 +77,18 @@ describe('Migration Framework Integration', () => { ).toBeUndefined(); }); - it('should not modify V3 settings', () => { - const v3Settings = { - $version: 3, + it('should not modify settings already at the current version', () => { + const current = { + $version: SETTINGS_VERSION, ui: { theme: 'dark' }, general: { enableAutoUpdate: true }, }; - const result = runMigrations(v3Settings, 'user'); + const result = runMigrations(current, 'user'); - expect(result.finalVersion).toBe(3); + expect(result.finalVersion).toBe(SETTINGS_VERSION); expect(result.executedMigrations).toHaveLength(0); - expect(result.settings).toEqual(v3Settings); + expect(result.settings).toEqual(current); }); it('should be idempotent', () => { @@ -100,7 +100,7 @@ describe('Migration Framework Integration', () => { const result1 = runMigrations(v1Settings, 'user'); const result2 = runMigrations(result1.settings, 'user'); - expect(result1.executedMigrations).toHaveLength(2); + expect(result1.executedMigrations).toHaveLength(SETTINGS_VERSION - 1); expect(result2.executedMigrations).toHaveLength(0); expect(result1.finalVersion).toBe(result2.finalVersion); }); @@ -135,13 +135,13 @@ describe('Migration Framework Integration', () => { expect(needsMigration(cleanV2Settings)).toBe(true); }); - it('should return false for V3 settings', () => { - const v3Settings = { - $version: 3, + it('should return false for settings already at the current version', () => { + const current = { + $version: SETTINGS_VERSION, general: { enableAutoUpdate: true }, }; - expect(needsMigration(v3Settings)).toBe(false); + expect(needsMigration(current)).toBe(false); }); it('should return false for legacy numeric version when no migration can execute', () => { @@ -156,13 +156,12 @@ describe('Migration Framework Integration', () => { describe('ALL_MIGRATIONS', () => { it('should contain all migrations in order', () => { - expect(ALL_MIGRATIONS).toHaveLength(2); + expect(ALL_MIGRATIONS).toHaveLength(SETTINGS_VERSION - 1); - expect(ALL_MIGRATIONS[0].fromVersion).toBe(1); - expect(ALL_MIGRATIONS[0].toVersion).toBe(2); - - expect(ALL_MIGRATIONS[1].fromVersion).toBe(2); - expect(ALL_MIGRATIONS[1].toVersion).toBe(3); + for (let i = 0; i < ALL_MIGRATIONS.length; i++) { + expect(ALL_MIGRATIONS[i].fromVersion).toBe(i + 1); + expect(ALL_MIGRATIONS[i].toVersion).toBe(i + 2); + } }); }); @@ -178,10 +177,10 @@ describe('Migration Framework Integration', () => { const result = scheduler.migrate(v1Settings); - expect(result.executedMigrations).toHaveLength(2); + expect(result.executedMigrations).toHaveLength(SETTINGS_VERSION - 1); const settings = result.settings as Record; - expect(settings['$version']).toBe(3); + expect(settings['$version']).toBe(SETTINGS_VERSION); expect((settings['ui'] as Record)['theme']).toBe('dark'); expect( (settings['general'] as Record)['enableAutoUpdate'], @@ -212,16 +211,16 @@ describe('Migration Framework Integration', () => { }); it('needsMigration should return false when runMigrations would execute no migrations', () => { - const v3Settings = { - $version: 3, + const current = { + $version: SETTINGS_VERSION, general: { enableAutoUpdate: true }, }; // needsMigration should report that no migration is needed - expect(needsMigration(v3Settings)).toBe(false); + expect(needsMigration(current)).toBe(false); // runMigrations should execute no migrations - const result = runMigrations(v3Settings, 'user'); + const result = runMigrations(current, 'user'); expect(result.executedMigrations).toHaveLength(0); }); @@ -234,10 +233,10 @@ describe('Migration Framework Integration', () => { // needsMigration should report that migration is needed expect(needsMigration(cleanV2Settings)).toBe(true); - // runMigrations should execute the V2->V3 migration + // runMigrations should execute migrations forward to the current version const result = runMigrations(cleanV2Settings, 'user'); expect(result.executedMigrations.length).toBeGreaterThan(0); - expect(result.finalVersion).toBe(3); + expect(result.finalVersion).toBe(SETTINGS_VERSION); }); }); @@ -364,14 +363,14 @@ describe('Migration Framework Integration', () => { it('should avoid repeated no-op migration loops', () => { // Settings that might cause repeated migrations - const v3Settings = { - $version: 3, + const current = { + $version: SETTINGS_VERSION, general: { enableAutoUpdate: true }, }; // First check - expect(needsMigration(v3Settings)).toBe(false); - const result1 = runMigrations(v3Settings, 'user'); + expect(needsMigration(current)).toBe(false); + const result1 = runMigrations(current, 'user'); expect(result1.executedMigrations).toHaveLength(0); // Second check should be consistent diff --git a/packages/cli/src/config/migration/index.ts b/packages/cli/src/config/migration/index.ts index 40d176cbe..e37b06d8d 100644 --- a/packages/cli/src/config/migration/index.ts +++ b/packages/cli/src/config/migration/index.ts @@ -13,6 +13,7 @@ export { MigrationScheduler } from './scheduler.js'; // Export migrations export { v1ToV2Migration, V1ToV2Migration } from './versions/v1-to-v2.js'; export { v2ToV3Migration, V2ToV3Migration } from './versions/v2-to-v3.js'; +export { v3ToV4Migration, V3ToV4Migration } from './versions/v3-to-v4.js'; // Import settings version from single source of truth import { SETTINGS_VERSION } from '../settings.js'; @@ -22,6 +23,7 @@ import { SETTINGS_VERSION } from '../settings.js'; // Order matters: migrations must be sorted by ascending version import { v1ToV2Migration } from './versions/v1-to-v2.js'; import { v2ToV3Migration } from './versions/v2-to-v3.js'; +import { v3ToV4Migration } from './versions/v3-to-v4.js'; import { MigrationScheduler } from './scheduler.js'; import type { MigrationResult } from './types.js'; @@ -35,7 +37,11 @@ import type { MigrationResult } from './types.js'; * const result = scheduler.migrate(settings); * ``` */ -export const ALL_MIGRATIONS = [v1ToV2Migration, v2ToV3Migration] as const; +export const ALL_MIGRATIONS = [ + v1ToV2Migration, + v2ToV3Migration, + v3ToV4Migration, +] as const; /** * Convenience function that runs all migrations on the given settings. diff --git a/packages/cli/src/config/migration/versions/v3-to-v4.test.ts b/packages/cli/src/config/migration/versions/v3-to-v4.test.ts new file mode 100644 index 000000000..0aad1d31e --- /dev/null +++ b/packages/cli/src/config/migration/versions/v3-to-v4.test.ts @@ -0,0 +1,242 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect } from 'vitest'; +import { V3ToV4Migration } from './v3-to-v4.js'; + +describe('V3ToV4Migration', () => { + const migration = new V3ToV4Migration(); + + describe('shouldMigrate', () => { + it('returns true for V3 settings', () => { + expect( + migration.shouldMigrate({ + $version: 3, + general: { gitCoAuthor: false }, + }), + ).toBe(true); + }); + + it('returns true for V3 settings without gitCoAuthor', () => { + // Even without the relevant key, the version must still bump. + expect(migration.shouldMigrate({ $version: 3 })).toBe(true); + }); + + it('returns false for V4 settings', () => { + expect( + migration.shouldMigrate({ + $version: 4, + general: { gitCoAuthor: { commit: true, pr: true } }, + }), + ).toBe(false); + }); + + it('returns false for non-object input', () => { + expect(migration.shouldMigrate(null)).toBe(false); + expect(migration.shouldMigrate('x')).toBe(false); + expect(migration.shouldMigrate(42)).toBe(false); + }); + + // `gitCoAuthor` post-dates the V1 indicator-key list, so a settings + // file that has ONLY this legacy boolean shape (no `$version`, + // no other migration-triggering keys) wouldn't fire any earlier + // migration. The v3→v4 step must catch it directly so the dialog + // doesn't silently overwrite the user's stored opt-out with the + // schema defaults on next save. + it('returns true for versionless settings with legacy boolean gitCoAuthor', () => { + expect( + migration.shouldMigrate({ + general: { gitCoAuthor: false }, + }), + ).toBe(true); + }); + + it('returns false for versionless settings without gitCoAuthor', () => { + expect(migration.shouldMigrate({ general: {} })).toBe(false); + expect(migration.shouldMigrate({})).toBe(false); + }); + + it('returns false for versionless settings with already-object gitCoAuthor', () => { + // User who hand-edited to the v4 shape — let the loader's + // version normalization handle it without rewriting. + expect( + migration.shouldMigrate({ + general: { gitCoAuthor: { commit: false, pr: true } }, + }), + ).toBe(false); + }); + + // Without the migration firing on invalid versionless values, the + // loader would stamp $version: 4 with `"off"` / `[]` / etc. left + // on disk, and runtime normalization would silently re-enable + // attribution. The migrate() body's drop-and-warn handles these + // — shouldMigrate has to fire so it gets a chance to run. + it.each([ + ['"off"', 'off'], + ['empty array', []], + ['number', 42], + ['null', null], + ])( + 'returns true for versionless settings with invalid gitCoAuthor (%s)', + (_label, value) => { + expect( + migration.shouldMigrate({ + general: { gitCoAuthor: value }, + }), + ).toBe(true); + }, + ); + }); + + describe('migrate', () => { + it('expands legacy boolean true into { commit: true, pr: true }', () => { + const input = { $version: 3, general: { gitCoAuthor: true } }; + const { settings, warnings } = migration.migrate(input, 'user') as { + settings: Record; + warnings: string[]; + }; + + expect( + (settings['general'] as Record)['gitCoAuthor'], + ).toEqual({ commit: true, pr: true }); + expect(settings['$version']).toBe(4); + expect(warnings).toEqual([]); + }); + + it('expands legacy boolean false into { commit: false, pr: false }', () => { + const input = { $version: 3, general: { gitCoAuthor: false } }; + const { settings } = migration.migrate(input, 'user') as { + settings: Record; + warnings: string[]; + }; + + expect( + (settings['general'] as Record)['gitCoAuthor'], + ).toEqual({ commit: false, pr: false }); + }); + + it('leaves an already-object value untouched', () => { + const input = { + $version: 3, + general: { gitCoAuthor: { commit: false, pr: true } }, + }; + const { settings, warnings } = migration.migrate(input, 'user') as { + settings: Record; + warnings: string[]; + }; + + expect( + (settings['general'] as Record)['gitCoAuthor'], + ).toEqual({ commit: false, pr: true }); + expect(warnings).toEqual([]); + }); + + it('bumps version when gitCoAuthor is absent', () => { + const input = { $version: 3, general: {} }; + const { settings, warnings } = migration.migrate(input, 'user') as { + settings: Record; + warnings: string[]; + }; + + expect(settings['$version']).toBe(4); + expect( + (settings['general'] as Record)['gitCoAuthor'], + ).toBeUndefined(); + expect(warnings).toEqual([]); + }); + + // String enable-intent forms map to {commit: true, pr: true}; + // disable-intent forms map to {commit: false, pr: false}; an + // unrecognised string also defaults to disabled (safer-by-default + // — same contract as the runtime `pickBool`) but emits a warning. + it.each([ + ['"true"', 'true', { commit: true, pr: true }, false], + ['"yes"', 'yes', { commit: true, pr: true }, false], + ['"on"', 'on', { commit: true, pr: true }, false], + ['"1"', '1', { commit: true, pr: true }, false], + ['"false"', 'false', { commit: false, pr: false }, false], + ['"no"', 'no', { commit: false, pr: false }, false], + ['"off"', 'off', { commit: false, pr: false }, false], + ['"0"', '0', { commit: false, pr: false }, false], + ['empty string', '', { commit: false, pr: false }, false], + ['"OFF" (case)', 'OFF', { commit: false, pr: false }, false], + ['unknown string', 'maybe', { commit: false, pr: false }, true], + ])( + 'maps string %s to %j (warn=%s)', + (_label, str, expected, expectWarn) => { + const input = { $version: 3, general: { gitCoAuthor: str } }; + const { settings, warnings } = migration.migrate(input, 'user') as { + settings: Record; + warnings: string[]; + }; + expect( + (settings['general'] as Record)['gitCoAuthor'], + ).toEqual(expected); + if (expectWarn) { + expect(warnings).toHaveLength(1); + expect(warnings[0]).toContain('gitCoAuthor'); + } else { + expect(warnings).toHaveLength(0); + } + }, + ); + + // Non-string invalid values (null/array/number) get the + // safer-by-default disabled state with a warning. + it.each([ + ['null', null], + ['array', []], + ['number', 42], + ])( + 'treats %s as invalid and resets to disabled with a warning', + (_label, bad) => { + const input = { $version: 3, general: { gitCoAuthor: bad } }; + const { settings, warnings } = migration.migrate(input, 'user') as { + settings: Record; + warnings: string[]; + }; + + expect( + (settings['general'] as Record)['gitCoAuthor'], + ).toEqual({ commit: false, pr: false }); + expect(warnings).toHaveLength(1); + }, + ); + + it('leaves a partially-specified object unchanged', () => { + // Downstream normalizeGitCoAuthor fills missing sub-keys with defaults; + // the migration only reshapes, it does not paternalistically fill defaults. + const input = { + $version: 3, + general: { gitCoAuthor: { commit: false } }, + }; + const { settings, warnings } = migration.migrate(input, 'user') as { + settings: Record; + warnings: string[]; + }; + + expect( + (settings['general'] as Record)['gitCoAuthor'], + ).toEqual({ commit: false }); + expect(warnings).toEqual([]); + }); + + it('does not mutate the input settings object', () => { + const input = { $version: 3, general: { gitCoAuthor: false } }; + migration.migrate(input, 'user'); + + expect(input).toEqual({ + $version: 3, + general: { gitCoAuthor: false }, + }); + }); + + it('throws for non-object input', () => { + expect(() => migration.migrate(null, 'user')).toThrow(); + expect(() => migration.migrate('string', 'user')).toThrow(); + }); + }); +}); diff --git a/packages/cli/src/config/migration/versions/v3-to-v4.ts b/packages/cli/src/config/migration/versions/v3-to-v4.ts new file mode 100644 index 000000000..786cb1739 --- /dev/null +++ b/packages/cli/src/config/migration/versions/v3-to-v4.ts @@ -0,0 +1,146 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { SettingsMigration } from '../types.js'; +import { + getNestedProperty, + setNestedPropertySafe, +} from '../../../utils/settingsUtils.js'; + +const GIT_CO_AUTHOR_PATH = 'general.gitCoAuthor'; + +/** + * V3 -> V4 migration (gitCoAuthor boolean → object expansion). + * + * Before V4, `general.gitCoAuthor` was a single boolean that governed both + * commit message attribution and PR body attribution. V4 splits those into + * two independent sub-toggles so users can disable one without losing the + * other. This migration rewrites any stored boolean into `{ commit: v, + * pr: v }` so the user's prior choice carries over to both new toggles and + * the settings dialog reads the expected object shape. + * + * Compatibility strategy: + * - Boolean values are expanded in place. + * - Object values with `commit`/`pr` keys are left untouched (forward- + * compatible — a user who edited their settings.json by hand to the new + * shape is already on V4-equivalent data). + * - Any other present value (string, number, array, null) is dropped with + * a warning so the caller sees an actionable message. + */ +export class V3ToV4Migration implements SettingsMigration { + readonly fromVersion = 3; + readonly toVersion = 4; + + shouldMigrate(settings: unknown): boolean { + if (typeof settings !== 'object' || settings === null) { + return false; + } + const s = settings as Record; + if (s['$version'] === 3) { + return true; + } + // Versionless settings file (no $version key): the V1/V2 migrations + // don't list `gitCoAuthor` as an indicator key (it post-dates them), + // so a settings file with ONLY this shape wouldn't trigger any + // earlier migration. Catch it here so: + // - legacy boolean (`gitCoAuthor: false`) gets expanded to + // `{commit: false, pr: false}` instead of being silently + // overwritten by the dialog's schema defaults on first save; + // - invalid shapes (`gitCoAuthor: "off"`, `gitCoAuthor: []`, + // etc.) get reset by the migrate() body's drop-and-warn path + // so runtime normalization doesn't quietly re-enable + // attribution against the user's intent. + if (s['$version'] === undefined) { + const value = getNestedProperty(s, GIT_CO_AUTHOR_PATH); + if (value === undefined) return false; + // Already in the v4 shape — leave the loader to stamp $version: 4. + if ( + typeof value === 'object' && + value !== null && + !Array.isArray(value) + ) { + return false; + } + // Anything else (boolean, string, number, array, null) needs + // rewriting via migrate(). + return true; + } + return false; + } + + migrate( + settings: unknown, + scope: string, + ): { settings: unknown; warnings: string[] } { + if (typeof settings !== 'object' || settings === null) { + throw new Error('Settings must be an object'); + } + + const result = structuredClone(settings) as Record; + const warnings: string[] = []; + + const value = getNestedProperty(result, GIT_CO_AUTHOR_PATH); + + if (typeof value === 'boolean') { + // Legacy shape — rewrite as { commit, pr } preserving the prior choice. + setNestedPropertySafe(result, GIT_CO_AUTHOR_PATH, { + commit: value, + pr: value, + }); + } else if (typeof value === 'string') { + // String forms: a user who hand-edited `"gitCoAuthor": "off"` (or + // similar) to disable the feature must NOT see attribution + // silently re-enable just because we couldn't parse the literal + // shape. Map disable-intent strings to `{commit: false, pr: false}`, + // enable-intent strings to `{commit: true, pr: true}`, and + // anything else to disabled with a warning (safer-by-default + // than enabling against an ambiguous opt-out). + const lowered = value.trim().toLowerCase(); + const disableIntent = ['false', 'no', 'off', '0', 'disabled', '']; + const enableIntent = ['true', 'yes', 'on', '1', 'enabled']; + if (enableIntent.includes(lowered)) { + setNestedPropertySafe(result, GIT_CO_AUTHOR_PATH, { + commit: true, + pr: true, + }); + } else if (disableIntent.includes(lowered)) { + setNestedPropertySafe(result, GIT_CO_AUTHOR_PATH, { + commit: false, + pr: false, + }); + } else { + setNestedPropertySafe(result, GIT_CO_AUTHOR_PATH, { + commit: false, + pr: false, + }); + warnings.push( + `Reset '${GIT_CO_AUTHOR_PATH}' in ${scope} settings to {commit: false, pr: false} because the stored string '${value}' was not a recognized boolean form.`, + ); + } + } else if ( + value !== undefined && + (typeof value !== 'object' || value === null || Array.isArray(value)) + ) { + // Invalid non-string shape (number, array, null). Drop and + // disable rather than re-enable on ambiguity — same + // safer-by-default contract as `pickBool` at runtime. + setNestedPropertySafe(result, GIT_CO_AUTHOR_PATH, { + commit: false, + pr: false, + }); + warnings.push( + `Reset '${GIT_CO_AUTHOR_PATH}' in ${scope} settings to {commit: false, pr: false} because the stored value was not a boolean or object.`, + ); + } + // Object values (including the new shape) pass through unchanged. + + result['$version'] = 4; + + return { settings: result, warnings }; + } +} + +export const v3ToV4Migration = new V3ToV4Migration(); diff --git a/packages/cli/src/config/settings.ts b/packages/cli/src/config/settings.ts index 38e86bc9c..b5a774a41 100644 --- a/packages/cli/src/config/settings.ts +++ b/packages/cli/src/config/settings.ts @@ -65,7 +65,7 @@ export const USER_SETTINGS_DIR = path.dirname(USER_SETTINGS_PATH); export const DEFAULT_EXCLUDED_ENV_VARS = ['DEBUG', 'DEBUG_MODE']; // Settings version to track migration state -export const SETTINGS_VERSION = 3; +export const SETTINGS_VERSION = 4; export const SETTINGS_VERSION_KEY = '$version'; /** diff --git a/packages/cli/src/config/settingsSchema.ts b/packages/cli/src/config/settingsSchema.ts index c81e7ec37..4604688a9 100644 --- a/packages/cli/src/config/settingsSchema.ts +++ b/packages/cli/src/config/settingsSchema.ts @@ -78,6 +78,27 @@ export interface SettingDefinition { options?: readonly SettingEnumOption[]; /** Schema for array items when type is 'array' */ items?: SettingItemDefinition; + /** + * Primitive shapes a field accepted before it was expanded to its current + * type. The exported JSON Schema wraps the field in `anyOf` so values from + * those older shapes don't trip the IDE validator while the runtime + * migration is still pending. Has no runtime effect — it's purely a + * compatibility hint for editors. + * + * Narrowed to the subset our generator can faithfully emit as a + * one-liner `{ type: }` schema fragment. `'enum'` is + * not a valid JSON Schema `type` value at all (enum constraints + * use the `enum` keyword, not `type: 'enum'`), so allowing it here + * would silently produce an invalid `settings.schema.json`. + * `'object'` IS a valid JSON Schema type, but a bare + * `{ type: 'object' }` legacy entry would accept ANY object value + * — most likely not what the field's pre-expansion shape actually + * permitted. Future legacy shapes that need `enum` / structured- + * object compatibility should land their own branch in + * `convertSettingToJsonSchema` (with proper `enum:` / `properties:` + * companions) instead of widening this set. + */ + legacyTypes?: ReadonlyArray<'boolean' | 'string' | 'number' | 'array'>; /** * Escape hatch for the JSON Schema generator: when set, this object is * emitted verbatim under the setting's properties entry instead of the @@ -387,14 +408,47 @@ const SETTINGS_SCHEMA = { showInDialog: true, }, gitCoAuthor: { - type: 'boolean', - label: 'Attribution: commit', + type: 'object', + label: 'Attribution', category: 'General', requiresRestart: false, - default: true, + // Match `normalizeGitCoAuthor`'s runtime defaults so the IDE + // schema publishes the same "enabled by default" hint users see + // at runtime. The empty-object form here would silently lose + // editor-surfaced defaults. + default: { commit: true, pr: true }, description: - 'Automatically add a Co-authored-by trailer to git commit messages when commits are made through Qwen Code.', - showInDialog: true, + 'Attribution added to git commits and pull requests created through Qwen Code.', + showInDialog: false, + // Pre-V4 settings stored this as a single boolean. The V3→V4 + // migration rewrites those on first launch, but the IDE schema + // validator runs before that — accept the boolean shape so users + // editing settings.json in VS Code don't see a spurious warning + // until they run qwen once. Config.normalizeGitCoAuthor handles + // the boolean at runtime. + legacyTypes: ['boolean'], + properties: { + commit: { + type: 'boolean', + label: 'Attribution: commit', + category: 'General', + requiresRestart: false, + default: true, + description: + 'Add a Co-authored-by trailer to git commit messages AND attach a per-file AI-attribution git note (`refs/notes/ai-attribution`) for commits made through Qwen Code. Disabling skips both.', + showInDialog: true, + }, + pr: { + type: 'boolean', + label: 'Attribution: PR', + category: 'General', + requiresRestart: false, + default: true, + description: + 'Append a Qwen Code attribution line to PR descriptions when running `gh pr create`.', + showInDialog: true, + }, + }, }, checkpointing: { type: 'object', diff --git a/packages/cli/src/utils/settingsUtils.ts b/packages/cli/src/utils/settingsUtils.ts index 0effeb738..f36b26bb2 100644 --- a/packages/cli/src/utils/settingsUtils.ts +++ b/packages/cli/src/utils/settingsUtils.ts @@ -295,7 +295,8 @@ const SETTINGS_DIALOG_ORDER: readonly string[] = [ 'ui.enableWelcomeBack', // Git Behavior - 'general.gitCoAuthor', + 'general.gitCoAuthor.commit', + 'general.gitCoAuthor.pr', // File Filtering 'context.fileFiltering.respectGitIgnore', diff --git a/packages/core/src/config/config.test.ts b/packages/core/src/config/config.test.ts index 07beb5788..c1d9ef2a9 100644 --- a/packages/core/src/config/config.test.ts +++ b/packages/core/src/config/config.test.ts @@ -1030,6 +1030,95 @@ describe('Server Config (config.ts)', () => { }); }); + describe('GitCoAuthor Settings', () => { + it('defaults both commit and pr to true when not specified', () => { + const config = new Config({ ...baseParams, gitCoAuthor: undefined }); + const settings = config.getGitCoAuthor(); + expect(settings.commit).toBe(true); + expect(settings.pr).toBe(true); + }); + + it('accepts an object with independent commit and pr toggles', () => { + const config = new Config({ + ...baseParams, + gitCoAuthor: { commit: true, pr: false }, + }); + const settings = config.getGitCoAuthor(); + expect(settings.commit).toBe(true); + expect(settings.pr).toBe(false); + }); + + // Legacy shape: before commit and PR attribution were split, this + // setting was a single boolean. Treat it as governing both toggles so + // existing users' preferences carry over. + it.each([true, false])( + 'coerces legacy boolean %s to { commit, pr } with the same value', + (value) => { + const config = new Config({ ...baseParams, gitCoAuthor: value }); + const settings = config.getGitCoAuthor(); + expect(settings.commit).toBe(value); + expect(settings.pr).toBe(value); + }, + ); + + // settings.json is hand-editable; without intent-aware string + // parsing a hand-edited `{ commit: "false" }` would silently + // inflate to `commit: true` (the previous "default-to-true on + // mismatch" policy). Honor common string disable-intent forms + // and fall through to disabled on genuinely unrecognisable + // input — safer-by-default than turning attribution on against + // the user's clear opt-out. + it.each([ + // Disable-intent strings. + ['string "false"', 'false', false], + ['string "FALSE"', 'FALSE', false], + ['string "no"', 'no', false], + ['string "off"', 'off', false], + ['string "0"', '0', false], + ['empty string', '', false], + // Enable-intent strings. + ['string "true"', 'true', true], + ['string "yes"', 'yes', true], + ['string "on"', 'on', true], + ['string "1"', '1', true], + // Numbers. + ['number 1', 1, true], + ['number 0', 0, false], + ['number 42', 42, false], + // Other types fall through to disabled. + ['null', null, false], + ['object', {}, false], + ['array', [], false], + // Unknown strings → disabled (don't quietly enable). + ['unknown string', 'maybe', false], + ])( + 'parses %s as %s for both commit and pr', + (_label, badValue, expected) => { + const config = new Config({ + ...baseParams, + gitCoAuthor: { + commit: badValue as unknown as boolean, + pr: badValue as unknown as boolean, + }, + }); + const settings = config.getGitCoAuthor(); + expect(settings.commit).toBe(expected); + expect(settings.pr).toBe(expected); + }, + ); + + // A genuinely-absent sub-field still defaults to true (schema default). + it('defaults absent commit/pr to true', () => { + const config = new Config({ + ...baseParams, + gitCoAuthor: {} as { commit?: boolean; pr?: boolean }, + }); + const settings = config.getGitCoAuthor(); + expect(settings.commit).toBe(true); + expect(settings.pr).toBe(true); + }); + }); + describe('Telemetry Settings', () => { it('should return default telemetry target if not provided', () => { const params: ConfigParameters = { diff --git a/packages/core/src/config/config.ts b/packages/core/src/config/config.ts index d25b00e2e..f635ea1b8 100644 --- a/packages/core/src/config/config.ts +++ b/packages/core/src/config/config.ts @@ -130,6 +130,9 @@ import { import { getAutoMemoryRoot } from '../memory/paths.js'; import { readAutoMemoryIndex } from '../memory/store.js'; import { MemoryManager } from '../memory/manager.js'; +import { CommitAttributionService } from '../services/commitAttribution.js'; + +const gitCoAuthorLogger = createDebugLogger('GIT_CO_AUTHOR'); import { ModelsConfig, @@ -237,11 +240,72 @@ export interface OutputSettings { } export interface GitCoAuthorSettings { - enabled?: boolean; + commit: boolean; + pr: boolean; name?: string; email?: string; } +/** + * Shape accepted by the Config constructor for the `gitCoAuthor` param. + * + * A plain `boolean` is accepted for backward compatibility: older settings + * (shipped before commit and PR attribution were split) stored this field as + * a single boolean, and we treat that as applying to both sub-toggles so + * nobody's stored preference silently flips. + */ +export type GitCoAuthorParam = boolean | { commit?: boolean; pr?: boolean }; + +function normalizeGitCoAuthor(value: GitCoAuthorParam | undefined): { + commit: boolean; + pr: boolean; +} { + if (typeof value === 'boolean') { + return { commit: value, pr: value }; + } + // Default to `true` (the schema default) ONLY when the sub-field + // is genuinely absent. For PRESENT-but-non-boolean values, honor + // common string forms (`"true"`/`"yes"`/`"on"`/`"1"` → true, + // `"false"`/`"no"`/`"off"`/`"0"`/`""` → false) and treat anything + // else as opt-out. settings.json is user-editable, and the previous + // "default-to-true on mismatch" policy meant a hand-edited + // `{ "commit": "false" }` silently activated attribution against + // the user's clear intent. Safer-by-default: ambiguous values + // disable rather than enable. + const pickBool = (v: unknown, fieldName: string): boolean => { + if (v === undefined) return true; + if (typeof v === 'boolean') return v; + if (typeof v === 'string') { + const lowered = v.trim().toLowerCase(); + if ( + lowered === 'true' || + lowered === 'yes' || + lowered === 'on' || + lowered === '1' + ) { + return true; + } + // Known disable-intent forms — silent (matches user intent). + const knownDisable = ['false', 'no', 'off', '0', 'disabled', '']; + if (!knownDisable.includes(lowered)) { + // Unrecognised string — disable (safer-by-default) but log + // so a user wondering "why is my setting being ignored?" + // can see the actual coercion in QWEN_DEBUG_LOG_FILE. + gitCoAuthorLogger.warn( + `Unrecognized string value for general.gitCoAuthor.${fieldName}: ${JSON.stringify(v)}; treating as false. Accepted forms: true/yes/on/1, false/no/off/0/empty.`, + ); + } + return false; + } + if (typeof v === 'number') return v === 1; + return false; + }; + return { + commit: pickBool(value?.commit, 'commit'), + pr: pickBool(value?.pr, 'pr'), + }; +} + export type ExtensionOriginSource = 'QwenCode' | 'Claude' | 'Gemini'; export interface ExtensionInstallMetadata { @@ -375,7 +439,7 @@ export interface ConfigParameters { contextFileName?: string | string[]; accessibility?: AccessibilitySettings; telemetry?: TelemetrySettings; - gitCoAuthor?: boolean; + gitCoAuthor?: GitCoAuthorParam; usageStatisticsEnabled?: boolean; /** * If true, disables the per-session FileReadCache short-circuit @@ -781,7 +845,7 @@ export class Config { useCollector: params.telemetry?.useCollector, }; this.gitCoAuthor = { - enabled: params.gitCoAuthor ?? true, + ...normalizeGitCoAuthor(params.gitCoAuthor), name: 'Qwen-Coder', email: 'qwen-coder@alibabacloud.com', }; @@ -1379,6 +1443,12 @@ export class Config { // constructed via Object.create — those should clear their own // cache, not the parent's. this.getFileReadCache().clear(); + // The commit-attribution singleton accumulates per-file AI edits + // and a session-scoped prompt counter — both stop being meaningful + // when the session resets. Without this, pending attributions + // from the previous session could attach to a commit in the new + // one, and the "N-shotted" PR label would span sessions. + CommitAttributionService.resetInstance(); if (this.initialized) { logStartSession(this, new StartSessionEvent(this)); } diff --git a/packages/core/src/core/client.test.ts b/packages/core/src/core/client.test.ts index 47a8c9a87..b4bc187cc 100644 --- a/packages/core/src/core/client.test.ts +++ b/packages/core/src/core/client.test.ts @@ -2728,6 +2728,64 @@ Other open files: expect(mockMessageBus.request).toHaveBeenCalled(); }); }); + + describe('attribution snapshot persistence', () => { + let recordAttributionSnapshot: ReturnType; + + beforeEach(() => { + recordAttributionSnapshot = vi.fn(); + vi.mocked(mockConfig.getChatRecordingService).mockReturnValue({ + recordAttributionSnapshot, + recordUserMessage: vi.fn(), + recordCronPrompt: vi.fn(), + } as unknown as ReturnType); + + mockTurnRunFn.mockReturnValue( + (async function* () { + yield { type: 'content', value: 'ok' }; + })(), + ); + }); + + it('records a snapshot on ToolResult turns so post-tool state is captured', async () => { + const stream = client.sendMessageStream( + [{ text: 'tool-result' }], + new AbortController().signal, + 'prompt-tr', + { type: SendMessageType.ToolResult }, + ); + for await (const _ of stream) { + /* consume */ + } + expect(recordAttributionSnapshot).toHaveBeenCalled(); + }); + + it('records a snapshot on UserQuery turns', async () => { + const stream = client.sendMessageStream( + [{ text: 'user' }], + new AbortController().signal, + 'prompt-uq', + { type: SendMessageType.UserQuery }, + ); + for await (const _ of stream) { + /* consume */ + } + expect(recordAttributionSnapshot).toHaveBeenCalled(); + }); + + it('does not record a snapshot on Retry turns', async () => { + const stream = client.sendMessageStream( + [{ text: 'retry' }], + new AbortController().signal, + 'prompt-retry-snap', + { type: SendMessageType.Retry }, + ); + for await (const _ of stream) { + /* consume */ + } + expect(recordAttributionSnapshot).not.toHaveBeenCalled(); + }); + }); }); describe('generateContent', () => { diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts index 5f6156f2e..902d29664 100644 --- a/packages/core/src/core/client.ts +++ b/packages/core/src/core/client.ts @@ -46,6 +46,7 @@ import { COMPRESSION_TOKEN_THRESHOLD, } from '../services/chatCompressionService.js'; import { LoopDetectionService } from '../services/loopDetectionService.js'; +import { CommitAttributionService } from '../services/commitAttribution.js'; // Models import { buildAgentContentGeneratorConfig } from '../models/content-generator-config.js'; @@ -220,11 +221,44 @@ export class GeminiClient { this.getChat().setLastPromptTokenCount( uiTelemetryService.getLastPromptTokenCount(), ); + + // Restore attribution state from the last snapshot in the session + this.restoreAttributionFromSession(resumedSessionData.conversation); } else { await this.startChat(); } } + /** + * Restore attribution state from the last snapshot in a resumed session. + */ + private restoreAttributionFromSession(conversation: { + messages: Array<{ subtype?: string; systemPayload?: unknown }>; + }): void { + // Find the last attribution snapshot in the session + let lastSnapshot: unknown = null; + for (const msg of conversation.messages) { + if ( + msg.subtype === 'attribution_snapshot' && + msg.systemPayload && + typeof msg.systemPayload === 'object' && + 'snapshot' in msg.systemPayload + ) { + lastSnapshot = (msg.systemPayload as { snapshot: unknown }).snapshot; + } + } + if (lastSnapshot && typeof lastSnapshot === 'object') { + try { + CommitAttributionService.getInstance().restoreFromSnapshot( + lastSnapshot as import('../services/commitAttribution.js').AttributionSnapshot, + ); + debugLogger.debug('Restored attribution state from session snapshot'); + } catch { + debugLogger.warn('Failed to restore attribution snapshot'); + } + } + } + private getContentGeneratorOrFail(): ContentGenerator { if (!this.config.getContentGenerator()) { throw new Error('Content generator not initialized'); @@ -782,6 +816,18 @@ export class GeminiClient { ); } + // Track prompt count for commit attribution. Only the user typing a + // fresh prompt should bump the counter — `ToolResult` (tool-call + // continuation), `Retry`, `Hook`, `Cron`, and `Notification` are all + // model-driven or background-driven re-entries of the same logical + // turn. Counting them inflates the "N-shotted" label in the PR + // attribution trailer (one user message becomes "10-shotted" when it + // triggered ten tool calls). + const attributionService = CommitAttributionService.getInstance(); + if (messageType === SendMessageType.UserQuery) { + attributionService.incrementPromptCount(); + } + // record user/cron message for session management if (messageType === SendMessageType.Cron) { this.config @@ -820,7 +866,18 @@ export class GeminiClient { ); } } + if (messageType !== SendMessageType.Retry) { + // Snapshot on every non-retry turn. ToolResult turns run right after + // tool execution, so their snapshot captures edits that a prior + // UserQuery turn scheduled. Without this, a resumed session only sees + // the UserQuery-time snapshot (empty) and loses tool-driven edits. + this.config + .getChatRecordingService() + ?.recordAttributionSnapshot( + CommitAttributionService.getInstance().toSnapshot(), + ); + this.sessionTurnCount++; if ( diff --git a/packages/core/src/services/attributionTrailer.test.ts b/packages/core/src/services/attributionTrailer.test.ts new file mode 100644 index 000000000..4e02af600 --- /dev/null +++ b/packages/core/src/services/attributionTrailer.test.ts @@ -0,0 +1,100 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect } from 'vitest'; +import { buildGitNotesCommand } from './attributionTrailer.js'; +import type { CommitAttributionNote } from './commitAttribution.js'; + +const sampleNote: CommitAttributionNote = { + version: 1, + generator: 'Qwen-Coder', + files: { + 'src/main.ts': { aiChars: 150, humanChars: 50, percent: 75 }, + 'src/utils.ts': { aiChars: 0, humanChars: 200, percent: 0 }, + }, + summary: { + aiPercent: 38, + aiChars: 150, + humanChars: 250, + totalFilesTouched: 2, + surfaces: ['cli'], + }, + surfaceBreakdown: { cli: { aiChars: 150, percent: 38 } }, + excludedGenerated: ['package-lock.json'], + excludedGeneratedCount: 1, + promptCount: 3, +}; + +describe('attributionTrailer', () => { + describe('buildGitNotesCommand', () => { + const TARGET_SHA = 'abc1234567890abcdef1234567890abcdef12345'; + + it('should build a valid git notes invocation', () => { + const cmd = buildGitNotesCommand(sampleNote, TARGET_SHA); + expect(cmd).not.toBeNull(); + expect(cmd!.command).toBe('git'); + expect(cmd!.args.slice(0, 6)).toEqual([ + 'notes', + '--ref=refs/notes/ai-attribution', + 'add', + '-f', + '-m', + // index 5 is the JSON note payload, asserted below + cmd!.args[5], + ]); + // Note must target the captured SHA, not the symbolic `HEAD` — + // otherwise a post-commit hook or chained command can move HEAD + // between capture and exec, and `-f` lands the note on the + // wrong commit. + expect(cmd!.args.at(-1)).toBe(TARGET_SHA); + }); + + it('should pass the JSON note as a single argv entry (no shell quoting)', () => { + // The `-f` flag is at args[3]; the note JSON sits at args[5] between + // `-m` and the target commit. Returning argv (rather than a + // shell-quoted command string) keeps the payload off the shell + // parser entirely so quotes, command substitution, and + // platform-specific escaping cannot break it on cmd.exe / PowerShell. + const cmd = buildGitNotesCommand(sampleNote, TARGET_SHA)!; + const noteArg = cmd.args[5]!; + const parsed = JSON.parse(noteArg); + expect(parsed.version).toBe(1); + expect(parsed.summary.aiPercent).toBe(38); + expect(parsed.files['src/main.ts'].percent).toBe(75); + }); + + it('should return null when note exceeds size limit', () => { + const hugeNote: CommitAttributionNote = { + ...sampleNote, + files: {}, + excludedGenerated: [], + excludedGeneratedCount: 0, + }; + for (let i = 0; i < 2000; i++) { + hugeNote.files[ + `src/very/long/path/to/some/deeply/nested/file_${i}.ts` + ] = { aiChars: 999999, humanChars: 999999, percent: 50 }; + } + expect(buildGitNotesCommand(hugeNote, TARGET_SHA)).toBeNull(); + }); + + it('should leave single quotes literal in the argv payload', () => { + // The previous string-based command needed bash-style quote escaping. + // With argv, the apostrophe stays literal — the executor passes it + // through to git unmolested. + const noteWithQuotes: CommitAttributionNote = { + ...sampleNote, + files: { + "it's-a-file.ts": { aiChars: 10, humanChars: 5, percent: 67 }, + }, + }; + const cmd = buildGitNotesCommand(noteWithQuotes, TARGET_SHA); + expect(cmd).not.toBeNull(); + const parsed = JSON.parse(cmd!.args[5]!); + expect(parsed.files["it's-a-file.ts"].percent).toBe(67); + }); + }); +}); diff --git a/packages/core/src/services/attributionTrailer.ts b/packages/core/src/services/attributionTrailer.ts new file mode 100644 index 000000000..fe17330af --- /dev/null +++ b/packages/core/src/services/attributionTrailer.ts @@ -0,0 +1,80 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * Attribution Trailer Utility + * + * Generates git notes commands for storing per-file AI attribution metadata + * on commits. This keeps the commit message clean (only Co-Authored-By trailer) + * while storing detailed contribution data in git notes. + */ + +import type { CommitAttributionNote } from './commitAttribution.js'; + +const GIT_NOTES_REF = 'refs/notes/ai-attribution'; + +/** + * Maximum byte length for the JSON note. Sized for the most + * restrictive ARG_MAX in the wild: Windows' `CreateProcess` + * lpCommandLine is capped around 32,768 UTF-16 chars including the + * git executable path, the other argv entries, and separators, so + * the note itself has to fit in that minus a safety margin (~2 KB) + * for everything else. Linux/macOS ARG_MAX is much larger; sizing + * for Windows just means we cap earlier on those platforms — the + * note is meant to be small metadata, not a payload, so the limit + * is rarely the binding constraint. + */ +const MAX_NOTE_BYTES = 30 * 1024; // 30 KB + +/** + * argv-form git notes invocation, designed for `child_process.execFile`. + * + * We return argv rather than a shell-quoted command string because the JSON + * note travels as a separate argv entry — no shell quoting is needed and no + * shell metacharacters can be re-evaluated. This matters most on Windows + * where bash-style single-quote escaping (`'\''`) is invalid and would + * corrupt the note (or, worse, allow interpolation under PowerShell/cmd). + */ +export interface GitNotesCommand { + command: string; + args: string[]; +} + +/** + * Build the git notes add invocation to attach attribution metadata to a + * specific commit. `targetCommit` MUST be the SHA the caller captured + * after detecting the commit's HEAD movement — passing the symbolic + * `'HEAD'` opens a TOCTOU window where a post-commit hook, a chained + * `git commit && git tag -m ...`, or a parallel process can advance + * HEAD between capture and exec, and `-f` would silently overwrite the + * note on the wrong commit. + * + * Caller should pass the result to a process-spawning API + * (`child_process.execFile`) along with a `cwd` option. + * + * Returns null if the serialized note exceeds MAX_NOTE_BYTES. + */ +export function buildGitNotesCommand( + note: CommitAttributionNote, + targetCommit: string, +): GitNotesCommand | null { + const noteJson = JSON.stringify(note); + if (Buffer.byteLength(noteJson, 'utf-8') > MAX_NOTE_BYTES) { + return null; + } + return { + command: 'git', + args: [ + 'notes', + `--ref=${GIT_NOTES_REF}`, + 'add', + '-f', + '-m', + noteJson, + targetCommit, + ], + }; +} diff --git a/packages/core/src/services/chatRecordingService.test.ts b/packages/core/src/services/chatRecordingService.test.ts index 2bbc6143e..ef31daceb 100644 --- a/packages/core/src/services/chatRecordingService.test.ts +++ b/packages/core/src/services/chatRecordingService.test.ts @@ -594,6 +594,138 @@ describe('ChatRecordingService', () => { }); }); + describe('recordAttributionSnapshot', () => { + const baseSnapshot = { + type: 'attribution-snapshot' as const, + version: 1, + surface: 'cli', + fileStates: {}, + promptCount: 0, + promptCountAtLastCommit: 0, + }; + + it('should write each distinct snapshot', async () => { + chatRecordingService.recordAttributionSnapshot(baseSnapshot); + chatRecordingService.recordAttributionSnapshot({ + ...baseSnapshot, + promptCount: 1, + }); + chatRecordingService.recordAttributionSnapshot({ + ...baseSnapshot, + promptCount: 2, + }); + await chatRecordingService.flush(); + expect(jsonl.writeLine).toHaveBeenCalledTimes(3); + }); + + // Sessions that touch many files emit a non-retry turn snapshot + // every prompt cycle. Without dedup, repeated identical snapshots + // (no edits, no prompt-counter change) would re-serialize the entire + // attribution state into the JSONL on every turn, inflating session + // size and slowing /resume. + it('should skip a snapshot identical to the previous write', async () => { + chatRecordingService.recordAttributionSnapshot(baseSnapshot); + chatRecordingService.recordAttributionSnapshot(baseSnapshot); + chatRecordingService.recordAttributionSnapshot(baseSnapshot); + await chatRecordingService.flush(); + expect(jsonl.writeLine).toHaveBeenCalledTimes(1); + }); + + // After rewindRecording, the previous attribution snapshot lives on + // the abandoned branch, so the dedup key has to clear — otherwise + // the post-rewind identical snapshot would be silently skipped and + // /resume on the rewound session would lose all attribution state. + it('should re-write an identical snapshot after rewindRecording', async () => { + chatRecordingService.recordUserMessage([{ text: 'turn 1' }]); + chatRecordingService.recordAttributionSnapshot(baseSnapshot); + await chatRecordingService.flush(); + const beforeRewind = vi.mocked(jsonl.writeLine).mock.calls.length; + + chatRecordingService.rewindRecording(0, { truncatedCount: 0 }); + // Same snapshot bytes — without the rewind reset this would dedup. + chatRecordingService.recordAttributionSnapshot(baseSnapshot); + await chatRecordingService.flush(); + // 1 rewind record + 1 fresh snapshot = 2 more writes after rewind. + expect(vi.mocked(jsonl.writeLine).mock.calls.length).toBe( + beforeRewind + 2, + ); + }); + + // A transient write failure must NOT permanently suppress future + // identical snapshots: if the dedup key were committed before the + // write, the next identical snapshot would dedup and the session + // would have no attribution snapshot at all. + it('should retry an identical snapshot after a write failure', async () => { + vi.mocked(jsonl.writeLine).mockRejectedValueOnce(new Error('disk full')); + chatRecordingService.recordAttributionSnapshot(baseSnapshot); + // Wait for the queued (failing) write to settle so the rollback runs. + await chatRecordingService.flush(); + const afterFailure = vi.mocked(jsonl.writeLine).mock.calls.length; + + chatRecordingService.recordAttributionSnapshot(baseSnapshot); + await chatRecordingService.flush(); + // Retry should fire, so we get a new write call. + expect(vi.mocked(jsonl.writeLine).mock.calls.length).toBe( + afterFailure + 1, + ); + }); + + // appendRecord is fire-and-forget for non-snapshot callers + // (recordUserMessage / recordAssistantTurn / recordAtCommand / + // ...). When jsonl.writeLine rejects, the rejection MUST be + // swallowed inside the service — otherwise it surfaces as an + // unhandled-promise-rejection in production (and as a flaky + // failure under vitest's --reporter=default). + it('should swallow async writeLine rejection for fire-and-forget callers', async () => { + vi.mocked(jsonl.writeLine).mockRejectedValueOnce(new Error('disk full')); + // Track unhandled rejections during this test. + const unhandled: unknown[] = []; + const handler = (err: unknown) => unhandled.push(err); + process.on('unhandledRejection', handler); + try { + chatRecordingService.recordUserMessage([{ text: 'hi' }]); + await chatRecordingService.flush(); + // Microtask drain to give any unhandled rejections a chance + // to surface before we assert. + await new Promise((resolve) => setImmediate(resolve)); + expect(unhandled).toHaveLength(0); + } finally { + process.off('unhandledRejection', handler); + } + }); + + // appendRecord can throw SYNCHRONOUSLY before returning a promise + // (e.g. ensureConversationFile fails because the conversation + // file can't be created). Without rollback in the outer catch, + // the dedup key stays set on a write that never happened, so + // all future identical snapshots get suppressed. + it('should retry an identical snapshot after a synchronous failure', async () => { + // First call: force writeFileSync (used by ensureConversationFile + // to wx-create the JSONL file) to throw a non-EEXIST error. + // ensureConversationFile rethrows that, which propagates through + // appendRecord SYNCHRONOUSLY before any promise is returned. + const writeFileSpy = vi.spyOn(fs, 'writeFileSync'); + writeFileSpy.mockImplementationOnce(() => { + const e = new Error( + 'EACCES: permission denied', + ) as NodeJS.ErrnoException; + e.code = 'EACCES'; + throw e; + }); + + chatRecordingService.recordAttributionSnapshot(baseSnapshot); + await chatRecordingService.flush(); + // Sync failure: writeLine never reached. + expect(vi.mocked(jsonl.writeLine)).not.toHaveBeenCalled(); + + // Identical snapshot on retry: dedup key should have been + // rolled back so this fires a fresh write. + chatRecordingService.recordAttributionSnapshot(baseSnapshot); + await chatRecordingService.flush(); + expect(vi.mocked(jsonl.writeLine)).toHaveBeenCalledTimes(1); + }); + }); + // Note: Session management tests (listSessions, loadSession, deleteSession, etc.) // have been moved to sessionService.test.ts // Session resume integration tests should test via SessionService mock diff --git a/packages/core/src/services/chatRecordingService.ts b/packages/core/src/services/chatRecordingService.ts index 240498372..691a45238 100644 --- a/packages/core/src/services/chatRecordingService.ts +++ b/packages/core/src/services/chatRecordingService.ts @@ -19,6 +19,7 @@ import { import * as jsonl from '../utils/jsonl-utils.js'; import { getGitBranch } from '../utils/gitUtils.js'; import { createDebugLogger } from '../utils/debugLogger.js'; +import type { AttributionSnapshot } from './commitAttribution.js'; import { tryGenerateSessionTitle } from './sessionTitle.js'; import type { ChatCompressionInfo, @@ -216,6 +217,7 @@ export interface ChatRecord { | 'slash_command' | 'ui_telemetry' | 'at_command' + | 'attribution_snapshot' | 'notification' | 'cron' | 'custom_title' @@ -262,6 +264,7 @@ export interface ChatRecord { | SlashCommandRecordPayload | UiTelemetryRecordPayload | AtCommandRecordPayload + | AttributionSnapshotPayload | CustomTitleRecordPayload | NotificationRecordPayload | RewindRecordPayload @@ -374,6 +377,14 @@ export interface UiTelemetryRecordPayload { uiEvent: UiEvent; } +/** + * Stored payload for attribution state snapshots. + * Enables session persistence of AI contribution tracking. + */ +export interface AttributionSnapshotPayload { + snapshot: AttributionSnapshot; +} + /** * Stored payload for conversation rewind events. */ @@ -466,6 +477,16 @@ export class ChatRecordingService { */ private autoTitleController: AbortController | undefined; + /** + * JSON-serialized form of the most recent attribution snapshot we + * wrote, used to deduplicate identical writes on every non-retry + * turn. Without this, sessions that touch many files would write a + * full duplicate of the entire snapshot to the JSONL on every turn, + * inflating the on-disk session and making `/resume` slower to + * hydrate. + */ + private lastAttributionSnapshotJson: string | undefined; + constructor(config: Config) { this.config = config; this.lastRecordUuid = @@ -612,7 +633,25 @@ export class ChatRecordingService { * local-disk writes failures are rare enough to accept the fire-and-forget * simplification. */ - private appendRecord(record: ChatRecord): void { + /** + * Fire-and-forget: queues a JSONL write on the internal writeChain + * and swallows async failures (logs them via debugLogger). All + * existing call sites — recordUserMessage, recordAssistantTurn, + * etc. — invoke this synchronously without awaiting, so the + * internal swallow keeps an unhandled-promise-rejection from + * surfacing on a single transient writeLine failure. + * + * Callers that need to react to per-record write FAILURE (e.g. the + * snapshot dedup-key rollback in `recordAttributionSnapshot`) pass + * an `onError` callback, which fires after the write rejects (and + * after the rejection has been logged + the chain re-armed). Sync + * throws still propagate so the caller's outer try/catch can roll + * back optimistic state — see the synchronous-failure test. + */ + private appendRecord( + record: ChatRecord, + onError?: (err: unknown) => void, + ): void { let conversationFile: string; try { conversationFile = this.ensureConversationFile(); @@ -626,6 +665,13 @@ export class ChatRecordingService { .then(() => jsonl.writeLine(conversationFile, record)) .catch((err) => { debugLogger.error('Error appending record (async):', err); + if (onError) { + try { + onError(err); + } catch (cbErr) { + debugLogger.error('appendRecord onError callback threw:', cbErr); + } + } }); } @@ -945,6 +991,12 @@ export class ChatRecordingService { this.lastRecordUuid = this.turnParentUuids[targetTurnIndex] ?? null; // Trim future boundaries — they no longer exist in the active branch. this.turnParentUuids = this.turnParentUuids.slice(0, targetTurnIndex); + // The previous attribution snapshot now sits on the abandoned + // branch — clear the dedup key so the next snapshot lands on the + // active branch and `/resume` can find it. Without this, a + // post-rewind identical snapshot would be skipped and the rewound + // session would lose all attribution state on restore. + this.lastAttributionSnapshotJson = undefined; const record: ChatRecord = { ...this.createBaseRecord('system'), @@ -1083,4 +1135,59 @@ export class ChatRecordingService { debugLogger.error('Error saving @-command record:', error); } } + + /** + * Records an attribution state snapshot for session persistence. + * Called at the start of every non-retry turn so that a resumed session + * sees the most recent state including edits made during the prior turn. + * + * Deduplicates identical successive writes: if the snapshot's JSON + * form is byte-identical to the last one we wrote, skip the append. + * Without this, sessions that touch many files would write a full + * duplicate of the entire snapshot to the JSONL on every turn, even + * when nothing changed — inflating session size and slowing /resume. + * + * Set the dedup key optimistically and roll it back if the write + * fails. Synchronous identical calls (common during a tool-driven + * turn) all dedup correctly, but a transient write failure clears + * the key so the next identical snapshot retries the write rather + * than being permanently suppressed. + */ + recordAttributionSnapshot(snapshot: AttributionSnapshot): void { + let json: string | undefined; + try { + json = JSON.stringify(snapshot); + if (json === this.lastAttributionSnapshotJson) { + return; + } + const record: ChatRecord = { + ...this.createBaseRecord('system'), + type: 'system', + subtype: 'attribution_snapshot', + systemPayload: { snapshot }, + }; + + this.lastAttributionSnapshotJson = json; + this.appendRecord(record, () => { + // Async write failed — only roll back if the key still + // belongs to our snapshot (a later distinct write may have + // overwritten it). + if (this.lastAttributionSnapshotJson === json) { + this.lastAttributionSnapshotJson = undefined; + } + }); + } catch (error) { + // appendRecord (and createBaseRecord/JSON.stringify) can throw + // synchronously — e.g. ensureConversationFile() fails because + // the project temp dir isn't writable. The .catch() handler + // attached to the promise never runs in that case, so we'd + // otherwise leave the dedup key set without a write ever + // having landed and permanently suppress identical retries. + // Roll back here too. + if (json !== undefined && this.lastAttributionSnapshotJson === json) { + this.lastAttributionSnapshotJson = undefined; + } + debugLogger.error('Error saving attribution snapshot:', error); + } + } } diff --git a/packages/core/src/services/commitAttribution.test.ts b/packages/core/src/services/commitAttribution.test.ts new file mode 100644 index 000000000..81cca7c67 --- /dev/null +++ b/packages/core/src/services/commitAttribution.test.ts @@ -0,0 +1,762 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect, beforeEach, vi, afterEach } from 'vitest'; + +// Stub `fs.realpathSync` so the symlink-aware tests below can simulate +// macOS-style `/var` ↔ `/private/var` mapping without needing a real +// symlink in the filesystem. Other tests don't touch realpath, so the +// pass-through default keeps them unaffected. +vi.mock('node:fs', async () => { + const actual = await vi.importActual('node:fs'); + return { ...actual, realpathSync: vi.fn(actual.realpathSync) }; +}); + +import * as fs from 'node:fs'; +import * as os from 'node:os'; +import * as path from 'node:path'; +import { + CommitAttributionService, + computeCharContribution, + type StagedFileInfo, +} from './commitAttribution.js'; + +function makeStagedInfo( + files: string[], + diffSizes?: Record, + deleted?: string[], + renamed?: Record, +): StagedFileInfo { + return { + files, + diffSizes: new Map(Object.entries(diffSizes ?? {})), + deletedFiles: new Set(deleted ?? []), + renamedFiles: new Map(Object.entries(renamed ?? {})), + }; +} + +describe('computeCharContribution', () => { + it('should return new content length for file creation', () => { + expect(computeCharContribution('', 'hello world')).toBe(11); + }); + + it('should return old content length for file deletion', () => { + expect(computeCharContribution('hello world', '')).toBe(11); + }); + + it('should handle same-length replacement via prefix/suffix', () => { + expect(computeCharContribution('Esc', 'esc')).toBe(1); + }); + + it('should handle insertion in the middle', () => { + expect(computeCharContribution('ab', 'aXb')).toBe(1); + }); + + it('should handle deletion in the middle', () => { + expect(computeCharContribution('aXb', 'ab')).toBe(1); + }); + + it('should handle complete replacement', () => { + expect(computeCharContribution('abc', 'xyz')).toBe(3); + }); + + it('should return 0 for identical content', () => { + expect(computeCharContribution('same', 'same')).toBe(0); + }); + + it('should handle multi-line changes', () => { + const old = 'line1\nline2\nline3'; + const now = 'line1\nchanged\nline3'; + expect(computeCharContribution(old, now)).toBe(7); // "changed" > "line2" + }); +}); + +describe('CommitAttributionService', () => { + beforeEach(() => { + CommitAttributionService.resetInstance(); + }); + + it('should return the same singleton instance', () => { + const a = CommitAttributionService.getInstance(); + const b = CommitAttributionService.getInstance(); + expect(a).toBe(b); + }); + + it('should track new file creation', () => { + const service = CommitAttributionService.getInstance(); + service.recordEdit('/project/src/file.ts', null, 'hello world'); + + const attr = service.getFileAttribution('/project/src/file.ts'); + expect(attr!.aiCreated).toBe(true); + expect(attr!.aiContribution).toBe(11); + }); + + it('should NOT treat empty existing file as new file creation', () => { + const service = CommitAttributionService.getInstance(); + service.recordEdit('/project/empty.ts', '', 'new content'); + + const attr = service.getFileAttribution('/project/empty.ts'); + expect(attr!.aiCreated).toBe(false); + expect(attr!.aiContribution).toBe(11); + }); + + it('should track edits with prefix/suffix algorithm', () => { + const service = CommitAttributionService.getInstance(); + service.recordEdit('/project/f.ts', 'Hello World', 'Hello world'); + expect(service.getFileAttribution('/project/f.ts')!.aiContribution).toBe(1); + }); + + it('should accumulate contributions across multiple edits', () => { + const service = CommitAttributionService.getInstance(); + service.recordEdit('/project/f.ts', 'aaa', 'bbb'); // 3 + service.recordEdit('/project/f.ts', 'bbb', 'bbbccc'); // 3 + expect(service.getFileAttribution('/project/f.ts')!.aiContribution).toBe(6); + }); + + // Out-of-band mutation detection: if the input `oldContent` doesn't + // match the contentHash AI recorded after its previous edit, the + // file was changed externally between AI's two writes — drop the + // accumulator before counting the new edit so prior AI work the + // user has since overwritten doesn't get credited later. + it('should reset accumulator when oldContent diverges from AI last write', () => { + const service = CommitAttributionService.getInstance(); + // First AI edit: file goes from 'abc' to 'AI block of 100 chars padded' (28 chars). + const aiBlock = 'AI block of 100 chars padded'; + service.recordEdit('/project/f.ts', 'abc', aiBlock); + const after1 = service.getFileAttribution('/project/f.ts')!; + expect(after1.aiContribution).toBeGreaterThan(0); + + // Now a DIFFERENT oldContent shows up — the user paste-replaced + // the file via an external editor in between. AI's recordEdit + // should reset the counter before applying the new contribution. + service.recordEdit('/project/f.ts', 'user paste replacement', 'final'); + const after2 = service.getFileAttribution('/project/f.ts')!; + // aiContribution is now bounded by the divergent edit alone, NOT + // accumulated on top of after1.aiContribution. + expect(after2.aiContribution).toBeLessThan(after1.aiContribution); + }); + + // Fresh-file lifetime: when AI re-creates a file at a path that was + // previously tracked but has since been deleted (oldContent === null + // signals "no file existed on disk"), the previous tracked state is + // from a different file lifetime. Without this reset, AI's + // accumulated chars from the deleted file would carry over and + // double-count toward the new file's attribution. + it('should reset accumulator when re-creating a previously-tracked deleted file', () => { + const service = CommitAttributionService.getInstance(); + // First lifetime: AI creates 'foo.ts' with 100 chars of content. + const firstContent = 'A'.repeat(100); + service.recordEdit('/project/foo.ts', null, firstContent); + const after1 = service.getFileAttribution('/project/foo.ts')!; + expect(after1.aiContribution).toBe(100); + expect(after1.aiCreated).toBe(true); + + // Second lifetime: file was deleted (e.g. user `rm foo.ts`), then + // AI re-creates it with new (shorter) content. oldContent=null + // signals "didn't exist on disk before this write". + const secondContent = 'short'; + service.recordEdit('/project/foo.ts', null, secondContent); + const after2 = service.getFileAttribution('/project/foo.ts')!; + // aiContribution should reflect ONLY the second write's chars, not + // 100 + 5. aiCreated stays true (this lifetime is also a creation). + expect(after2.aiContribution).toBe(5); + expect(after2.aiCreated).toBe(true); + }); + + it('should NOT reset accumulator when oldContent matches AI last write', () => { + const service = CommitAttributionService.getInstance(); + service.recordEdit('/project/f.ts', 'abc', 'AI step one'); + const after1 = service.getFileAttribution('/project/f.ts')!; + // Second AI edit picks up where the first left off — oldContent + // matches the post-first hash, so accumulation continues. + service.recordEdit('/project/f.ts', 'AI step one', 'AI step two final'); + const after2 = service.getFileAttribution('/project/f.ts')!; + expect(after2.aiContribution).toBeGreaterThan(after1.aiContribution); + }); + + // validateAgainst runs at commit time and drops entries whose + // recorded post-write hash doesn't match the caller-supplied + // content — catches user edits that happened entirely outside the + // Edit/Write tools (no recordEdit was called, so the input-hash + // check above couldn't see the divergence). + describe('validateAgainst', () => { + let tmpDir: string; + beforeEach(() => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'attr-validate-')); + }); + afterEach(() => { + fs.rmSync(tmpDir, { recursive: true, force: true }); + }); + + it('drops entries whose content has diverged', () => { + const service = CommitAttributionService.getInstance(); + const filePath = path.join(tmpDir, 'diverged.ts'); + fs.writeFileSync(filePath, 'AI wrote this', 'utf-8'); + service.recordEdit(filePath, null, 'AI wrote this'); + expect(service.getFileAttribution(filePath)).toBeDefined(); + + // Caller passes a reader that returns the diverged content. + service.validateAgainst(() => 'human replaced this'); + expect(service.getFileAttribution(filePath)).toBeUndefined(); + }); + + it('keeps entries whose content matches', () => { + const service = CommitAttributionService.getInstance(); + const filePath = path.join(tmpDir, 'unchanged.ts'); + fs.writeFileSync(filePath, 'AI wrote this', 'utf-8'); + service.recordEdit(filePath, null, 'AI wrote this'); + service.validateAgainst(() => 'AI wrote this'); + expect(service.getFileAttribution(filePath)).toBeDefined(); + }); + + it('keeps entries when getContent returns null (no comparison signal)', () => { + const service = CommitAttributionService.getInstance(); + const filePath = path.join(tmpDir, 'no-comparison.ts'); + fs.writeFileSync(filePath, 'will be queried', 'utf-8'); + service.recordEdit(filePath, null, 'will be queried'); + // null = "no committed blob / unreadable / out-of-scope" — the + // entry should NOT be dropped. + service.validateAgainst(() => null); + expect(service.getFileAttribution(filePath)).toBeDefined(); + }); + + // BOM/CRLF normalisation: writeTextFile preserves the file's BOM + // and CRLF line-ending choice independently of whether AI's + // recordEdit input string contained the BOM char or used LF. The + // on-disk bytes returned by `git show` can therefore include a + // leading U+FEFF and CRLFs that AI never wrote — the hash MUST + // canonicalise both sides so a BOM/CRLF file isn't dropped on + // every commit. + it('keeps entries when on-disk content has BOM but AI input did not', () => { + const service = CommitAttributionService.getInstance(); + const filePath = path.join(tmpDir, 'bom.ts'); + // Simulate the on-disk file having a BOM (writeTextFile wrote + // it because the previous file version had one). + const aiContent = 'export const foo = 42;'; + const onDiskWithBom = '' + aiContent; + fs.writeFileSync(filePath, onDiskWithBom, 'utf-8'); + service.recordEdit(filePath, null, aiContent); + + // Reader returns the on-disk content (with BOM). After + // canonicalisation, both sides hash to the same value. + service.validateAgainst(() => onDiskWithBom); + expect(service.getFileAttribution(filePath)).toBeDefined(); + }); + + it('keeps entries when on-disk uses CRLF but AI input used LF', () => { + const service = CommitAttributionService.getInstance(); + const filePath = path.join(tmpDir, 'crlf.ts'); + const aiContent = 'line one\nline two\n'; + const onDiskCrlf = 'line one\r\nline two\r\n'; + fs.writeFileSync(filePath, onDiskCrlf, 'utf-8'); + service.recordEdit(filePath, null, aiContent); + service.validateAgainst(() => onDiskCrlf); + expect(service.getFileAttribution(filePath)).toBeDefined(); + }); + + // Combined: BOM + CRLF on disk, plain LF + no BOM in AI input. + // The most common case for a Windows-edited file the model + // returned in unix form. + it('keeps entries when on-disk has BOM AND CRLF, AI input had neither', () => { + const service = CommitAttributionService.getInstance(); + const filePath = path.join(tmpDir, 'bom-crlf.ts'); + const aiContent = 'foo\nbar\n'; + const onDisk = 'foo\r\nbar\r\n'; + fs.writeFileSync(filePath, onDisk, 'utf-8'); + service.recordEdit(filePath, null, aiContent); + service.validateAgainst(() => onDisk); + expect(service.getFileAttribution(filePath)).toBeDefined(); + }); + + // Legacy snapshot from before contentHash existed: the entry has + // an empty contentHash. We can't tell stale from fresh, so leave + // it alone (don't reset). + it('skips entries with empty contentHash (legacy snapshot)', () => { + const service = CommitAttributionService.getInstance(); + service.restoreFromSnapshot({ + type: 'attribution-snapshot', + surface: 'cli', + fileStates: { + '/legacy.ts': { + aiContribution: 50, + aiCreated: false, + contentHash: '', + }, + }, + promptCount: 0, + promptCountAtLastCommit: 0, + }); + // Even if the reader claims a different hash, an empty recorded + // hash means we have no baseline — keep the entry. + service.validateAgainst(() => 'totally different'); + expect(service.getFileAttribution('/legacy.ts')).toBeDefined(); + }); + + // Deleted-file lookup must remain stable: recordEdit canonicalises + // the path via realpathSync; getFileAttribution must still resolve + // the same canonical key after the leaf is unlinked. realpathOrSelf + // canonicalises the parent and rejoins the basename for missing + // leaves so macOS /var ↔ /private/var doesn't break the lookup + // post-deletion. + it('keeps deleted-file entries reachable via the original path', () => { + const service = CommitAttributionService.getInstance(); + const filePath = path.join(tmpDir, 'deleted.ts'); + fs.writeFileSync(filePath, 'will be deleted', 'utf-8'); + service.recordEdit(filePath, null, 'will be deleted'); + fs.unlinkSync(filePath); + // Lookup must still find the entry by the original path even + // though realpath of the leaf now throws. + expect(service.getFileAttribution(filePath)).toBeDefined(); + }); + }); + + it('should save session baseline on first edit', () => { + const service = CommitAttributionService.getInstance(); + service.recordEdit('/project/f.ts', 'original content', 'new content'); + + // Baseline should have been saved from oldContent + // We can verify indirectly: after clear, baseline is gone + service.clearAttributions(); + expect(service.hasAttributions()).toBe(false); + }); + + it('should return defensive copies', () => { + const service = CommitAttributionService.getInstance(); + service.recordEdit('/project/f.ts', null, 'content'); + + const copy = service.getFileAttribution('/project/f.ts')!; + copy.aiContribution = 99999; + + expect( + service.getFileAttribution('/project/f.ts')!.aiContribution, + ).not.toBe(99999); + }); + + describe('prompt counting', () => { + it('should track prompt counts', () => { + const service = CommitAttributionService.getInstance(); + expect(service.getPromptCount()).toBe(0); + + service.incrementPromptCount(); + service.incrementPromptCount(); + service.incrementPromptCount(); + + expect(service.getPromptCount()).toBe(3); + expect(service.getPromptsSinceLastCommit()).toBe(3); + }); + + it('should reset prompts-since-commit counter on successful clear', () => { + const service = CommitAttributionService.getInstance(); + service.incrementPromptCount(); + service.incrementPromptCount(); + service.clearAttributions(true); + + expect(service.getPromptCount()).toBe(2); + expect(service.getPromptsSinceLastCommit()).toBe(0); + }); + + it('should NOT reset prompts-since-commit on failed clear', () => { + const service = CommitAttributionService.getInstance(); + service.incrementPromptCount(); + service.incrementPromptCount(); + service.recordEdit('/project/f.ts', null, 'x'); + service.clearAttributions(false); + + // File data cleared, but prompt counter preserved + expect(service.hasAttributions()).toBe(false); + expect(service.getPromptCount()).toBe(2); + expect(service.getPromptsSinceLastCommit()).toBe(2); + }); + }); + + describe('surface tracking', () => { + it('should default to cli surface', () => { + const service = CommitAttributionService.getInstance(); + expect(service.getSurface()).toBe('cli'); + }); + }); + + describe('snapshot / restore', () => { + it('should serialize and restore state', () => { + const service = CommitAttributionService.getInstance(); + service.recordEdit('/project/f.ts', null, 'hello'); + service.incrementPromptCount(); + service.incrementPromptCount(); + + const snapshot = service.toSnapshot(); + expect(snapshot.type).toBe('attribution-snapshot'); + expect(snapshot.promptCount).toBe(2); + expect(Object.keys(snapshot.fileStates)).toHaveLength(1); + + // Restore into a fresh instance + CommitAttributionService.resetInstance(); + const restored = CommitAttributionService.getInstance(); + restored.restoreFromSnapshot(snapshot); + + expect(restored.getPromptCount()).toBe(2); + expect(restored.getFileAttribution('/project/f.ts')!.aiContribution).toBe( + 5, + ); + }); + }); + + describe('generateNotePayload', () => { + it('should compute real AI/human percentages', () => { + const service = CommitAttributionService.getInstance(); + service.recordEdit('/project/src/main.ts', '', 'x'.repeat(200)); + + const staged = makeStagedInfo(['src/main.ts', 'src/human.ts'], { + 'src/main.ts': 400, + 'src/human.ts': 200, + }); + + const note = service.generateNotePayload( + staged, + '/project', + 'Qwen-Coder', + ); + + expect(note.files['src/main.ts']!.percent).toBe(50); + expect(note.files['src/human.ts']!.percent).toBe(0); + expect(note.summary.aiPercent).toBe(33); + expect(note.summary.surfaces).toContain('cli'); + expect(note.surfaceBreakdown['cli']).toBeDefined(); + }); + + it('should exclude generated files', () => { + const service = CommitAttributionService.getInstance(); + service.recordEdit('/project/src/main.ts', null, 'code'); + + const staged = makeStagedInfo( + ['src/main.ts', 'package-lock.json', 'dist/bundle.js'], + { + 'src/main.ts': 100, + 'package-lock.json': 50000, + 'dist/bundle.js': 30000, + }, + ); + + const note = service.generateNotePayload(staged, '/project'); + expect(Object.keys(note.files)).toHaveLength(1); + expect(note.excludedGenerated).toContain('package-lock.json'); + expect(note.excludedGenerated).toContain('dist/bundle.js'); + }); + + it('should include promptCount', () => { + const service = CommitAttributionService.getInstance(); + service.recordEdit('/project/f.ts', null, 'code'); + service.incrementPromptCount(); + service.incrementPromptCount(); + + const staged = makeStagedInfo(['f.ts'], { 'f.ts': 100 }); + const note = service.generateNotePayload(staged, '/project'); + expect(note.promptCount).toBe(2); + }); + + it('should sanitize internal model codenames', () => { + const service = CommitAttributionService.getInstance(); + service.recordEdit('/project/f.ts', null, 'x'); + const staged = makeStagedInfo(['f.ts'], { 'f.ts': 10 }); + + expect( + service.generateNotePayload(staged, '/project', 'qwen-72b').generator, + ).toBe('Qwen-Coder'); + expect( + service.generateNotePayload(staged, '/project', 'CustomAgent') + .generator, + ).toBe('CustomAgent'); + }); + + // Long-line edits inflate the tracked AI char count (we count actual + // characters), but diffSize comes from `git diff --stat` which + // approximates each changed line as ~40 chars. Without clamping, + // aiChars stays large while humanChars snaps to 0, leaving + // aiChars+humanChars > the committed change magnitude. + it('should clamp aiChars to diffSize so totals stay consistent', () => { + const service = CommitAttributionService.getInstance(); + // Big AI edit but small reported diff (one long-line change). + service.recordEdit('/project/src/big.ts', '', 'x'.repeat(1000)); + + const staged = makeStagedInfo(['src/big.ts'], { 'src/big.ts': 40 }); + const note = service.generateNotePayload(staged, '/project'); + + const detail = note.files['src/big.ts']!; + expect(detail.aiChars).toBe(40); + expect(detail.humanChars).toBe(0); + // aiChars + humanChars now equals the reported diff size. + expect(detail.aiChars + detail.humanChars).toBe(40); + expect(note.summary.aiChars).toBe(40); + }); + }); + + // The service realpath's file paths at every entry/exit point so a + // symlinked vs canonical absolute path collapses to one entry. This + // matters most on macOS (`/var` → `/private/var`), where edit.ts + // can record a path under one form while git rev-parse reports the + // other — without canonicalisation, the lookup never matches and + // AI attribution silently zeroes out. + describe('symlink-aware path canonicalisation', () => { + beforeEach(() => { + // Map any /var/... input to /private/var/... (the macOS-ism). + // Anything else passes through unchanged. + vi.mocked(fs.realpathSync).mockImplementation(((input: unknown) => { + const s = String(input); + if (s.startsWith('/var/')) return s.replace('/var/', '/private/var/'); + if (s === '/var') return '/private/var'; + return s; + }) as unknown as typeof fs.realpathSync); + }); + afterEach(() => { + vi.mocked(fs.realpathSync).mockReset(); + }); + + it('records and looks up under the canonical path', () => { + const service = CommitAttributionService.getInstance(); + service.recordEdit('/var/repo/src/main.ts', '', 'x'.repeat(50)); + + // Lookup with EITHER form should work — the service canonicalises + // both write and read. + expect(service.getFileAttribution('/var/repo/src/main.ts')).toBeDefined(); + expect( + service.getFileAttribution('/private/var/repo/src/main.ts'), + ).toBeDefined(); + }); + + it('matches diff paths when baseDir is the symlinked form', () => { + const service = CommitAttributionService.getInstance(); + service.recordEdit('/var/repo/src/main.ts', '', 'x'.repeat(80)); + + // generateNotePayload receives the symlinked baseDir; the loop + // canonicalises it before computing path.relative against the + // (already-canonical) keys. + const staged = makeStagedInfo(['src/main.ts'], { 'src/main.ts': 80 }); + const note = service.generateNotePayload(staged, '/var/repo'); + + expect(note.files['src/main.ts']!.aiChars).toBe(80); + expect(note.files['src/main.ts']!.percent).toBe(100); + }); + + it('clearAttributedFiles deletes by canonical key without realpath-ing the leaf', () => { + const service = CommitAttributionService.getInstance(); + service.recordEdit('/var/repo/src/deleted.ts', '', 'will be removed'); + expect( + service.getFileAttribution('/var/repo/src/deleted.ts'), + ).toBeDefined(); + + // Caller composes paths against a canonical baseDir (mirrors + // attachCommitAttribution's pattern), so the leaf doesn't need + // to exist for the delete to find the right key. + service.clearAttributedFiles( + new Set(['/private/var/repo/src/deleted.ts']), + ); + expect( + service.getFileAttribution('/var/repo/src/deleted.ts'), + ).toBeUndefined(); + }); + + it('moves attribution across committed renames before payload generation', () => { + const service = CommitAttributionService.getInstance(); + service.recordEdit('/var/repo/src/old.ts', '', 'renamed content'); + + service.applyCommittedRenames( + new Map([['src/old.ts', 'src/new.ts']]), + '/private/var/repo', + ); + + expect( + service.getFileAttribution('/var/repo/src/old.ts'), + ).toBeUndefined(); + expect(service.getFileAttribution('/var/repo/src/new.ts')).toBeDefined(); + + const staged = makeStagedInfo(['src/new.ts'], { 'src/new.ts': 80 }, [], { + 'src/old.ts': 'src/new.ts', + }); + const note = service.generateNotePayload(staged, '/var/repo'); + expect(note.files['src/new.ts']!.aiChars).toBe(15); + expect(note.files['src/new.ts']!.percent).toBe(19); + }); + + it('merges old-path attribution into an existing destination entry', () => { + const service = CommitAttributionService.getInstance(); + service.recordEdit('/var/repo/src/old.ts', '', 'old ai text'); + service.recordEdit('/var/repo/src/new.ts', '', 'new ai text'); + + service.applyCommittedRenames( + new Map([['src/old.ts', 'src/new.ts']]), + '/private/var/repo', + ); + + const attr = service.getFileAttribution('/var/repo/src/new.ts')!; + expect(attr.aiContribution).toBe( + 'old ai text'.length + 'new ai text'.length, + ); + expect( + service.getFileAttribution('/var/repo/src/old.ts'), + ).toBeUndefined(); + }); + + it('canonicalises keys on snapshot restore', () => { + const service = CommitAttributionService.getInstance(); + service.restoreFromSnapshot({ + type: 'attribution-snapshot', + surface: 'cli', + // Snapshot written before the canonicalisation fix could carry + // either form; restore should normalise to canonical. + fileStates: { + '/var/repo/src/legacy.ts': { + aiContribution: 99, + aiCreated: false, + contentHash: '', + }, + }, + promptCount: 0, + promptCountAtLastCommit: 0, + }); + + // Lookup under the canonical form succeeds even though the + // snapshot wrote the symlink form. + expect( + service.getFileAttribution('/private/var/repo/src/legacy.ts')! + .aiContribution, + ).toBe(99); + }); + + // A snapshot straddling the canonicalisation fix can carry both + // the symlinked and canonical paths for the same file. After + // realpathOrSelf normalises them, the second entry to land + // would overwrite the first if we just `set()` — losing the + // first form's accumulated aiContribution. Merge instead. + it('merges duplicate entries collapsed by canonicalisation', () => { + const service = CommitAttributionService.getInstance(); + service.restoreFromSnapshot({ + type: 'attribution-snapshot', + surface: 'cli', + fileStates: { + '/var/repo/src/dup.ts': { + aiContribution: 30, + aiCreated: false, + contentHash: 'old', + }, + '/private/var/repo/src/dup.ts': { + aiContribution: 70, + aiCreated: true, + contentHash: 'new', + }, + }, + promptCount: 0, + promptCountAtLastCommit: 0, + }); + + const restored = service.getFileAttribution( + '/private/var/repo/src/dup.ts', + )!; + expect(restored.aiContribution).toBe(100); + // aiCreated is OR'd: any form carrying true wins. + expect(restored.aiCreated).toBe(true); + }); + + // A corrupted snapshot with promptCountAtLastCommit > promptCount + // would surface a negative `getPromptsSinceLastCommit()` and + // propagate as a "(-3)-shotted" trailer into PR text. + it('clamps promptCountAtLastCommit to promptCount on restore', () => { + const service = CommitAttributionService.getInstance(); + service.restoreFromSnapshot({ + type: 'attribution-snapshot', + surface: 'cli', + fileStates: {}, + promptCount: 5, + promptCountAtLastCommit: 99, + }); + expect(service.getPromptsSinceLastCommit()).toBe(0); + }); + + // `surface` lands verbatim in the git-notes payload and is used + // as a Map key. Non-string values would coerce into + // `[object Object]` etc. Fall back to the current client surface. + it.each([ + ['object', { foo: 'bar' }], + ['number', 42], + ['null', null], + ['empty string', ''], + ])( + 'falls back to client surface when snapshot.surface is non-string (%s)', + (_label, badValue) => { + const service = CommitAttributionService.getInstance(); + service.restoreFromSnapshot({ + type: 'attribution-snapshot', + surface: badValue as unknown as string, + fileStates: {}, + promptCount: 0, + promptCountAtLastCommit: 0, + }); + // getClientSurface() returns 'cli' in tests (no env var set). + expect(service.getSurface()).toBe('cli'); + }, + ); + + // Envelope-level corruption: a payload whose `type` discriminator + // is wrong (or whose top-level shape is non-object) must reset to + // a clean state instead of polluting fileAttributions. The + // resume-time caller passes `snapshot as AttributionSnapshot` + // from a structural cast off `unknown`, so the runtime value + // could be anything. + it.each([ + ['null', null], + ['array', []], + ['string', 'snapshot'], + ['number', 42], + ['wrong type discriminator', { type: 'something-else' }], + ['missing type', { fileStates: {} }], + ])( + 'resets to fresh state when snapshot envelope is malformed (%s)', + (_label, badPayload) => { + const service = CommitAttributionService.getInstance(); + // Seed some pre-existing state to confirm the reset clears it. + service.recordEdit('/project/preexisting.ts', null, 'hello'); + expect( + service.getFileAttribution('/project/preexisting.ts'), + ).toBeDefined(); + + service.restoreFromSnapshot( + badPayload as unknown as Parameters< + typeof service.restoreFromSnapshot + >[0], + ); + expect( + service.getFileAttribution('/project/preexisting.ts'), + ).toBeUndefined(); + expect(service.getSurface()).toBe('cli'); + expect(service.getPromptsSinceLastCommit()).toBe(0); + }, + ); + + // `fileStates` must be a plain object; otherwise Object.entries + // would happily iterate an array's [index, value] pairs and seed + // fileAttributions with numeric-string keys. + it.each([ + ['array', []], + ['string', 'oops'], + ['number', 42], + ['null', null], + ])( + 'ignores non-object fileStates (%s) without polluting attribution map', + (_label, badFileStates) => { + const service = CommitAttributionService.getInstance(); + service.restoreFromSnapshot({ + type: 'attribution-snapshot', + surface: 'cli', + fileStates: badFileStates as unknown as Record< + string, + { aiContribution: number; aiCreated: boolean; contentHash: string } + >, + promptCount: 0, + promptCountAtLastCommit: 0, + }); + expect(service.hasAttributions()).toBe(false); + }, + ); + }); +}); diff --git a/packages/core/src/services/commitAttribution.ts b/packages/core/src/services/commitAttribution.ts new file mode 100644 index 000000000..0a1956bdd --- /dev/null +++ b/packages/core/src/services/commitAttribution.ts @@ -0,0 +1,916 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * Commit Attribution Service + * + * Tracks character-level contribution ratios between AI and humans per file. + * When a git commit is made, this data is combined with git diff analysis to + * calculate real AI vs human contribution percentages, stored as git notes. + * + * Features: + * - Character-level prefix/suffix diff algorithm + * - Real AI/human contribution ratio via git diff + * - Surface tracking (cli/ide/api/sdk) + * - Prompt counting (since-last-commit window) + * - Snapshot/restore for session persistence + * - Generated file exclusion + */ + +import { createHash } from 'node:crypto'; +import * as fs from 'node:fs'; +import * as path from 'node:path'; +import { createDebugLogger } from '../utils/debugLogger.js'; +import { isGeneratedFile } from './generatedFiles.js'; + +const debugLogger = createDebugLogger('COMMIT_ATTRIBUTION'); + +/** + * Strip the per-platform / per-encoding noise (leading UTF-8 BOM, + * CRLF line endings) so two byte-different but semantically-identical + * versions of the same content hash to the same value. + * + * Edit and WriteFile preserve the user's BOM/lineEnding choice when + * writing back, so the on-disk bytes can include CRLF or a leading + * U+FEFF even when AI's `recordEdit` was given LF-normalised content + * with no BOM. Without this normalisation, a `git show :` + * read of a BOM/CRLF file would always mismatch AI's recorded hash + * and drop the entry on every commit. The hash is metadata (used for + * divergence detection only); collapsing these visual differences is + * the right comparison semantics. + */ +function canonicaliseForHash(content: string): string { + // Strip a single leading UTF-8 BOM (U+FEFF). writeTextFile's + // BOM mode prepends BOM bytes to the on-disk file independently of + // whether AI's input included the BOM character in its string. + let normalised = + content.length > 0 && content.charCodeAt(0) === 0xfeff + ? content.slice(1) + : content; + // Normalise CRLF → LF. writeTextFile writes CRLF when the file's + // detected line-ending is CRLF; AI's recordEdit input is typically + // LF-normalised (Edit's `currentContent.replace(/\r\n/g, '\n')` + // happens before recordEdit fires). + normalised = normalised.replace(/\r\n/g, '\n'); + return normalised; +} + +function computeContentHash(content: string): string { + return createHash('sha256') + .update(canonicaliseForHash(content)) + .digest('hex'); +} + +/** + * Resolve symlinks on a path. On macOS in particular, `/var` is a + * symlink to `/private/var`, so an absolute path captured via + * `fs.realpathSync` (what edit.ts/write-file.ts records) and + * `path.relative` against `git rev-parse --show-toplevel` (which may + * report either form) won't line up unless we normalise both sides. + * + * For DELETED leaves (file no longer exists on disk), realpathSync + * throws — but the parent directory is still resolvable. Canonicalise + * the parent and rejoin the missing basename so a deleted file's + * lookup still hits the canonical key recordEdit stored before the + * file was removed. Without this, a `getFileAttribution(deletedPath)` + * call after the file was deleted would fall back to the + * non-canonical input and miss the canonical entry on macOS. + */ +function realpathOrSelf(p: string): string { + try { + return fs.realpathSync(p); + } catch { + try { + const parent = path.dirname(p); + const realParent = fs.realpathSync(parent); + return path.join(realParent, path.basename(p)); + } catch { + return p; + } + } +} + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +export interface FileAttribution { + /** Total characters contributed by AI (accumulated across edits) */ + aiContribution: number; + /** Whether the file was created by AI */ + aiCreated: boolean; + /** + * SHA-256 of the file content immediately after AI's last write. Used + * to detect out-of-band mutation (paste-replace via external editor, + * `rm` + recreate, manual save) so AI's accumulated counter doesn't + * silently get credited to subsequent human edits. recordEdit checks + * this on every call (resets when the input `oldContent` doesn't + * match), and `validateAgainst` re-verifies before a commit note is + * generated to catch user edits that happened entirely outside the + * Edit/Write tools. + */ + contentHash: string; +} + +/** + * Per-file attribution detail in the git notes payload. + * + * Field naming caveat: `aiChars` and `humanChars` look like literal + * UTF-16/UTF-8 character counts, but they are NOT. Both are + * heuristic diff-size proxies derived from `git diff --numstat`: + * for text files the value is `(addedLines + deletedLines) × 40` + * (the 40-char/line heuristic), and for binary files both sides + * are reported as a flat `1024`. The per-file AI accumulator from + * `recordEdit` is then clamped against this same line-based ceiling. + * + * Practical consequence: a commit adding 1000 one-character lines + * and one adding 1000 thousand-character lines both report + * `aiChars = 40000`; a 5 MB image change and a 1-byte binary tweak + * both report `1024`. `percent` (and `summary.aiPercent`) is + * largely insulated from this — both numerator and denominator use + * the same heuristic — but consumers aggregating raw + * `aiChars`/`humanChars` for compliance reporting will get + * systematically biased numbers and should treat these fields as + * "approximate change size in proxy-chars" rather than literal + * char counts. + */ +export interface FileAttributionDetail { + /** Heuristic diff-size proxy (NOT a literal char count — see interface doc). */ + aiChars: number; + /** Heuristic diff-size proxy (NOT a literal char count — see interface doc). */ + humanChars: number; + /** + * AI share of the per-file diff, rounded to integer percent. + * Robust against the heuristic in `aiChars`/`humanChars` because + * both sides of the ratio use the same proxy; safe to aggregate. + */ + percent: number; + surface?: string; +} + +/** + * Full attribution payload stored as git notes JSON. + * + * Same `aiChars`/`humanChars` caveat as `FileAttributionDetail`: + * those summed totals are sums of heuristic diff-size proxies, not + * literal character counts. `aiPercent` (and per-file `percent`) + * use the same proxy on both sides of the ratio, so the percentage + * is the field consumers should rely on for cross-commit + * aggregation; the raw chars values are useful for ordering + * commits within the same payload but should not be summed across + * unrelated commits as if they were byte counts. + */ +export interface CommitAttributionNote { + version: 1; + generator: string; + files: Record; + summary: { + /** AI share of the whole commit, rounded to integer percent. */ + aiPercent: number; + /** Sum of per-file `aiChars` heuristic proxies (see FileAttributionDetail). */ + aiChars: number; + /** Sum of per-file `humanChars` heuristic proxies (see FileAttributionDetail). */ + humanChars: number; + totalFilesTouched: number; + surfaces: string[]; + }; + surfaceBreakdown: Record; + /** + * Sample of generated/vendored files that were excluded from + * attribution. Capped at `MAX_EXCLUDED_GENERATED_SAMPLE` paths so a + * commit churning thousands of `dist/` artifacts can't blow past the + * 30 KB note budget and silently drop attribution for the real + * source files in the same commit. Use `excludedGeneratedCount` for + * the true total. + */ + excludedGenerated: string[]; + /** Total count of excluded files (≥ excludedGenerated.length). */ + excludedGeneratedCount: number; + promptCount: number; +} + +/** + * Upper bound on the number of excluded-generated paths we serialize + * into the git note. Keeps the JSON payload bounded for commits with + * lots of generated artifacts. + */ +export const MAX_EXCLUDED_GENERATED_SAMPLE = 50; + +/** Result of running git commands to get staged file info. */ +export interface StagedFileInfo { + files: string[]; + diffSizes: Map; + deletedFiles: Set; + /** + * Git rename map from old repo-relative path to new repo-relative path. + * Populated from `git diff --name-status --find-renames`. Used to move + * pending attribution from the pre-rename absolute key to the post-rename + * key before payload generation and cleanup. + */ + renamedFiles: Map; + /** + * Absolute path of the repository root (`git rev-parse --show-toplevel`). + * Optional for backward compatibility with synthetic test inputs; + * production callers should set it so file paths in `files` (which are + * relative to the repo root) align with absolute paths tracked by the + * attribution service. When absent, callers may fall back to the + * configured target directory at the cost of zeroed-out attribution + * for files outside that directory. + */ + repoRoot?: string; +} + +/** + * On-disk schema version for AttributionSnapshot. Bump when the shape + * changes incompatibly so restoreFromSnapshot can refuse / migrate + * stale payloads instead of silently producing NaN counters or + * mismatched key shapes. + */ +export const ATTRIBUTION_SNAPSHOT_VERSION = 1; + +/** Serializable snapshot for session persistence. */ +export interface AttributionSnapshot { + type: 'attribution-snapshot'; + /** Schema version; absent on pre-versioning snapshots, treated as 1. */ + version?: number; + surface: string; + fileStates: Record; + promptCount: number; + promptCountAtLastCommit: number; +} + +// --------------------------------------------------------------------------- +// Model name sanitization +// --------------------------------------------------------------------------- + +const INTERNAL_MODEL_PATTERNS = [ + /qwen[-_]?\d+(\.\d+)?[-_]?b?/i, + /qwen[-_]?coder[-_]?\d*/i, + /qwen[-_]?max/i, + /qwen[-_]?plus/i, + /qwen[-_]?turbo/i, +]; + +const SANITIZED_GENERATOR_NAME = 'Qwen-Coder'; + +function sanitizeModelName(name: string): string { + for (const pattern of INTERNAL_MODEL_PATTERNS) { + if (pattern.test(name)) { + return SANITIZED_GENERATOR_NAME; + } + } + return name; +} + +// --------------------------------------------------------------------------- +// Utilities +// --------------------------------------------------------------------------- + +/** + * Defensive coercions for restoring snapshot fields. A snapshot can + * arrive with `undefined` / wrong-type fields if the on-disk JSON was + * partially written or pre-dates the current schema; without coercion + * they would flow through `Math.min(undefined, n) === NaN` into the + * git-notes payload. + */ +function sanitiseCount(v: unknown): number { + return typeof v === 'number' && Number.isFinite(v) && v >= 0 ? v : 0; +} + +function sanitiseAttribution(v: unknown): FileAttribution { + const obj = (v ?? {}) as Partial; + return { + aiContribution: sanitiseCount(obj.aiContribution), + aiCreated: typeof obj.aiCreated === 'boolean' ? obj.aiCreated : false, + contentHash: typeof obj.contentHash === 'string' ? obj.contentHash : '', + }; +} + +/** + * Surface label embedded in the git-notes payload. Defaults to `'cli'` + * for the qwen-code CLI; embedders (IDE extensions, SDK consumers) can + * override by setting `QWEN_CODE_ENTRYPOINT` before construction so the + * note records where the contribution was authored. + */ +export function getClientSurface(): string { + return process.env['QWEN_CODE_ENTRYPOINT'] ?? 'cli'; +} + +// --------------------------------------------------------------------------- +// Service +// --------------------------------------------------------------------------- + +export class CommitAttributionService { + private static instance: CommitAttributionService | null = null; + + /** Per-file AI contribution tracking (keyed by absolute path) */ + private fileAttributions: Map = new Map(); + /** Client surface (cli, ide, api, sdk, etc.) */ + private surface: string = getClientSurface(); + + // -- Prompt counting -- + private promptCount: number = 0; + private promptCountAtLastCommit: number = 0; + + private constructor() {} + + static getInstance(): CommitAttributionService { + if (!CommitAttributionService.instance) { + CommitAttributionService.instance = new CommitAttributionService(); + } + return CommitAttributionService.instance; + } + + /** Reset singleton for testing. */ + static resetInstance(): void { + CommitAttributionService.instance = null; + } + + // ----------------------------------------------------------------------- + // Recording + // ----------------------------------------------------------------------- + + /** + * Record an AI edit to a file. + * Uses prefix/suffix matching for precise character-level contribution. + * + * `filePath` is canonicalised via `fs.realpathSync` before being used + * as a key, so symlinked paths (e.g. `/var/...` ↔ `/private/var/...` + * on macOS) collapse to the same entry instead of silently producing + * two parallel records. + * + * Divergence detection: if a tracked entry's recorded `contentHash` + * doesn't match the hash of the `oldContent` we received here, the + * file was changed out-of-band between AI's last write and this + * call (paste-replace via external editor, `git checkout`, manual + * save, ...). Reset `aiContribution` and `aiCreated` to 0/false + * before applying the new edit so prior AI work that the user + * since overwrote isn't credited to the next commit. + */ + recordEdit( + filePath: string, + oldContent: string | null, + newContent: string, + ): void { + const key = realpathOrSelf(filePath); + + const existing = this.fileAttributions.get(key); + const isNewFile = oldContent === null; + + let aiContribution = existing?.aiContribution ?? 0; + let aiCreated = existing?.aiCreated ?? false; + + // Fresh-file lifetime: if we have a prior tracked state for this + // path BUT the caller is reporting `oldContent === null` (the file + // didn't exist on disk at edit time), the previous tracking was + // for a since-deleted file at the same path — accumulating across + // distinct file lifetimes would credit AI for chars from the old + // file that no longer exist. Reset before counting the new + // contribution. Common path: AI creates `foo.ts` → user / shell + // `rm foo.ts` → AI re-creates `foo.ts` from scratch. + if (existing && isNewFile) { + aiContribution = 0; + aiCreated = false; + } + + // Out-of-band mutation: if we have a prior tracked state AND the + // input `oldContent` doesn't match the hash we recorded after + // AI's last write, the file diverged out-of-band (paste-replace + // via external editor, `git checkout`, manual save). Reset the + // accumulator before applying the new edit. + // + // Skip when `existing.contentHash` is empty: that's a legacy + // snapshot (pre-divergence-detection schema) where we never + // recorded the post-write hash. Comparing an empty hash to the + // actual file hash would always trip the reset and silently wipe + // AI work that's still on disk. + if (existing && existing.contentHash && oldContent !== null) { + const oldHash = computeContentHash(oldContent); + if (existing.contentHash !== oldHash) { + aiContribution = 0; + aiCreated = false; + } + } + + const contribution = computeCharContribution(oldContent ?? '', newContent); + aiContribution += contribution; + if (isNewFile) aiCreated = true; + + this.fileAttributions.set(key, { + aiContribution, + aiCreated, + contentHash: computeContentHash(newContent), + }); + } + + /** + * Re-hash each tracked file's content via a caller-supplied reader + * and drop entries whose hash doesn't match what AI's last write + * recorded. Catches the cases recordEdit's input-hash check can't + * see — i.e. the user (or another tool) modified the file entirely + * outside the Edit/Write tools, then committed it. Without this, + * the AI's stale aiContribution would attach to the human-only + * diff at commit time and credit AI for human work. + * + * `getContent(absPath)` returns the bytes the caller wants to + * compare against, or `null` if the entry shouldn't be checked + * (deletion, unreadable, file not in the relevant scope). Returning + * `null` leaves the entry alone rather than dropping it. + * + * Production caller (`attachCommitAttribution`) passes a reader + * that fetches the COMMITTED blob (`git show HEAD:`) for files + * actually in the just-made commit, returning null for everything + * else. The "committed blob" choice (rather than the live working + * tree) is what makes a `git add AI's edit && extra unstaged edits + * && git commit` flow correctly attribute the commit to AI even + * though the working-tree file no longer matches AI's recorded + * hash. + */ + validateAgainst(getContent: (absPath: string) => string | null): void { + for (const [key, attr] of this.fileAttributions) { + // Skip legacy entries that have no recorded post-write hash — + // we can't tell stale from fresh, so leave them alone. + if (!attr.contentHash) continue; + const current = getContent(key); + if (current === null) continue; // not a divergence signal + if (computeContentHash(current) !== attr.contentHash) { + debugLogger.debug( + `validateAgainst: dropping stale attribution for ${key} (hash diverged)`, + ); + this.fileAttributions.delete(key); + } + } + } + + // ----------------------------------------------------------------------- + // Prompt counting + // ----------------------------------------------------------------------- + + incrementPromptCount(): void { + this.promptCount++; + } + + getPromptCount(): number { + return this.promptCount; + } + + /** Prompts since last commit (for "N-shotted" display). */ + getPromptsSinceLastCommit(): number { + return this.promptCount - this.promptCountAtLastCommit; + } + + // ----------------------------------------------------------------------- + // Querying + // ----------------------------------------------------------------------- + + getAttributions(): Map { + const copy = new Map(); + for (const [k, v] of this.fileAttributions) { + copy.set(k, { ...v }); + } + return copy; + } + + getFileAttribution(filePath: string): FileAttribution | undefined { + // Canonicalise so callers don't have to know about the realpath + // normalization happening inside `recordEdit`. + const attr = this.fileAttributions.get(realpathOrSelf(filePath)); + return attr ? { ...attr } : undefined; + } + + hasAttributions(): boolean { + return this.fileAttributions.size > 0; + } + + getSurface(): string { + return this.surface; + } + + /** + * Clear file attribution data. Called after commit (success or failure). + * @param commitSucceeded If true, also updates the "at last commit" + * counters so getPromptsSinceLastCommit() resets to 0. + */ + clearAttributions(commitSucceeded: boolean = true): void { + if (commitSucceeded) { + this.promptCountAtLastCommit = this.promptCount; + } + this.fileAttributions.clear(); + } + + /** + * Clear attribution data for the specific files that just landed in + * a commit, leaving entries for files the user *didn't* include + * (partial commits, `git add A && git commit -m "..."`) intact so + * they're still credited on a later commit. Snapshots prompt + * counters since a commit did succeed. + * + * Inputs must already be canonical absolute paths. The caller + * should resolve repo-relative diff entries against a canonical + * (realpath'd) repo root rather than realpathing each leaf — at + * cleanup time the leaf for a just-deleted file no longer exists, + * so per-leaf `fs.realpathSync` would fail and fall back to a + * non-canonical path that misses the stored canonical key. + */ + clearAttributedFiles(committedAbsolutePaths: Set): void { + this.promptCountAtLastCommit = this.promptCount; + for (const p of committedAbsolutePaths) { + this.fileAttributions.delete(p); + } + } + + /** + * Snapshot the prompt counter as the new "last commit" without + * clearing per-file attribution. Used when a commit landed but we + * can't reliably determine which files were in it (multi-commit + * chain we won't write a note for, attribution toggle off, diff + * analysis failed). Wholesale-clearing in those branches would + * silently wipe pending AI edits for *unrelated* files the user + * didn't stage — a worse failure mode than the small risk of + * stale per-file state for files that did just land. + */ + noteCommitWithoutClearing(): void { + this.promptCountAtLastCommit = this.promptCount; + } + + /** + * Resolve a set of repo-relative file paths to the canonical absolute + * keys actually stored in the attribution map. Used by cleanup to + * partial-clear only the files that just landed in a commit. + * + * Matching by walking `fileAttributions` (instead of resolving each + * relative path with `path.resolve` + `fs.realpathSync`) is the only + * approach that handles all of: deleted files (where realpathSync + * throws), intermediate-symlink directories (where path.resolve only + * canonicalises the base), and renamed files (where the diff-time + * relative path differs from the recordEdit-time absolute path — + * still no match here, that's a rename-tracking concern handled + * separately). Each tracked key is canonical (recordEdit ran it + * through `realpathOrSelf`), so its computed relative form against + * the canonical repo root is what generateNotePayload uses too. + */ + matchCommittedFiles( + relativeFiles: Iterable, + canonicalRepoRoot: string, + ): Set { + const wanted = new Set(relativeFiles); + const matched = new Set(); + for (const key of this.fileAttributions.keys()) { + const rel = path + .relative(canonicalRepoRoot, key) + .split(path.sep) + .join('/'); + if (wanted.has(rel)) { + matched.add(key); + } + } + return matched; + } + + /** + * Move pending attribution across git renames before matching committed files. + * + * `recordEdit` stores attribution by canonical absolute path at edit time. + * If the user later commits `git mv old.ts new.ts`, git reports the committed + * file as `new.ts` while our map is still keyed by `old.ts`. Without moving + * the key first, the note either misses the AI-authored rename entirely or + * treats the old path as a deletion depending on diff settings. + */ + applyCommittedRenames( + renamedFiles: ReadonlyMap, + canonicalRepoRoot: string, + ): void { + if (renamedFiles.size === 0) return; + + // Build the new-key path using POSIX semantics so the joined + // string matches `git diff --name-only` output (which is always + // forward-slash) and isn't subject to `path.win32.join`'s + // backslash conversion. `realpathOrSelf` is what canonicalises + // back to the platform's actual storage form: on Windows it + // calls `fs.realpathSync` which accepts mixed slashes and returns + // backslash form, matching what `recordEdit` stored. + const posixRepoRoot = canonicalRepoRoot.split(path.sep).join('/'); + for (const [key, attr] of [...this.fileAttributions.entries()]) { + const rel = path + .relative(canonicalRepoRoot, key) + .split(path.sep) + .join('/'); + const renamedRel = renamedFiles.get(rel); + if (!renamedRel) continue; + + const renamedAbs = realpathOrSelf( + path.posix.join(posixRepoRoot, renamedRel), + ); + if (renamedAbs === key) continue; + + const existing = this.fileAttributions.get(renamedAbs); + if (existing) { + this.fileAttributions.set(renamedAbs, { + aiContribution: existing.aiContribution + attr.aiContribution, + aiCreated: existing.aiCreated || attr.aiCreated, + // Prefer the destination hash: if AI edited after the rename, + // that entry reflects the freshest post-write content. + contentHash: existing.contentHash || attr.contentHash, + }); + } else { + this.fileAttributions.set(renamedAbs, { ...attr }); + } + this.fileAttributions.delete(key); + } + } + + // ----------------------------------------------------------------------- + // Snapshot / restore (session persistence) + // ----------------------------------------------------------------------- + + /** Serialize current state for session persistence. */ + toSnapshot(): AttributionSnapshot { + const fileStates: Record = {}; + for (const [k, v] of this.fileAttributions) { + fileStates[k] = { ...v }; + } + return { + type: 'attribution-snapshot', + version: ATTRIBUTION_SNAPSHOT_VERSION, + surface: this.surface, + fileStates, + promptCount: this.promptCount, + promptCountAtLastCommit: this.promptCountAtLastCommit, + }; + } + + /** Restore state from a persisted snapshot. */ + restoreFromSnapshot(snapshot: AttributionSnapshot): void { + // The resume-time caller (client.ts) passes `snapshot` as a + // structural cast from `unknown`, so its TS-typed shape is only + // a hint — the actual runtime value can be anything (corrupted + // JSONL line, hand-edited session file, schema drift). Bail to + // a clean reset on any envelope-level shape mismatch: + // - non-object / null / array + // - wrong `type` discriminator + // - non-numeric `version` (after the `version ?? 1` default) + // - non-object `fileStates` + // Per-field coercion (sanitiseAttribution etc.) handles damage + // INSIDE a structurally valid snapshot; this gate stops a + // wholesale-wrong payload from polluting fileAttributions with + // garbage keys before per-field validation can run. + const isPlainObject = (v: unknown): v is Record => + typeof v === 'object' && v !== null && !Array.isArray(v); + const looksLikeSnapshot = + isPlainObject(snapshot) && + (snapshot as Record)['type'] === 'attribution-snapshot'; + if (!looksLikeSnapshot) { + this.fileAttributions.clear(); + this.surface = getClientSurface(); + this.promptCount = 0; + this.promptCountAtLastCommit = 0; + return; + } + // Future schema bumps land here. Treat absent `version` as 1 + // (the schema in production at the time this field was added) so + // existing on-disk snapshots restore cleanly. + const snapshotVersion = snapshot.version ?? 1; + if (snapshotVersion !== ATTRIBUTION_SNAPSHOT_VERSION) { + // Don't trust a stale shape — its fields may have moved or + // changed semantics. Reset to a fresh state rather than + // splice incompatible data. + this.fileAttributions.clear(); + this.surface = getClientSurface(); + this.promptCount = 0; + this.promptCountAtLastCommit = 0; + return; + } + + // `surface` is embedded verbatim in the git-notes payload and used + // as a Map/Record key downstream. A corrupted snapshot with a + // non-string value (e.g. `{}`, `42`, `null`) would coerce into + // strings like `[object Object]` and break the payload shape. + // Fall back to the current client surface when the stored value + // isn't a string. + this.surface = + typeof snapshot.surface === 'string' && snapshot.surface.length > 0 + ? snapshot.surface + : getClientSurface(); + // A corrupted or partially-written snapshot can leave numeric + // counters as `undefined`; without coercion, downstream + // `Math.min(undefined, n)` produces NaN that flows into the + // git-notes payload. Coerce per-field with a typed default. + this.promptCount = sanitiseCount(snapshot.promptCount); + this.promptCountAtLastCommit = sanitiseCount( + snapshot.promptCountAtLastCommit, + ); + // Enforce the invariant `atLastCommit <= total`: a corrupted / + // partially-written snapshot with the inverse would surface a + // negative `getPromptsSinceLastCommit()` and propagate as a + // "(-3)-shotted" trailer into PR descriptions. + if (this.promptCountAtLastCommit > this.promptCount) { + this.promptCountAtLastCommit = this.promptCount; + } + + this.fileAttributions.clear(); + // Reject a corrupted `fileStates` (e.g. an array, a string, or + // null) before iterating: `Object.entries()` would happily + // produce `[index, value]` pairs and seed fileAttributions with + // numeric-string keys. + const fileStates = isPlainObject(snapshot.fileStates) + ? snapshot.fileStates + : {}; + for (const [k, v] of Object.entries(fileStates)) { + // Re-canonicalise on restore so old snapshots (written before + // recordEdit started running keys through realpath) end up + // with the same shape as newly-recorded entries. If both the + // symlinked and canonical forms were stored under separate + // keys (e.g. a session straddling the canonicalisation fix), + // collapsing them onto the same canonical key MUST merge their + // attribution rather than overwrite — otherwise the second + // entry to land wins and the AI's accumulated contribution from + // the first form is silently dropped. + const canonicalKey = realpathOrSelf(k); + const incoming = sanitiseAttribution(v); + const existing = this.fileAttributions.get(canonicalKey); + if (existing) { + // Sum aiContribution and OR aiCreated. Pick the + // most-recently-recorded contentHash (incoming wins) so + // post-restore divergence checks compare against the freshest + // hash; an old form's stale hash would force unnecessary + // resets on the next recordEdit. + this.fileAttributions.set(canonicalKey, { + aiContribution: existing.aiContribution + incoming.aiContribution, + aiCreated: existing.aiCreated || incoming.aiCreated, + contentHash: incoming.contentHash || existing.contentHash, + }); + } else { + this.fileAttributions.set(canonicalKey, incoming); + } + } + } + + // ----------------------------------------------------------------------- + // Payload generation + // ----------------------------------------------------------------------- + + /** + * Generate the git notes JSON payload by combining tracked AI contributions + * with staged file information from git. + */ + generateNotePayload( + stagedInfo: StagedFileInfo, + baseDir: string, + generatorName?: string, + ): CommitAttributionNote { + const generator = sanitizeModelName( + generatorName ?? SANITIZED_GENERATOR_NAME, + ); + + const files: Record = {}; + const excludedGenerated: string[] = []; + let excludedGeneratedCount = 0; + const surfaceCounts: Record = {}; + let totalAiChars = 0; + let totalHumanChars = 0; + + // Build lookup: relative path → tracked AI contribution. Keys in + // `fileAttributions` are already canonical (recordEdit runs them + // through realpath); we only need to canonicalise `baseDir`, + // which comes from `git rev-parse --show-toplevel` and may be a + // symlink (e.g. macOS `/var` → `/private/var`). Without that + // canonicalisation `path.relative` would produce a `../...` key + // that never matches the diff output. Normalize separators to + // forward slashes so git paths line up on Windows. + const canonicalBase = realpathOrSelf(baseDir); + const aiLookup = new Map(); + for (const [absPath, attr] of this.fileAttributions) { + const rel = path + .relative(canonicalBase, absPath) + .split(path.sep) + .join('/'); + aiLookup.set(rel, attr); + } + + for (const relFile of stagedInfo.files) { + if (isGeneratedFile(relFile)) { + excludedGeneratedCount++; + // Cap the sample so a commit churning thousands of `dist/` + // artifacts can't blow past the 30 KB note budget. + if (excludedGenerated.length < MAX_EXCLUDED_GENERATED_SAMPLE) { + excludedGenerated.push(relFile); + } + continue; + } + + const tracked = aiLookup.get(relFile); + const diffSize = stagedInfo.diffSizes.get(relFile) ?? 0; + const isDeleted = stagedInfo.deletedFiles.has(relFile); + + let aiChars: number; + let humanChars: number; + + if (tracked) { + // Clamp aiChars to diffSize so aiChars+humanChars stays + // consistent with the committed change magnitude derived from + // `git diff --numstat`. Without this, cases where + // tracked.aiContribution exceeds the committed change size + // can leave aiChars > diffSize: humanChars then snaps to 0 + // but aiChars stays large, inflating the per-file total + // beyond what was committed. + aiChars = Math.min(tracked.aiContribution, diffSize); + humanChars = Math.max(0, diffSize - aiChars); + } else if (isDeleted) { + // Deleted files with no AI tracking are attributed entirely to + // the human. diffSize comes from `git diff --numstat` so empty + // deletions legitimately have diffSize=0 — a magic fallback + // would only inflate totals. + aiChars = 0; + humanChars = diffSize; + } else { + aiChars = 0; + humanChars = diffSize; + } + + const total = aiChars + humanChars; + const percent = total > 0 ? Math.round((aiChars / total) * 100) : 0; + + files[relFile] = { aiChars, humanChars, percent, surface: this.surface }; + totalAiChars += aiChars; + totalHumanChars += humanChars; + surfaceCounts[this.surface] = + (surfaceCounts[this.surface] ?? 0) + aiChars; + } + + const totalChars = totalAiChars + totalHumanChars; + const aiPercent = + totalChars > 0 ? Math.round((totalAiChars / totalChars) * 100) : 0; + + // Surface breakdown + const surfaceBreakdown: Record< + string, + { aiChars: number; percent: number } + > = {}; + for (const [surf, chars] of Object.entries(surfaceCounts)) { + surfaceBreakdown[surf] = { + aiChars: chars, + percent: totalChars > 0 ? Math.round((chars / totalChars) * 100) : 0, + }; + } + + return { + version: 1, + generator, + files, + summary: { + aiPercent, + aiChars: totalAiChars, + humanChars: totalHumanChars, + totalFilesTouched: Object.keys(files).length, + surfaces: [this.surface], + }, + surfaceBreakdown, + excludedGenerated, + excludedGeneratedCount, + promptCount: this.getPromptsSinceLastCommit(), + }; + } +} + +// --------------------------------------------------------------------------- +// Character contribution calculation (Claude's prefix/suffix algorithm) +// --------------------------------------------------------------------------- + +/** + * Compute the character contribution for a file modification. + * Uses common prefix/suffix matching to find the actual changed region, + * then returns the larger of the old/new changed lengths. + */ +export function computeCharContribution( + oldContent: string, + newContent: string, +): number { + if (oldContent === '' || newContent === '') { + return oldContent === '' ? newContent.length : oldContent.length; + } + + const minLen = Math.min(oldContent.length, newContent.length); + let prefixEnd = 0; + while ( + prefixEnd < minLen && + oldContent[prefixEnd] === newContent[prefixEnd] + ) { + prefixEnd++; + } + + let suffixLen = 0; + while ( + suffixLen < minLen - prefixEnd && + oldContent[oldContent.length - 1 - suffixLen] === + newContent[newContent.length - 1 - suffixLen] + ) { + suffixLen++; + } + + const oldChangedLen = oldContent.length - prefixEnd - suffixLen; + const newChangedLen = newContent.length - prefixEnd - suffixLen; + return Math.max(oldChangedLen, newChangedLen); +} diff --git a/packages/core/src/services/generatedFiles.test.ts b/packages/core/src/services/generatedFiles.test.ts new file mode 100644 index 000000000..a37d74289 --- /dev/null +++ b/packages/core/src/services/generatedFiles.test.ts @@ -0,0 +1,95 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect } from 'vitest'; +import { isGeneratedFile } from './generatedFiles.js'; + +describe('isGeneratedFile', () => { + it('should exclude lock files', () => { + expect(isGeneratedFile('package-lock.json')).toBe(true); + expect(isGeneratedFile('yarn.lock')).toBe(true); + expect(isGeneratedFile('pnpm-lock.yaml')).toBe(true); + expect(isGeneratedFile('Cargo.lock')).toBe(true); + }); + + it('should exclude minified files', () => { + expect(isGeneratedFile('app.min.js')).toBe(true); + expect(isGeneratedFile('styles.min.css')).toBe(true); + expect(isGeneratedFile('lib-min.js')).toBe(true); + }); + + it('should exclude files in dist/build directories', () => { + expect(isGeneratedFile('dist/bundle.js')).toBe(true); + expect(isGeneratedFile('build/output.js')).toBe(true); + expect(isGeneratedFile('src/.next/cache.js')).toBe(true); + }); + + // `.d.ts` files are commonly authored by hand (declaration files + // for projects without TS sources, ambient module declarations, + // asset shims like `*.d.ts` for `import './x.svg'`); the prior + // blanket exclusion silently dropped AI edits to those files. + // Auto-generated `.d.ts` (e.g. `tsc --declaration` output) tends + // to live under `/dist/` etc. and is still excluded by the + // directory rules. + it('should NOT exclude hand-authored TypeScript declaration files', () => { + expect(isGeneratedFile('types/index.d.ts')).toBe(false); + expect(isGeneratedFile('src/assets.d.ts')).toBe(false); + }); + + it('should still exclude .d.ts emitted into build directories', () => { + expect(isGeneratedFile('dist/index.d.ts')).toBe(true); + expect(isGeneratedFile('build/types.d.ts')).toBe(true); + }); + + it('should exclude generated code files', () => { + expect(isGeneratedFile('api.generated.ts')).toBe(true); + expect(isGeneratedFile('schema.pb.go')).toBe(true); + expect(isGeneratedFile('service.grpc.ts')).toBe(true); + }); + + it('should exclude vendor directories', () => { + expect(isGeneratedFile('vendor/lib/utils.js')).toBe(true); + expect(isGeneratedFile('node_modules/pkg/index.js')).toBe(true); + }); + + it('should NOT exclude normal source files', () => { + expect(isGeneratedFile('src/main.ts')).toBe(false); + expect(isGeneratedFile('lib/utils.js')).toBe(false); + expect(isGeneratedFile('README.md')).toBe(false); + expect(isGeneratedFile('package.json')).toBe(false); + expect(isGeneratedFile('src/components/Button.tsx')).toBe(false); + }); + + // Segment-boundary check: project dirs that *contain* a reserved + // word as a substring (e.g. `my-dist`, `xbuild`, `prebuild`) must + // not be caught by the directory rule. + it('should NOT exclude dirs that merely substring-match a reserved name', () => { + expect(isGeneratedFile('my-dist/file.ts')).toBe(false); + expect(isGeneratedFile('xbuild/core.ts')).toBe(false); + expect(isGeneratedFile('rebuild/notes.md')).toBe(false); + expect(isGeneratedFile('preout/x.ts')).toBe(false); + // The filename itself isn't subject to the directory rule. + expect(isGeneratedFile('src/dist.ts')).toBe(false); + }); + + // `.lock` is no longer a blanket exclusion — only the explicit + // EXCLUDED_FILENAMES (yarn.lock etc.) are dropped. + it('should NOT exclude unknown .lock files (only well-known ones)', () => { + expect(isGeneratedFile('config/feature.lock')).toBe(false); + // Sanity: known lockfiles still excluded. + expect(isGeneratedFile('yarn.lock')).toBe(true); + }); + + // Terraform: `.terraform.lock.hcl` is generated by `terraform init` + // and dominates Terraform-repo commits. Listed as a known + // generated lockfile in EXCLUDED_FILENAMES. + it('should exclude Terraform provider lockfile', () => { + expect(isGeneratedFile('.terraform.lock.hcl')).toBe(true); + expect(isGeneratedFile('infra/modules/network/.terraform.lock.hcl')).toBe( + true, + ); + }); +}); diff --git a/packages/core/src/services/generatedFiles.ts b/packages/core/src/services/generatedFiles.ts new file mode 100644 index 000000000..a8b4a7760 --- /dev/null +++ b/packages/core/src/services/generatedFiles.ts @@ -0,0 +1,157 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * Generated / vendored file detection for attribution exclusion. + * Based on GitHub Linguist vendored patterns and common generated file patterns. + */ + +import { basename, extname, posix, sep } from 'node:path'; + +// Exact file name matches (case-insensitive) +const EXCLUDED_FILENAMES = new Set([ + 'package-lock.json', + 'yarn.lock', + 'pnpm-lock.yaml', + 'bun.lockb', + 'bun.lock', + 'composer.lock', + 'gemfile.lock', + 'cargo.lock', + 'poetry.lock', + 'pipfile.lock', + 'shrinkwrap.json', + 'npm-shrinkwrap.json', + // Terraform: provider lockfile generated by `terraform init`. In + // Terraform repos this file often dominates a commit's churn and + // would skew per-file AI ratios toward provider noise rather than + // the human-authored Terraform sources the note is meant to + // describe. + '.terraform.lock.hcl', +]); + +// File extension patterns (case-insensitive). Note: `.d.ts` is NOT +// listed here — `.d.ts` files are commonly authored by hand +// (declaration files for projects without TS sources, ambient module +// declarations, asset shims like `*.d.ts` for `import './x.svg'`), +// and treating every one as generated would silently drop AI edits +// to those files. Auto-generated `.d.ts` (e.g. `tsc --declaration` +// output) tends to live under `/dist/`, `/build/`, or `/out/`, +// which are already covered by `EXCLUDED_DIRECTORY_SEGMENTS`. +const EXCLUDED_EXTENSIONS = new Set([ + '.min.js', + '.min.css', + '.min.html', + '.bundle.js', + '.bundle.css', + '.generated.ts', + '.generated.js', +]); + +// Directory segments that indicate generated/vendored content. Compared +// against path segments (split on `/`) rather than substrings, so a +// project dir named `my-dist` or `xbuild` doesn't get caught by a +// `/dist/` substring match and silently drop AI attribution. +const EXCLUDED_DIRECTORY_SEGMENTS = new Set([ + 'dist', + 'build', + 'out', + 'output', + 'node_modules', + 'vendor', + 'vendored', + 'third_party', + 'third-party', + 'external', + '.next', + '.nuxt', + '.svelte-kit', + 'coverage', + '__pycache__', + '.tox', + 'venv', + '.venv', +]); + +// Multi-segment directory patterns that need contiguous matches +// (e.g. `target/release` and `target/debug` for Rust — `target` alone +// is too noisy as it's a common app name too). +const EXCLUDED_DIRECTORY_PATH_SUFFIXES = ['target/release', 'target/debug']; + +// Filename patterns using regex for more complex matching +const EXCLUDED_FILENAME_PATTERNS = [ + /^.*\.min\.[a-z]+$/i, + /^.*-min\.[a-z]+$/i, + /^.*\.bundle\.[a-z]+$/i, + /^.*\.generated\.[a-z]+$/i, + /^.*\.gen\.[a-z]+$/i, + /^.*\.auto\.[a-z]+$/i, + /^.*_generated\.[a-z]+$/i, + /^.*_gen\.[a-z]+$/i, + /^.*\.pb\.(go|js|ts|py|rb)$/i, + /^.*_pb2?\.py$/i, + /^.*\.pb\.h$/i, + /^.*\.grpc\.[a-z]+$/i, + /^.*\.swagger\.[a-z]+$/i, + /^.*\.openapi\.[a-z]+$/i, +]; + +/** + * Check if a file should be excluded from attribution based on Linguist-style rules. + * + * @param filePath - Relative file path from repository root + * @returns true if the file should be excluded from attribution + */ +export function isGeneratedFile(filePath: string): boolean { + const normalizedPath = + posix.sep + filePath.split(sep).join(posix.sep).replace(/^\/+/, ''); + const fileName = basename(filePath).toLowerCase(); + const ext = extname(filePath).toLowerCase(); + + if (EXCLUDED_FILENAMES.has(fileName)) { + return true; + } + + if (EXCLUDED_EXTENSIONS.has(ext)) { + return true; + } + + // Check for compound extensions like .min.js + const parts = fileName.split('.'); + if (parts.length > 2) { + const compoundExt = '.' + parts.slice(-2).join('.'); + if (EXCLUDED_EXTENSIONS.has(compoundExt)) { + return true; + } + } + + const normalizedPathLower = normalizedPath.toLowerCase(); + // Segment-boundary check: split on `/` and test each segment against + // EXCLUDED_DIRECTORY_SEGMENTS so `/repo/my-dist/file.ts` (a literal + // dir name) doesn't get caught by the `dist` rule the way a naïve + // `.includes('/dist/')` substring match could appear to suggest. + const segments = normalizedPathLower.split('/').filter(Boolean); + // The last segment is the filename — directory rules only apply to + // intermediate path components. + for (const seg of segments.slice(0, -1)) { + if (EXCLUDED_DIRECTORY_SEGMENTS.has(seg)) { + return true; + } + } + for (const suffix of EXCLUDED_DIRECTORY_PATH_SUFFIXES) { + if (normalizedPathLower.includes(`/${suffix}/`)) { + return true; + } + } + + for (const pattern of EXCLUDED_FILENAME_PATTERNS) { + if (pattern.test(fileName)) { + return true; + } + } + + return false; +} diff --git a/packages/core/src/tools/agent/agent-override.test.ts b/packages/core/src/tools/agent/agent-override.test.ts index 6185ce3a9..f0f97e46a 100644 --- a/packages/core/src/tools/agent/agent-override.test.ts +++ b/packages/core/src/tools/agent/agent-override.test.ts @@ -140,15 +140,10 @@ describe('createApprovalModeOverride bound-tool isolation', () => { expect(parent.getApprovalMode()).toBe(ApprovalMode.DEFAULT); - const child = await createApprovalModeOverride( - parent, - ApprovalMode.YOLO, - ); + const child = await createApprovalModeOverride(parent, ApprovalMode.YOLO); expect(child.getApprovalMode()).toBe(ApprovalMode.YOLO); - const childEdit = await child - .getToolRegistry() - .ensureTool(ToolNames.EDIT); + const childEdit = await child.getToolRegistry().ensureTool(ToolNames.EDIT); // eslint-disable-next-line @typescript-eslint/no-explicit-any const boundConfig = (childEdit as any).config as Config; expect(boundConfig.getApprovalMode()).toBe(ApprovalMode.YOLO); diff --git a/packages/core/src/tools/edit.test.ts b/packages/core/src/tools/edit.test.ts index e1cb47335..5bc53d495 100644 --- a/packages/core/src/tools/edit.test.ts +++ b/packages/core/src/tools/edit.test.ts @@ -30,6 +30,7 @@ import { ApprovalMode } from '../config/config.js'; import { createMockWorkspaceContext } from '../test-utils/mockWorkspaceContext.js'; import { FileReadCache } from '../services/fileReadCache.js'; import { StandardFileSystemService } from '../services/fileSystemService.js'; +import { CommitAttributionService } from '../services/commitAttribution.js'; describe('EditTool', () => { let tool: EditTool; @@ -500,6 +501,60 @@ describe('EditTool', () => { expect(display.fileName).toBe(testFile); }); + // The Edit tool feeds the commit-attribution singleton on success so + // commit notes can later report per-file AI/human ratios. Service- + // level tests for `recordEdit` already exist; these guard against + // the wiring at the tool boundary regressing (e.g. someone moves + // the call out of the success path). + describe('commit-attribution wiring', () => { + beforeEach(() => { + CommitAttributionService.resetInstance(); + }); + + it('records AI-originated edits in the attribution service', async () => { + const initial = 'old line'; + const updated = 'new line'; + fs.writeFileSync(filePath, initial, 'utf8'); + // Prior-read enforcement (origin/main #3774) requires the file + // to have been Read before Edit can mutate it. + seedPriorRead(filePath); + const invocation = tool.build({ + file_path: filePath, + old_string: 'old', + new_string: 'new', + }); + + await invocation.execute(new AbortController().signal); + + const attribution = + CommitAttributionService.getInstance().getFileAttribution(filePath); + expect(attribution).toBeDefined(); + // The actual char count is implementation detail of + // computeCharContribution; we only assert the entry exists + // with a positive contribution. + expect(attribution!.aiContribution).toBeGreaterThan(0); + // Length sanity: contribution is bounded by the new content. + expect(attribution!.aiContribution).toBeLessThanOrEqual(updated.length); + }); + + it('skips attribution when the edit is modified_by_user', async () => { + fs.writeFileSync(filePath, 'old line', 'utf8'); + seedPriorRead(filePath); + const invocation = tool.build({ + file_path: filePath, + old_string: 'old', + new_string: 'new', + modified_by_user: true, + }); + + await invocation.execute(new AbortController().signal); + + expect( + CommitAttributionService.getInstance().getFileAttribution(filePath), + ).toBeUndefined(); + }); + }); + it('should create a new file if old_string is empty and file does not exist, and return created message', async () => { const newFileName = 'brand_new_file.txt'; const newFilePath = path.join(rootDir, newFileName); diff --git a/packages/core/src/tools/edit.ts b/packages/core/src/tools/edit.ts index a19f19741..7ec717829 100644 --- a/packages/core/src/tools/edit.ts +++ b/packages/core/src/tools/edit.ts @@ -45,6 +45,7 @@ import type { ModifiableDeclarativeTool, ModifyContext, } from './modifiable-tool.js'; +import { CommitAttributionService } from '../services/commitAttribution.js'; import { safeLiteralReplace } from '../utils/textUtils.js'; import { countOccurrences, @@ -568,6 +569,15 @@ class EditToolInvocation implements ToolInvocation { }); } + // Track AI contribution for commit attribution + if (!this.params.modified_by_user) { + CommitAttributionService.getInstance().recordEdit( + this.params.file_path, + editData.currentContent, + editData.newContent, + ); + } + // Mark the cache entry written, capturing the post-write stats // so a follow-up Read sees `lastReadAt < lastWriteAt` and falls // through to the full pipeline instead of returning the diff --git a/packages/core/src/tools/shell.test.ts b/packages/core/src/tools/shell.test.ts index f3d329f9f..4e293cdae 100644 --- a/packages/core/src/tools/shell.test.ts +++ b/packages/core/src/tools/shell.test.ts @@ -35,9 +35,10 @@ import * as fs from 'node:fs'; import * as os from 'node:os'; import * as crypto from 'node:crypto'; import { ToolErrorType } from './tool-error.js'; -import { OUTPUT_UPDATE_INTERVAL_MS } from './shell.js'; +import { OUTPUT_UPDATE_INTERVAL_MS, parseNumstat } from './shell.js'; import { createMockWorkspaceContext } from '../test-utils/mockWorkspaceContext.js'; import { PermissionManager } from '../permissions/permission-manager.js'; +import { CommitAttributionService } from '../services/commitAttribution.js'; describe('ShellTool', () => { let shellTool: ShellTool; @@ -68,8 +69,10 @@ describe('ShellTool', () => { getTruncateToolOutputLines: vi.fn().mockReturnValue(0), getPermissionManager: vi.fn().mockReturnValue(undefined), getGeminiClient: vi.fn(), + getModel: vi.fn().mockReturnValue('qwen3-coder-plus'), getGitCoAuthor: vi.fn().mockReturnValue({ - enabled: true, + commit: true, + pr: true, name: 'Qwen-Coder', email: 'qwen-coder@alibabacloud.com', }), @@ -110,6 +113,9 @@ describe('ShellTool', () => { }), }; }); + + // Ensure attribution singleton is clean between tests + CommitAttributionService.resetInstance(); }); describe('isCommandAllowed', () => { @@ -1459,10 +1465,49 @@ describe('ShellTool', () => { ); }); - it('should not add co-author when disabled in config', async () => { - // Mock config with disabled co-author + it('should not add co-author when only pr is enabled (commit off)', async () => { + // Commit attribution must be independent from PR attribution: + // disabling commit should skip the Co-authored-by trailer even if + // pr remains enabled. (mockConfig.getGitCoAuthor as Mock).mockReturnValue({ - enabled: false, + commit: false, + pr: true, + name: 'Qwen-Coder', + email: 'qwen-coder@alibabacloud.com', + }); + + const command = 'git commit -m "Initial commit"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.not.stringContaining('Co-authored-by:'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }); + + it('should not add co-author when disabled in config', async () => { + // Mock config with commit co-author disabled + (mockConfig.getGitCoAuthor as Mock).mockReturnValue({ + commit: false, + pr: false, name: 'Qwen-Coder', email: 'qwen-coder@alibabacloud.com', }); @@ -1497,7 +1542,8 @@ describe('ShellTool', () => { it('should use custom name and email from config', async () => { // Mock config with custom co-author details (mockConfig.getGitCoAuthor as Mock).mockReturnValue({ - enabled: true, + commit: true, + pr: true, name: 'Custom Bot', email: 'custom@example.com', }); @@ -1531,7 +1577,12 @@ describe('ShellTool', () => { ); }); - it('should add co-author when git commit is prefixed with cd command', async () => { + // `cd /elsewhere && git commit` could be redirecting the commit + // into a different repo than our cwd. We can't take a meaningful + // pre-HEAD snapshot or write notes to the right place without + // resolving the cd target, so we conservatively skip the + // co-author rewrite altogether. + it('should NOT add co-author when git commit is preceded by cd', async () => { const command = 'cd /tmp/test && git commit -m "Test commit"'; const invocation = shellTool.build({ command, is_background: false }); const promise = invocation.execute(mockAbortSignal); @@ -1550,9 +1601,7 @@ describe('ShellTool', () => { await promise; expect(mockShellExecutionService).toHaveBeenCalledWith( - expect.stringContaining( - 'Co-authored-by: Qwen-Coder ', - ), + expect.not.stringContaining('Co-authored-by:'), expect.any(String), expect.any(Function), expect.any(AbortSignal), @@ -1561,6 +1610,887 @@ describe('ShellTool', () => { ); }); + // `cd subdir && git commit` (relative cd that doesn't escape + // upward) is a very common workflow — entering a subdirectory + // before committing. The cd target stays inside the same repo, + // so attribution should still apply. The earlier blanket + // "any cd shifts cwd" gate broke this; the heuristic now only + // marks shifted on absolute paths, `..`-prefixed paths, env-var + // expansions, etc. + it('should add co-author for cd subdir && git commit (relative same-repo)', async () => { + const command = 'cd src && git commit -m "Test commit"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.stringContaining('Co-authored-by:'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }); + + // `cd ..` could escape the repo root — conservative shift. + // Embedded `..` traversal — `cd foo/../../escape` — could + // escape the repo just as much as a leading `..`, so the + // heuristic must reject it. Without this the trailer would + // be appended to a commit landing in a different repo. + it('should NOT add co-author for cd with embedded .. (escapes via traversal)', async () => { + const command = 'cd foo/../../escape && git commit -m "Test"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.not.stringContaining('Co-authored-by:'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }); + + // `env` is a shell wrapper like `sudo`/`command`, with the + // additional twist that it accepts `KEY=VALUE` argv entries + // before the program. Without explicit handling, the regex + // would see `KEY=VALUE` as the program name and skip + // attribution entirely. + it('should add co-author when git commit is wrapped in env KEY=VAL', async () => { + const command = + 'env GIT_COMMITTER_DATE=now git commit -m "Test commit"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.stringContaining('Co-authored-by:'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }); + + // `env -u NAME` unsets a variable. The flag takes a value, so + // tokeniseSegment has to skip it; otherwise NAME would be left + // as the next token and the parser would treat it as the + // program, masking the real `git commit`. + it('should add co-author when git commit is wrapped in env -u NAME', async () => { + const command = 'env -u GIT_AUTHOR_DATE git commit -m "Test commit"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.stringContaining('Co-authored-by:'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }); + + // `GIT_DIR=...` and friends redirect git's repo selection; a + // commit prefixed with one of these lands in a different repo + // than our cwd. Stamping the trailer onto it would corrupt a + // commit in a repo the user didn't expect us to touch. + it.each([ + ['GIT_DIR', 'GIT_DIR=/tmp/other/.git git commit -m "msg"'], + ['GIT_WORK_TREE', 'GIT_WORK_TREE=/tmp/other git commit -m "msg"'], + ['GIT_COMMON_DIR', 'GIT_COMMON_DIR=/tmp/other git commit -m "msg"'], + [ + 'GIT_INDEX_FILE', + 'GIT_INDEX_FILE=/tmp/other/index git commit -m "msg"', + ], + [ + 'env-wrapped GIT_DIR', + 'env GIT_DIR=/tmp/other/.git git commit -m "msg"', + ], + // GNU coreutils 8.30+'s `env -C DIR` / `--chdir` relocates + // the working directory before exec — same repo-shifting + // contract as `cd /elsewhere && git commit`. + ['env -C', 'env -C /tmp/other git commit -m "msg"'], + ['env --chdir', 'env --chdir /tmp/other git commit -m "msg"'], + // Attached-value forms: `shell-quote` tokenises `--chdir=/tmp` + // and `-C/tmp` as single argv entries, so the bare-flag set + // membership check would miss them. Without explicit + // attached-form handling, `sudo --chdir=/tmp git commit` and + // `env -C/tmp git commit` would silently land our trailer on + // a commit in the wrong repo. + ['env --chdir=', 'env --chdir=/tmp/other git commit -m "msg"'], + ['env -C attached', 'env -C/tmp/other git commit -m "msg"'], + ['sudo --chdir=', 'sudo --chdir=/tmp/other git commit -m "msg"'], + ['sudo -D attached', 'sudo -D/tmp/other git commit -m "msg"'], + ])( + 'should NOT add co-author for repo-redirecting %s assignment', + async (_label, command) => { + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + await promise; + const observed = mockShellExecutionService.mock.calls[0][0]; + expect(observed).not.toContain('Co-authored-by:'); + }, + ); + + // GIT_AUTHOR_DATE / GIT_COMMITTER_DATE / etc. tweak commit + // metadata but don't relocate the repo — attribution still + // applies as normal. + it('should still add co-author with benign GIT_COMMITTER_DATE assignment', async () => { + const command = + 'GIT_COMMITTER_DATE="2026-01-01T00:00:00Z" git commit -m "Test commit"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + await promise; + const observed = mockShellExecutionService.mock.calls[0][0]; + expect(observed).toContain('Co-authored-by:'); + }); + + it('should NOT add co-author for cd .. && git commit (could escape repo)', async () => { + const command = 'cd .. && git commit -m "Test commit"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.not.stringContaining('Co-authored-by:'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }); + + // `cd $HOME && git commit` would land in whatever repo `$HOME` + // points to — typically NOT our cwd. With the default + // `shell-quote` parse, `$HOME` collapses to `''` and the + // `target.includes('$')` repo-shift check silently fails. The + // env-preserving parse keeps `$NAME` literal in tokens so this + // case is correctly flagged. + it.each([ + ['$HOME', 'cd $HOME && git commit -m "elsewhere"'], + ['$REPO_ROOT', 'cd $REPO_ROOT && git commit -m "elsewhere"'], + ])( + 'should NOT add co-author for cd %s && git commit (env-var target)', + async (_label, command) => { + const invocation = shellTool.build({ + command, + is_background: false, + }); + const promise = invocation.execute(mockAbortSignal); + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + await promise; + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.not.stringContaining('Co-authored-by:'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }, + ); + + // A cd that comes AFTER an in-cwd commit doesn't invalidate the + // commit's attribution — the commit already landed in our repo. + it('should add co-author when cd comes AFTER git commit', async () => { + const command = 'git commit -m "Test" && cd /tmp/test'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.stringContaining('Co-authored-by:'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }); + + // `git -C commit` runs in , not our cwd — same risk + // as the cd case, so the rewrite should be skipped. Also covers + // the attached-value form `-C/path` (single token from + // shell-quote) and the long-flag attached forms + // `--git-dir=/path` / `--work-tree=/path`. + it.each([ + ['git -C /tmp/other commit', 'git -C /tmp/other commit -m "Other"'], + [ + 'git -C/tmp/other commit (attached)', + 'git -C/tmp/other commit -m "Other"', + ], + [ + 'git --git-dir=/tmp/other/.git commit', + 'git --git-dir=/tmp/other/.git commit -m "Other"', + ], + [ + 'git --work-tree=/tmp/other commit', + 'git --work-tree=/tmp/other commit -m "Other"', + ], + ])('should NOT add co-author for %s', async (_label, command) => { + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.not.stringContaining('Co-authored-by:'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }); + + // `git -C .` (or `-C ./` or `-C .` attached as `-C.`) is a + // semantic no-op — the cwd doesn't actually change. The + // previous "any -C → cwd-shifted" rule silently skipped + // attribution for what's basically `git commit` with an + // explicit cwd marker. Treat dot-form as in-cwd. + it.each([ + ['git -C . commit', 'git -C . commit -m "in cwd"'], + ['git -C ./ commit', 'git -C ./ commit -m "in cwd"'], + ['git -C. commit (attached)', 'git -C. commit -m "in cwd"'], + ])('should add co-author for %s', async (_label, command) => { + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + await promise; + const observed = mockShellExecutionService.mock.calls[0][0]; + expect(observed).toContain('Co-authored-by:'); + }); + + // `shell-quote` parses an unresolved env-var (`$HOME`, `$REPO`) + // or unknown command-substitution as the empty string, which is + // indistinguishable from a literal `-C ""`. Treating that as + // no-op would let `git -C $HOME commit` silently land our trailer + // on a commit that goes to a different repo. Conservative skip is + // safer than the rare `-C $PWD` miss. + it.each([ + ['git -C $HOME commit', 'git -C $HOME commit -m "elsewhere"'], + ['git -C "" commit', 'git -C "" commit -m "literal empty"'], + ])( + 'should NOT add co-author for %s (env-var/empty target)', + async (_label, command) => { + const invocation = shellTool.build({ + command, + is_background: false, + }); + const promise = invocation.execute(mockAbortSignal); + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + await promise; + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.not.stringContaining('Co-authored-by:'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }, + ); + + // Trailing shell comments must not confuse the `-m` rewrite: + // `git commit -m "real" # -m "fake"` would otherwise have + // `lastMatchOf` pick the comment's `-m "fake"` and splice the + // trailer into a `-m` flag bash discards, leaving the actual + // commit unattributed. The unquoted-`#` truncation in the + // segment slicing keeps the rewrite scoped to the live part. + it('should add co-author for git commit followed by # comment', async () => { + const command = 'git commit -m "real" # -m "fake"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + const observed = mockShellExecutionService.mock.calls[0][0] as string; + // Trailer must land in the live `-m "real"` body, BEFORE the `#`. + expect(observed).toContain('Co-authored-by:'); + const realIdx = observed.indexOf('-m "real'); + const hashIdx = observed.indexOf(' # '); + const coAuthorIdx = observed.indexOf('Co-authored-by:'); + expect(realIdx).toBeGreaterThanOrEqual(0); + expect(hashIdx).toBeGreaterThan(realIdx); + expect(coAuthorIdx).toBeGreaterThan(realIdx); + expect(coAuthorIdx).toBeLessThan(hashIdx); + }); + + // A `#` inside a quoted commit body is NOT a comment marker. + // `git commit -m "fix #123"` should still get the trailer + // appended inside the quoted body. + it('should add co-author for git commit -m with # inside body', async () => { + const command = 'git commit -m "fix #123 add feature"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + const observed = mockShellExecutionService.mock.calls[0][0] as string; + expect(observed).toContain('Co-authored-by:'); + // The `#123` MUST still be inside the body (not pushed out by + // the comment-truncation logic mistaking it for a comment). + expect(observed).toContain('#123'); + }); + + // git's global flags (`-c`, `--no-pager`, etc.) push the + // subcommand past index 1; a fixed-position check at arg1 used + // to silently skip these forms. Make sure we still inject the + // trailer for them. + it('should add co-author for git -c key=val commit', async () => { + const command = 'git -c user.email=x@y commit -m "Test"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.stringContaining('Co-authored-by:'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }); + + it('should add co-author for git --no-pager commit', async () => { + const command = 'git --no-pager commit -m "Test"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.stringContaining('Co-authored-by:'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }); + + // Common real-world prefixes — env-var assignment and `sudo` — must + // still be detected so attribution doesn't silently skip the trailer. + it('should add co-author when git commit is prefixed with env vars', async () => { + const command = 'GIT_COMMITTER_DATE=now git commit -m "Test"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.stringContaining('Co-authored-by:'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }); + + // `sudo -u user git commit` puts the program at index [3], not + // [1]; a naive flag-only consumer would leave `user` standing + // in for the program name. + it('should add co-author for sudo with value-taking flag (-u user)', async () => { + const command = 'sudo -u other git commit -m "Test"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.stringContaining('Co-authored-by:'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }); + + // git's `-m` can be passed multiple times — `git interpret-trailers` + // only recognises trailers that sit at the end of the *last* `-m` + // value, so the rewrite must target the last match. + it('should add Co-authored-by trailer to the LAST -m when multiple are present', async () => { + const command = 'git commit -m "Title" -m "Body line 1"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + const observed = mockShellExecutionService.mock.calls[0][0]; + // The trailer must land inside the second `-m` quote pair, not + // the first; a simple way to assert this is that `Body line 1` + // and the trailer share the same closing quote. + expect(observed).toMatch( + /-m\s+"Body line 1\s+Co-authored-by: Qwen-Coder "/s, + ); + // And the first -m's title is unchanged. + expect(observed).toMatch(/-m\s+"Title"\s/); + }); + + // Concern: a literal `-m '...'` *inside* a quoted commit + // message body could be picked up by the regex as if it were a + // real later argument, splicing the trailer mid-message and + // breaking the command's quoting. + it('should not be fooled by a literal -m token inside the quoted message body', async () => { + const command = + 'git commit -m "docs mention -m \'flag\' for completeness"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + const observed = mockShellExecutionService.mock.calls[0][0]; + // The original message body must be preserved end-to-end — + // no trailer spliced before its closing quote. + expect(observed).toContain( + "-m \"docs mention -m 'flag' for completeness", + ); + // The trailer must land AFTER the original body, just before + // the message's outer closing quote. + expect(observed).toMatch( + /docs mention -m 'flag' for completeness\s+Co-authored-by:[^"]+"/s, + ); + }); + + // Concern: a later `git tag -m "..."` in the same compound + // command could be mistaken for the commit message because the + // regex was matching across the whole command string. + it('should target the commit message, not a later git tag -m in the same chain', async () => { + const command = + 'git commit -m "fix" && git tag -a v1 -m "release notes"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + const observed = mockShellExecutionService.mock.calls[0][0]; + // The trailer is appended to the commit message body... + expect(observed).toMatch(/git commit -m "fix\s+Co-authored-by:[^"]+"/s); + // ...and the later `git tag -m` is left exactly as the user + // wrote it. + expect(observed).toContain('git tag -a v1 -m "release notes"'); + // The tag annotation must not have a trailer spliced in. + const tagMatch = observed.match(/git tag .*-m "([^"]*)"/); + expect(tagMatch?.[1]).toBe('release notes'); + }); + + // The tool description recommends `git commit -m "$(cat <<'EOF' + // ... EOF)"` for multi-line messages. The body contains nested + // `"` from interior shell tokens — the regex would match only + // up to the first interior quote and splice the trailer + // mid-substitution, breaking the command. Bail explicitly. + it('should NOT rewrite -m bodies that contain $(...) command substitution', async () => { + const command = + 'git commit -m "$(cat <<\'EOF\'\nfix: title\n\ndetails\nEOF\n)"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + const observed = mockShellExecutionService.mock.calls[0][0]; + // The original command must reach the executor unchanged. + expect(observed).toBe(command); + expect(observed).not.toContain('Co-authored-by:'); + }); + + // `--message` is git's documented long alias for `-m`. Without + // explicit handling the trailer would be silently skipped on + // commits that use the long form. + it('should add co-author for git commit --message "..."', async () => { + const command = 'git commit --message "Test commit"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.stringContaining('Co-authored-by:'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }); + + it('should add co-author for git commit --message="..."', async () => { + const command = 'git commit --message="Test commit"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.stringContaining('Co-authored-by:'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }); + + it('should add co-author when git commit is prefixed with sudo', async () => { + const command = 'sudo git commit -m "Test"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.stringContaining('Co-authored-by:'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }); + + // Quoted "git commit" should not look like an executed commit. + it('should NOT add co-author when git commit appears only inside quoted text', async () => { + const command = 'echo "git commit -m foo"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.not.stringContaining('Co-authored-by:'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }); + + // Bash's apostrophe-via-`'\''` form (close-escape-reopen) is a + // single logical body. The trailer must land at the FINAL + // closing `'` — not in the middle of the escape — so the regex + // body group has to recognise the escape sequence as a whole. + // Mirrors the bodySinglePattern in addAttributionToPR. + it("should append trailer after the final ' in -m 'don'\\''t' apostrophe-escape", async () => { + const command = "git commit -m 'don'\\''t'"; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + const observed = mockShellExecutionService.mock.calls[0][0]; + // The full apostrophe-escape body survives intact and the + // trailer lands AFTER it (before the closing `'`), not in the + // middle of `'\''`. + expect(observed).toMatch( + /git commit -m 'don'\\''t[\s\S]*Co-authored-by:[^']*'/, + ); + }); + it('should add co-author to git commit with multi-line message', async () => { const command = `git commit -m "Fix bug @@ -1593,6 +2523,560 @@ describe('ShellTool', () => { {}, ); }); + + // Bash accepts `-mfoo` as well as `-m foo`. The previous regex + // required at least one whitespace and silently no-op'd on the + // shorthand form, so users who used `git commit -m"msg"` got no + // co-author trailer. + it('should add co-author to git commit -m"msg" shorthand (no space)', async () => { + const command = 'git commit -m"Quick fix"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.stringContaining( + 'Co-authored-by: Qwen-Coder ', + ), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }); + + // Without escaping, a co-author name containing `$()`, backticks, + // or `"` would either break the user-approved `git commit` command + // or be evaluated as command substitution. + it('should escape shell metacharacters in name/email', async () => { + (mockConfig.getGitCoAuthor as Mock).mockReturnValue({ + commit: true, + pr: true, + name: 'Bot $(rm -rf /) `eval` "danger"', + email: 'bot@example.com', + }); + + const command = 'git commit -m "msg"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + const observedCmd = mockShellExecutionService.mock.calls[0][0]; + // Each metacharacter must be escaped, not literal. + expect(observedCmd).toContain('\\$'); + expect(observedCmd).toContain('\\`'); + expect(observedCmd).toContain('\\"'); + // The `-m "..."` quote pair must stay closed. + expect(observedCmd).toMatch(/-m\s+".+"/s); + }); + }); + + describe('addAttributionToPR', () => { + // Non-inline-body flows: `--body-file ` reads the body + // from a file on disk, `--fill` populates it from commit + // messages, and bare `gh pr create` opens an editor. None of + // these have a body argv we can splice the attribution into. + // We can't safely modify them automatically (would either + // mutate the user's file on disk or break the editor flow), + // so we leave the command untouched and rely on the debug + // warning to surface the skip when QWEN_DEBUG_LOG_FILE is set. + it.each([ + ['--body-file', 'gh pr create --title "x" --body-file /tmp/body.md'], + ['--fill', 'gh pr create --title "x" --fill'], + ['no body flag (editor)', 'gh pr create --title "x"'], + ])( + 'should leave gh pr create %s unchanged (non-inline-body flow)', + async (_label, command) => { + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + const observed = mockShellExecutionService.mock.calls[0][0] as string; + expect(observed).toBe(command); + expect(observed).not.toContain('Generated with Qwen Code'); + }, + ); + + // `gh pr new` is a documented alias for `gh pr create`. Without + // explicit alias handling the rewrite silently misses it. + it('should append attribution to `gh pr new --body "..."` (alias form)', async () => { + const command = 'gh pr new --title "x" --body "Summary"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.stringContaining('Generated with Qwen Code'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }); + + // Same `$(...)` bailout as addCoAuthorToGitCommit: a heredoc + // body must not have the trailer spliced in mid-substitution. + it('should NOT rewrite --body that contains $(...) command substitution', async () => { + const command = + 'gh pr create --title "x" --body "$(cat <<\'EOF\'\nSummary\nEOF\n)"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + const observed = mockShellExecutionService.mock.calls[0][0]; + expect(observed).toBe(command); + expect(observed).not.toContain('Generated with Qwen Code'); + }); + + // `-b` is gh's documented short alias for `--body`. Without + // explicit handling the rewrite would silently miss it. + // `curl -b "session=abc" && gh pr create --body "summary"` — + // without segment scoping the body regex would match curl's + // `-b` cookie flag (since it's the same `-b "..."` shape) and + // inject attribution into the cookie value, breaking curl. + it('should NOT match -b in earlier non-gh segments of a compound', async () => { + const command = + 'curl -b "session=abc" https://example.com && gh pr create --title "x" --body "summary"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + const observed = mockShellExecutionService.mock.calls[0][0]; + // curl's -b cookie value must be exactly preserved. + expect(observed).toContain('curl -b "session=abc"'); + // The trailer should land in gh's --body, not in curl's -b. + expect(observed).toMatch( + /gh pr create --title "x" --body "summary[\s\S]*Generated with Qwen Code"/, + ); + }); + + it('should append attribution to gh pr create -b "..." (short form)', async () => { + const command = 'gh pr create --title "x" -b "Summary"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.stringContaining('Generated with Qwen Code'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }); + + // A `-b 'flag'` mention literally inside the outer `--body "..."` + // text must NOT be picked as the body argument: the trailer + // would land mid-body, corrupting the user-approved command. + // Mirrors addCoAuthorToGitCommit's nested-match check. + it('should pick the OUTER --body when an inner -b appears in body text', async () => { + const command = + 'gh pr create --title "x" --body "docs mention -b \'flag\' here"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + await promise; + + const calls = mockShellExecutionService.mock.calls; + const cmd = calls[calls.length - 1]?.[0] as string; + // The trailer must appear AFTER the closing `"` of the outer + // body, not between `flag` and `here`. + expect(cmd).toMatch( + /--body "docs mention -b 'flag' here[\s\S]*Generated with Qwen Code"/, + ); + expect(cmd).not.toMatch( + /-b 'flag[\s\S]*Generated with Qwen Code[\s\S]*' here"/, + ); + }); + + it('should append attribution to gh pr create --body when pr enabled', async () => { + const command = 'gh pr create --title "x" --body "Summary"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.stringContaining('Generated with Qwen Code'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }); + + // gh CLI uses the *last* `--body` flag when multiple are + // provided. Splicing into the first one would silently drop + // attribution. Mirrors the matchAll/last-match behaviour in + // addCoAuthorToGitCommit. + it('should target the LAST --body when gh pr create has multiple', async () => { + const command = + 'gh pr create --title "x" --body "ignored" --body "real summary"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + await promise; + + const calls = mockShellExecutionService.mock.calls; + const cmd = calls[calls.length - 1]?.[0] as string; + expect(cmd).toMatch( + /--body "ignored" --body "real summary[\s\S]*Generated with Qwen Code/, + ); + // The trailer must NOT be inside the first --body. + expect(cmd).not.toMatch( + /--body "ignored[\s\S]*Generated with Qwen Code[\s\S]*" --body/, + ); + }); + + // `gh --repo owner/repo pr create` shifts pr/create past the + // fixed `tokens[1]/tokens[2]` slots; a literal-position check + // misses these forms. + it('should append attribution when gh has global flags before pr create', async () => { + const command = + 'gh --repo owner/repo pr create --title "x" --body "Summary"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.stringContaining('Generated with Qwen Code'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }); + + // The `--body=value` (equals-sign) form is common with gh; the + // earlier `\s+` separator only matched `--body value`. + it('should append attribution to --body="..." equals-sign form', async () => { + const command = 'gh pr create --title "x" --body="Summary"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.stringContaining('Generated with Qwen Code'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }); + + // Quoted "gh pr create" should not look like an executed PR command. + it('should NOT rewrite when gh pr create appears only inside quoted text', async () => { + const command = 'echo "gh pr create --title x --body \\"Summary\\""'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.not.stringContaining('Generated with Qwen Code'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }); + + it('should skip PR attribution when pr is off even if commit is on', async () => { + // Commit and PR toggles must be independent. + (mockConfig.getGitCoAuthor as Mock).mockReturnValue({ + commit: true, + pr: false, + name: 'Qwen-Coder', + email: 'qwen-coder@alibabacloud.com', + }); + + const command = 'gh pr create --title "x" --body "Summary"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + expect(mockShellExecutionService).toHaveBeenCalledWith( + expect.not.stringContaining('Generated with Qwen Code'), + expect.any(String), + expect.any(Function), + expect.any(AbortSignal), + false, + {}, + ); + }); + + // Without escaping, a generator name containing `"`, `$`, or a + // backtick would either break the user-approved `gh pr create` + // command or be evaluated as command substitution. The fix was to + // shell-escape the appended text for the surrounding quote style. + it('should escape generator names with shell metacharacters in double-quoted body', async () => { + (mockConfig.getGitCoAuthor as Mock).mockReturnValue({ + commit: true, + pr: true, + // A name designed to break double-quote interpolation if not escaped. + name: 'Bot $(rm -rf /) "danger" `eval`', + email: 'bot@example.com', + }); + // Generator name only ends up in the attribution when shots > 0. + const svc = CommitAttributionService.getInstance(); + svc.incrementPromptCount(); + + const command = 'gh pr create --title "x" --body "Summary"'; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + const observedCmd = mockShellExecutionService.mock.calls[0][0]; + // Each metacharacter must be escaped, not literal. + expect(observedCmd).toContain('\\$'); + expect(observedCmd).toContain('\\"'); + expect(observedCmd).toContain('\\`'); + // And the original `--body` quote must still close properly + // (`s` flag — body contains newlines from the attribution). + expect(observedCmd).toMatch(/--body\s+".+"/s); + }); + + it('should escape single-quoted body containing apostrophes in generator name', async () => { + (mockConfig.getGitCoAuthor as Mock).mockReturnValue({ + commit: true, + pr: true, + name: "O'Brien-Bot", + email: 'bot@example.com', + }); + const svc = CommitAttributionService.getInstance(); + svc.incrementPromptCount(); + + const command = "gh pr create --title 'x' --body 'Summary'"; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + const observedCmd = mockShellExecutionService.mock.calls[0][0]; + // The bash close-escape-reopen trick yields `'\''` in place of `'`. + expect(observedCmd).toContain("O'\\''Brien-Bot"); + }); + + // A body that already uses bash's `'\''` apostrophe-escape form + // should be matched as a single complete argument so the trailer + // appends after the full body, not after the first quote-segment. + it("should match the full body across '\\\\'' apostrophe escapes", async () => { + const command = "gh pr create --title 'x' --body 'don'\\''t break me'"; + const invocation = shellTool.build({ command, is_background: false }); + const promise = invocation.execute(mockAbortSignal); + + resolveExecutionPromise({ + rawOutput: Buffer.from(''), + output: '', + exitCode: 0, + signal: null, + error: null, + aborted: false, + pid: 12345, + executionMethod: 'child_process', + }); + + await promise; + + const observed = mockShellExecutionService.mock.calls[0][0]; + // The original body content is preserved end-to-end. + expect(observed).toContain("don'\\''t break me"); + // The attribution lands AFTER the original body, not in the + // middle of it. + expect(observed).toMatch( + /don'\\''t break me[\s\S]*Generated with Qwen Code/, + ); + }); }); }); @@ -1943,6 +3427,45 @@ describe('ShellTool', () => { }); }); +describe('parseNumstat', () => { + it('parses text-diff entries as (additions + deletions) * 40', () => { + // Format: "\t\t" + const out = '2\t3\tsrc/main.ts'; + expect(parseNumstat(out).get('src/main.ts')).toBe(200); + }); + + it('uses a fixed fallback for binary entries (- - path)', () => { + const out = ['-\t-\tassets/logo.png', '5\t0\tsrc/main.ts'].join('\n'); + const sizes = parseNumstat(out); + // Binary file still lands in the map so attribution doesn't drop + // it via diffSize=0; exact size doesn't matter, the constant just + // needs to be > 0. + expect(sizes.get('assets/logo.png')).toBeGreaterThan(0); + expect(sizes.get('src/main.ts')).toBe(200); + }); + + it('normalizes brace rename notation to the new path', () => { + const out = '3\t1\tsrc/{old => new}/file.ts'; + expect([...parseNumstat(out).keys()]).toEqual(['src/new/file.ts']); + }); + + it('normalizes bare cross-directory rename to the new path', () => { + const out = '1\t1\told/dir/file.ts => new/dir/file.ts'; + expect([...parseNumstat(out).keys()]).toEqual(['new/dir/file.ts']); + }); + + it('ignores malformed lines instead of crashing', () => { + const out = [ + '', + 'garbage line', + '5\t2\tsrc/ok.ts', + 'a\tb\tsrc/bad.ts', + ].join('\n'); + const sizes = parseNumstat(out); + expect([...sizes.keys()]).toEqual(['src/ok.ts']); + }); +}); + describe('detectBlockedSleepPattern', () => { it('blocks standalone sleep >= 2s', () => { expect(detectBlockedSleepPattern('sleep 5')).toBe('standalone sleep 5'); diff --git a/packages/core/src/tools/shell.ts b/packages/core/src/tools/shell.ts index 74e472514..42e0f9fd8 100644 --- a/packages/core/src/tools/shell.ts +++ b/packages/core/src/tools/shell.ts @@ -8,6 +8,7 @@ import fs from 'node:fs'; import path from 'node:path'; import os from 'node:os'; import crypto from 'node:crypto'; +import * as childProcess from 'node:child_process'; import type { Config } from '../config/config.js'; import { ToolNames, ToolDisplayNames } from './tool-names.js'; import { ToolErrorType } from './tool-error.js'; @@ -24,6 +25,11 @@ import type { PermissionDecision } from '../permissions/types.js'; import { BaseDeclarativeTool, BaseToolInvocation, Kind } from './tools.js'; import { getErrorMessage } from '../utils/errors.js'; import { truncateToolOutput } from '../utils/truncation.js'; +import { + CommitAttributionService, + type StagedFileInfo, +} from '../services/commitAttribution.js'; +import { buildGitNotesCommand } from '../services/attributionTrailer.js'; import type { ShellExecutionConfig, ShellOutputEvent, @@ -37,9 +43,11 @@ import { isSubpaths } from '../utils/paths.js'; import { getCommandRoot, getCommandRoots, + getShellConfiguration, splitCommands, stripShellWrapper, } from '../utils/shell-utils.js'; +import { parse } from 'shell-quote'; import { createDebugLogger } from '../utils/debugLogger.js'; import { isShellCommandReadOnlyAST, @@ -48,6 +56,840 @@ import { const debugLogger = createDebugLogger('SHELL'); +/** + * Strip a single bare trailing `&` (bash background operator) from a + * command string. Returns the input unchanged if the trailing form is + * `&&` (logical AND), `\&` (escaped literal `&`), or there is no `&` + * at the end at all. Linear time, no regex backtracking risk. + */ +function stripTrailingBackgroundAmp(command: string): string { + const trimmed = command.trimEnd(); + if (!trimmed.endsWith('&')) return command; + if (trimmed.endsWith('&&')) return command; + if (trimmed.endsWith('\\&')) return command; + return trimmed.slice(0, -1).trimEnd(); +} + +/** + * Escape `s` so it is safe to interpolate inside a bash double-quoted + * string. Inside `"..."`, bash still interprets `$`, backtick, `\`, and + * `"`; escape those four. Newlines and other characters are literal. + */ +function escapeForBashDoubleQuote(s: string): string { + return s.replace(/[\\"$`]/g, '\\$&'); +} + +/** + * Escape `s` so it is safe to interpolate inside a bash single-quoted + * string. Bash single quotes have no escape mechanism — the standard + * trick is to close the quote, emit a backslash-escaped `'`, and reopen. + */ +function escapeForBashSingleQuote(s: string): string { + return s.replace(/'/g, "'\\''"); +} + +/** + * Return the LAST match from a RegExp.matchAll iterator, or `null` if + * the iterator is empty. Used to find the final `-m` / `--body` flag + * in a command segment: git/gh both honour the LAST occurrence when + * multiple are passed, so the trailer has to land in that match to be + * picked up by the actual commit / PR body. + */ +function lastMatchOf( + matches: IterableIterator, +): T | null { + let result: T | null = null; + for (const m of matches) result = m; + return result; +} + +/** + * Return the position of the first unquoted `#` (start-of-comment) in + * `s`, or -1 if none. Bash treats `#` as a comment marker only when it + * begins a word — at start of input or preceded by whitespace — and + * not when it appears inside a single- or double-quoted region. This + * mirrors that semantics so the `-m` / `--body` rewriters can scope + * their regex to the pre-comment part of a segment and avoid splicing + * the trailer into a comment-out flag like + * `git commit -m "real" # -m "fake"`, where the actual commit gets + * "real" but `lastMatchOf` would otherwise pick the comment's `-m + * "fake"` and put the trailer there. + */ +function findUnquotedCommentStart(s: string): number { + let inSingle = false; + let inDouble = false; + let i = 0; + while (i < s.length) { + const c = s[i]!; + if (c === '\\' && !inSingle && i + 1 < s.length) { + i += 2; + continue; + } + if (c === "'" && !inDouble) { + inSingle = !inSingle; + i++; + continue; + } + if (c === '"' && !inSingle) { + inDouble = !inDouble; + i++; + continue; + } + if (c === '#' && !inSingle && !inDouble) { + const prev = i === 0 ? '' : s[i - 1]!; + if (prev === '' || /\s/.test(prev)) return i; + } + i++; + } + return -1; +} + +/** + * Helpers for the nested-match-rejection logic shared between + * addCoAuthorToGitCommit and addAttributionToPR. Both functions pick + * the LAST `-m` / `--body` occurrence across two quote styles, but + * have to reject a candidate that's nested INSIDE the other's range + * — e.g. `git commit -m "docs mention -m 'flag'"` where the inner + * `-m 'flag'` lives entirely inside the outer `-m "..."`. Without + * the nesting check the inner (later) match would win and the + * trailer would land in the body text. + * + * Extracted to module scope so future bug fixes can't apply to only + * one of the two call sites. + */ +function matchSpan( + m: RegExpMatchArray | null, +): { start: number; end: number } | null { + return m ? { start: m.index ?? 0, end: (m.index ?? 0) + m[0].length } : null; +} + +function isMatchInside( + inner: RegExpMatchArray | null, + outer: RegExpMatchArray | null, +): boolean { + const i = matchSpan(inner); + const o = matchSpan(outer); + return !!(i && o && i.start >= o.start && i.end <= o.end); +} + +/** + * Pick the LAST non-nested match across two quote styles. Mirrors the + * algorithm both rewriters use: prefer whichever appears later in the + * segment, but if either match lives inside the other's range, take + * the OUTER one. Returns the chosen match plus a marker telling the + * caller which style won (so they can pick the right escape function). + */ +function pickOuterLastMatch( + doubleMatch: T, + singleMatch: T, +): { match: T; isDouble: boolean } { + if (doubleMatch && singleMatch) { + if (isMatchInside(singleMatch, doubleMatch)) { + return { match: doubleMatch, isDouble: true }; + } + if (isMatchInside(doubleMatch, singleMatch)) { + return { match: singleMatch, isDouble: false }; + } + return (doubleMatch.index ?? 0) > (singleMatch.index ?? 0) + ? { match: doubleMatch, isDouble: true } + : { match: singleMatch, isDouble: false }; + } + if (doubleMatch) return { match: doubleMatch, isDouble: true }; + return { match: singleMatch, isDouble: false }; +} + +/** + * Tokenise a single shell-command segment via `shell-quote`. Returns + * the parsed string tokens with leading env-var assignments and a + * small allowlist of safe wrappers (`sudo`, `command`, with their + * flag block consumed) stripped. Returns `null` if the segment + * doesn't parse — the caller should then skip the segment. + * + * Using `shell-quote.parse` (rather than a regex scan) is what makes + * quoted env values (`FOO="a b" cmd`) tokenise correctly and avoids + * the polynomial regex behaviour CodeQL flagged on the previous + * `\S*\s+`-based slicing loop. + */ +function tokeniseSegment(segment: string): string[] | null { + let tokens: string[]; + try { + // Pass an env getter that preserves `$NAME` references in tokens + // rather than collapsing them to `''` (shell-quote's default). + // Without this, `cd $HOME` parses as `['cd', '']` and the downstream + // `target.includes('$')` repo-shift detection silently fails: an + // env-var that points to another repo would get treated as a + // same-repo no-op and our Co-authored-by trailer would land on a + // commit in whatever repo `$HOME`/`$REPO_ROOT` resolves to at + // runtime. Same problem in `parseGitInvocation` for `git -C $HOME`. + // Single-quoted forms (`cd '$HOME'`) end up looking like a variable + // reference too, but in practice nobody creates a directory named + // literally `$HOME`, so over-flagging is the conservative-correct + // choice. + tokens = parse(segment, (key) => '$' + key).filter( + (t): t is string => typeof t === 'string', + ); + } catch (e) { + debugLogger.warn( + `tokeniseSegment: parse failed for "${segment.slice(0, 80)}": ${ + e instanceof Error ? e.message : String(e) + }`, + ); + return null; + } + let i = 0; + // Skip env-var assignments (KEY=value). If the key is one of the + // git-repo-redirecting variables, refuse to tokenise the segment at + // all: `GIT_DIR=elsewhere/.git git commit ...` runs against another + // repository, so treating it as an in-cwd commit and stamping our + // attribution onto it would be wrong (and a `Co-authored-by` trailer + // would land on a commit in a repo the user didn't expect us to touch). + while (i < tokens.length) { + const key = leadingEnvAssignmentKey(tokens[i]!); + if (key === null) break; + if (GIT_ENV_SHIFTS_REPO.has(key)) return null; + i++; + } + // Strip a single safe wrapper, then any leading flag tokens it + // took. Sudo's value-taking flags (`-u user`, `-g group`, + // `-h host`, `-D path`, `-r role`, `-t type`) consume the next + // argv slot, so without explicitly knowing which take values we'd + // leave e.g. `user` standing in for the program in + // `sudo -u user git commit ...`. `command` doesn't take any flag + // values. `env` accepts both flags (`-i`, `-S`, `-u name`) AND + // `KEY=VALUE` argv entries before the program — both need + // skipping so `env GIT_COMMITTER_DATE=now git commit ...` resolves + // to `git`. + if (tokens[i] === 'sudo' || tokens[i] === 'command' || tokens[i] === 'env') { + const wrapper = tokens[i]; + i++; + while (i < tokens.length && tokens[i]!.startsWith('-')) { + const flag = tokens[i]!; + i++; + // `env -C DIR` / `env --chdir DIR` (GNU coreutils 8.30+) and + // `sudo -D DIR` / `sudo --chdir DIR` (Linux sudo with --chdir) + // both relocate the working directory before exec. Treat the + // segment as repo-shifting (same contract as a leading + // `GIT_DIR=...` assignment) so we don't stamp our trailer onto + // a commit that landed in a different repository. + // + // Also catch the attached-value forms `--chdir=DIR` and the + // short-form `-CDIR` / `-DDIR` that shell-quote tokenises as a + // single argv entry. Without this, `sudo --chdir=/tmp git + // commit` and `env -C/tmp git commit` would both pass through + // the bare-flag check (which is set-membership, not prefix- + // match) and silently land our trailer on a commit in the + // wrong repo. + const shiftSet = + wrapper === 'env' + ? ENV_FLAGS_SHIFT_CWD + : wrapper === 'sudo' + ? SUDO_FLAGS_SHIFT_CWD + : null; + if (shiftSet && isShiftCwdFlag(flag, shiftSet)) { + return null; + } + // Value-taking flag tables, per wrapper: `sudo -u user`, + // `env -u NAME` (unset), `env -S string` (split-string args). + // `command` has no value-taking options in this allowlist. + // Without skipping the value, `env -u FOO git commit ...` + // would leave `FOO` as `tokens[0]` and the parser would treat + // it as the program — masking the real `git commit`. + const takesValue = + (wrapper === 'sudo' && SUDO_FLAGS_WITH_VALUE.has(flag)) || + (wrapper === 'env' && ENV_FLAGS_WITH_VALUE.has(flag)); + if (takesValue && i < tokens.length) { + i++; + } + } + // `env` puts KEY=VALUE pairs between its flags and the real + // program, so skip those too. Same git-repo-redirect bail as + // above applies — a `env GIT_DIR=elsewhere git commit` segment + // is non-attributable. + if (wrapper === 'env') { + while (i < tokens.length) { + const key = leadingEnvAssignmentKey(tokens[i]!); + if (key === null) break; + if (GIT_ENV_SHIFTS_REPO.has(key)) return null; + i++; + } + } + } + return tokens.slice(i); +} + +const SUDO_FLAGS_WITH_VALUE = new Set([ + '-u', + '-g', + '-h', + '-D', + '-r', + '-t', + '-C', + '--user', + '--group', + '--host', + '--chdir', + '--role', + '--type', +]); + +// `env`'s value-taking flags. `-u NAME` unsets a variable; +// `-S "string"` splits a single string into args. Without skipping +// the value, `env -u FOO git commit ...` would leave `FOO` as the +// next token and the parser would treat it as the program. +const ENV_FLAGS_WITH_VALUE = new Set(['-u', '--unset', '-S', '--split-string']); + +// `env`'s flags that relocate the working directory (and therefore +// the implicit repository) before exec — GNU coreutils 8.30+'s +// `-C DIR` / `--chdir DIR`. A `git commit` inside such an env wrapper +// runs against whatever repo lives at DIR, NOT our cwd, so we must +// refuse the segment outright the same way `cd /elsewhere && git +// commit` is refused. Returning null from tokeniseSegment makes the +// segment non-attributable, which suppresses both trailer injection +// and the per-file note. +const ENV_FLAGS_SHIFT_CWD = new Set(['-C', '--chdir']); + +// `sudo`'s flags that relocate the working directory before exec. +// Linux sudo's `-D DIR` / `--chdir DIR` (1.9.2+) makes the inner +// command run in DIR, which means a `git commit` underneath it +// targets DIR's repo, not ours. Refuse the segment. +const SUDO_FLAGS_SHIFT_CWD = new Set(['-D', '--chdir']); + +/** + * Match a flag token against a SHIFT_CWD set, including attached-value + * forms. Bare `--chdir`/`-D`/`-C` are caught by direct set membership; + * the long attached form `--name=value` matches when `--name` is in the + * set, and the short attached form `-Xvalue` matches when `-X` is in + * the set AND the token is longer than the flag (so `-D` alone doesn't + * spuriously match `-D` against itself twice). + */ +function isShiftCwdFlag(flag: string, set: ReadonlySet): boolean { + if (set.has(flag)) return true; + for (const f of set) { + if (f.startsWith('--') && flag.startsWith(f + '=')) return true; + if ( + f.length === 2 && + f.startsWith('-') && + flag.startsWith(f) && + flag.length > 2 + ) { + return true; + } + } + return false; +} + +/** + * Environment variables that redirect git's repository selection. A + * leading `GIT_DIR=...`, `GIT_WORK_TREE=...`, etc. on a command makes + * the inner `git commit` operate on a different repo than our cwd + * suggests; treating it as an in-cwd commit would attach our + * `Co-authored-by` trailer (and per-file note) to the wrong + * repository. tokeniseSegment refuses to parse such segments so the + * caller skips them. + * + * Identity / date variables (`GIT_AUTHOR_*`, `GIT_COMMITTER_*`) are + * deliberately NOT in this set — they tweak the commit's metadata + * but don't move it to another repo, so attribution is still + * meaningful. + */ +// `GIT_NAMESPACE` is intentionally NOT here: it prefixes ref names +// within the same repository, but the working tree and object store +// are unchanged, so a `git commit` under it still lands in our cwd's +// repo. The set covers ONLY variables that change which on-disk +// repository git acts on. +const GIT_ENV_SHIFTS_REPO = new Set([ + 'GIT_DIR', + 'GIT_WORK_TREE', + 'GIT_COMMON_DIR', + 'GIT_INDEX_FILE', +]); + +/** + * Match the `KEY=` prefix of a `KEY=value` token and return KEY, + * or null if the token isn't a leading env-var assignment. Centralised + * so the leading-env-strip and the env-wrapper KEY=VALUE strip share + * the same parsing. + */ +function leadingEnvAssignmentKey(token: string): string | null { + const m = /^([A-Za-z_][A-Za-z0-9_]*)=/.exec(token); + return m ? m[1]! : null; +} + +/** + * Walk a `git ...` token sequence past git's global flags + * (`-c key=val`, `-C path`, `--no-pager`, `--git-dir`, `--work-tree`, + * `--namespace`, etc.) to find the actual subcommand. Without this, + * `git -c k=v commit -m x` and `git --no-pager commit -m x` would + * silently slip past a fixed-position check at index 1. + * + * `changesCwd` is true when any of the consumed flags would relocate + * the working directory (`-C`, `--git-dir`, `--work-tree`). + */ +// Two-token global flags whose second token is consumed as a value. +const GIT_GLOBAL_FLAGS_TAKES_VALUE = new Set([ + '-c', + '-C', + '--git-dir', + '--work-tree', + '--namespace', + '--exec-path', + '--config-env', + '--super-prefix', + '--list-cmds', +]); +// Flags whose presence shifts cwd interpretation. +const GIT_GLOBAL_FLAGS_SHIFTS_CWD = new Set(['-C', '--git-dir', '--work-tree']); + +// `-C .` (and `./`, attached `-C.`) are no-op cwd shifts; treating +// them as cwd-changing would suppress attribution for `git -C . commit` +// (a common alias for "explicit current dir"). +// +// Empty string is intentionally NOT treated as no-op even though +// `-C "" commit` is technically a no-op — `shell-quote` returns '' +// for any env-var or command-substitution that it cannot resolve at +// parse time (e.g. `-C $HOME`, `-C $REPO_ROOT`, `-C $UNSET`), so +// the literal-empty and the unknown-env-var cases are +// indistinguishable from our static view. Treating them as no-op +// would silently stamp our Co-authored-by trailer onto a commit +// that lands in whatever repo `$HOME`/`$REPO_ROOT` resolves to at +// runtime. Conservative skip is the safer call; the only missed +// attribution is for `-C $PWD commit` (rare) and literal `-C "" +// commit` (malformed and won't actually commit). +// +// Same conservatism applies to literal absolute paths that happen +// to resolve to cwd at runtime — we only have the argv at parse +// time, so the cheap textual comparison is what we can reasonably +// check here. +function isNoopCwdTarget(target: string): boolean { + const t = target.trim(); + return t === '.' || t === './'; +} + +function parseGitInvocation(tokens: string[]): { + subcommand: string | undefined; + changesCwd: boolean; +} { + let i = 1; // skip 'git' + let changesCwd = false; + while (i < tokens.length) { + const t = tokens[i]!; + if (GIT_GLOBAL_FLAGS_TAKES_VALUE.has(t)) { + const value = tokens[i + 1] ?? ''; + // For `-C` specifically, the value is the new cwd. `-C .` is + // a no-op so don't flip changesCwd. (`--git-dir`/`--work-tree` + // path arguments aren't cwd in the same sense — leave those + // unconditional.) + if (t === '-C') { + if (!isNoopCwdTarget(value)) changesCwd = true; + } else if (GIT_GLOBAL_FLAGS_SHIFTS_CWD.has(t)) { + changesCwd = true; + } + i += 2; + continue; + } + // Attached-value form: `--git-dir=path`, `--work-tree=path`, etc. + if (t.startsWith('--git-dir=') || t.startsWith('--work-tree=')) { + changesCwd = true; + i++; + continue; + } + // Attached-value form for `-C`: `git -C/path commit ...` and + // `git -C. commit ...`. Git accepts both `-C path` (handled + // above by TAKES_VALUE) and the concatenated form. shell-quote + // tokenises the latter as a single `-Cpath` token. + if (t.length > 2 && t.startsWith('-C')) { + const value = t.slice(2); + if (!isNoopCwdTarget(value)) changesCwd = true; + i++; + continue; + } + // Other long/short flag (no separate arg, e.g. --no-pager, + // --version, --bare, -p). + if (t.startsWith('-')) { + i++; + continue; + } + // First non-flag is the subcommand. + return { subcommand: t, changesCwd }; + } + return { subcommand: undefined, changesCwd }; +} + +/** + * Classify whether a command chain (potentially compound) contains a + * `git commit` invocation, and whether that invocation lands in the + * tool's initial cwd. + * + * Two flags are returned because the answers feed different decisions: + * - `hasCommit` is the broader "did the user try to commit anywhere + * in this chain?" — used to refuse background mode and to gate + * prompt-counter snapshotting. + * - `attributableInCwd` is the stricter "is it safe to capture HEAD + * in our cwd and write a note to that repo?" — used by the actual + * trailer rewrite and git-notes write. + * + * Walks segments in order so a `cd` AFTER an in-cwd commit doesn't + * invalidate that commit's attribution; only a `cd` (or `git -C` / + * `--git-dir` / `--work-tree`) BEFORE the commit shifts safety. + * + * `cwdShifted` is intentionally a one-way latch — it isn't reset on + * a subsequent `cd .` or `cd ..`, so harmless cd cycles like + * `cd src && cd .. && git commit -m x` will conservatively skip + * attribution. The trade-off matches the wrong-repo guard's intent + * (better miss than corrupt unrelated repos). + */ +function gitCommitContext(command: string): { + hasCommit: boolean; + attributableInCwd: boolean; +} { + let hasCommit = false; + let attributable = false; + let cwdShifted = false; + + for (const sub of splitCommands(command)) { + const tokens = tokeniseSegment(sub); + if (!tokens || tokens.length === 0) continue; + + const program = tokens[0]!; + + if (program === 'cd' || program === 'pushd') { + // A cd / pushd before any commit might redirect a later + // `git commit` into a different repo. A cd AFTER the commit + // doesn't matter for the commit we already saw. + // + // A heuristic relaxation: relative cd targets that don't escape + // upward (no `..`, no absolute path, no env-var/$home expansion) + // almost always stay within the same repo. The very common + // `cd subdir && git commit -m "..."` flow is the motivating case + // — same repo, same toplevel, attribution is still safe. Only + // mark as shifted when the target *could* land us in a different + // repo. We can't be 100% certain without running `git rev-parse + // --show-toplevel` after the cd, which would require a synchronous + // fs/exec call that the rest of this walk avoids — the heuristic + // covers the common case and stays conservative on the rest. + if (!hasCommit && cdTargetMayChangeRepo(tokens)) cwdShifted = true; + continue; + } + if (program === 'popd') { + // `popd` returns to a previous directory in the bash dir-stack. + // Without tracking the stack we can't know whether the resulting + // cwd is the same repo or a different one — treat conservatively + // as a shift before any commit. + if (!hasCommit) cwdShifted = true; + continue; + } + + if (program === 'git') { + const { subcommand, changesCwd } = parseGitInvocation(tokens); + if (subcommand === 'commit') { + hasCommit = true; + // The commit lands in our cwd only if no preceding cd shifted + // us and this very invocation didn't redirect via -C/--git-dir. + if (!cwdShifted && !changesCwd) attributable = true; + } else if (changesCwd && !hasCommit) { + // `git -C /path status` and friends signal cwd-elsewhere + // intent; subsequent in-cwd commits in this chain are unusual + // enough to be conservative about. + cwdShifted = true; + } + } + } + + return { hasCommit, attributableInCwd: attributable }; +} + +/** + * Walk a `gh ...` token sequence past gh's global flags + * (`--repo owner/repo`, `--hostname host`, `--help`, `--version`) and + * return the resulting subcommand chain. Same purpose as + * `parseGitInvocation`: a fixed-position check at index 1 misses + * `gh --repo owner/repo pr create ...`, which is a common form. + */ +const GH_GLOBAL_FLAGS_TAKES_VALUE = new Set(['--repo', '-R', '--hostname']); + +function parseGhInvocation(tokens: string[]): string[] { + let i = 1; // skip 'gh' + while (i < tokens.length) { + const t = tokens[i]!; + if (GH_GLOBAL_FLAGS_TAKES_VALUE.has(t)) { + i += 2; + continue; + } + if ( + t.startsWith('--repo=') || + t.startsWith('--hostname=') || + t.startsWith('-R=') + ) { + i++; + continue; + } + if (t.startsWith('-')) { + i++; + continue; + } + return tokens.slice(i); + } + return []; +} + +/** + * Heuristic: does this `cd` invocation potentially redirect us into + * a different repository? Used by `gitCommitContext` to decide + * whether a subsequent `git commit` in the same chain is still + * attributable in our cwd. + * + * Returns true (conservative — assume shift) when the target is + * absolute, escapes upward (`..`), goes to `$HOME` / `~`, contains an + * env-var (we can't resolve it statically), or is missing entirely + * (`cd` alone goes to `$HOME`). Plain relative paths like `cd src`, + * `cd ./packages/foo`, or `cd subdir/nested` are treated as in-repo. + */ +function cdTargetMayChangeRepo(tokens: string[]): boolean { + // tokens[0] is 'cd'. The next non-flag token is the target. + let i = 1; + while (i < tokens.length && tokens[i]!.startsWith('-')) i++; + const target = tokens[i]; + // `cd` with no argument goes to $HOME. + if (target === undefined) return true; + if (target.startsWith('/')) return true; + if (target.startsWith('~')) return true; + // Env-var reference (e.g. `$HOME`, `$REPO`) — can't resolve here. + if (target.includes('$')) return true; + // `..`, `../..`, `..\\foo` etc. could escape the repo root. + if (target === '..') return true; + if (target.startsWith('../') || target.startsWith('..\\')) return true; + // Embedded parent-dir traversal can also escape: `foo/../../escape`, + // `./..`, `nested/..`, etc. Catching `/..` and `\..` anywhere in + // the path covers both POSIX and Windows separators without + // false-positiving on legitimate names that happen to contain `..` + // (which only escape when followed by a separator). + if (target.includes('/..') || target.includes('\\..')) return true; + // `-` is bash's "previous directory" — could be anywhere. + if (target === '-') return true; + return false; +} + +/** + * Detect whether the attributable `git commit` invocation in + * `command` carries the `--amend` flag. Used so attachCommitAttribution + * can switch the diff range from `${postHead}~1..${postHead}` (the + * amended commit vs its parent — too broad for amend, since the + * amended commit's parent is the original commit's parent, so this + * diff lumps both commits' worth of changes) to + * `${preHead}..${postHead}` (the actual amend delta — `preHead` was + * captured synchronously before spawn and is the pre-amend SHA). + * + * Only the *first* commit segment that runs in the same cwd as the + * shell tool counts. `git -C ../other commit --amend && git commit -m x` + * must not flip the diff range for the second (fresh) commit, since + * `preHead` would be the inner repo's SHA there, not ours. + */ +function isAmendCommit(command: string): boolean { + let cwdShifted = false; + for (const sub of splitCommands(command)) { + const tokens = tokeniseSegment(sub); + if (!tokens || tokens.length === 0) continue; + const program = tokens[0]!; + if (program === 'cd' || program === 'pushd') { + if (!cwdShifted && cdTargetMayChangeRepo(tokens)) cwdShifted = true; + continue; + } + if (program === 'popd') { + cwdShifted = true; + continue; + } + if (program !== 'git') continue; + const { subcommand, changesCwd } = parseGitInvocation(tokens); + if (subcommand === 'commit' && !cwdShifted && !changesCwd) { + return ( + tokens.includes('--amend') || + tokens.some((t) => t.startsWith('--amend=')) + ); + } + if (changesCwd && !cwdShifted) cwdShifted = true; + } + return false; +} + +/** + * Locate the character range of the *first* attributable + * `git commit` invocation in the (potentially compound) command, or + * `null` if none is attributable in the current cwd. The range + * covers the segment as `splitCommands` tokenised it — i.e. just + * the `git commit ...` part, NOT later `&& git tag -m ...` or + * earlier `git status &&` segments. + * + * Used by `addCoAuthorToGitCommit` to scope the `-m` regex rewrite + * so a later `git tag -m "..."` (different sub-command in the same + * compound) can't be mistaken for the commit message. + */ +function findAttributableCommitSegment( + command: string, +): { start: number; end: number } | null { + let cursor = 0; + let cwdShifted = false; + for (const sub of splitCommands(command)) { + const start = command.indexOf(sub, cursor); + if (start < 0) { + // splitCommands strips line continuations (`\`) and + // some whitespace, so the trimmed segment text may not appear + // verbatim in the original command. Log so a multi-line + // command silently dropping its trailer is at least visible + // when QWEN_DEBUG_LOG_FILE is set. + debugLogger.warn( + `findAttributableCommitSegment: cannot map segment "${sub.slice(0, 60)}" ` + + `back to the original command (likely line-continuation / whitespace mismatch).`, + ); + continue; + } + const end = start + sub.length; + cursor = end; + const tokens = tokeniseSegment(sub); + if (!tokens || tokens.length === 0) continue; + const program = tokens[0]!; + if (program === 'cd' || program === 'pushd') { + // Mirror gitCommitContext's cd/pushd heuristic: relative paths + // that don't escape upward are treated as in-repo, so + // `cd subdir && git commit ...` still finds the segment. + if (!cwdShifted && cdTargetMayChangeRepo(tokens)) cwdShifted = true; + continue; + } + if (program === 'popd') { + cwdShifted = true; + continue; + } + if (program === 'git') { + const { subcommand, changesCwd } = parseGitInvocation(tokens); + if (subcommand === 'commit' && !cwdShifted && !changesCwd) { + return { start, end }; + } + if (changesCwd && !cwdShifted) cwdShifted = true; + } + } + return null; +} + +/** + * Locate the character range of the `gh pr create` (or alias + * `gh pr new`) segment in a potentially compound command. Used by + * `addAttributionToPR` so the `--body`/`-b` rewrite is scoped to + * just that segment — without scoping, a command like + * `curl -b "session=abc" && gh pr create --body "summary"` would + * have the regex match `curl`'s `-b` cookie flag and inject + * attribution there. + */ +function findGhPrCreateSegment( + command: string, +): { start: number; end: number } | null { + let cursor = 0; + for (const sub of splitCommands(command)) { + const start = command.indexOf(sub, cursor); + if (start < 0) { + debugLogger.warn( + `findGhPrCreateSegment: cannot map segment "${sub.slice(0, 60)}" ` + + `back to the original command (likely line-continuation / whitespace mismatch).`, + ); + continue; + } + const end = start + sub.length; + cursor = end; + const tokens = tokeniseSegment(sub); + if (!tokens || tokens[0] !== 'gh') continue; + const rest = parseGhInvocation(tokens); + if (rest[0] === 'pr' && (rest[1] === 'create' || rest[1] === 'new')) { + return { start, end }; + } + } + return null; +} + +/** + * Approximate characters per text line for the diff-size proxy. + * `numstat` reports added+deleted line counts; we multiply by this + * constant to get a coarse "change magnitude" the per-file AI + * accumulator can be clamped against. The downstream `aiChars` / + * `humanChars` fields in the git-notes payload are literally + * (lines × this constant) — they are NOT real character counts. + * See the `FileAttributionDetail` interface doc for the consequences + * for consumers that aggregate the raw values. + */ +const APPROX_CHARS_PER_LINE = 40; +/** + * Fallback diff-size proxy for binary files. `numstat` reports `-` + * (instead of integer counts) for any non-text blob, so we can't + * compute a per-line estimate; this flat value lets the entry + * survive into the payload at a consistent (if coarse) size. + * Same heuristic-not-literal caveat as `APPROX_CHARS_PER_LINE` — + * a 5 MB image change and a 1-byte binary tweak both report this + * value. + */ +const BINARY_DIFF_SIZE_FALLBACK = 1024; + +/** + * Parse `git diff --numstat` output into a `path → approximate change + * size` map for attribution accounting. The result feeds in as the + * denominator clamp for `aiChars`, so missing entries would silently + * drop a file from attribution — every changed file must land in the + * map. + * + * `--numstat` is preferred over `--stat` because the columns are exact + * integers (no graphical bars to parse). Each line is: + * `\t\t` + * For binary files, both counts are `-`; we fall back to a fixed + * estimate so binary-only changes still get a non-zero entry. + * + * The `(adds + dels) * 40` figure remains a heuristic — git diff has no + * cheap way to surface exact character counts. The clamp in + * `generateNotePayload` keeps the math consistent (aiChars never + * exceeds diffSize), so the heuristic drives the precision of the + * percentage but cannot make `aiChars + humanChars` diverge from + * `diffSize`. + * + * Rename notations (`{old => new}` and bare `old => new`) are + * normalized to the new path so lookups match `--name-only` output. + * + * Exported for unit testing — the function is otherwise an + * implementation detail of `attachCommitAttribution`. + */ +export function parseNumstat(numstatOutput: string): Map { + const sizes = new Map(); + const lines = numstatOutput.split('\n').filter(Boolean); + + const normalizeFilePath = (filePath: string): string => { + let p = filePath.trim(); + // Brace rename: `{old => new}` or `dir/{old => new}/file` + p = p.replace(/\{[^}]*?=>\s*([^}]*)\}/g, '$1'); + // Bare rename across directories: `old/path/file => new/path/file` + if (p.includes('=>')) { + const m = p.match(/^(.*?)\s=>\s(.*)$/); + if (m) p = m[2]!.trim(); + } + return p; + }; + + for (const line of lines) { + // Format: "\t\t" — a literal "-" stands + // in for both counts on binary entries. + const m = line.match(/^([\d-]+)\t([\d-]+)\t(.+)$/); + if (!m) continue; + const filePath = normalizeFilePath(m[3]!); + if (m[1] === '-' && m[2] === '-') { + // Binary file: numstat omits exact counts. Fall back to a fixed + // estimate so the entry isn't missing entirely (which would zero + // out attribution for the file). + sizes.set(filePath, BINARY_DIFF_SIZE_FALLBACK); + continue; + } + const adds = parseInt(m[1]!, 10); + const dels = parseInt(m[2]!, 10); + if (Number.isNaN(adds) || Number.isNaN(dels)) continue; + sizes.set(filePath, (adds + dels) * APPROX_CHARS_PER_LINE); + } + + return sizes; +} + export const OUTPUT_UPDATE_INTERVAL_MS = 1000; const DEFAULT_FOREGROUND_TIMEOUT_MS = 120000; @@ -572,6 +1414,8 @@ export class ShellToolInvocation extends BaseToolInvocation< shellExecutionConfig?: ShellExecutionConfig, setPidCallback?: (pid: number) => void, ): Promise { + const strippedCommand = stripShellWrapper(this.params.command); + if (signal.aborted) { return { llmContent: 'Command was cancelled by user before it could start.', @@ -593,13 +1437,47 @@ export class ShellToolInvocation extends BaseToolInvocation< combinedSignal = AbortSignal.any([signal, timeoutSignal]); } - // Add co-author to git commit commands - const processedCommand = this.addCoAuthorToGitCommit( - this.params.command.trim(), + // Add co-author to git commit commands and Qwen Code attribution to + // `gh pr create` bodies. Both wrappers are no-ops on commands they + // don't recognise. Apply to the *trimmed original* (not strippedCommand) + // so leading env assignments and shell wrappers (`FOO=bar bash -c '...'`) + // are preserved through to execution; the rewriters operate at the + // top-level shell layer and become no-ops when the commit hides + // inside a wrapper. + const processedCommand = this.addAttributionToPR( + this.addCoAuthorToGitCommit(this.params.command.trim()), ); const commandToExecute = processedCommand; const cwd = this.params.directory || this.config.getTargetDir(); + // Snapshot HEAD before running so attachCommitAttribution can detect + // commit creation by HEAD movement instead of trusting the shell + // exit code (which is unreliable for compound commands). + // + // Synchronous capture via `execFileSync`: a fire-and-forget async + // rev-parse can resolve AFTER a fast-cached `git commit` moves + // HEAD (real race seen on slow filesystems / heavy contention), + // leaving preHead === postHead and silently skipping the + // attribution note. ~10–50ms event-loop block per commit-shaped + // command, only when `commitCtx.hasCommit` is true. + // + // We act on `gitCommitContext` rather than a raw regex so quoted + // text like `echo "git commit"` doesn't trigger snapshot/notes, + // and so attribution still runs after a `git commit && cd ..` + // chain (which would have failed an "any cd anywhere" gate). + const commitCtx = gitCommitContext(strippedCommand); + // Capture preHead only when the commit will actually be + // attributed in our cwd: that's the only consumer (the + // `attributableInCwd` branch below feeds preHead into + // `attachCommitAttribution`). For non-attributable + // hasCommit cases (`cd /elsewhere && git commit`, + // `git -C /other commit`), no consumer reads preHead and the + // ~10–50 ms execFileSync is dead work that just blocks the + // event loop before the user's real command spawns. + const preHead: string | null = commitCtx.attributableInCwd + ? this.getGitHeadSync(cwd) + : null; + let cumulativeOutput: string | AnsiOutput = ''; let lastUpdateTime = Date.now(); let isBinaryStream = false; @@ -737,6 +1615,42 @@ export class ShellToolInvocation extends BaseToolInvocation< // (Long-run advisory append happens AFTER `truncateToolOutput` // below — see the explanation there for why post-truncation.) } + + // Run attribution outside the aborted/non-aborted branch: a + // `git commit -m "x" && sleep 999` chain can move HEAD and then + // time out, leaving the new commit without its attribution note + // while the stale per-file attribution stays around for a later + // unrelated commit. attachCommitAttribution already gates on HEAD + // movement, so it's a no-op when no commit was actually created. + let attributionWarning: string | null = null; + if (commitCtx.attributableInCwd) { + // `git commit --amend` rewrites HEAD in place, so the standard + // parent-vs-postHead diff (`${postHead}~1..${postHead}`) would + // span the entire amended commit (the amended commit's parent + // is the original's parent, so diffing against it lumps both + // commits' worth of changes). Detect the flag so + // `getCommittedFileInfo` can switch to `${preHead}..${postHead}` + // — `preHead` was captured synchronously before spawn and is + // the pre-amend SHA, so this range captures only the amend + // delta. + const isAmend = isAmendCommit(strippedCommand); + attributionWarning = await this.attachCommitAttribution( + cwd, + preHead, + isAmend, + ); + } + // Intentionally NO `else if (commitCtx.hasCommit)` cleanup branch: + // commands that match `hasCommit` but not `attributableInCwd` + // (e.g. `cd /abs/path/to/this/repo && git commit`, `git -C . commit`) + // can land a commit in our cwd, but we don't know which files were + // staged — the user may have done a partial `git add A` and left + // unstaged AI edits to B and C pending. A wholesale + // `clearAttributions(true)` here would silently lose B and C even + // though they weren't committed. Leave the singleton alone; the + // next attributable commit's `attachCommitAttribution` will do a + // proper partial clear via `clearAttributedFiles`. + // Decide whether to emit the long-run advisory. Conditions: // - Process completed under its own steam (no AbortSignal // trigger, no external signal). Specifically: @@ -868,6 +1782,23 @@ export class ShellToolInvocation extends BaseToolInvocation< // someone adds a non-string return path. } + // Surface AI-attribution failures (note exec failure, payload too + // large, diff-analysis exception, shallow clone, etc.) on the tool + // result so the user knows their commit succeeded but the per-file + // git note didn't land. Without this, the only signal is a + // QWEN_DEBUG_LOG_FILE entry the user has likely never set up. + // Appended to BOTH llmContent (so the agent can react / report) and + // returnDisplayMessage (so the human sees it in the TUI). Skipped + // when null (intentional skips like a bare `git commit` with no + // tracked AI edits don't need user-visible feedback). + if (attributionWarning) { + if (typeof llmContent === 'string') { + llmContent += `\n\n${attributionWarning}`; + } + returnDisplayMessage += + (returnDisplayMessage ? '\n\n' : '') + attributionWarning; + } + // When `result.error` is set, `coreToolScheduler` builds the // model-facing functionResponse from `error.message`, NOT from // `llmContent` (see `convertToFunctionResponse` and the error @@ -921,8 +1852,55 @@ export class ShellToolInvocation extends BaseToolInvocation< signal: AbortSignal, shellExecutionConfig?: ShellExecutionConfig, ): Promise { - const processedCommand = this.addCoAuthorToGitCommit( - this.params.command.trim(), + const strippedCommand = stripShellWrapper(this.params.command); + + // The background lifecycle (BackgroundShellRegistry) doesn't run + // the post-command attribution path — there's no clean place to + // hook pre/post-HEAD comparison and `git notes` writes between + // the early `Background shell started` return and the eventual + // process exit. Allowing `git commit` to slip through would leave + // the new commit without notes and let stale per-file attribution + // leak into the next foreground commit. Refuse the request and + // tell the user to run it foreground. + // + // Use the broader `hasCommit` flag rather than `attributableInCwd`: + // `cd /elsewhere && git commit` should still be refused even + // though we wouldn't attribute it. + if (gitCommitContext(strippedCommand).hasCommit) { + return { + llmContent: + 'Refusing to run `git commit` in background mode: AI-attribution notes ' + + 'are written by the foreground completion path. Re-run the commit ' + + 'with is_background=false (or split it out of the compound command).', + returnDisplay: + 'Refused: `git commit` is not supported in background shell mode.', + }; + } + // Strip a single bare trailing `&` (the bash background operator) before + // spawn: bash treats it as background-detach, exits the wrapper + // immediately, and the real child outlives the wrapper — the registry + // would settle as `completed` while the shell is still running, and + // chunked output would land on a closed stream. The managed path is + // itself the backgrounding mechanism, so the trailing `&` is redundant. + // + // Deliberately precise: do not touch `&&` (logical AND), `\&` (escaped + // literal `&`), or commands without a trailing `&`. Earlier `\s*&+\s*$` + // was both too greedy (it ate `&&` and `\&`) and a ReDoS hazard on + // long all-`&` inputs. Plain string checks here are linear and clearer + // than a lookbehind regex. + // + // Operate on the trimmed *original* command so leading env assignments + // / shell wrappers survive through to execution; ShellExecutionService + // re-runs the user-approved invocation verbatim. + const trimmedOriginal = this.params.command.trim(); + const noTrailingAmp = stripTrailingBackgroundAmp(trimmedOriginal); + if (noTrailingAmp !== trimmedOriginal) { + debugLogger.warn( + 'Stripped trailing & from background shell command — managed path handles backgrounding', + ); + } + const processedCommand = this.addAttributionToPR( + this.addCoAuthorToGitCommit(noTrailingAmp), ); const cwd = this.params.directory || this.config.getTargetDir(); @@ -1050,54 +2028,1077 @@ export class ShellToolInvocation extends BaseToolInvocation< }; } + /** + * Count the commits between `preHead` (exclusive) and `postHead` + * (inclusive). SHA-pinned on both ends so a post-commit hook moving + * HEAD between this check and the note write can't change the + * answer (`HEAD~1..HEAD` here would race the same TOCTOU window + * the diff calls were just pinned against). Returns 0 if either + * side is unreadable. Goes through `child_process.execFile` with + * argv to stay independent of the mockable `ShellExecutionService`. + */ + private async countCommitsAfter( + cwd: string, + preHead: string, + postHead: string, + ): Promise { + return this.runGitCount(cwd, [ + 'rev-list', + '--count', + `${preHead}..${postHead}`, + ]); + } + + /** + * Count commits reachable from `postHead` when the repo had no prior + * HEAD before the user's command — i.e. the very first commit (or + * compound `init && commit && commit ...`). Without this fallback + * the multi-commit guard would be skipped on a brand-new repo and + * mis-attribute combined data to the final commit. SHA-pinned for + * the same reason as `countCommitsAfter`. + */ + private async countCommitsFromRoot( + cwd: string, + postHead: string, + ): Promise { + return this.runGitCount(cwd, ['rev-list', '--count', postHead]); + } + + /** Shared helper for the two `rev-list --count` invocations. */ + private async runGitCount(cwd: string, args: string[]): Promise { + return new Promise((resolve) => { + const child = childProcess.execFile( + 'git', + args, + { cwd, timeout: 2000 }, + (error, stdout) => { + if (error) { + resolve(0); + return; + } + const n = parseInt(String(stdout).trim(), 10); + resolve(Number.isFinite(n) && n > 0 ? n : 0); + }, + ); + child.on('error', () => {}); + }); + } + + /** + * Read the current HEAD SHA, or null if unavailable (no commits + * yet, not a git repo, or git failed). Used to detect whether a + * `git commit` actually created a new commit, independent of the + * shell's exit code. Goes through `child_process.execFile` rather + * than {@link ShellExecutionService} so the lookup is unaffected + * by test mocks of the shell service and stays well clear of any + * user-supplied shell wrapper. + */ + private async getGitHead(cwd: string): Promise { + return new Promise((resolve) => { + const child = childProcess.execFile( + 'git', + ['rev-parse', 'HEAD'], + { cwd, timeout: 2000 }, + (error, stdout) => { + if (error) { + resolve(null); + return; + } + const sha = String(stdout).trim(); + resolve(sha.length > 0 ? sha : null); + }, + ); + // Suppress unhandled-error events from the child stream (e.g. ENOENT + // when git is missing); the callback still receives the error. + child.on('error', () => {}); + }); + } + + /** + * Synchronous companion to {@link getGitHead}. Captured BEFORE the + * user's shell command spawns so a fast `git commit` (hot-cached, + * no hooks) cannot move HEAD before our async rev-parse has a chance + * to read it — a real race seen on slow filesystems / heavy contention + * where preHead would otherwise resolve to the new SHA, postHead would + * match, and `attachCommitAttribution` would silently skip writing the + * attribution note even though the commit succeeded. + * + * Worst case is ~10–50 ms of event-loop block per commit-shaped shell + * command; acceptable trade for correctness of the post-command HEAD + * comparison. + */ + private getGitHeadSync(cwd: string): string | null { + try { + const stdout = childProcess.execFileSync('git', ['rev-parse', 'HEAD'], { + cwd, + timeout: 2000, + // Discard stderr noise (e.g. "fatal: not a git repository") — + // the catch-or-empty-output path already covers failure. + stdio: ['ignore', 'pipe', 'ignore'], + }); + const sha = String(stdout).trim(); + return sha.length > 0 ? sha : null; + } catch { + return null; + } + } + + /** + * After a successful git commit, attach per-file AI attribution metadata + * as git notes. Analyzes staged files via `git diff` to calculate real + * AI vs human contribution percentages. + * + * Detects commit creation by HEAD movement, not by shell exit code: + * for compound commands like `git commit -m "x" && npm test`, the + * commit can succeed and a later step can fail. Gating on `exitCode + * !== 0` would skip attribution for the successful commit, so we + * compare pre- and post-command HEAD instead. + * + * Respects the gitCoAuthor.commit setting: if the user disables commit + * attribution, the per-file note is skipped too (same toggle governs + * the Co-authored-by trailer and the git-notes payload). + */ + private async attachCommitAttribution( + cwd: string, + preHead: string | null, + isAmend: boolean, + ): Promise { + // Returns a one-line warning suitable for appending to the tool's + // returnDisplay when a write that the user could plausibly fix + // (note exec failure, payload too large, exception during diff + // analysis) drops the AI-attribution note. Returns null when the + // skip is intentional / inherent to the situation (no commit + // landed, multi-commit chain, attribution toggle off, no tracked + // edits) — those don't need user-visible feedback. + // Caller (`execute`) gates this with `commitCtx.attributableInCwd`, + // so we don't re-parse the command here. Re-parsing would be dead + // work and a maintenance trap — if the two checks ever drifted, + // trailer injection and git-notes writes could diverge silently. + + const postHead = await this.getGitHead(cwd); + const commitCreated = postHead !== null && postHead !== preHead; + const attributionService = CommitAttributionService.getInstance(); + + if (!commitCreated) { + // HEAD didn't move in this cwd. Possible causes: + // 1. Commit failed (hook rejected, nothing staged, etc.) + // 2. User did `git commit && git reset HEAD~1` — HEAD reverted + // 3. Submodule case (`cd submodule && git commit`) — the inner + // repo's HEAD moved, ours didn't + // We can't tell these apart reliably from here. Dropping the + // per-file attributions on (1)/(2) is fine in isolation, but on + // (3) we'd silently lose the user's outer-repo edits even though + // none of them were committed. Leave attributions intact instead: + // a later successful commit will overwrite the counters and the + // accumulated aiContribution still represents real AI work. + return null; + } + + // Refuse to attribute when a single shell command produced more + // than one commit (e.g. `git commit -m a && git commit -m b`). + // Our singleton has no way to partition the per-file AI + // contribution across the individual commits, so attaching the + // combined note to HEAD would mis-attribute earlier commits' + // changes to the last one. Snapshot prompt counters and bail. + // + // For a brand-new repo (preHead === null), use `git rev-list + // --count HEAD` so the very first compound `init && commit a && + // commit b` chain still gets caught. + const commitCount = + preHead !== null + ? await this.countCommitsAfter(cwd, preHead, postHead) + : await this.countCommitsFromRoot(cwd, postHead); + // commitCreated has already established that HEAD moved, so we + // expect exactly 1 commit. Anything else is suspicious: + // - >1: actual multi-commit chain we can't partition + // - 0: rev-list errored / timed out — could not verify, so + // we'd otherwise silently attribute as a single commit even + // though the count is unknown + // Bail in either case. + if (commitCount !== 1) { + const reason = + commitCount === 0 + ? 'commit count unavailable (rev-list failed) ' + + 'after HEAD moved — refusing to assume single commit' + : `multi-commit shell command (${commitCount} commits since ` + + `${preHead ? preHead.slice(0, 12) : 'repo root'})`; + debugLogger.warn(`Refusing AI attribution: ${reason}.`); + // Snapshot the prompt counter but do NOT clear per-file + // attributions: in a `commit a && commit b` chain, the user + // may have unstaged AI edits to files that appeared in NEITHER + // commit. Wholesale-clearing here would erase those even + // though the rest of the flow is built to preserve unstaged + // entries across partial commits. + attributionService.noteCommitWithoutClearing(); + return null; + } + + // A new commit landed. Even when no per-file attribution was + // tracked (rare but possible — e.g. user committed external + // changes), we still need to snapshot the prompt counters as + // "at last commit" so a later `gh pr create` doesn't report an + // inflated N-shotted count spanning multiple commits. + if (!attributionService.hasAttributions()) { + attributionService.noteCommitWithoutClearing(); + return null; + } + + let committedAbsolutePaths: Set | null = null; + // Separate from `committedAbsolutePaths` so a failed note write + // (oversized payload, `git notes` non-zero exit, exception) does + // NOT also delete the per-file attribution data the user might + // need to amend & retry. `shouldClear` flips to the partial-clear + // set only on (a) note-write success, or (b) attribution toggle + // OFF — both cases where the file is genuinely "done" from the + // attribution path's POV. + let shouldClear: Set | null = null; + let warning: string | null = null; + try { + // Analyze the just-committed files by diffing the captured + // `postHead` against its parent (or `preHead` for amend). All + // diff calls are SHA-pinned so a post-commit hook / chained + // `git tag` / parallel git process moving HEAD between the + // analysis phase and the note write can't leave the note + // attached to commit A but describing commit B. + const stagedInfo = await this.getCommittedFileInfo( + cwd, + isAmend, + postHead, + preHead, + ); + + // null = analysis failed (shallow clone, --amend without reflog, + // partial diff failure, etc.). Leave `committedAbsolutePaths` + // null so the finally block calls `noteCommitWithoutClearing()` + // — snapshotting the prompt counter while leaving per-file + // attributions intact. (Earlier revisions of this code did a + // wholesale clear here, but that erased pending unstaged AI + // edits for files outside the just-failed commit; the + // smaller-evil trade-off is documented in the finally block.) + // Skip the note write entirely — emitting a structurally valid + // but factually wrong all-zero note is worse than no note. + if (stagedInfo === null) { + warning = + 'AI attribution note skipped: could not analyze the commit ' + + 'diff (shallow clone, missing reflog for --amend, or partial ' + + '`git diff` failure). Co-authored-by trailer is unaffected.'; + return warning; // finally still runs for cleanup + } + + // Pass the actual model name (e.g. `qwen3-coder-plus`) rather than the + // co-author display label so the note's `generator` field reflects + // which model produced the changes — and so generateNotePayload's + // sanitizeModelName() actually has the codename it's meant to scrub. + // The base directory must be the git repo root: getCommittedFileInfo + // returns paths relative to `git rev-parse --show-toplevel`, and any + // mismatch here would cause path.relative to produce `../...` keys + // that never match in the AI-attribution lookup. + const baseDir = stagedInfo.repoRoot ?? this.config.getTargetDir(); + + // Capture the absolute paths actually included in this commit so + // the finally block can do a partial clear: files the AI edited + // but the user didn't `git add` should still be tracked for a + // later commit. + // + // Match against the canonical keys already stored in + // `fileAttributions` (recordEdit canonicalises every component + // via realpathSync) rather than re-resolving each diff path on + // the fly. Re-resolving fails for deleted files (realpathSync + // throws on a missing leaf) and for files behind intermediate + // symlinked directories (path.resolve only canonicalises the + // base) — both cases produced cleanup keys that didn't match + // the stored canonical keys, leaking stale per-file attribution + // into subsequent commits. + let canonicalBase: string; + try { + canonicalBase = fs.realpathSync(baseDir); + } catch { + canonicalBase = baseDir; + } + + attributionService.applyCommittedRenames( + stagedInfo.renamedFiles, + canonicalBase, + ); + + // First-pass match: which tracked entries are part of THIS + // commit? Validation must run against this subset only — a + // tracked file the user didn't stage isn't in HEAD's new tree + // post-commit (HEAD still has the pre-AI-edit version), so + // `git show HEAD:` would return the OLD content and the + // hash divergence check would drop the AI's pending unstaged + // work. Scope the reader to the committed set only. + const committedScope = attributionService.matchCommittedFiles( + stagedInfo.files, + canonicalBase, + ); + + // Drop tracked entries whose COMMITTED content has diverged + // from what AI's last write recorded — catches the case where + // the user paste-replaced via an external editor, ran + // `git checkout`, or otherwise modified the file outside the + // Edit/Write tools. Validate against the COMMITTED blob rather + // than the live working tree: the user can `git add` AI's + // content, then make additional unstaged edits, then + // `git commit` — the commit's blob still matches AI's recorded + // hash, but the working-tree file does not. A working-tree + // comparison would drop the entry on a commit that legitimately + // came from AI. + // + // Pin the read to the captured `postHead` SHA, NOT the symbolic + // `HEAD`, for the same TOCTOU reason `buildGitNotesCommand` + // does: a post-commit hook or chained command can advance HEAD + // between our postHead capture and these reads, and a symbolic + // `git show HEAD:` would then compare against the WRONG + // commit's content and spuriously drop entries. + attributionService.validateAgainst((absPath) => { + // ONLY check files that landed in this commit. Anything else + // (unstaged AI work, files in other directories) returns null + // so validateAgainst leaves them alone. + if (!committedScope.has(absPath)) return null; + const rel = path + .relative(canonicalBase, absPath) + .split(path.sep) + .join('/'); + if (!rel || rel.startsWith('..')) return null; + try { + return childProcess + .execFileSync('git', ['show', `${postHead}:${rel}`], { + cwd, + timeout: 2000, + stdio: ['ignore', 'pipe', 'ignore'], + maxBuffer: 16 * 1024 * 1024, + }) + .toString('utf-8'); + } catch { + // No committed content (deleted file, file not in the + // commit, or git error) — leave the entry alone. + return null; + } + }); + + // Recompute the committed set after validation: dropped entries + // shouldn't appear in the per-file payload OR in the partial + // clear set (they were already deleted from fileAttributions). + committedAbsolutePaths = attributionService.matchCommittedFiles( + stagedInfo.files, + canonicalBase, + ); + + // No file in this commit was AI-touched in the current session. + // Writing a note anyway would emit an all-zero "0% AI" payload + // attached to a commit that legitimately had no AI involvement + // — actively misleading. Skip the note; the partial clear in + // the finally block is a no-op (empty set) so unrelated pending + // attributions stay tracked for a later commit. + if (committedAbsolutePaths.size === 0) { + return null; + } + + // Toggle gate AFTER computing committedAbsolutePaths so the + // finally block still does a proper partial clear of files + // that just landed. Without this, a user who turned off + // attribution would have those just-committed files' tracked + // AI work sit in the singleton; flipping the toggle back on + // and committing the same file again would re-attribute the + // earlier (already-committed) AI edits to the new commit. + const gitCoAuthorSettings = this.config.getGitCoAuthor(); + if (!gitCoAuthorSettings.commit) { + // Toggle-off but the commit landed — partial-clear the files + // that just landed so re-enabling later doesn't re-attribute + // earlier (already-committed) AI edits to a future commit. + shouldClear = committedAbsolutePaths; + return null; + } + + const note = attributionService.generateNotePayload( + stagedInfo, + baseDir, + this.config.getModel(), + ); + // Pin the note to the SHA we captured at commit-detection time + // (`postHead`) rather than the symbolic `HEAD`. A post-commit + // hook, chained `git commit && git tag -m ...`, or parallel + // process can advance HEAD between that capture and this + // execFile — without the SHA pin, `-f` would silently land the + // note on the wrong commit. + const notesCommand = buildGitNotesCommand(note, postHead); + + if (!notesCommand) { + debugLogger.warn( + 'AI attribution note too large, skipping git notes attachment', + ); + warning = + 'AI attribution note skipped: payload exceeded the 30 KB ' + + 'size cap (large generated-file exclusion list?). ' + + 'Co-authored-by trailer is unaffected.'; + // Leave per-file state intact: the user might `git commit + // --amend` after pruning excluded paths, and partial-clearing + // here would erase the data they'd need to retry. + return warning; + } + + // Use execFile with argv (rather than ShellExecutionService) so the + // JSON note isn't subjected to shell quoting at all — important on + // Windows where the bash-style escape used previously is invalid + // for cmd.exe / PowerShell. 5s timeout keeps a wedged repo from + // stalling the user-visible turn. + const { exitCode, output, timedOut } = await new Promise<{ + exitCode: number | null; + output: string; + timedOut: boolean; + }>((resolve) => { + const child = childProcess.execFile( + notesCommand.command, + notesCommand.args, + { cwd, timeout: 5000 }, + (error, stdout, stderr) => { + const merged = (stdout || '') + (stderr || ''); + if (error) { + // execFile signals timeout via either `error.killed === true` + // + `error.signal === 'SIGTERM'` (default kill), or + // `error.code === 'ETIMEDOUT'` on some platforms. Detect + // both so the caller's warning can name the actual cause + // ("timed out") instead of mislabeling it as exit-code 1. + const errno = error as NodeJS.ErrnoException & { + killed?: boolean; + signal?: string | null; + }; + const isTimeout = + errno.code === 'ETIMEDOUT' || + (errno.killed === true && errno.signal === 'SIGTERM'); + const code = + typeof errno.code === 'number' + ? (errno.code as unknown as number) + : null; + resolve({ + exitCode: code ?? 1, + output: merged, + timedOut: isTimeout, + }); + } else { + resolve({ exitCode: 0, output: merged, timedOut: false }); + } + }, + ); + child.on('error', () => {}); + }); + + if (exitCode !== 0) { + if (timedOut) { + debugLogger.warn(`git notes timed out after 5s: ${output}`); + warning = + 'AI attribution note skipped: `git notes add` timed out ' + + 'after 5s' + + (output ? ` (${output.trim().slice(0, 120)})` : '') + + '. Co-authored-by trailer is unaffected.'; + } else { + debugLogger.warn(`git notes exited with code ${exitCode}: ${output}`); + warning = + `AI attribution note skipped: \`git notes add\` exited ${exitCode}` + + (output ? ` (${output.trim().slice(0, 120)})` : '') + + '. Co-authored-by trailer is unaffected.'; + } + // Note didn't land — leave per-file state intact so the user + // can amend the commit (or manually run `git notes add`) + // without losing attribution data they'd need to reproduce. + } else { + debugLogger.debug( + `Attached AI attribution note: ${note.summary.aiPercent}% AI, ${note.summary.totalFilesTouched} file(s)`, + ); + // Successful note write — partial-clear the just-committed + // files so a later commit doesn't re-attribute them. + shouldClear = committedAbsolutePaths; + } + } catch (err) { + debugLogger.warn( + `Failed to attach AI attribution note: ${getErrorMessage(err)}`, + ); + warning = + `AI attribution note skipped: ${getErrorMessage(err)}. ` + + 'Co-authored-by trailer is unaffected.'; + } finally { + // Partial clear: only drop tracking for files that landed in + // this commit AND the note write actually succeeded (or the + // user disabled the toggle). `shouldClear` stays null when the + // note was skipped (oversized payload, non-zero exit, exception) + // so the user can amend & retry without their per-file + // attribution being silently destroyed first. When `shouldClear` + // is null, just snapshot the prompt counter — DON'T + // wholesale-clear, since that would erase pending AI edits for + // files the user never staged in this commit. + if (shouldClear) { + attributionService.clearAttributedFiles(shouldClear); + } else { + attributionService.noteCommitWithoutClearing(); + } + } + return warning; + } + + /** + * Get information about files in the just-landed commit by diffing + * the captured `postHead` against its parent (`${postHead}~1`), or + * for amend against `preHead` (the captured pre-amend SHA). All + * probes/diffs are SHA-pinned so a post-commit hook moving HEAD + * between this call and the eventual `git notes` write can't make + * the note describe a different commit than it attaches to. + * + * Returns: + * - A populated `StagedFileInfo` when analysis succeeded. + * - An empty `StagedFileInfo` when the commit truly has no files + * (e.g. `--allow-empty`). The caller does a no-op partial clear so + * pending AI attributions stay tracked for the next real commit. + * - `null` when analysis itself failed (shallow clone with no parent + * object, --amend with `preHead === null` or unresolvable `preHead`, + * partial diff failure, exception). + * The caller treats this as "could not determine the committed + * set" and falls back to `noteCommitWithoutClearing()` — snapshots + * the prompt counter but leaves per-file attribution intact, so + * pending AI edits for files NOT in the just-committed set don't + * get wiped along with the analysis failure. (The just-committed + * file's stale entry may re-attribute on a later commit; that's + * the smaller evil compared to wholesale loss.) + */ + private async getCommittedFileInfo( + cwd: string, + isAmend: boolean, + postHead: string, + preHead: string | null, + ): Promise { + const empty: StagedFileInfo = { + files: [], + diffSizes: new Map(), + deletedFiles: new Set(), + renamedFiles: new Map(), + }; + + // Distinguish a successful git command with no output (e.g. + // `--allow-empty` -> empty `--name-only` listing) from a failed + // git command (silenced by ShellExecutionService) so the caller + // can choose between the empty-commit sentinel and the analysis- + // failure sentinel. Returning the same `''` for both used to + // alias `--allow-empty` to a `--name-only` failure, which left + // pending attributions tracked across the just-committed file + // and re-attributed it on the next commit. + const runGit = async (args: string): Promise => { + const handle = await ShellExecutionService.execute( + `git ${args}`, + cwd, + () => {}, + AbortSignal.timeout(5000), + false, + {}, + ); + const r = await handle.result; + return r.exitCode === 0 ? r.output : null; + }; + + try { + // SHA-pin every probe and diff to the captured `postHead` (and + // `preHead` for amend). Using symbolic `HEAD` here would re-open + // the same TOCTOU class that the `git notes` write was already + // pinned against: between this analysis phase and the note write, + // a post-commit hook (husky/lefthook auto-amend, sign-off, signed + // commits adjustment), a chained `git tag -m ...`, or a parallel + // git process can advance HEAD — and then `HEAD~1..HEAD` / + // `diff-tree HEAD` would describe whatever commit HEAD now + // points at, while the note still attaches to the original + // `postHead`. The result is a note on commit A whose contents + // describe commit B. Pinning to `postHead` keeps the analysis + // and the note consistent. + // + // The three calls are independent — fan out so we don't pay the + // spawn latency serially. Same for the three diff calls below + // once we know which form to use. + // - `rev-parse --verify ${postHead}~1`: probe whether the parent + // OBJECT is locally available (fails in shallow clones where + // the parent was pruned). + // - `log -1 --pretty=%P ${postHead}`: read the parent SHA from + // the commit metadata. Works regardless of shallow status + // because the parent SHA is recorded on the commit itself, not + // derived by walking. Empty output = postHead is a true root + // commit. Non-empty output = postHead has a parent (whether or + // not its object is locally available). + // - `rev-parse --show-toplevel`: capture the repo root (HEAD- + // independent). + // + // `rev-list --count` looks tempting as a "is this a root + // commit?" probe but it returns 1 in a depth-1 shallow clone + // (only the local object is reachable), aliasing the shallow + // and root cases. The parent-SHA approach disambiguates them + // correctly. + const [hasParentOutput, parentShaOutput, repoRootOutput] = + await Promise.all([ + runGit(`rev-parse --verify ${postHead}~1`), + runGit(`log -1 --pretty=%P ${postHead}`), + runGit('rev-parse --show-toplevel'), + ]); + // `rev-parse --verify ~1` is allowed to fail (shallow + // clone, true root commit) — treat null and '' uniformly. + const hasParent = hasParentOutput !== null && hasParentOutput.length > 0; + // `log -1 --pretty=%P ` MUST succeed; if git can't read + // postHead's metadata we have no way to tell shallow apart from + // a real root commit. Bail. + if (parentShaOutput === null) { + debugLogger.warn( + 'getCommittedFileInfo: log -1 --pretty=%P failed; ' + + 'cannot distinguish shallow clone from true root commit.', + ); + return null; + } + const isTrueRootCommit = parentShaOutput.trim().length === 0; + // Shallow clone: postHead has a parent recorded but the object + // isn't local. Bail rather than over-attribute via --root. + if (!hasParent && !isTrueRootCommit) { + debugLogger.warn( + 'getCommittedFileInfo: ~1 unreadable but commit is not ' + + 'the true root (shallow clone?); skipping attribution to avoid ' + + 'attributing the entire commit contents.', + ); + return null; + } + // Capture the repo root so the attribution service can + // reconcile paths from `git diff` (relative to the toplevel) + // against absolute paths recorded by the edit/write tools. + // Using the configured target directory as base would zero out + // attribution for any file outside it. Tolerate failure (null + // -> empty string -> caller falls back to targetDir). + const repoRoot = (repoRootOutput ?? '').trim(); + + // Choose the diff range: + // - amend: `${preHead}..${postHead}` — the actual amend delta. + // `preHead` was captured BEFORE the user's command ran and so + // points at the original (pre-amend) commit. The amend rewrote + // that commit into postHead; diffing them captures only what + // changed in this amend, not the entire amended commit's + // contents (which `${postHead}~1..${postHead}` would falsely + // include — postHead's parent is the original's parent, so + // diffing against it spans both commits' worth of changes). + // - has parent: `${postHead}~1..${postHead}` — pin both ends. + // We do NOT use `${preHead}..${postHead}` here: in chains like + // `git reset HEAD~3 && git commit`, preHead points well above + // postHead's parent and the diff would include the reset-away + // commits as deletions, dramatically over-attributing. + // - root commit: `diff-tree --root ` against the empty + // tree. + let diffArgs: { name: string; status: string; numstat: string }; + if (isAmend) { + // For amend, the pre-amend SHA we need is `preHead`. It must + // be non-null (caller's `attributableInCwd` gate already + // captured it for any commit attempt); a missing preHead means + // a brand-new repo where amend isn't meaningful anyway. + if (preHead === null) { + debugLogger.warn( + 'getCommittedFileInfo: --amend with no preHead; skipping ' + + 'attribution note (cannot determine amend delta).', + ); + return null; + } + // Verify the pre-amend SHA still resolves. preHead is captured + // synchronously before spawn, but a concurrent `git gc` / + // `git prune` could in principle remove the object before we + // try to diff against it. + const preHeadProbe = await runGit(`rev-parse --verify ${preHead}`); + if (preHeadProbe === null || preHeadProbe.length === 0) { + debugLogger.warn( + 'getCommittedFileInfo: --amend preHead unresolvable; skipping ' + + 'attribution note (cannot determine amend delta).', + ); + return null; + } + diffArgs = { + name: `diff --find-renames --name-only ${preHead} ${postHead}`, + status: `diff --find-renames --name-status ${preHead} ${postHead}`, + numstat: `diff --find-renames --numstat ${preHead} ${postHead}`, + }; + } else if (hasParent) { + diffArgs = { + name: `diff --find-renames --name-only ${postHead}~1 ${postHead}`, + status: `diff --find-renames --name-status ${postHead}~1 ${postHead}`, + numstat: `diff --find-renames --numstat ${postHead}~1 ${postHead}`, + }; + } else { + diffArgs = { + name: `diff-tree --root --find-renames --no-commit-id -r --name-only ${postHead}`, + status: `diff-tree --root --find-renames --no-commit-id -r --name-status ${postHead}`, + numstat: `diff-tree --root --find-renames --no-commit-id -r --numstat ${postHead}`, + }; + } + const [nameOutput, statusOutput, numstatOutput] = await Promise.all([ + runGit(diffArgs.name), + runGit(diffArgs.status), + runGit(diffArgs.numstat), + ]); + + // ANY of the three diffs failing (null) is an analysis failure, + // NOT an empty commit. Without this check, a `--name-only` that + // failed silently used to alias to `--allow-empty`, leaving the + // just-committed file's tracked AI edit in the singleton and + // re-attributing it to the next commit. + if ( + nameOutput === null || + statusOutput === null || + numstatOutput === null + ) { + debugLogger.warn( + 'getCommittedFileInfo: one or more diff calls failed; ' + + 'cannot distinguish empty commit from analysis failure.', + ); + return null; + } + + const files = nameOutput + .split('\n') + .map((f) => f.trim()) + .filter(Boolean); + if (files.length === 0) return empty; + + // Get deleted files + const deletedFiles = new Set(); + const renamedFiles = new Map(); + for (const line of statusOutput.split('\n')) { + if (line.startsWith('D\t')) { + deletedFiles.add(line.slice(2).trim()); + continue; + } + const parts = line.split('\t'); + const status = parts[0] ?? ''; + if (status.startsWith('R') && parts.length >= 3) { + renamedFiles.set(parts[1]!.trim(), parts[2]!.trim()); + } + } + + // Get diff sizes from numstat output. Bail if `--numstat` + // returned nothing while `--name-only` succeeded — that's the + // partial-failure signal for `Promise.all`, and writing a note + // anyway would force every file's diffSize to 0, then + // generateNotePayload would clamp aiChars to 0 and emit a + // structurally valid but factually wrong all-zero attribution. + const diffSizes = parseNumstat(numstatOutput); + if (diffSizes.size === 0) { + debugLogger.warn( + 'getCommittedFileInfo: --numstat returned empty while ' + + '--name-only listed files; skipping attribution note to ' + + 'avoid emitting all-zero AI percentages.', + ); + return null; + } + + return { + files, + diffSizes, + deletedFiles, + renamedFiles, + repoRoot: repoRoot.length > 0 ? repoRoot : undefined, + }; + } catch { + return null; + } + } + + /** + * Append a configured `Co-authored-by:` trailer to `git commit` + * commands when the commit co-author feature is enabled. No-op for + * commands that don't carry an inline `-m`/`-am` message (those open + * an editor, which we don't try to rewrite). + */ private addCoAuthorToGitCommit(command: string): string { - // Check if co-author feature is enabled + // Check if commit co-author feature is enabled const gitCoAuthorSettings = this.config.getGitCoAuthor(); - if (!gitCoAuthorSettings.enabled) { + if (!gitCoAuthorSettings.commit) { return command; } - // Check if this is a git commit command (anywhere in the command, e.g., after "cd /path &&") - const gitCommitPattern = /\bgit\s+commit\b/; - if (!gitCommitPattern.test(command)) { + // Same shell-type guard as addAttributionToPR — bash escaping is + // wrong for cmd/PowerShell. Gating on the active shell rather than + // the OS platform keeps Windows + Git Bash users (where + // getShellConfiguration() reports shell:'bash') working. + if (getShellConfiguration().shell !== 'bash') { return command; } - // Define the co-author line using configuration - const coAuthor = ` - -Co-authored-by: ${gitCoAuthorSettings.name} <${gitCoAuthorSettings.email}>`; + // Shell-aware detection — a raw regex would falsely match quoted + // text such as `echo "git commit"` and hand a corrupted command + // (with the trailer mid-string) back to the executor. The stricter + // `attributableInCwd` is what we want here: only inject the + // trailer when we're confident the commit lands in our cwd. + const segmentRange = findAttributableCommitSegment(command); + if (!segmentRange) { + return command; + } // Handle different git commit patterns: // Match -m "message" or -m 'message', including combined flags like -am - // Use separate patterns to avoid ReDoS (catastrophic backtracking) + // Use separate patterns to avoid ReDoS (catastrophic backtracking). + // The regex tolerates `-m"msg"` shorthand (no space) — bash accepts + // both `-m foo` and `-mfoo`, and we shouldn't silently skip the + // shorthand form. + // + // The regex is scoped to the actual `git commit` segment (not the + // whole compound command) so a later `git tag -a v1 -m "..."` in + // the same chain can't be mistaken for the commit message. // // Pattern breakdown: // -[a-zA-Z]*m matches -m, -am, -nm, etc. (combined short flags) - // \s+ matches whitespace after the flag + // \s* matches optional whitespace after the flag // [^"\\] matches any char except double-quote and backslash // \\. matches escape sequences like \" or \\ // (?:...|...)* matches normal chars or escapes, repeated - const doubleQuotePattern = /(-[a-zA-Z]*m\s+)"((?:[^"\\]|\\.)*)"/; - const singleQuotePattern = /(-[a-zA-Z]*m\s+)'((?:[^'\\]|\\.)*)'/; - const doubleMatch = command.match(doubleQuotePattern); - const singleMatch = command.match(singleQuotePattern); - const match = doubleMatch ?? singleMatch; - const quote = doubleMatch ? '"' : "'"; + // Match both the short form (`-m`, `-am`, combined short flags) + // and git's long alias `--message` (with optional `=` separator: + // `--message="..."`). Inner alternation is non-capturing so the + // existing `[full, prefix, body]` destructure still applies. + const FLAG_PREFIX = `(?:-[a-zA-Z]*m|--message)\\s*=?\\s*`; + const doubleQuotePattern = new RegExp( + `(${FLAG_PREFIX})"((?:[^"\\\\]|\\\\.)*)"`, + 'g', + ); + // Bash single quotes can't be escaped, so apostrophes inside a + // single-quoted message use the close-escape-reopen form `'\''` + // (e.g. `git commit -m 'don'\''t'`). The inner alternation matches + // either a non-apostrophe character or that escape sequence as a + // whole, so the trailer lands at the true end of the body — at the + // FINAL closing `'` after the user's content — rather than after + // the first interior apostrophe. Mirrors `bodySinglePattern` in + // `addAttributionToPR`. + const singleQuotePattern = new RegExp( + `(${FLAG_PREFIX})'((?:[^']|'\\\\'')*)'`, + 'g', + ); + // Trim a trailing shell comment from the segment so an inert + // `git commit -m "real" # -m "fake"` doesn't have `lastMatchOf` + // pick the comment's `-m "fake"` and splice the trailer into the + // comment (where bash discards it), leaving the actual commit + // unattributed. + const fullSegment = command.slice(segmentRange.start, segmentRange.end); + const commentStart = findUnquotedCommentStart(fullSegment); + const segment = + commentStart >= 0 ? fullSegment.slice(0, commentStart) : fullSegment; + // Git concatenates multiple `-m` values with a blank line, so the + // co-author trailer has to land in the *last* `-m` value to be + // recognised by `git interpret-trailers`. matchAll → take the + // last match (`lastMatchOf` is the shared helper). + const doubleMatch = lastMatchOf(segment.matchAll(doubleQuotePattern)); + const singleMatch = lastMatchOf(segment.matchAll(singleQuotePattern)); + + // Pick whichever match appears LAST in the segment, regardless of + // quote style — but reject any candidate that's nested inside the + // other's range. For `git commit -m "docs mention -m 'flag'"` the + // single-quoted `-m 'flag'` lives INSIDE the double-quoted real + // message; without the nesting check the later (inner) `-m` would + // win and the trailer would be spliced into the body text. + const picked = pickOuterLastMatch(doubleMatch, singleMatch); + const match = picked.match; + const quote = picked.isDouble ? '"' : "'"; + + // Escape the configured name/email for the surrounding quote + // style — has to follow the actually-selected match. + const escape = picked.isDouble + ? escapeForBashDoubleQuote + : escapeForBashSingleQuote; + const escapedName = escape(gitCoAuthorSettings.name ?? ''); + const escapedEmail = escape(gitCoAuthorSettings.email ?? ''); + const coAuthor = `\n\nCo-authored-by: ${escapedName} <${escapedEmail}>`; if (match) { const [fullMatch, prefix, existingMessage] = match; + + // Bail on `$(...)` command substitution inside the captured + // body: our regex's `(?:[^"\\]|\\.)*` body group stops at the + // first interior `"`, so a heredoc-style + // `git commit -m "$(cat <<'HEREDOC' ... HEREDOC)"` (which the + // tool description recommends for multi-line messages) would + // be matched only up to the first inner `"`, then the trailer + // would be spliced into the middle of the command + // substitution and break the shell command. Recognising + // `$(` is enough — if it's there we can't safely rewrite + // without a real shell parser. + // + // We do NOT bail on a bare backtick: while `\`cmd "with" quotes\`` + // suffers the same regex-truncation bug, the common markdown- + // style `\`func()\`` in a commit body has no inner `"` and works + // fine. Bailing on any backtick would lose attribution for the + // common case to defend against a near-zero-traffic pathological + // case where the user typed raw backticks INSIDE a double-quoted + // body and put inner double-quotes inside the backtick span. + // bash itself would interpret that as command substitution + // anyway — almost certainly a user error rather than a real + // commit message — so the rewrite is at most one of several + // things that go wrong. + if (existingMessage.includes('$(')) { + return command; + } + const newMessage = existingMessage + coAuthor; const replacement = prefix + quote + newMessage + quote; - return command.replace(fullMatch, replacement); + // Splice the modified segment back into the original command, + // preserving everything outside the commit segment exactly as + // the caller had it. + const matchStart = (match.index ?? 0) + segmentRange.start; + if (matchStart >= segmentRange.start) { + return ( + command.slice(0, matchStart) + + replacement + + command.slice(matchStart + fullMatch.length) + ); + } } // If no -m flag found, the command might open an editor // In this case, we can't easily modify it, so return as-is return command; } + + /** + * Detect `gh pr create` commands and append AI attribution text to the + * PR body. Format: "🤖 Generated with Qwen Code (N-shotted by Qwen-Coder)" + * when at least one user prompt has been recorded since the last commit; + * otherwise just "🤖 Generated with Qwen Code". + * + * Skipped on Windows: the appended text relies on bash quote-escape + * conventions (`\$`, `'\''`) that cmd.exe and PowerShell don't honor, + * so on those shells our injection could either break the user-approved + * `gh pr create` command or be evaluated as command substitution. + * Losing PR attribution on Windows is an acceptable trade for safety. + */ + private addAttributionToPR(command: string): string { + // Shell-aware detection — a raw regex would falsely match quoted + // text such as `echo "gh pr create --body \"x\""` and rewrite a + // command that wasn't actually creating a PR. + const ghSegment = findGhPrCreateSegment(command); + if (!ghSegment) { + return command; + } + + // Gate on shell type rather than OS platform: bash escaping is + // invalid under cmd/PowerShell but works fine under Windows + + // Git Bash, which `getShellConfiguration()` reports as `'bash'`. + if (getShellConfiguration().shell !== 'bash') { + return command; + } + + const gitCoAuthorSettings = this.config.getGitCoAuthor(); + if (!gitCoAuthorSettings.pr) { + return command; + } + + const attributionService = CommitAttributionService.getInstance(); + const shots = attributionService.getPromptsSinceLastCommit(); + const generator = gitCoAuthorSettings.name ?? 'Qwen-Coder'; + + const attribution = + shots > 0 + ? `\n\n🤖 Generated with Qwen Code (${shots}-shotted by ${generator})` + : `\n\n🤖 Generated with Qwen Code`; + + // Match both the long form `--body` and the short alias `-b` + // (documented in `gh pr create --help`), with either space or + // `=` separator: `--body "..."`, `--body="..."`, `-b "..."`, + // `-b="..."`. Inner alternation is non-capturing so the existing + // `[full, prefix, body]` destructure stays intact. + // + // Run the regex against just the gh segment, NOT the full + // command. Otherwise a compound like + // `curl -b "session=abc" && gh pr create --body "summary"` would + // have the body regex match `curl`'s `-b` cookie flag and inject + // attribution into the cookie value, corrupting the curl call. + const BODY_FLAG = `(?:--body|-b)[\\s=]+`; + const bodyDoublePattern = new RegExp( + `(${BODY_FLAG})"((?:[^"\\\\]|\\\\.)*)"`, + 'g', + ); + // Bash apostrophes inside a single-quoted body use the + // close-escape-reopen form `'\''`. The inner alternation matches + // either a non-apostrophe character or that escape sequence as a + // whole, so the trailer lands at the true end of the body rather + // than after only the first quoted segment. + const bodySinglePattern = new RegExp( + `(${BODY_FLAG})'((?:[^']|'\\\\'')*)'`, + 'g', + ); + // Trim a trailing shell comment off the segment for the same + // reason as addCoAuthorToGitCommit — `gh pr create --body "real" + // # --body "fake"` would otherwise let `lastMatchOf` pick the + // comment's `--body "fake"` and inject attribution into a `--body` + // flag bash discards. + const fullSegment = command.slice(ghSegment.start, ghSegment.end); + const commentStart = findUnquotedCommentStart(fullSegment); + const segment = + commentStart >= 0 ? fullSegment.slice(0, commentStart) : fullSegment; + // gh ignores all but the last `--body`/`-b` flag, so the trailer + // has to land in the final occurrence to actually appear in the PR. + // matchAll → take the last match for each quote style, then pick + // whichever sits later in the segment (mirrors addCoAuthorToGitCommit; + // shares the `lastMatchOf` helper). + const bodyDoubleMatch = lastMatchOf(segment.matchAll(bodyDoublePattern)); + const bodySingleMatch = lastMatchOf(segment.matchAll(bodySinglePattern)); + // Pick whichever match appears LAST in the segment, regardless of + // quote style — but reject any candidate that's nested inside the + // other's range. For `gh pr create --body "docs mention -b 'flag'"` + // the inner `-b 'flag'` is INSIDE the outer `--body "..."`; without + // a nesting check the inner (later) `-b` would win and the trailer + // would be spliced into the body text rather than appended after it. + // Shared with addCoAuthorToGitCommit via `pickOuterLastMatch`. + const pickedBody = pickOuterLastMatch(bodyDoubleMatch, bodySingleMatch); + const bodyMatch = pickedBody.match; + const bodyQuote = pickedBody.isDouble ? '"' : "'"; + + if (bodyMatch) { + const [fullMatch, prefix, existingBody] = bodyMatch; + // Same `$(...)` bailout as addCoAuthorToGitCommit: a heredoc- + // style body (`gh pr create --body "$(cat <<'EOF' ... EOF)"`) + // contains nested `"` that our regex's `(?:[^"\\]|\\.)*` body + // group can't span — the match would terminate at the first + // interior quote and the splice would land mid-substitution, + // corrupting the user-approved command. + if (existingBody.includes('$(')) { + return command; + } + // Escape the appended text for the surrounding quote style. + // Without this, a configured generator name containing `"`, `$`, a + // backtick, or `'` would either break the user-approved `gh pr + // create` command or, worse, be interpreted as command substitution. + const escapedAttribution = pickedBody.isDouble + ? escapeForBashDoubleQuote(attribution) + : escapeForBashSingleQuote(attribution); + const newBody = existingBody + escapedAttribution; + // Splice the modified segment back into the original command, + // offsetting the in-segment match index by the segment start. + const idx = (bodyMatch.index ?? 0) + ghSegment.start; + if (idx >= ghSegment.start) { + const replacement = prefix + bodyQuote + newBody + bodyQuote; + return ( + command.slice(0, idx) + + replacement + + command.slice(idx + fullMatch.length) + ); + } + } + + // Reached here means: `gh pr create`/`gh pr new` was detected, + // `gitCoAuthor.pr` is enabled, but the regex found no inline + // `--body`/`-b` to splice the attribution into. Common causes + // are `--body-file `, `--fill` (uses commit messages as + // body), or just bare `gh pr create` (opens an editor). The + // command runs as the user typed it; we just don't add the + // attribution line. Surface this as a debug warning so a user + // wondering "why isn't my PR getting the trailer?" can see the + // skip in `QWEN_DEBUG_LOG_FILE`. Inline-body rewriting is the + // only safe automatic path — `--body-file` would require us to + // mutate the user's file on disk; `--fill` and editor flows + // have no body in argv at all. + debugLogger.warn( + 'addAttributionToPR: gh pr create detected but no inline ' + + '`--body`/`-b` argument found to append attribution to ' + + '(--body-file / --fill / editor flows are unsupported); ' + + 'PR will be created without the AI attribution line. ' + + 'Pass `--body "..."` inline to enable automatic attribution.', + ); + return command; + } } function getShellToolDescription(): string { diff --git a/packages/core/src/tools/write-file.test.ts b/packages/core/src/tools/write-file.test.ts index 4967bd97c..512126c7c 100644 --- a/packages/core/src/tools/write-file.test.ts +++ b/packages/core/src/tools/write-file.test.ts @@ -28,6 +28,7 @@ import { GeminiClient } from '../core/client.js'; import { createMockWorkspaceContext } from '../test-utils/mockWorkspaceContext.js'; import { FileReadCache } from '../services/fileReadCache.js'; import { StandardFileSystemService } from '../services/fileSystemService.js'; +import { CommitAttributionService } from '../services/commitAttribution.js'; const rootDir = path.resolve(os.tmpdir(), 'qwen-code-test-root'); @@ -826,6 +827,79 @@ describe('WriteFileTool', () => { }); }); + // Same as edit.test's wiring guard: the WriteFileTool feeds the + // commit-attribution singleton on success. The recordEdit call + // distinguishes a true file creation (`null` old content) from + // overwriting an existing empty file (`''` old content); these + // tests pin both shapes so the distinction can't drift silently. + describe('commit-attribution wiring', () => { + const abortSignal = new AbortController().signal; + + beforeEach(() => { + CommitAttributionService.resetInstance(); + }); + + it('records AI-originated writes in the attribution service', async () => { + const filePath = path.join(rootDir, 'attr_write.txt'); + const invocation = tool.build({ + file_path: filePath, + content: 'fresh content', + }); + await invocation.execute(abortSignal); + + const attribution = + CommitAttributionService.getInstance().getFileAttribution(filePath); + expect(attribution).toBeDefined(); + expect(attribution!.aiContribution).toBeGreaterThan(0); + // A truly new file should be flagged so deletions later in the + // session can be reconciled. + expect(attribution!.aiCreated).toBe(true); + + fs.unlinkSync(filePath); + }); + + it('skips attribution when modified_by_user', async () => { + const filePath = path.join(rootDir, 'attr_skip.txt'); + const invocation = tool.build({ + file_path: filePath, + content: 'human-edited', + modified_by_user: true, + }); + await invocation.execute(abortSignal); + + expect( + CommitAttributionService.getInstance().getFileAttribution(filePath), + ).toBeUndefined(); + + fs.unlinkSync(filePath); + }); + + it('marks aiCreated=false when overwriting an existing empty file', async () => { + const filePath = path.join(rootDir, 'attr_existing_empty.txt'); + // Create an empty file first — the distinction we're guarding + // is that overwriting an empty existing file should NOT be + // counted as a creation, even though both old contents are + // length-0. + fs.writeFileSync(filePath, '', 'utf8'); + // Prior-read enforcement (origin/main #3774) requires the file + // to have been Read before WriteFile can overwrite it. + seedPriorRead(filePath); + + const invocation = tool.build({ + file_path: filePath, + content: 'overwrite content', + }); + await invocation.execute(abortSignal); + + const attribution = + CommitAttributionService.getInstance().getFileAttribution(filePath); + expect(attribution).toBeDefined(); + expect(attribution!.aiCreated).toBe(false); + + fs.unlinkSync(filePath); + }); + }); + describe('prior-read enforcement', () => { const abortSignal = new AbortController().signal; diff --git a/packages/core/src/tools/write-file.ts b/packages/core/src/tools/write-file.ts index 6736d5270..f427f77c7 100644 --- a/packages/core/src/tools/write-file.ts +++ b/packages/core/src/tools/write-file.ts @@ -49,6 +49,7 @@ import { fileExists as isFilefileExists, } from '../utils/fileUtils.js'; import { getLanguageFromFilePath } from '../utils/language-detection.js'; +import { CommitAttributionService } from '../services/commitAttribution.js'; import { createDebugLogger } from '../utils/debugLogger.js'; const debugLogger = createDebugLogger('WRITE_FILE'); @@ -426,6 +427,17 @@ class WriteFileToolInvocation extends BaseToolInvocation< }, }); + // Track AI contribution for commit attribution. + // Pass null only when the file truly did not exist before this write; + // an empty string means the file existed but was empty. + if (!modified_by_user) { + CommitAttributionService.getInstance().recordEdit( + file_path, + fileExists ? originalContent : null, + content, + ); + } + // Mark the cache entry written, capturing the post-write stats // so a follow-up Read sees `lastReadAt < lastWriteAt` and falls // through to the full pipeline instead of returning the diff --git a/packages/sdk-python/scripts/get-release-version.js b/packages/sdk-python/scripts/get-release-version.js index 105617376..ca5b977da 100644 --- a/packages/sdk-python/scripts/get-release-version.js +++ b/packages/sdk-python/scripts/get-release-version.js @@ -272,7 +272,10 @@ function isTimeoutError(error) { ); } -async function getReleaseState({ packageVersion, releaseVersion }, allVersions) { +async function getReleaseState( + { packageVersion, releaseVersion }, + allVersions, +) { const state = { packageVersionExistsOnPyPI: allVersions.includes(packageVersion), gitTagExists: false, diff --git a/packages/vscode-ide-companion/schemas/settings.schema.json b/packages/vscode-ide-companion/schemas/settings.schema.json index 3b835bcc9..f5ffa4dc5 100644 --- a/packages/vscode-ide-companion/schemas/settings.schema.json +++ b/packages/vscode-ide-companion/schemas/settings.schema.json @@ -66,9 +66,31 @@ "default": 5 }, "gitCoAuthor": { - "description": "Automatically add a Co-authored-by trailer to git commit messages when commits are made through Qwen Code.", - "type": "boolean", - "default": true + "description": "Attribution added to git commits and pull requests created through Qwen Code.", + "default": { + "commit": true, + "pr": true + }, + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "commit": { + "description": "Add a Co-authored-by trailer to git commit messages AND attach a per-file AI-attribution git note (`refs/notes/ai-attribution`) for commits made through Qwen Code. Disabling skips both.", + "type": "boolean", + "default": true + }, + "pr": { + "description": "Append a Qwen Code attribution line to PR descriptions when running `gh pr create`.", + "type": "boolean", + "default": true + } + } + } + ] }, "checkpointing": { "description": "Session checkpointing settings.", @@ -2129,7 +2151,7 @@ "$version": { "type": "number", "description": "Settings schema version for migration tracking.", - "default": 3 + "default": 4 } }, "additionalProperties": true diff --git a/scripts/generate-settings-schema.ts b/scripts/generate-settings-schema.ts index 0494437df..281aea99f 100644 --- a/scripts/generate-settings-schema.ts +++ b/scripts/generate-settings-schema.ts @@ -25,6 +25,7 @@ import type { SettingsSchema, } from '../packages/cli/src/config/settingsSchema.js'; import { getSettingsSchema } from '../packages/cli/src/config/settingsSchema.js'; +import { SETTINGS_VERSION } from '../packages/cli/src/config/settings.js'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); @@ -161,7 +162,7 @@ function convertSettingToJsonSchema( break; } - // Add default value for simple types only + // Add default value for simple and object types if (setting.default !== undefined && setting.default !== null) { const defaultVal = setting.default; if ( @@ -172,9 +173,39 @@ function convertSettingToJsonSchema( schema.default = defaultVal; } else if (Array.isArray(defaultVal) && defaultVal.length > 0) { schema.default = defaultVal; + } else if ( + typeof defaultVal === 'object' && + !Array.isArray(defaultVal) && + Object.keys(defaultVal).length > 0 + ) { + // Non-empty plain object — publish so IDE editors can surface the + // default value (e.g. `{commit: true, pr: true}` for gitCoAuthor). + schema.default = defaultVal; } } + // If the field accepts a legacy primitive shape (e.g. a boolean that was + // later expanded into an object), wrap with `anyOf` so existing values + // in users' settings.json don't trip the IDE schema validator while + // they wait for our migration to rewrite them on the next launch. + // + // Lift `description` and `default` to the outer (anyOf) level so IDE + // editors that surface schema-driven defaults / descriptions still see + // them — burying these behind `anyOf[N]` makes most validators ignore + // the `default`, which loses the "enabled by default" hint for any + // setting using `legacyTypes`. + if (setting.legacyTypes && setting.legacyTypes.length > 0) { + const description = schema.description; + const defaultVal = schema.default; + delete schema.description; + delete schema.default; + return { + ...(description ? { description } : {}), + ...(defaultVal !== undefined ? { default: defaultVal } : {}), + anyOf: [...setting.legacyTypes.map((t) => ({ type: t })), schema], + }; + } + return schema; } @@ -195,11 +226,12 @@ function generateJsonSchema( ); } - // Add $version property + // Add $version property — sourced from settings.ts so a SETTINGS_VERSION + // bump propagates here instead of needing a parallel manual edit. jsonSchema.properties!['$version'] = { type: 'number', description: 'Settings schema version for migration tracking.', - default: 3, + default: SETTINGS_VERSION, }; return jsonSchema;