fix(shell): scope -m rewrite to commit segment, reject nested matches

Two Critical findings on addCoAuthorToGitCommit, plus a Copilot
maintainability nit:

- The `-m` regex used to scan the whole compound command, so
  `git commit -m "fix" && git tag -a v1 -m "release"` would target
  the LATER tag annotation (last -m wins) and splice the trailer
  there instead of the commit message. The rewrite now scopes to
  the actual `git commit` segment via a new
  findAttributableCommitSegment(): same shell-aware walk
  gitCommitContext does, but returning the segment's character
  range so the regex can be run on a slice and spliced back into
  the original command.

- Within the segment, a literal `-m '...'` *inside* a quoted body
  was treated as a real later -m. For
  `git commit -m "docs mention -m 'flag' for completeness"`, the
  inner single-quoted -m sits at a higher index than the real
  outer -m, and the previous index comparison would have it win —
  splicing the trailer mid-message and corrupting the quoting.
  The new code checks whether the candidate is nested inside the
  other quote-style's range (start/end containment) and prefers
  the outer match when so.

- Hoisted three constant Sets (sudo flag list, git global flags
  taking values, git global flags shifting cwd, gh global flags)
  out of the per-call scope to module constants. Functional
  no-op, but keeps the parsing helpers easier to read and avoids
  re-allocating the Sets on every command.

Two regression tests added for the cases above:
- inner `-m '...'` inside the outer message body is preserved
  literally and the trailer lands after the body
- `git tag -a v1 -m "release notes"` after a real
  `git commit -m "fix"` is left untouched, with the trailer
  appended to "fix" only
This commit is contained in:
wenshao 2026-05-02 11:04:31 +08:00
parent 2dd792c87e
commit 222b1884dc
2 changed files with 195 additions and 61 deletions

View file

@ -1315,6 +1315,75 @@ describe('ShellTool', () => {
expect(observed).toMatch(/-m\s+"Title"\s/);
});
// Concern: a literal `-m '...'` *inside* a quoted commit
// message body could be picked up by the regex as if it were a
// real later argument, splicing the trailer mid-message and
// breaking the command's quoting.
it('should not be fooled by a literal -m token inside the quoted message body', async () => {
const command =
'git commit -m "docs mention -m \'flag\' for completeness"';
const invocation = shellTool.build({ command, is_background: false });
const promise = invocation.execute(mockAbortSignal);
resolveExecutionPromise({
rawOutput: Buffer.from(''),
output: '',
exitCode: 0,
signal: null,
error: null,
aborted: false,
pid: 12345,
executionMethod: 'child_process',
});
await promise;
const observed = mockShellExecutionService.mock.calls[0][0];
// The original message body must be preserved end-to-end —
// no trailer spliced before its closing quote.
expect(observed).toContain(
"-m \"docs mention -m 'flag' for completeness",
);
// The trailer must land AFTER the original body, just before
// the message's outer closing quote.
expect(observed).toMatch(
/docs mention -m 'flag' for completeness\s+Co-authored-by:[^"]+"/s,
);
});
// Concern: a later `git tag -m "..."` in the same compound
// command could be mistaken for the commit message because the
// regex was matching across the whole command string.
it('should target the commit message, not a later git tag -m in the same chain', async () => {
const command =
'git commit -m "fix" && git tag -a v1 -m "release notes"';
const invocation = shellTool.build({ command, is_background: false });
const promise = invocation.execute(mockAbortSignal);
resolveExecutionPromise({
rawOutput: Buffer.from(''),
output: '',
exitCode: 0,
signal: null,
error: null,
aborted: false,
pid: 12345,
executionMethod: 'child_process',
});
await promise;
const observed = mockShellExecutionService.mock.calls[0][0];
// The trailer is appended to the commit message body...
expect(observed).toMatch(/git commit -m "fix\s+Co-authored-by:[^"]+"/s);
// ...and the later `git tag -m` is left exactly as the user
// wrote it.
expect(observed).toContain('git tag -a v1 -m "release notes"');
// The tag annotation must not have a trailer spliced in.
const tagMatch = observed.match(/git tag .*-m "([^"]*)"/);
expect(tagMatch?.[1]).toBe('release notes');
});
it('should add co-author when git commit is prefixed with sudo', async () => {
const command = 'sudo git commit -m "Test"';
const invocation = shellTool.build({ command, is_background: false });

View file

@ -127,21 +127,6 @@ function tokeniseSegment(segment: string): string[] | null {
if (tokens[i] === 'sudo' || tokens[i] === 'command') {
const wrapper = tokens[i];
i++;
const sudoFlagsWithValue = new Set([
'-u',
'-g',
'-h',
'-D',
'-r',
'-t',
'-C',
'--user',
'--group',
'--host',
'--chdir',
'--role',
'--type',
]);
while (i < tokens.length && tokens[i]!.startsWith('-')) {
const flag = tokens[i]!;
i++;
@ -149,7 +134,7 @@ function tokeniseSegment(segment: string): string[] | null {
// `command`'s flag-only options we leave i alone.
if (
wrapper === 'sudo' &&
sudoFlagsWithValue.has(flag) &&
SUDO_FLAGS_WITH_VALUE.has(flag) &&
i < tokens.length
) {
i++;
@ -159,6 +144,22 @@ function tokeniseSegment(segment: string): string[] | null {
return tokens.slice(i);
}
const SUDO_FLAGS_WITH_VALUE = new Set([
'-u',
'-g',
'-h',
'-D',
'-r',
'-t',
'-C',
'--user',
'--group',
'--host',
'--chdir',
'--role',
'--type',
]);
/**
* Walk a `git ...` token sequence past git's global flags
* (`-c key=val`, `-C path`, `--no-pager`, `--git-dir`, `--work-tree`,
@ -169,31 +170,31 @@ function tokeniseSegment(segment: string): string[] | null {
* `changesCwd` is true when any of the consumed flags would relocate
* the working directory (`-C`, `--git-dir`, `--work-tree`).
*/
// Two-token global flags whose second token is consumed as a value.
const GIT_GLOBAL_FLAGS_TAKES_VALUE = new Set([
'-c',
'-C',
'--git-dir',
'--work-tree',
'--namespace',
'--exec-path',
'--config-env',
'--super-prefix',
'--list-cmds',
]);
// Flags whose presence shifts cwd interpretation.
const GIT_GLOBAL_FLAGS_SHIFTS_CWD = new Set(['-C', '--git-dir', '--work-tree']);
function parseGitInvocation(tokens: string[]): {
subcommand: string | undefined;
changesCwd: boolean;
} {
// Two-token global flags whose second token is consumed as a value.
const TAKES_VALUE = new Set([
'-c',
'-C',
'--git-dir',
'--work-tree',
'--namespace',
'--exec-path',
'--config-env',
'--super-prefix',
'--list-cmds',
]);
// Flags whose presence shifts cwd interpretation.
const SHIFTS_CWD = new Set(['-C', '--git-dir', '--work-tree']);
let i = 1; // skip 'git'
let changesCwd = false;
while (i < tokens.length) {
const t = tokens[i]!;
if (TAKES_VALUE.has(t)) {
if (SHIFTS_CWD.has(t)) changesCwd = true;
if (GIT_GLOBAL_FLAGS_TAKES_VALUE.has(t)) {
if (GIT_GLOBAL_FLAGS_SHIFTS_CWD.has(t)) changesCwd = true;
i += 2;
continue;
}
@ -295,12 +296,13 @@ function gitCommitContext(command: string): {
* `parseGitInvocation`: a fixed-position check at index 1 misses
* `gh --repo owner/repo pr create ...`, which is a common form.
*/
const GH_GLOBAL_FLAGS_TAKES_VALUE = new Set(['--repo', '-R', '--hostname']);
function parseGhInvocation(tokens: string[]): string[] {
const TAKES_VALUE = new Set(['--repo', '-R', '--hostname']);
let i = 1; // skip 'gh'
while (i < tokens.length) {
const t = tokens[i]!;
if (TAKES_VALUE.has(t)) {
if (GH_GLOBAL_FLAGS_TAKES_VALUE.has(t)) {
i += 2;
continue;
}
@ -321,6 +323,46 @@ function parseGhInvocation(tokens: string[]): string[] {
return [];
}
/**
* Locate the character range of the *first* attributable
* `git commit` invocation in the (potentially compound) command, or
* `null` if none is attributable in the current cwd. The range
* covers the segment as `splitCommands` tokenised it i.e. just
* the `git commit ...` part, NOT later `&& git tag -m ...` or
* earlier `git status &&` segments.
*
* Used by `addCoAuthorToGitCommit` to scope the `-m` regex rewrite
* so a later `git tag -m "..."` (different sub-command in the same
* compound) can't be mistaken for the commit message.
*/
function findAttributableCommitSegment(
command: string,
): { start: number; end: number } | null {
let cursor = 0;
let cwdShifted = false;
for (const sub of splitCommands(command)) {
const start = command.indexOf(sub, cursor);
if (start < 0) continue;
const end = start + sub.length;
cursor = end;
const tokens = tokeniseSegment(sub);
if (!tokens || tokens.length === 0) continue;
const program = tokens[0]!;
if (program === 'cd') {
if (!cwdShifted) cwdShifted = true;
continue;
}
if (program === 'git') {
const { subcommand, changesCwd } = parseGitInvocation(tokens);
if (subcommand === 'commit' && !cwdShifted && !changesCwd) {
return { start, end };
}
if (changesCwd && !cwdShifted) cwdShifted = true;
}
}
return null;
}
/**
* Detect whether `command` invokes `gh pr create` at the top level
* same shell-aware shape as `gitCommitContext` so quoted text like
@ -1570,7 +1612,8 @@ export class ShellToolInvocation extends BaseToolInvocation<
// (with the trailer mid-string) back to the executor. The stricter
// `attributableInCwd` is what we want here: only inject the
// trailer when we're confident the commit lands in our cwd.
if (!gitCommitContext(command).attributableInCwd) {
const segmentRange = findAttributableCommitSegment(command);
if (!segmentRange) {
return command;
}
@ -1581,6 +1624,10 @@ export class ShellToolInvocation extends BaseToolInvocation<
// both `-m foo` and `-mfoo`, and we shouldn't silently skip the
// shorthand form.
//
// The regex is scoped to the actual `git commit` segment (not the
// whole compound command) so a later `git tag -a v1 -m "..."` in
// the same chain can't be mistaken for the commit message.
//
// Pattern breakdown:
// -[a-zA-Z]*m matches -m, -am, -nm, etc. (combined short flags)
// \s* matches optional whitespace after the flag
@ -1595,6 +1642,7 @@ export class ShellToolInvocation extends BaseToolInvocation<
// and a wrong rewrite would mid-insert the trailer and break the
// command's quoting.
const singleQuotePattern = /(-[a-zA-Z]*m\s*)'([^']*)'(?!\\'')/g;
const segment = command.slice(segmentRange.start, segmentRange.end);
// Git concatenates multiple `-m` values with a blank line, so the
// co-author trailer has to land in the *last* `-m` value to be
// recognised by `git interpret-trailers`. matchAll → take the
@ -1606,26 +1654,44 @@ export class ShellToolInvocation extends BaseToolInvocation<
for (const m of matches) result = m;
return result;
};
const doubleMatch = lastMatch(command.matchAll(doubleQuotePattern));
const singleMatch = lastMatch(command.matchAll(singleQuotePattern));
// Pick whichever match appears LAST in the command string,
// regardless of quote style. For `-m "Title" -m 'Body'`, a
// simple `doubleMatch ?? singleMatch` would target the title
// (last double match) and bury the trailer in the wrong field —
// git interpret-trailers only recognises a trailer at the end
// of the final `-m` value.
const match =
doubleMatch && singleMatch
? (doubleMatch.index ?? 0) > (singleMatch.index ?? 0)
? doubleMatch
: singleMatch
: (doubleMatch ?? singleMatch);
const doubleMatch = lastMatch(segment.matchAll(doubleQuotePattern));
const singleMatch = lastMatch(segment.matchAll(singleQuotePattern));
// Pick whichever match appears LAST in the segment, regardless of
// quote style — but reject any candidate that's nested inside the
// other's range. For `git commit -m "docs mention -m 'flag'"` the
// single-quoted `-m 'flag'` lives INSIDE the double-quoted real
// message; without a nesting check the later (inner) `-m` would
// win and the trailer would be spliced into the body text.
const matchRange = (m: RegExpMatchArray | null) =>
m ? { start: m.index ?? 0, end: (m.index ?? 0) + m[0].length } : null;
const isInside = (
inner: RegExpMatchArray | null,
outer: RegExpMatchArray | null,
): boolean => {
const i = matchRange(inner);
const o = matchRange(outer);
return !!(i && o && i.start >= o.start && i.end <= o.end);
};
let match: RegExpMatchArray | null;
if (doubleMatch && singleMatch) {
if (isInside(singleMatch, doubleMatch)) {
match = doubleMatch;
} else if (isInside(doubleMatch, singleMatch)) {
match = singleMatch;
} else {
match =
(doubleMatch.index ?? 0) > (singleMatch.index ?? 0)
? doubleMatch
: singleMatch;
}
} else {
match = doubleMatch ?? singleMatch;
}
const quote = match === doubleMatch ? '"' : "'";
// Escape the configured name/email for the surrounding quote
// style — has to follow the actually-selected match, not just
// whether a double-quoted -m exists somewhere earlier (a later
// single-quoted -m wins on index comparison above).
// style — has to follow the actually-selected match.
const escape =
match === doubleMatch
? escapeForBashDoubleQuote
@ -1639,16 +1705,15 @@ export class ShellToolInvocation extends BaseToolInvocation<
const newMessage = existingMessage + coAuthor;
const replacement = prefix + quote + newMessage + quote;
// Use match.index + slice (rather than indexOf) so multiple
// `-m` flags don't collide — we want the position of the
// *last* match, not the first occurrence of a string that
// could appear earlier in the command.
const idx = match.index ?? command.indexOf(fullMatch);
if (idx >= 0) {
// Splice the modified segment back into the original command,
// preserving everything outside the commit segment exactly as
// the caller had it.
const matchStart = (match.index ?? 0) + segmentRange.start;
if (matchStart >= segmentRange.start) {
return (
command.slice(0, idx) +
command.slice(0, matchStart) +
replacement +
command.slice(idx + fullMatch.length)
command.slice(matchStart + fullMatch.length)
);
}
}