Fix streaming dedup: keep last occurrence of each message.id within session files

Claude Code writes the same message.id multiple times during streaming.
The first write has partial tokens (often 1) and no tool_use blocks.
The last write has authoritative token counts and all tool_use/MCP blocks.

Old behavior kept the first occurrence (keep-first), silently dropping
real output tokens (+6.3% undercount) and all MCP tool calls.

New behavior keeps the last occurrence's content but preserves the first
occurrence's timestamp for correct date bucketing.

Validated against 21,390 real session files: 40.5% had duplicate IDs,
output tokens were understated by up to 78% per session.
This commit is contained in:
iamtoruk 2026-05-02 22:30:17 -07:00
parent 341aa46f78
commit 800c106250

View file

@ -121,6 +121,30 @@ function parseApiCall(entry: JournalEntry): ParsedApiCall | null {
}
}
function dedupeStreamingMessageIds(entries: JournalEntry[]): JournalEntry[] {
const firstIdxById = new Map<string, number>()
const lastIdxById = new Map<string, number>()
for (let i = 0; i < entries.length; i++) {
const id = getMessageId(entries[i]!)
if (!id) continue
if (!firstIdxById.has(id)) firstIdxById.set(id, i)
lastIdxById.set(id, i)
}
if (lastIdxById.size === 0) return entries
const result: JournalEntry[] = []
for (let i = 0; i < entries.length; i++) {
const id = getMessageId(entries[i]!)
if (id && lastIdxById.get(id) !== i) continue
if (id && firstIdxById.get(id) !== i) {
const firstTs = entries[firstIdxById.get(id)!]!.timestamp
result.push({ ...entries[i]!, timestamp: firstTs ?? entries[i]!.timestamp })
continue
}
result.push(entries[i]!)
}
return result
}
function groupIntoTurns(entries: JournalEntry[], seenMsgIds: Set<string>): ParsedTurn[] {
const turns: ParsedTurn[] = []
let currentUserMessage = ''
@ -291,7 +315,8 @@ async function parseSessionFile(
if (entries.length === 0) return null
const sessionId = basename(filePath, '.jsonl')
let turns = groupIntoTurns(entries, seenMsgIds)
const dedupedEntries = dedupeStreamingMessageIds(entries)
let turns = groupIntoTurns(dedupedEntries, seenMsgIds)
if (dateRange) {
// Bucket a turn by the timestamp of its first assistant call (when the cost was
// actually incurred). Filtering entries directly produced orphan assistant calls