diff --git a/packages/cli/src/services/insight/generators/DataProcessor.test.ts b/packages/cli/src/services/insight/generators/DataProcessor.test.ts index 25ebd2033..85f0e6fba 100644 --- a/packages/cli/src/services/insight/generators/DataProcessor.test.ts +++ b/packages/cli/src/services/insight/generators/DataProcessor.test.ts @@ -1126,4 +1126,82 @@ describe('DataProcessor', () => { expect(hasNonEmptyFrictionDetail).toBe(false); }); }); + + describe('generateFacets', () => { + it('should skip non-conversational sessions', async () => { + const userOnlyRecords: ChatRecord[] = [ + { + sessionId: 'user-only', + timestamp: '2025-01-15T10:00:00Z', + type: 'user', + message: { role: 'user', parts: [{ text: 'Hello' }] }, + uuid: '', + parentUuid: null, + cwd: '', + version: '', + }, + ]; + + const conversationalRecords: ChatRecord[] = [ + { + sessionId: 'conversational', + timestamp: '2025-01-15T10:00:00Z', + type: 'user', + message: { role: 'user', parts: [{ text: 'Hello' }] }, + uuid: '', + parentUuid: null, + cwd: '', + version: '', + }, + { + sessionId: 'conversational', + timestamp: '2025-01-15T10:01:00Z', + type: 'assistant', + message: { role: 'assistant', parts: [{ text: 'Hi' }] }, + uuid: '', + parentUuid: null, + cwd: '', + version: '', + }, + ]; + + // First file is user-only, second is conversational + mockedReadJsonlFile + .mockResolvedValueOnce(userOnlyRecords) + .mockResolvedValueOnce(conversationalRecords); + + const mockFacet = { + underlying_goal: 'Test', + goal_categories: {}, + outcome: 'fully_achieved', + user_satisfaction_counts: {}, + Qwen_helpfulness: 'very_helpful', + session_type: 'single_task', + friction_counts: {}, + friction_detail: '', + primary_success: 'none', + brief_summary: 'Test', + }; + mockGenerateJson.mockResolvedValue(mockFacet); + + const files = [ + { path: '/test/user-only.jsonl', mtime: 2000 }, + { path: '/test/conversational.jsonl', mtime: 1000 }, + ]; + + const result = await ( + dataProcessor as unknown as { + generateFacets( + files: Array<{ path: string; mtime: number }>, + facetsOutputDir?: string, + ): Promise; + } + ).generateFacets(files); + + // Only the conversational session should be analyzed + expect(mockGenerateJson).toHaveBeenCalledTimes(1); + expect(result).toHaveLength(1); + expect(result[0].session_id).toBe('conversational'); + }); + }); }); diff --git a/packages/cli/src/services/insight/generators/DataProcessor.ts b/packages/cli/src/services/insight/generators/DataProcessor.ts index 6a1d03aaf..8e7211d20 100644 --- a/packages/cli/src/services/insight/generators/DataProcessor.ts +++ b/packages/cli/src/services/insight/generators/DataProcessor.ts @@ -82,6 +82,26 @@ export class DataProcessor { return output; } + // Only analyze conversational sessions for facets (skip system-only logs). + private hasUserAndAssistantRecords(records: ChatRecord[]): boolean { + let hasUser = false; + let hasAssistant = false; + + for (const record of records) { + if (record.type === 'user') { + hasUser = true; + } else if (record.type === 'assistant') { + hasAssistant = true; + } + + if (hasUser && hasAssistant) { + return true; + } + } + + return false; + } + // Analyze a single session using LLM private async analyzeSession( records: ChatRecord[], @@ -975,25 +995,49 @@ None captured`; facetsOutputDir?: string, onProgress?: InsightProgressCallback, ): Promise { - // Sort files by recency (descending) and take top 50 - const recentFiles = [...allFiles] - .sort((a, b) => b.mtime - a.mtime) - .slice(0, 50); + const MAX_ELIGIBLE_SESSIONS = 50; - logger.info(`Analyzing ${recentFiles.length} recent sessions with LLM...`); + // Sort files by recency (descending), then select up to 50 conversational + // sessions (must contain both user and assistant records). + const sortedFiles = [...allFiles].sort((a, b) => b.mtime - a.mtime); + const eligibleSessions: Array<{ + fileInfo: { path: string; mtime: number }; + records: ChatRecord[]; + }> = []; + + for (const fileInfo of sortedFiles) { + if (eligibleSessions.length >= MAX_ELIGIBLE_SESSIONS) { + break; + } + + try { + const records = await readJsonlFile(fileInfo.path); + if (!this.hasUserAndAssistantRecords(records)) { + continue; + } + eligibleSessions.push({ fileInfo, records }); + } catch (e) { + logger.error( + `Error reading session file ${fileInfo.path} for facet eligibility:`, + e, + ); + } + } + + logger.info( + `Analyzing ${eligibleSessions.length} eligible recent sessions with LLM...`, + ); // Create a limit function with concurrency of 4 to avoid 429 errors const limit = pLimit(CONCURRENCY_LIMIT); let completed = 0; - const total = recentFiles.length; + const total = eligibleSessions.length; // Analyze sessions concurrently with limit - const analysisPromises = recentFiles.map((fileInfo) => + const analysisPromises = eligibleSessions.map(({ fileInfo, records }) => limit(async () => { try { - const records = await readJsonlFile(fileInfo.path); - // Check if we already have this session analyzed if (records.length > 0 && facetsOutputDir) { const sessionId = records[0].sessionId;