fix(insight): only analyze conversational sessions for facets

Filter sessions to only include those with both user and assistant records when generating facets. This prevents system-only logs from being analyzed, ensuring more accurate session insights. Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
2026-04-30 12:40:44 +00:00 · 2026-02-26 11:26:04 +08:00 · 2026-02-26 11:26:04 +08:00 · c948e0c6e9
commit c948e0c6e9
parent 2421bb185b
2 changed files with 131 additions and 9 deletions
--- a/packages/cli/src/services/insight/generators/DataProcessor.test.ts
+++ b/packages/cli/src/services/insight/generators/DataProcessor.test.ts
@ -1126,4 +1126,82 @@ describe('DataProcessor', () => {
      expect(hasNonEmptyFrictionDetail).toBe(false);
    });
  });
+
+  describe('generateFacets', () => {
+    it('should skip non-conversational sessions', async () => {
+      const userOnlyRecords: ChatRecord[] = [
+        {
+          sessionId: 'user-only',
+          timestamp: '2025-01-15T10:00:00Z',
+          type: 'user',
+          message: { role: 'user', parts: [{ text: 'Hello' }] },
+          uuid: '',
+          parentUuid: null,
+          cwd: '',
+          version: '',
+        },
+      ];
+
+      const conversationalRecords: ChatRecord[] = [
+        {
+          sessionId: 'conversational',
+          timestamp: '2025-01-15T10:00:00Z',
+          type: 'user',
+          message: { role: 'user', parts: [{ text: 'Hello' }] },
+          uuid: '',
+          parentUuid: null,
+          cwd: '',
+          version: '',
+        },
+        {
+          sessionId: 'conversational',
+          timestamp: '2025-01-15T10:01:00Z',
+          type: 'assistant',
+          message: { role: 'assistant', parts: [{ text: 'Hi' }] },
+          uuid: '',
+          parentUuid: null,
+          cwd: '',
+          version: '',
+        },
+      ];
+
+      // First file is user-only, second is conversational
+      mockedReadJsonlFile
+        .mockResolvedValueOnce(userOnlyRecords)
+        .mockResolvedValueOnce(conversationalRecords);
+
+      const mockFacet = {
+        underlying_goal: 'Test',
+        goal_categories: {},
+        outcome: 'fully_achieved',
+        user_satisfaction_counts: {},
+        Qwen_helpfulness: 'very_helpful',
+        session_type: 'single_task',
+        friction_counts: {},
+        friction_detail: '',
+        primary_success: 'none',
+        brief_summary: 'Test',
+      };
+      mockGenerateJson.mockResolvedValue(mockFacet);
+
+      const files = [
+        { path: '/test/user-only.jsonl', mtime: 2000 },
+        { path: '/test/conversational.jsonl', mtime: 1000 },
+      ];
+
+      const result = await (
+        dataProcessor as unknown as {
+          generateFacets(
+            files: Array<{ path: string; mtime: number }>,
+            facetsOutputDir?: string,
+          ): Promise<SessionFacets[]>;
+        }
+      ).generateFacets(files);
+
+      // Only the conversational session should be analyzed
+      expect(mockGenerateJson).toHaveBeenCalledTimes(1);
+      expect(result).toHaveLength(1);
+      expect(result[0].session_id).toBe('conversational');
+    });
+  });
 });
--- a/packages/cli/src/services/insight/generators/DataProcessor.ts
+++ b/packages/cli/src/services/insight/generators/DataProcessor.ts
@ -82,6 +82,26 @@ export class DataProcessor {
    return output;
  }

+  // Only analyze conversational sessions for facets (skip system-only logs).
+  private hasUserAndAssistantRecords(records: ChatRecord[]): boolean {
+    let hasUser = false;
+    let hasAssistant = false;
+
+    for (const record of records) {
+      if (record.type === 'user') {
+        hasUser = true;
+      } else if (record.type === 'assistant') {
+        hasAssistant = true;
+      }
+
+      if (hasUser && hasAssistant) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
  // Analyze a single session using LLM
  private async analyzeSession(
    records: ChatRecord[],
@ -975,25 +995,49 @@ None captured`;
    facetsOutputDir?: string,
    onProgress?: InsightProgressCallback,
  ): Promise<SessionFacets[]> {
-    // Sort files by recency (descending) and take top 50
-    const recentFiles = [...allFiles]
-      .sort((a, b) => b.mtime - a.mtime)
-      .slice(0, 50);
+    const MAX_ELIGIBLE_SESSIONS = 50;

-    logger.info(`Analyzing ${recentFiles.length} recent sessions with LLM...`);
+    // Sort files by recency (descending), then select up to 50 conversational
+    // sessions (must contain both user and assistant records).
+    const sortedFiles = [...allFiles].sort((a, b) => b.mtime - a.mtime);
+    const eligibleSessions: Array<{
+      fileInfo: { path: string; mtime: number };
+      records: ChatRecord[];
+    }> = [];
+
+    for (const fileInfo of sortedFiles) {
+      if (eligibleSessions.length >= MAX_ELIGIBLE_SESSIONS) {
+        break;
+      }
+
+      try {
+        const records = await readJsonlFile<ChatRecord>(fileInfo.path);
+        if (!this.hasUserAndAssistantRecords(records)) {
+          continue;
+        }
+        eligibleSessions.push({ fileInfo, records });
+      } catch (e) {
+        logger.error(
+          `Error reading session file ${fileInfo.path} for facet eligibility:`,
+          e,
+        );
+      }
+    }
+
+    logger.info(
+      `Analyzing ${eligibleSessions.length} eligible recent sessions with LLM...`,
+    );

    // Create a limit function with concurrency of 4 to avoid 429 errors
    const limit = pLimit(CONCURRENCY_LIMIT);

    let completed = 0;
-    const total = recentFiles.length;
+    const total = eligibleSessions.length;

    // Analyze sessions concurrently with limit
-    const analysisPromises = recentFiles.map((fileInfo) =>
+    const analysisPromises = eligibleSessions.map(({ fileInfo, records }) =>
      limit(async () => {
        try {
-          const records = await readJsonlFile<ChatRecord>(fileInfo.path);
-
          // Check if we already have this session analyzed
          if (records.length > 0 && facetsOutputDir) {
            const sessionId = records[0].sessionId;