fix(insight): only analyze conversational sessions for facets

Filter sessions to only include those with both user and assistant records when generating facets. This prevents system-only logs from being analyzed, ensuring more accurate session insights. Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
2026-04-30 12:40:44 +00:00 · 2026-02-26 11:26:04 +08:00 · 2026-02-26 11:26:04 +08:00 · c948e0c6e9
commit c948e0c6e9
parent 2421bb185b
2 changed files with 131 additions and 9 deletions
--- a/packages/cli/src/services/insight/generators/DataProcessor.test.ts
+++ b/packages/cli/src/services/insight/generators/DataProcessor.test.ts
@ -1126,4 +1126,82 @@ describe('DataProcessor', () => {
      expect(hasNonEmptyFrictionDetail).toBe(false);
    });
  });
  describe('generateFacets', () => {
    it('should skip non-conversational sessions', async () => {
      const userOnlyRecords: ChatRecord[] = [
        {
          sessionId: 'user-only',
          timestamp: '2025-01-15T10:00:00Z',
          type: 'user',
          message: { role: 'user', parts: [{ text: 'Hello' }] },
          uuid: '',
          parentUuid: null,
          cwd: '',
          version: '',
        },
      ];
      const conversationalRecords: ChatRecord[] = [
        {
          sessionId: 'conversational',
          timestamp: '2025-01-15T10:00:00Z',
          type: 'user',
          message: { role: 'user', parts: [{ text: 'Hello' }] },
          uuid: '',
          parentUuid: null,
          cwd: '',
          version: '',
        },
        {
          sessionId: 'conversational',
          timestamp: '2025-01-15T10:01:00Z',
          type: 'assistant',
          message: { role: 'assistant', parts: [{ text: 'Hi' }] },
          uuid: '',
          parentUuid: null,
          cwd: '',
          version: '',
        },
      ];
      // First file is user-only, second is conversational
      mockedReadJsonlFile
        .mockResolvedValueOnce(userOnlyRecords)
        .mockResolvedValueOnce(conversationalRecords);
      const mockFacet = {
        underlying_goal: 'Test',
        goal_categories: {},
        outcome: 'fully_achieved',
        user_satisfaction_counts: {},
        Qwen_helpfulness: 'very_helpful',
        session_type: 'single_task',
        friction_counts: {},
        friction_detail: '',
        primary_success: 'none',
        brief_summary: 'Test',
      };
      mockGenerateJson.mockResolvedValue(mockFacet);
      const files = [
        { path: '/test/user-only.jsonl', mtime: 2000 },
        { path: '/test/conversational.jsonl', mtime: 1000 },
      ];
      const result = await (
        dataProcessor as unknown as {
          generateFacets(
            files: Array<{ path: string; mtime: number }>,
            facetsOutputDir?: string,
          ): Promise<SessionFacets[]>;
        }
      ).generateFacets(files);
      // Only the conversational session should be analyzed
      expect(mockGenerateJson).toHaveBeenCalledTimes(1);
      expect(result).toHaveLength(1);
      expect(result[0].session_id).toBe('conversational');
    });
  });
 });
--- a/packages/cli/src/services/insight/generators/DataProcessor.ts
+++ b/packages/cli/src/services/insight/generators/DataProcessor.ts
@ -82,6 +82,26 @@ export class DataProcessor {
    return output;
  }
  // Only analyze conversational sessions for facets (skip system-only logs).
  private hasUserAndAssistantRecords(records: ChatRecord[]): boolean {
    let hasUser = false;
    let hasAssistant = false;
    for (const record of records) {
      if (record.type === 'user') {
        hasUser = true;
      } else if (record.type === 'assistant') {
        hasAssistant = true;
      }
      if (hasUser && hasAssistant) {
        return true;
      }
    }
    return false;
  }
  // Analyze a single session using LLM
  private async analyzeSession(
    records: ChatRecord[],
@ -975,25 +995,49 @@ None captured`;
    facetsOutputDir?: string,
    onProgress?: InsightProgressCallback,
  ): Promise<SessionFacets[]> {
-    // Sort files by recency (descending) and take top 50
+    const MAX_ELIGIBLE_SESSIONS = 50;
    const recentFiles = [...allFiles]
      .sort((a, b) => b.mtime - a.mtime)
      .slice(0, 50);
-    logger.info(`Analyzing ${recentFiles.length} recent sessions with LLM...`);
+    // Sort files by recency (descending), then select up to 50 conversational
    // sessions (must contain both user and assistant records).
    const sortedFiles = [...allFiles].sort((a, b) => b.mtime - a.mtime);
    const eligibleSessions: Array<{
      fileInfo: { path: string; mtime: number };
      records: ChatRecord[];
    }> = [];
    for (const fileInfo of sortedFiles) {
      if (eligibleSessions.length >= MAX_ELIGIBLE_SESSIONS) {
        break;
      }
      try {
        const records = await readJsonlFile<ChatRecord>(fileInfo.path);
        if (!this.hasUserAndAssistantRecords(records)) {
          continue;
        }
        eligibleSessions.push({ fileInfo, records });
      } catch (e) {
        logger.error(
          `Error reading session file ${fileInfo.path} for facet eligibility:`,
          e,
        );
      }
    }
    logger.info(
      `Analyzing ${eligibleSessions.length} eligible recent sessions with LLM...`,
    );
    // Create a limit function with concurrency of 4 to avoid 429 errors
    const limit = pLimit(CONCURRENCY_LIMIT);
    let completed = 0;
-    const total = recentFiles.length;
+    const total = eligibleSessions.length;
    // Analyze sessions concurrently with limit
-    const analysisPromises = recentFiles.map((fileInfo) =>
+    const analysisPromises = eligibleSessions.map(({ fileInfo, records }) =>
      limit(async () => {
        try {
          const records = await readJsonlFile<ChatRecord>(fileInfo.path);
          // Check if we already have this session analyzed
          if (records.length > 0 && facetsOutputDir) {
            const sessionId = records[0].sessionId;