fix(insight): only analyze conversational sessions for facets

Filter sessions to only include those with both user and assistant records when generating facets. This prevents system-only logs from being analyzed, ensuring more accurate session insights. Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
2026-05-02 05:31:02 +00:00 · 2026-02-26 11:26:04 +08:00 · 2026-02-26 11:26:04 +08:00 · c948e0c6e9
commit c948e0c6e9
parent 2421bb185b
2 changed files with 131 additions and 9 deletions
--- a/packages/cli/src/services/insight/generators/DataProcessor.ts
+++ b/packages/cli/src/services/insight/generators/DataProcessor.ts
@ -82,6 +82,26 @@ export class DataProcessor {
    return output;
  }

+  // Only analyze conversational sessions for facets (skip system-only logs).
+  private hasUserAndAssistantRecords(records: ChatRecord[]): boolean {
+    let hasUser = false;
+    let hasAssistant = false;
+
+    for (const record of records) {
+      if (record.type === 'user') {
+        hasUser = true;
+      } else if (record.type === 'assistant') {
+        hasAssistant = true;
+      }
+
+      if (hasUser && hasAssistant) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
  // Analyze a single session using LLM
  private async analyzeSession(
    records: ChatRecord[],
@ -975,25 +995,49 @@ None captured`;
    facetsOutputDir?: string,
    onProgress?: InsightProgressCallback,
  ): Promise<SessionFacets[]> {
-    // Sort files by recency (descending) and take top 50
-    const recentFiles = [...allFiles]
-      .sort((a, b) => b.mtime - a.mtime)
-      .slice(0, 50);
+    const MAX_ELIGIBLE_SESSIONS = 50;

-    logger.info(`Analyzing ${recentFiles.length} recent sessions with LLM...`);
+    // Sort files by recency (descending), then select up to 50 conversational
+    // sessions (must contain both user and assistant records).
+    const sortedFiles = [...allFiles].sort((a, b) => b.mtime - a.mtime);
+    const eligibleSessions: Array<{
+      fileInfo: { path: string; mtime: number };
+      records: ChatRecord[];
+    }> = [];
+
+    for (const fileInfo of sortedFiles) {
+      if (eligibleSessions.length >= MAX_ELIGIBLE_SESSIONS) {
+        break;
+      }
+
+      try {
+        const records = await readJsonlFile<ChatRecord>(fileInfo.path);
+        if (!this.hasUserAndAssistantRecords(records)) {
+          continue;
+        }
+        eligibleSessions.push({ fileInfo, records });
+      } catch (e) {
+        logger.error(
+          `Error reading session file ${fileInfo.path} for facet eligibility:`,
+          e,
+        );
+      }
+    }
+
+    logger.info(
+      `Analyzing ${eligibleSessions.length} eligible recent sessions with LLM...`,
+    );

    // Create a limit function with concurrency of 4 to avoid 429 errors
    const limit = pLimit(CONCURRENCY_LIMIT);

    let completed = 0;
-    const total = recentFiles.length;
+    const total = eligibleSessions.length;

    // Analyze sessions concurrently with limit
-    const analysisPromises = recentFiles.map((fileInfo) =>
+    const analysisPromises = eligibleSessions.map(({ fileInfo, records }) =>
      limit(async () => {
        try {
-          const records = await readJsonlFile<ChatRecord>(fileInfo.path);
-
          // Check if we already have this session analyzed
          if (records.length > 0 && facetsOutputDir) {
            const sessionId = records[0].sessionId;