/** * @license * Copyright 2025 Qwen Code * SPDX-License-Identifier: Apache-2.0 */ import fs from 'fs/promises'; import path from 'path'; import { read as readJsonlFile, createDebugLogger, } from '@qwen-code/qwen-code-core'; import pLimit from 'p-limit'; import type { Config, ChatRecord } from '@qwen-code/qwen-code-core'; import type { InsightData, HeatMapData, StreakData, SessionFacets, InsightProgressCallback, } from '../types/StaticInsightTypes.js'; import type { QualitativeInsights, InsightImpressiveWorkflows, InsightProjectAreas, InsightFutureOpportunities, InsightFrictionPoints, InsightMemorableMoment, InsightImprovements, InsightInteractionStyle, InsightAtAGlance, } from '../types/QualitativeInsightTypes.js'; import { PROMPT_IMPRESSIVE_WORKFLOWS, PROMPT_PROJECT_AREAS, PROMPT_FUTURE_OPPORTUNITIES, PROMPT_FRICTION_POINTS, PROMPT_MEMORABLE_MOMENT, PROMPT_IMPROVEMENTS, PROMPT_INTERACTION_STYLE, PROMPT_AT_A_GLANCE, ANALYSIS_PROMPT, } from '../prompts/InsightPrompts.js'; const logger = createDebugLogger('DataProcessor'); const CONCURRENCY_LIMIT = 2; export class DataProcessor { constructor(private config: Config) {} // Helper function to format date as YYYY-MM-DD private formatDate(date: Date): string { return date.toISOString().split('T')[0]; } // Format chat records for LLM analysis private formatRecordsForAnalysis(records: ChatRecord[]): string { let output = ''; const sessionStart = records.length > 0 ? new Date(records[0].timestamp) : new Date(); output += `Session: ${records[0]?.sessionId || 'unknown'}\n`; output += `Date: ${sessionStart.toISOString()}\n`; output += `Duration: ${records.length} turns\n\n`; for (const record of records) { if (record.type === 'user') { const text = record.message?.parts ?.map((p) => ('text' in p ? p.text : '')) .join('') || ''; output += `[User]: ${text}\n`; } else if (record.type === 'assistant') { if (record.message?.parts) { for (const part of record.message.parts) { if ('text' in part && part.text) { output += `[Assistant]: ${part.text}\n`; } else if ('functionCall' in part) { const call = part.functionCall; if (call) { output += `[Tool: ${call.name}]\n`; } } } } } } return output; } // Analyze a single session using LLM private async analyzeSession( records: ChatRecord[], ): Promise { if (records.length === 0) return null; const INSIGHT_SCHEMA = { type: 'object', properties: { underlying_goal: { type: 'string', description: 'What the user fundamentally wanted to achieve', }, goal_categories: { type: 'object', additionalProperties: { type: 'number' }, }, outcome: { type: 'string', enum: [ 'fully_achieved', 'mostly_achieved', 'partially_achieved', 'not_achieved', 'unclear_from_transcript', ], }, user_satisfaction_counts: { type: 'object', additionalProperties: { type: 'number' }, }, Qwen_helpfulness: { type: 'string', enum: [ 'unhelpful', 'slightly_helpful', 'moderately_helpful', 'very_helpful', 'essential', ], }, session_type: { type: 'string', enum: [ 'single_task', 'multi_task', 'iterative_refinement', 'exploration', 'quick_question', ], }, friction_counts: { type: 'object', additionalProperties: { type: 'number' }, }, friction_detail: { type: 'string', description: 'One sentence describing friction or empty', }, primary_success: { type: 'string', enum: [ 'none', 'fast_accurate_search', 'correct_code_edits', 'good_explanations', 'proactive_help', 'multi_file_changes', 'good_debugging', ], }, brief_summary: { type: 'string', description: 'One sentence: what user wanted and whether they got it', }, }, required: [ 'underlying_goal', 'goal_categories', 'outcome', 'user_satisfaction_counts', 'Qwen_helpfulness', 'session_type', 'friction_counts', 'friction_detail', 'primary_success', 'brief_summary', ], }; const sessionText = this.formatRecordsForAnalysis(records); const prompt = `${ANALYSIS_PROMPT}\n\nSESSION:\n${sessionText}`; try { const result = await this.config.getBaseLlmClient().generateJson({ // Use the configured model model: this.config.getModel(), contents: [{ role: 'user', parts: [{ text: prompt }] }], schema: INSIGHT_SCHEMA, abortSignal: AbortSignal.timeout(60000), // 1 minute timeout per session }); if (!result || Object.keys(result).length === 0) { return null; } return { ...(result as unknown as SessionFacets), session_id: records[0].sessionId, }; } catch (error) { logger.error( `Failed to analyze session ${records[0]?.sessionId}:`, error, ); return null; } } // Calculate streaks from activity dates private calculateStreaks(dates: string[]): StreakData { if (dates.length === 0) { return { currentStreak: 0, longestStreak: 0, dates: [] }; } // Convert string dates to Date objects and sort them const dateObjects = dates.map((dateStr) => new Date(dateStr)); dateObjects.sort((a, b) => a.getTime() - b.getTime()); let currentStreak = 1; let maxStreak = 1; let currentDate = new Date(dateObjects[0]); currentDate.setHours(0, 0, 0, 0); // Normalize to start of day for (let i = 1; i < dateObjects.length; i++) { const nextDate = new Date(dateObjects[i]); nextDate.setHours(0, 0, 0, 0); // Normalize to start of day // Calculate difference in days const diffDays = Math.floor( (nextDate.getTime() - currentDate.getTime()) / (1000 * 60 * 60 * 24), ); if (diffDays === 1) { // Consecutive day currentStreak++; maxStreak = Math.max(maxStreak, currentStreak); } else if (diffDays > 1) { // Gap in streak currentStreak = 1; } // If diffDays === 0, same day, so streak continues currentDate = nextDate; } // Check if the streak is still ongoing (if last activity was yesterday or today) const today = new Date(); today.setHours(0, 0, 0, 0); const yesterday = new Date(today); yesterday.setDate(yesterday.getDate() - 1); if ( currentDate.getTime() === today.getTime() || currentDate.getTime() === yesterday.getTime() ) { // The streak might still be active, so we don't reset it } return { currentStreak, longestStreak: maxStreak, dates, }; } // Process chat files from all projects in the base directory and generate insights async generateInsights( baseDir: string, facetsOutputDir?: string, onProgress?: InsightProgressCallback, ): Promise { if (onProgress) onProgress('Scanning chat files', 0); const allChatFiles = await this.scanChatFiles(baseDir); if (onProgress) onProgress('Generating metrics', 10); const metrics = await this.generateMetrics(allChatFiles, onProgress); if (onProgress) onProgress('Analyzing sessions', 20); const facets = await this.generateFacets( allChatFiles, facetsOutputDir, onProgress, ); if (onProgress) onProgress('Generating qualitative insights', 80); const qualitative = await this.generateQualitativeInsights(metrics, facets); // Aggregate satisfaction, friction, success and outcome data from facets const { satisfactionAgg, frictionAgg, primarySuccessAgg, outcomesAgg, goalsAgg, } = this.aggregateFacetsData(facets); if (onProgress) onProgress('Finalizing report', 100); return { ...metrics, qualitative, satisfaction: satisfactionAgg, friction: frictionAgg, primarySuccess: primarySuccessAgg, outcomes: outcomesAgg, topGoals: goalsAgg, }; } // Aggregate satisfaction and friction data from facets private aggregateFacetsData(facets: SessionFacets[]): { satisfactionAgg: Record; frictionAgg: Record; primarySuccessAgg: Record; outcomesAgg: Record; goalsAgg: Record; } { const satisfactionAgg: Record = {}; const frictionAgg: Record = {}; const primarySuccessAgg: Record = {}; const outcomesAgg: Record = {}; const goalsAgg: Record = {}; facets.forEach((facet) => { // Aggregate satisfaction Object.entries(facet.user_satisfaction_counts).forEach(([sat, count]) => { satisfactionAgg[sat] = (satisfactionAgg[sat] || 0) + count; }); // Aggregate friction Object.entries(facet.friction_counts).forEach(([fric, count]) => { frictionAgg[fric] = (frictionAgg[fric] || 0) + count; }); // Aggregate primary success if (facet.primary_success && facet.primary_success !== 'none') { primarySuccessAgg[facet.primary_success] = (primarySuccessAgg[facet.primary_success] || 0) + 1; } // Aggregate outcomes if (facet.outcome) { outcomesAgg[facet.outcome] = (outcomesAgg[facet.outcome] || 0) + 1; } // Aggregate goals Object.entries(facet.goal_categories).forEach(([goal, count]) => { goalsAgg[goal] = (goalsAgg[goal] || 0) + count; }); }); return { satisfactionAgg, frictionAgg, primarySuccessAgg, outcomesAgg, goalsAgg, }; } private async generateQualitativeInsights( metrics: Omit, facets: SessionFacets[], ): Promise { if (facets.length === 0) { return undefined; } logger.info('Generating qualitative insights...'); const commonData = this.prepareCommonPromptData(metrics, facets); const generate = async ( promptTemplate: string, schema: Record, ): Promise => { const prompt = `${promptTemplate}\n\n${commonData}`; try { const result = await this.config.getBaseLlmClient().generateJson({ model: this.config.getModel(), contents: [{ role: 'user', parts: [{ text: prompt }] }], schema, abortSignal: AbortSignal.timeout(60000), }); return result as T; } catch (error) { logger.error('Failed to generate insight:', error); throw error; } }; // Schemas for each insight type // We define simplified schemas here to guide the LLM. // The types are already defined in QualitativeInsightTypes.ts // 1. Impressive Workflows const schemaImpressiveWorkflows = { type: 'object', properties: { intro: { type: 'string' }, impressive_workflows: { type: 'array', items: { type: 'object', properties: { title: { type: 'string' }, description: { type: 'string' }, }, required: ['title', 'description'], }, }, }, required: ['intro', 'impressive_workflows'], }; // 2. Project Areas const schemaProjectAreas = { type: 'object', properties: { areas: { type: 'array', items: { type: 'object', properties: { name: { type: 'string' }, session_count: { type: 'number' }, description: { type: 'string' }, }, required: ['name', 'session_count', 'description'], }, }, }, required: ['areas'], }; // 3. Future Opportunities const schemaFutureOpportunities = { type: 'object', properties: { intro: { type: 'string' }, opportunities: { type: 'array', items: { type: 'object', properties: { title: { type: 'string' }, whats_possible: { type: 'string' }, how_to_try: { type: 'string' }, copyable_prompt: { type: 'string' }, }, required: [ 'title', 'whats_possible', 'how_to_try', 'copyable_prompt', ], }, }, }, required: ['intro', 'opportunities'], }; // 4. Friction Points const schemaFrictionPoints = { type: 'object', properties: { intro: { type: 'string' }, categories: { type: 'array', items: { type: 'object', properties: { category: { type: 'string' }, description: { type: 'string' }, examples: { type: 'array', items: { type: 'string' } }, }, required: ['category', 'description', 'examples'], }, }, }, required: ['intro', 'categories'], }; // 5. Memorable Moment const schemaMemorableMoment = { type: 'object', properties: { headline: { type: 'string' }, detail: { type: 'string' }, }, required: ['headline', 'detail'], }; // 6. Improvements const schemaImprovements = { type: 'object', properties: { Qwen_md_additions: { type: 'array', items: { type: 'object', properties: { addition: { type: 'string' }, why: { type: 'string' }, prompt_scaffold: { type: 'string' }, }, required: ['addition', 'why', 'prompt_scaffold'], }, }, features_to_try: { type: 'array', items: { type: 'object', properties: { feature: { type: 'string' }, one_liner: { type: 'string' }, why_for_you: { type: 'string' }, example_code: { type: 'string' }, }, required: ['feature', 'one_liner', 'why_for_you', 'example_code'], }, }, usage_patterns: { type: 'array', items: { type: 'object', properties: { title: { type: 'string' }, suggestion: { type: 'string' }, detail: { type: 'string' }, copyable_prompt: { type: 'string' }, }, required: ['title', 'suggestion', 'detail', 'copyable_prompt'], }, }, }, required: ['Qwen_md_additions', 'features_to_try', 'usage_patterns'], }; // 7. Interaction Style const schemaInteractionStyle = { type: 'object', properties: { narrative: { type: 'string' }, key_pattern: { type: 'string' }, }, required: ['narrative', 'key_pattern'], }; // 8. At A Glance const schemaAtAGlance = { type: 'object', properties: { whats_working: { type: 'string' }, whats_hindering: { type: 'string' }, quick_wins: { type: 'string' }, ambitious_workflows: { type: 'string' }, }, required: [ 'whats_working', 'whats_hindering', 'quick_wins', 'ambitious_workflows', ], }; const limit = pLimit(CONCURRENCY_LIMIT); try { const [ impressiveWorkflows, projectAreas, futureOpportunities, frictionPoints, memorableMoment, improvements, interactionStyle, atAGlance, ] = await Promise.all([ limit(() => generate( PROMPT_IMPRESSIVE_WORKFLOWS, schemaImpressiveWorkflows, ), ), limit(() => generate( PROMPT_PROJECT_AREAS, schemaProjectAreas, ), ), limit(() => generate( PROMPT_FUTURE_OPPORTUNITIES, schemaFutureOpportunities, ), ), limit(() => generate( PROMPT_FRICTION_POINTS, schemaFrictionPoints, ), ), limit(() => generate( PROMPT_MEMORABLE_MOMENT, schemaMemorableMoment, ), ), limit(() => generate( PROMPT_IMPROVEMENTS, schemaImprovements, ), ), limit(() => generate( PROMPT_INTERACTION_STYLE, schemaInteractionStyle, ), ), limit(() => generate(PROMPT_AT_A_GLANCE, schemaAtAGlance), ), ]); return { impressiveWorkflows, projectAreas, futureOpportunities, frictionPoints, memorableMoment, improvements, interactionStyle, atAGlance, }; } catch (e) { logger.error('Error generating qualitative insights:', e); return undefined; } } private prepareCommonPromptData( metrics: Omit, facets: SessionFacets[], ): string { // 1. DATA section const goalsAgg: Record = {}; const outcomesAgg: Record = {}; const satisfactionAgg: Record = {}; const frictionAgg: Record = {}; const successAgg: Record = {}; facets.forEach((facet) => { // Aggregate goals Object.entries(facet.goal_categories).forEach(([goal, count]) => { goalsAgg[goal] = (goalsAgg[goal] || 0) + count; }); // Aggregate outcomes outcomesAgg[facet.outcome] = (outcomesAgg[facet.outcome] || 0) + 1; // Aggregate satisfaction Object.entries(facet.user_satisfaction_counts).forEach(([sat, count]) => { satisfactionAgg[sat] = (satisfactionAgg[sat] || 0) + count; }); // Aggregate friction Object.entries(facet.friction_counts).forEach(([fric, count]) => { frictionAgg[fric] = (frictionAgg[fric] || 0) + count; }); // Aggregate success (primary_success) if (facet.primary_success && facet.primary_success !== 'none') { successAgg[facet.primary_success] = (successAgg[facet.primary_success] || 0) + 1; } }); const topGoals = Object.entries(goalsAgg) .sort((a, b) => b[1] - a[1]) .slice(0, 8); const dataObj = { sessions: metrics.totalSessions || facets.length, analyzed: facets.length, date_range: { start: Object.keys(metrics.heatmap).sort()[0] || 'N/A', end: Object.keys(metrics.heatmap).sort().pop() || 'N/A', }, messages: metrics.totalMessages || 0, hours: metrics.totalHours || 0, commits: 0, // Not tracked yet top_tools: metrics.topTools || [], top_goals: topGoals, outcomes: outcomesAgg, satisfaction: satisfactionAgg, friction: frictionAgg, success: successAgg, }; // 2. SESSION SUMMARIES section const sessionSummaries = facets .map((f) => `- ${f.brief_summary}`) .join('\n'); // 3. FRICTION DETAILS section const frictionDetails = facets .filter((f) => f.friction_detail && f.friction_detail.trim().length > 0) .map((f) => `- ${f.friction_detail}`) .join('\n'); return `DATA: ${JSON.stringify(dataObj, null, 2)} SESSION SUMMARIES: ${sessionSummaries} FRICTION DETAILS: ${frictionDetails} USER INSTRUCTIONS TO Qwen: None captured`; } private async scanChatFiles( baseDir: string, ): Promise> { const allChatFiles: Array<{ path: string; mtime: number }> = []; try { // Get all project directories in the base directory const projectDirs = await fs.readdir(baseDir); // Process each project directory for (const projectDir of projectDirs) { const projectPath = path.join(baseDir, projectDir); const stats = await fs.stat(projectPath); // Only process if it's a directory if (stats.isDirectory()) { const chatsDir = path.join(projectPath, 'chats'); try { // Get all chat files in the chats directory const files = await fs.readdir(chatsDir); const chatFiles = files.filter((file) => file.endsWith('.jsonl')); for (const file of chatFiles) { const filePath = path.join(chatsDir, file); // Get file stats for sorting by recency try { const fileStats = await fs.stat(filePath); allChatFiles.push({ path: filePath, mtime: fileStats.mtimeMs }); } catch (e) { logger.error(`Failed to stat file ${filePath}:`, e); } } } catch (error) { if ((error as NodeJS.ErrnoException).code !== 'ENOENT') { logger.error( `Error reading chats directory for project ${projectDir}: ${error}`, ); } // Continue to next project if chats directory doesn't exist continue; } } } } catch (error) { if ((error as NodeJS.ErrnoException).code === 'ENOENT') { // Base directory doesn't exist, return empty logger.info(`Base directory does not exist: ${baseDir}`); } else { logger.error(`Error reading base directory: ${error}`); } } return allChatFiles; } private async generateMetrics( files: Array<{ path: string; mtime: number }>, onProgress?: InsightProgressCallback, ): Promise> { // Initialize data structures const heatmap: HeatMapData = {}; const activeHours: { [hour: number]: number } = {}; const sessionStartTimes: { [sessionId: string]: Date } = {}; const sessionEndTimes: { [sessionId: string]: Date } = {}; let totalMessages = 0; let totalLinesAdded = 0; let totalLinesRemoved = 0; const uniqueFiles = new Set(); const toolUsage: Record = {}; // Process files in batches to avoid OOM and blocking the event loop const BATCH_SIZE = 50; const totalFiles = files.length; for (let i = 0; i < totalFiles; i += BATCH_SIZE) { const batchEnd = Math.min(i + BATCH_SIZE, totalFiles); const batch = files.slice(i, batchEnd); // Process batch sequentially to minimize memory usage for (const fileInfo of batch) { try { const records = await readJsonlFile(fileInfo.path); totalMessages += records.length; // Process each record for (const record of records) { const timestamp = new Date(record.timestamp); const dateKey = this.formatDate(timestamp); const hour = timestamp.getHours(); // Update heatmap (count of interactions per day) heatmap[dateKey] = (heatmap[dateKey] || 0) + 1; // Update active hours activeHours[hour] = (activeHours[hour] || 0) + 1; // Track session times if (!sessionStartTimes[record.sessionId]) { sessionStartTimes[record.sessionId] = timestamp; } sessionEndTimes[record.sessionId] = timestamp; // Track tool usage if (record.type === 'assistant' && record.message?.parts) { for (const part of record.message.parts) { if ('functionCall' in part) { const name = part.functionCall!.name!; toolUsage[name] = (toolUsage[name] || 0) + 1; } } } // Track lines and files from tool results if ( record.type === 'tool_result' && record.toolCallResult?.resultDisplay ) { const display = record.toolCallResult.resultDisplay; // Check if it matches FileDiff shape if ( typeof display === 'object' && display !== null && 'fileName' in display ) { // Cast to any to avoid importing FileDiff type which might not be available here const diff = display as { fileName: unknown; diffStat?: { model_added_lines?: number; model_removed_lines?: number; }; }; if (typeof diff.fileName === 'string') { uniqueFiles.add(diff.fileName); } if (diff.diffStat) { totalLinesAdded += diff.diffStat.model_added_lines || 0; totalLinesRemoved += diff.diffStat.model_removed_lines || 0; } } } } } catch (error) { logger.error( `Failed to process metrics for file ${fileInfo.path}:`, error, ); // Continue to next file } } // Update progress (mapped to 10-20% range of total progress) if (onProgress) { const percentComplete = batchEnd / totalFiles; const overallProgress = 10 + Math.round(percentComplete * 10); onProgress( `Generating metrics (${batchEnd}/${totalFiles})`, overallProgress, ); } // Yield to event loop to allow GC and UI updates await new Promise((resolve) => setTimeout(resolve, 0)); } // Calculate streak data const streakData = this.calculateStreaks(Object.keys(heatmap)); // Calculate longest work session and total hours let longestWorkDuration = 0; let longestWorkDate: string | null = null; let totalDurationMs = 0; const sessionIds = Object.keys(sessionStartTimes); const totalSessions = sessionIds.length; for (const sessionId of sessionIds) { const start = sessionStartTimes[sessionId]; const end = sessionEndTimes[sessionId]; const durationMs = end.getTime() - start.getTime(); const durationMinutes = Math.round(durationMs / (1000 * 60)); totalDurationMs += durationMs; if (durationMinutes > longestWorkDuration) { longestWorkDuration = durationMinutes; longestWorkDate = this.formatDate(start); } } const totalHours = Math.round(totalDurationMs / (1000 * 60 * 60)); // Calculate latest active time let latestActiveTime: string | null = null; let latestTimestamp = new Date(0); for (const dateStr in heatmap) { const date = new Date(dateStr); if (date > latestTimestamp) { latestTimestamp = date; latestActiveTime = date.toLocaleTimeString([], { hour: '2-digit', minute: '2-digit', }); } } // Calculate top tools const topTools = Object.entries(toolUsage) .sort((a, b) => b[1] - a[1]) .slice(0, 10); return { heatmap, currentStreak: streakData.currentStreak, longestStreak: streakData.longestStreak, longestWorkDate, longestWorkDuration, activeHours, latestActiveTime, totalSessions, totalMessages, totalHours, topTools, totalLinesAdded, totalLinesRemoved, totalFiles: uniqueFiles.size, }; } private async generateFacets( allFiles: Array<{ path: string; mtime: number }>, facetsOutputDir?: string, onProgress?: InsightProgressCallback, ): Promise { // Sort files by recency (descending) and take top 50 const recentFiles = [...allFiles] .sort((a, b) => b.mtime - a.mtime) .slice(0, 50); logger.info(`Analyzing ${recentFiles.length} recent sessions with LLM...`); // Create a limit function with concurrency of 4 to avoid 429 errors const limit = pLimit(CONCURRENCY_LIMIT); let completed = 0; const total = recentFiles.length; // Analyze sessions concurrently with limit const analysisPromises = recentFiles.map((fileInfo) => limit(async () => { try { const records = await readJsonlFile(fileInfo.path); // Check if we already have this session analyzed if (records.length > 0 && facetsOutputDir) { const sessionId = records[0].sessionId; if (sessionId) { const existingFacetPath = path.join( facetsOutputDir, `${sessionId}.json`, ); try { // Check if file exists and is readable const existingData = await fs.readFile( existingFacetPath, 'utf-8', ); const existingFacet = JSON.parse(existingData); completed++; if (onProgress) { const percent = 20 + Math.round((completed / total) * 60); onProgress( 'Analyzing sessions', percent, `${completed}/${total}`, ); } return existingFacet; } catch (readError) { // File doesn't exist or is invalid, proceed to analyze if ((readError as NodeJS.ErrnoException).code !== 'ENOENT') { logger.warn( `Failed to read existing facet for ${sessionId}, regenerating:`, readError, ); } } } } const facet = await this.analyzeSession(records); if (facet && facetsOutputDir) { try { const facetPath = path.join( facetsOutputDir, `${facet.session_id}.json`, ); await fs.writeFile( facetPath, JSON.stringify(facet, null, 2), 'utf-8', ); } catch (writeError) { logger.error( `Failed to write facet file for session ${facet.session_id}:`, writeError, ); } } completed++; if (onProgress) { const percent = 20 + Math.round((completed / total) * 60); onProgress('Analyzing sessions', percent, `${completed}/${total}`); } return facet; } catch (e) { logger.error(`Error analyzing session file ${fileInfo.path}:`, e); completed++; if (onProgress) { const percent = 20 + Math.round((completed / total) * 60); onProgress('Analyzing sessions', percent, `${completed}/${total}`); } return null; } }), ); const sessionFacetsWithNulls = await Promise.all(analysisPromises); const facets = sessionFacetsWithNulls.filter( (f): f is SessionFacets => f !== null, ); return facets; } }