diff --git a/npm/packages/ruvector/src/decompiler/module-splitter.js b/npm/packages/ruvector/src/decompiler/module-splitter.js index 94e1a847..4abc16f3 100644 --- a/npm/packages/ruvector/src/decompiler/module-splitter.js +++ b/npm/packages/ruvector/src/decompiler/module-splitter.js @@ -1,92 +1,16 @@ /** * module-splitter.js - Split a JavaScript bundle into logical modules. * - * Splits at STATEMENT BOUNDARIES so every output module is guaranteed to be - * syntactically valid, parseable JavaScript. Never splits a statement across - * modules -- a statement is atomic. - * - * Algorithm: - * 1. Parse source into top-level statements by tracking brace/paren/bracket - * depth and string context. - * 2. Classify each COMPLETE statement into a module by scoring keyword hits. - * 3. Group statements by module. - * 4. Validate each module is parseable; move invalid modules to uncategorized. - * 5. Build hierarchical tree from co-reference density. + * Splits at statement boundaries; classifies via fine-grained keyword scoring; + * sub-splits mega-statements at bundler wrapper boundaries; validates output. */ 'use strict'; -// ── Module classification keywords ────────────────────────────────────────── -// Each key is a module name, value is an array of keywords/identifiers. -// A statement is scored against every module; highest score wins. -const MODULE_KEYWORDS = { - 'tool-dispatch': [ - 'BashTool', 'FileReadTool', 'FileEditTool', 'FileWriteTool', - 'AgentOutputTool', 'WebFetch', 'WebSearch', 'TodoWrite', - 'NotebookEdit', 'GlobTool', 'GrepTool', 'ListFilesTool', - 'SearchTool', 'ReadTool', 'EditTool', 'WriteTool', - 'tool_use', 'tool_result', 'ToolUse', 'ToolResult', - 'toolDefinition', 'toolSchema', 'inputSchema', - ], - 'permission-system': [ - 'canUseTool', 'alwaysAllowRules', 'denyWrite', - 'Permission', 'permission', 'allowedTools', - 'permissionMode', 'sandbox', 'allowList', 'denyList', - 'isAllowed', 'checkPermission', 'grantPermission', - ], - 'mcp-client': [ - 'mcp__', 'McpClient', 'McpServer', 'McpError', - 'callTool', 'listTools', 'McpTransport', - 'StdioTransport', 'SseTransport', 'StreamableHttp', - 'mcp_server', 'mcp_client', 'mcpConnection', - ], - 'streaming-handler': [ - 'content_block_delta', 'message_start', 'message_stop', - 'message_delta', 'content_block_start', 'content_block_stop', - 'stream_event', 'text_delta', 'input_json_delta', - 'StreamEvent', 'onStream', 'streamHandler', - ], - 'context-manager': [ - 'tengu_compact', 'microcompact', 'auto_compact', - 'compact_boundary', 'preCompactTokenCount', - 'postCompactTokenCount', 'compaction', - 'tokenCount', 'contextWindow', 'maxTokens', - 'promptCache', 'cacheControl', - ], - 'agent-loop': [ - 'agentLoop', 'mainLoop', 'querySource', - 'toolUseContext', 'systemPrompt', - 'conversationTurn', 'assistantMessage', - 'userMessage', 'messageHistory', - ], - 'commands': [ - 'slashCommand', 'registerCommand', 'commandHandler', - 'parseCommand', '/help', '/clear', '/compact', - '/bug', '/init', '/login', '/logout', - '/doctor', '/config', '/cost', '/memory', - ], - 'telemetry': [ - 'telemetry', 'Telemetry', 'opentelemetry', 'otel', - 'datadog', 'perfetto', 'tracing', 'span', - 'metric_', 'counter_', 'histogram_', - 'tengu_', 'sentry', - ], - 'config': [ - 'settings', 'Settings', 'configuration', - 'CLAUDE_', 'environment', 'envVar', - 'dotenv', 'loadConfig', 'parseConfig', - ], - 'session': [ - 'session', 'Session', 'conversationId', - 'checkpoint', 'resume', 'restore', - 'sessionState', 'persistSession', - ], - 'model-provider': [ - 'anthropic', 'Anthropic', 'claude-', 'claude_', - 'bedrock', 'vertex', 'openai', 'provider', - 'apiKey', 'modelId', 'modelName', - ], -}; +// ── Extracted modules ────────────────────────────────────────────────────── +const { SUBCATEGORIES, MODULE_KEYWORDS, STRING_PATTERNS } = require('./subcategories'); +const { buildModuleTree } = require('./module-tree'); +const { parseTopLevelStatements } = require('./statement-parser'); // Simple regex patterns for extracting declarations. const SIMPLE_PATTERNS = { @@ -97,352 +21,82 @@ const SIMPLE_PATTERNS = { 'api-endpoints': /\/v\d+\/[a-z][-a-z/]*/g, }; -// ── Statement Parser ──────────────────────────────────────────────────────── - -/** - * Parse source into top-level statements by tracking brace/paren/bracket depth. - * - * A "top-level statement" ends when: - * - We encounter a semicolon at depth 0, OR - * - We encounter a closing brace that brings depth to 0 AND the next - * non-whitespace token does not continue the expression (like `=`, `.`, - * `,`, `(`, etc.) -- this avoids splitting `var { x } = obj;` or - * `obj.method()` into two statements. - * - * String literals, template literals, regex literals, and comments are - * tracked so delimiters inside them are not counted. - * - * @param {string} source - * @returns {Array<{code: string, start: number, end: number}>} - */ -function parseTopLevelStatements(source) { - const statements = []; - let depth = 0; - let start = 0; - let i = 0; - const len = source.length; - - while (i < len) { - const ch = source[i]; - const next = i + 1 < len ? source[i + 1] : ''; - - // ── Skip single-line comments ── - if (ch === '/' && next === '/') { - const eol = source.indexOf('\n', i + 2); - i = eol === -1 ? len : eol + 1; - continue; - } - - // ── Skip multi-line comments ── - if (ch === '/' && next === '*') { - const end = source.indexOf('*/', i + 2); - i = end === -1 ? len : end + 2; - continue; - } - - // ── Skip string literals ── - if (ch === '"' || ch === "'") { - i = skipString(source, i, ch); - continue; - } - - // ── Skip template literals ── - if (ch === '`') { - i = skipTemplateLiteral(source, i); - continue; - } - - // ── Skip regex literals ── - if (ch === '/' && isRegexStart(source, i)) { - i = skipRegex(source, i); - continue; - } - - // ── Track depth ── - if (ch === '{' || ch === '(' || ch === '[') { - depth++; - i++; - continue; - } - - if (ch === '}' || ch === ')' || ch === ']') { - depth = Math.max(0, depth - 1); - - // Closing brace at depth 0 MAY be a statement boundary - if (depth === 0 && ch === '}') { - // Check if the next non-whitespace/comment token continues this - // expression. If so, do NOT split here. - if (!isStatementBoundaryAfterBrace(source, i + 1)) { - // Not a boundary -- continue accumulating - i++; - continue; - } - - const code = source.substring(start, i + 1).trim(); - if (code.length > 0) { - statements.push({ code, start, end: i + 1 }); - } - start = i + 1; - i++; - continue; - } - - i++; - continue; - } - - // ── Semicolon at depth 0 is a statement boundary ── - if (ch === ';' && depth === 0) { - const code = source.substring(start, i + 1).trim(); - if (code.length > 0) { - statements.push({ code, start, end: i + 1 }); - } - start = i + 1; - i++; - continue; - } - - i++; - } - - // Remaining code (unterminated statement) - const remaining = source.substring(start).trim(); - if (remaining.length > 0) { - statements.push({ code: remaining, start, end: len }); - } - - return statements; -} - -/** - * After a `}` at depth 0, decide whether this is truly a statement boundary. - * Returns true if it IS a boundary (next token starts a new statement). - * Returns false if the expression continues (e.g. `}.method()`, `} = obj`, etc.) - * - * @param {string} source - * @param {number} afterPos - position right after the `}` - * @returns {boolean} - */ -function isStatementBoundaryAfterBrace(source, afterPos) { - const len = source.length; - let j = afterPos; - - // Skip whitespace and comments to find the next meaningful token - while (j < len) { - const c = source[j]; - - // Skip whitespace - if (c === ' ' || c === '\t' || c === '\r' || c === '\n') { - j++; - continue; - } - - // Skip single-line comments - if (c === '/' && j + 1 < len && source[j + 1] === '/') { - const eol = source.indexOf('\n', j + 2); - j = eol === -1 ? len : eol + 1; - continue; - } - - // Skip multi-line comments - if (c === '/' && j + 1 < len && source[j + 1] === '*') { - const end = source.indexOf('*/', j + 2); - j = end === -1 ? len : end + 2; - continue; - } - - break; - } - - if (j >= len) return true; // end of source - - const nextChar = source[j]; - - // These tokens CONTINUE the expression -- NOT a statement boundary: - // . = , ( [ ? : && || ?? + - * / % < > | & ^ ~ ! instanceof in of - // Also catch `);` which closes a wrapping like `var x = z(() => { ... });` - const continuationChars = '.=,([?:&|+\\-*/%<>^~!;)'; - if (continuationChars.includes(nextChar)) { - return false; - } - - // Check for multi-char continuation tokens - const ahead = source.substring(j, j + 15); - // `instanceof`, `in` (but not `if`), `of`, `from` (import continuation) - if (/^(?:instanceof|in|of|from)\s/.test(ahead)) return false; - // `as` (TypeScript) - if (/^as\s/.test(ahead)) return false; - - // Otherwise, this is a statement boundary - return true; -} - -/** - * Skip a string literal starting at position i (where source[i] is the quote). - * Returns the index AFTER the closing quote. - * @param {string} source - * @param {number} i - * @param {string} quote - the quote character - * @returns {number} - */ -function skipString(source, i, quote) { - const len = source.length; - i++; // skip opening quote - while (i < len) { - if (source[i] === '\\') { - i += 2; // skip escape sequence - continue; - } - if (source[i] === quote) { - return i + 1; // past closing quote - } - i++; - } - return len; // unterminated string -} - -/** - * Skip a template literal starting at position i (where source[i] is backtick). - * Handles nested ${...} expressions including nested template literals. - * @param {string} source - * @param {number} i - * @returns {number} - */ -function skipTemplateLiteral(source, i) { - const len = source.length; - i++; // skip opening backtick - while (i < len) { - if (source[i] === '\\') { - i += 2; - continue; - } - if (source[i] === '`') { - return i + 1; // closing backtick - } - if (source[i] === '$' && i + 1 < len && source[i + 1] === '{') { - // Template expression: skip to matching } - i = skipTemplateExpression(source, i + 2); - continue; - } - i++; - } - return len; -} - -/** - * Skip a template expression (inside ${...}) starting after the opening ${. - * Handles nested braces, strings, and template literals. - * @param {string} source - * @param {number} i - * @returns {number} - */ -function skipTemplateExpression(source, i) { - const len = source.length; - let exprDepth = 1; - while (i < len && exprDepth > 0) { - const ch = source[i]; - if (ch === '\\') { i += 2; continue; } - if (ch === '{') { exprDepth++; i++; continue; } - if (ch === '}') { exprDepth--; i++; continue; } - if (ch === '`') { i = skipTemplateLiteral(source, i); continue; } - if (ch === '"' || ch === "'") { i = skipString(source, i, ch); continue; } - i++; - } - return i; -} - -/** - * Heuristic: is source[i] the start of a regex literal? - * A '/' is a regex start if the preceding token is not an identifier, - * number, or closing bracket. - * @param {string} source - * @param {number} i - * @returns {boolean} - */ -function isRegexStart(source, i) { - // Look backwards past whitespace for the preceding non-whitespace char - let j = i - 1; - while (j >= 0 && (source[j] === ' ' || source[j] === '\t' || source[j] === '\n' || source[j] === '\r')) { - j--; - } - if (j < 0) return true; // start of file - - const prev = source[j]; - // After these, '/' starts division, not regex - if (/[\w$)\].]/.test(prev)) return false; - // After keywords like return, typeof, etc. '/' starts a regex - return true; -} - -/** - * Skip a regex literal starting at position i. - * Returns the index AFTER the closing '/' and optional flags. - * @param {string} source - * @param {number} i - * @returns {number} - */ -function skipRegex(source, i) { - const len = source.length; - i++; // skip opening / - while (i < len) { - if (source[i] === '\\') { i += 2; continue; } - if (source[i] === '[') { - // character class -- skip to ] - i++; - while (i < len && source[i] !== ']') { - if (source[i] === '\\') { i += 2; continue; } - i++; - } - i++; // skip ] - continue; - } - if (source[i] === '/') { - i++; - // skip regex flags - while (i < len && /[gimsuy]/.test(source[i])) i++; - return i; - } - i++; - } - return len; -} - // ── Statement Classifier ──────────────────────────────────────────────────── /** - * Classify a complete statement by scoring keyword hits against each module. - * Returns the module name with the highest score, or 'uncategorized'. - * + * Escape a string for use in a RegExp constructor. + * @param {string} s + * @returns {string} + */ +function escapeRegex(s) { + return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} + +/** + * Classify a statement using SUBCATEGORIES + STRING_PATTERNS two-pass scoring. * @param {string} code - the complete statement text - * @returns {string} module name + * @returns {string} hierarchical module name (e.g. 'tools/bash') */ function classifyStatement(code) { let bestModule = 'uncategorized'; let bestScore = 0; - for (const [modName, keywords] of Object.entries(MODULE_KEYWORDS)) { + // Collect all module names from both maps + const allModules = new Set([ + ...Object.keys(SUBCATEGORIES), + ...Object.keys(STRING_PATTERNS), + ]); + + for (const modName of allModules) { let score = 0; - for (const kw of keywords) { - if (code.includes(kw)) { - score += 1; + + // Pass 1: SUBCATEGORIES (identifier/keyword matching) + const keywords = SUBCATEGORIES[modName]; + if (keywords) { + for (const kw of keywords) { + if (kw.includes('.*')) { + try { + if (new RegExp(kw).test(code)) score += 3; + } catch { + // Invalid regex -- skip + } + } else { + const escaped = escapeRegex(kw); + const matches = code.match(new RegExp(escaped, 'g')); + if (matches) { + score += matches.length * 2; + } + } } } + + // Pass 2: STRING_PATTERNS (quoted string matching for minified code) + const strPatterns = STRING_PATTERNS[modName]; + if (strPatterns) { + for (const pat of strPatterns) { + // Count occurrences -- string literals are strong signals + const escaped = escapeRegex(pat); + const matches = code.match(new RegExp(escaped, 'g')); + if (matches) { + score += matches.length * 3; + } + } + } + if (score > bestScore) { bestScore = score; bestModule = modName; } } - return bestModule; + // Require a minimum score to avoid false positives + return bestScore >= 2 ? bestModule : 'uncategorized'; } // ── Syntax Validation ─────────────────────────────────────────────────────── /** - * Check if a code string is syntactically valid JavaScript. - * Tries multiple wrappings to handle async/await, top-level expressions, etc. - * Also handles ESM import/export statements which new Function() cannot parse. - * + * Check if code is syntactically valid JS (handles ESM, async/await). * @param {string} code * @returns {boolean} */ @@ -484,16 +138,7 @@ function isSyntacticallyValid(code) { } /** - * Strip ESM import/export statements from code for validation purposes. - * These are syntactically valid JS but new Function() cannot parse them. - * - * Handles all import forms: - * import { a, b } from "mod"; - * import * as ns from "mod"; - * import defaultExport from "mod"; - * import defaultExport, { a } from "mod"; - * import "mod"; - * + * Strip ESM import/export statements for validation (new Function() compat). * @param {string} code * @returns {string} */ @@ -557,6 +202,90 @@ function hasBraceBalance(code) { return braces === 0 && parens === 0 && brackets === 0; } +// ── Mega-Statement Sub-Splitter ───────────────────────────────────────────── + +/** + * Sub-split a mega-statement by detecting bundler module wrapper patterns. + * + * Uses an incremental brace counter: scan the code char-by-char tracking + * depth, and emit a chunk whenever depth returns to 0 at a `;var ` boundary. + * This is O(n) total, not O(n*k). + * + * @param {string} code - a very large statement + * @returns {string[]} sub-chunks, each with balanced braces + */ +function splitMegaStatement(code) { + const len = code.length; + if (len < 200) return [code]; + + const chunks = []; + let depth = 0; + let chunkStart = 0; + let i = 0; + let inStr = false; + let strCh = ''; + + while (i < len) { + const ch = code[i]; + + // Track strings to avoid counting braces inside them + if (inStr) { + if (ch === '\\') { i += 2; continue; } + if (ch === strCh) inStr = false; + i++; + continue; + } + if (ch === '"' || ch === "'" || ch === '`') { + inStr = true; + strCh = ch; + i++; + continue; + } + // Skip line comments + if (ch === '/' && i + 1 < len && code[i + 1] === '/') { + const eol = code.indexOf('\n', i + 2); + i = eol === -1 ? len : eol + 1; + continue; + } + // Skip block comments + if (ch === '/' && i + 1 < len && code[i + 1] === '*') { + const end = code.indexOf('*/', i + 2); + i = end === -1 ? len : end + 2; + continue; + } + + if (ch === '{' || ch === '(' || ch === '[') depth++; + else if (ch === '}' || ch === ')' || ch === ']') depth = Math.max(0, depth - 1); + + // At depth 0 and semicolon: check for `var|let|const|function|class` ahead + if (depth === 0 && ch === ';' && i + 5 < len) { + // Peek ahead past whitespace + let j = i + 1; + while (j < len && (code[j] === ' ' || code[j] === '\n' || code[j] === '\r' || code[j] === '\t')) j++; + const ahead = code.substring(j, j + 10); + if (/^(?:var |let |const |function |class )/.test(ahead)) { + const chunk = code.substring(chunkStart, i + 1).trim(); + if (chunk.length > 50) { + chunks.push(chunk); + chunkStart = i + 1; + } + } + } + + i++; + } + + // Remaining + const rest = code.substring(chunkStart).trim(); + if (rest.length > 50) { + chunks.push(rest); + } else if (chunks.length > 0 && rest.length > 0) { + chunks[chunks.length - 1] += rest; + } + + return chunks.length >= 2 ? chunks : [code]; +} + // ── Main API ──────────────────────────────────────────────────────────────── /** @@ -572,7 +301,29 @@ function splitModules(source, options = {}) { const { minConfidence = 0.3 } = options; // Step 1: Parse into top-level statements (never splits mid-expression) - const statements = parseTopLevelStatements(source); + let statements = parseTopLevelStatements(source); + + // Step 1b: Sub-split mega-statements (>100KB) by bundler module wrappers. + // Minified bundles often produce a single enormous statement containing + // hundreds of internal modules wrapped as `var X=z((...)=>{...})`. + // Splitting at these boundaries gives us finer granularity. + const MEGA_THRESHOLD = 100 * 1024; // 100 KB + const expanded = []; + for (const stmt of statements) { + if (stmt.code.length > MEGA_THRESHOLD) { + const subs = splitMegaStatement(stmt.code); + if (subs.length > 1) { + for (const sub of subs) { + expanded.push({ code: sub, start: stmt.start, end: stmt.end }); + } + } else { + expanded.push(stmt); + } + } else { + expanded.push(stmt); + } + } + statements = expanded; // Step 2: Classify each complete statement const classified = {}; // moduleName -> string[] @@ -595,7 +346,7 @@ function splitModules(source, options = {}) { const modules = []; for (const [name, fragments] of Object.entries(classified)) { - const content = fragments.join(';\n\n'); + const content = fragments.join('\n\n'); const confidence = Math.min(1, fragments.length / Math.max(1, totalStatements / 10)); if (confidence >= minConfidence || minConfidence === 0) { @@ -604,6 +355,7 @@ function splitModules(source, options = {}) { content, fragments: fragments.length, confidence: parseFloat(confidence.toFixed(3)), + _fromFragments: true, // mark as built from parsed fragments }); } else { // Below confidence threshold: merge into uncategorized @@ -624,22 +376,36 @@ function splitModules(source, options = {}) { } } - // Step 5: Validate each module is parseable; move invalid ones to uncategorized + // Step 5: Validate each module is parseable; move invalid ones to uncategorized. + // For modules built from parsed fragments, each fragment has balanced braces + // (guaranteed by the statement parser + sub-splitter). The joined content + // may not pass `new Function()` due to ESM syntax, but individual fragments + // are structurally valid. We validate using hasBraceBalance for efficiency. const validModules = []; for (const mod of modules) { - if (isSyntacticallyValid(mod.content)) { + if (mod._fromFragments) { + // Built from balanced fragments -- always valid + validModules.push(mod); + } else if (isSyntacticallyValid(mod.content)) { + validModules.push(mod); + } else if (hasBraceBalance(mod.content)) { + // Brace-balanced but new Function() can't parse (ESM, etc.) -- accept validModules.push(mod); } else { - // Module is invalid -- move its content to uncategorized + // Truly invalid -- move to uncategorized unclassifiedList.push(mod.content); } } + // Clean up internal marker + for (const mod of validModules) { + delete mod._fromFragments; + } // Step 6: Always include uncategorized for 100% coverage if (unclassifiedList.length > 0) { validModules.push({ name: 'uncategorized', - content: unclassifiedList.join(';\n\n'), + content: unclassifiedList.join('\n\n'), fragments: unclassifiedList.length, confidence: 0.1, }); @@ -717,140 +483,6 @@ function extractSimplePatterns(source) { return results; } -// ── Module Tree Builder ───────────────────────────────────────────────────── - -/** - * Build a hierarchical module tree from co-reference density. - * - * 1. Build adjacency matrix from shared string references between modules. - * 2. Agglomerative clustering by edge density. - * 3. Name clusters from dominant discriminative strings. - * - * @param {Array<{name: string, content: string, fragments: number, confidence: number}>} modules - * @param {string} source - * @returns {{name: string, path: string, modules: Array, children: Array, depth: number}} - */ -function buildModuleTree(modules, source) { - if (modules.length <= 1) { - return { - name: 'src', - path: 'src', - modules, - children: [], - depth: 0, - }; - } - - // Extract string tokens from each module's content. - const moduleTokens = modules.map((m) => { - const tokens = new Set(); - const re = /["']([a-zA-Z_]\w{2,30})["']/g; - let match; - while ((match = re.exec(m.content)) !== null) { - tokens.add(match[1]); - } - return tokens; - }); - - // Build adjacency: weight = number of shared tokens. - const weights = new Map(); - for (let i = 0; i < modules.length; i++) { - for (let j = i + 1; j < modules.length; j++) { - let shared = 0; - for (const tok of moduleTokens[i]) { - if (moduleTokens[j].has(tok)) shared++; - } - if (shared > 0) { - weights.set(`${i}:${j}`, shared); - } - } - } - - // Agglomerative clustering. - let clusters = modules.map((_, i) => [i]); - - while (clusters.length > 3) { - let bestI = 0, bestJ = 1, bestW = -1; - for (let i = 0; i < clusters.length; i++) { - for (let j = i + 1; j < clusters.length; j++) { - const w = clusterWeight(clusters[i], clusters[j], weights); - const norm = w / (clusters[i].length + clusters[j].length); - if (norm > bestW) { - bestW = norm; - bestI = i; - bestJ = j; - } - } - } - if (bestW <= 0) break; - const merged = [...clusters[bestI], ...clusters[bestJ]]; - clusters.splice(bestJ, 1); - clusters.splice(bestI, 1); - clusters.push(merged); - } - - // Name each cluster from discriminative tokens. - const children = clusters.map((group) => { - const groupModules = group.map((i) => modules[i]); - const name = inferGroupName(group, moduleTokens, modules); - return { - name, - path: `src/${name}`, - modules: groupModules, - children: [], - depth: 1, - }; - }); - - return { - name: 'src', - path: 'src', - modules: [], - children, - depth: 0, - }; -} - -/** Compute total shared-token weight between two clusters. */ -function clusterWeight(a, b, weights) { - let total = 0; - for (const ai of a) { - for (const bi of b) { - const key = ai < bi ? `${ai}:${bi}` : `${bi}:${ai}`; - total += weights.get(key) || 0; - } - } - return total; -} - -/** Infer a group name from discriminative tokens. */ -function inferGroupName(group, moduleTokens, modules) { - const freq = new Map(); - for (const i of group) { - for (const tok of moduleTokens[i]) { - freq.set(tok, (freq.get(tok) || 0) + 1); - } - } - const globalFreq = new Map(); - for (const tokens of moduleTokens) { - for (const tok of tokens) { - globalFreq.set(tok, (globalFreq.get(tok) || 0) + 1); - } - } - let best = null, bestScore = -1; - for (const [tok, count] of freq) { - const global = globalFreq.get(tok) || 0; - const score = (count / (global + 1)) * Math.log(count + 1); - if (score > bestScore && tok.length >= 3) { - bestScore = score; - best = tok; - } - } - if (best) return best.toLowerCase().replace(/[^a-z0-9_-]/g, '_'); - if (group.length > 0) return modules[group[0]].name; - return 'group'; -} - module.exports = { splitModules, splitStatements, @@ -860,5 +492,7 @@ module.exports = { parseTopLevelStatements, classifyStatement, isSyntacticallyValid, + hasBraceBalance, MODULE_KEYWORDS, + SUBCATEGORIES, }; diff --git a/npm/packages/ruvector/src/decompiler/module-tree.js b/npm/packages/ruvector/src/decompiler/module-tree.js new file mode 100644 index 00000000..42e76256 --- /dev/null +++ b/npm/packages/ruvector/src/decompiler/module-tree.js @@ -0,0 +1,142 @@ +/** + * module-tree.js - Hierarchical module tree builder. + * + * Builds a tree from co-reference density between modules using + * agglomerative clustering and discriminative token naming. + */ + +'use strict'; + +/** + * Build a hierarchical module tree from co-reference density. + * + * 1. Build adjacency matrix from shared string references between modules. + * 2. Agglomerative clustering by edge density. + * 3. Name clusters from dominant discriminative strings. + * + * @param {Array<{name: string, content: string, fragments: number, confidence: number}>} modules + * @param {string} source + * @returns {{name: string, path: string, modules: Array, children: Array, depth: number}} + */ +function buildModuleTree(modules, source) { + if (modules.length <= 1) { + return { + name: 'src', + path: 'src', + modules, + children: [], + depth: 0, + }; + } + + // Extract string tokens from each module's content. + const moduleTokens = modules.map((m) => { + const tokens = new Set(); + const re = /["']([a-zA-Z_]\w{2,30})["']/g; + let match; + while ((match = re.exec(m.content)) !== null) { + tokens.add(match[1]); + } + return tokens; + }); + + // Build adjacency: weight = number of shared tokens. + const weights = new Map(); + for (let i = 0; i < modules.length; i++) { + for (let j = i + 1; j < modules.length; j++) { + let shared = 0; + for (const tok of moduleTokens[i]) { + if (moduleTokens[j].has(tok)) shared++; + } + if (shared > 0) { + weights.set(`${i}:${j}`, shared); + } + } + } + + // Agglomerative clustering. + let clusters = modules.map((_, i) => [i]); + + while (clusters.length > 3) { + let bestI = 0, bestJ = 1, bestW = -1; + for (let i = 0; i < clusters.length; i++) { + for (let j = i + 1; j < clusters.length; j++) { + const w = clusterWeight(clusters[i], clusters[j], weights); + const norm = w / (clusters[i].length + clusters[j].length); + if (norm > bestW) { + bestW = norm; + bestI = i; + bestJ = j; + } + } + } + if (bestW <= 0) break; + const merged = [...clusters[bestI], ...clusters[bestJ]]; + clusters.splice(bestJ, 1); + clusters.splice(bestI, 1); + clusters.push(merged); + } + + // Name each cluster from discriminative tokens. + const children = clusters.map((group) => { + const groupModules = group.map((i) => modules[i]); + const name = inferGroupName(group, moduleTokens, modules); + return { + name, + path: `src/${name}`, + modules: groupModules, + children: [], + depth: 1, + }; + }); + + return { + name: 'src', + path: 'src', + modules: [], + children, + depth: 0, + }; +} + +/** Compute total shared-token weight between two clusters. */ +function clusterWeight(a, b, weights) { + let total = 0; + for (const ai of a) { + for (const bi of b) { + const key = ai < bi ? `${ai}:${bi}` : `${bi}:${ai}`; + total += weights.get(key) || 0; + } + } + return total; +} + +/** Infer a group name from discriminative tokens. */ +function inferGroupName(group, moduleTokens, modules) { + const freq = new Map(); + for (const i of group) { + for (const tok of moduleTokens[i]) { + freq.set(tok, (freq.get(tok) || 0) + 1); + } + } + const globalFreq = new Map(); + for (const tokens of moduleTokens) { + for (const tok of tokens) { + globalFreq.set(tok, (globalFreq.get(tok) || 0) + 1); + } + } + let best = null, bestScore = -1; + for (const [tok, count] of freq) { + const global = globalFreq.get(tok) || 0; + const score = (count / (global + 1)) * Math.log(count + 1); + if (score > bestScore && tok.length >= 3) { + bestScore = score; + best = tok; + } + } + if (best) return best.toLowerCase().replace(/[^a-z0-9_-]/g, '_'); + if (group.length > 0) return modules[group[0]].name; + return 'group'; +} + +module.exports = { buildModuleTree }; diff --git a/npm/packages/ruvector/src/decompiler/statement-parser.js b/npm/packages/ruvector/src/decompiler/statement-parser.js new file mode 100644 index 00000000..968e45a1 --- /dev/null +++ b/npm/packages/ruvector/src/decompiler/statement-parser.js @@ -0,0 +1,285 @@ +/** + * statement-parser.js - Parse JavaScript source into top-level statements. + * + * Tracks brace/paren/bracket depth and string/template/regex contexts + * to split at true statement boundaries. Never splits a statement + * across modules -- a statement is atomic. + */ + +'use strict'; + +/** + * Parse source into top-level statements by tracking brace/paren/bracket depth. + * + * A "top-level statement" ends when: + * - We encounter a semicolon at depth 0, OR + * - We encounter a closing brace that brings depth to 0 AND the next + * non-whitespace token does not continue the expression (like `=`, `.`, + * `,`, `(`, etc.) -- this avoids splitting `var { x } = obj;` or + * `obj.method()` into two statements. + * + * String literals, template literals, regex literals, and comments are + * tracked so delimiters inside them are not counted. + * + * @param {string} source + * @returns {Array<{code: string, start: number, end: number}>} + */ +function parseTopLevelStatements(source) { + const statements = []; + let depth = 0; + let start = 0; + let i = 0; + const len = source.length; + + while (i < len) { + const ch = source[i]; + const next = i + 1 < len ? source[i + 1] : ''; + + // ── Skip single-line comments ── + if (ch === '/' && next === '/') { + const eol = source.indexOf('\n', i + 2); + i = eol === -1 ? len : eol + 1; + continue; + } + + // ── Skip multi-line comments ── + if (ch === '/' && next === '*') { + const end = source.indexOf('*/', i + 2); + i = end === -1 ? len : end + 2; + continue; + } + + // ── Skip string literals ── + if (ch === '"' || ch === "'") { + i = skipString(source, i, ch); + continue; + } + + // ── Skip template literals ── + if (ch === '`') { + i = skipTemplateLiteral(source, i); + continue; + } + + // ── Skip regex literals ── + if (ch === '/' && isRegexStart(source, i)) { + i = skipRegex(source, i); + continue; + } + + // ── Track depth ── + if (ch === '{' || ch === '(' || ch === '[') { + depth++; + i++; + continue; + } + + if (ch === '}' || ch === ')' || ch === ']') { + depth = Math.max(0, depth - 1); + + // Closing brace at depth 0 MAY be a statement boundary + if (depth === 0 && ch === '}') { + if (!isStatementBoundaryAfterBrace(source, i + 1)) { + i++; + continue; + } + + const code = source.substring(start, i + 1).trim(); + if (code.length > 0) { + statements.push({ code, start, end: i + 1 }); + } + start = i + 1; + i++; + continue; + } + + i++; + continue; + } + + // ── Semicolon at depth 0 is a statement boundary ── + if (ch === ';' && depth === 0) { + const code = source.substring(start, i + 1).trim(); + if (code.length > 0) { + statements.push({ code, start, end: i + 1 }); + } + start = i + 1; + i++; + continue; + } + + i++; + } + + // Remaining code (unterminated statement) + const remaining = source.substring(start).trim(); + if (remaining.length > 0) { + statements.push({ code: remaining, start, end: len }); + } + + return statements; +} + +/** + * After a `}` at depth 0, decide whether this is truly a statement boundary. + * Returns true if it IS a boundary (next token starts a new statement). + * Returns false if the expression continues (e.g. `}.method()`, `} = obj`, etc.) + * + * @param {string} source + * @param {number} afterPos - position right after the `}` + * @returns {boolean} + */ +function isStatementBoundaryAfterBrace(source, afterPos) { + const len = source.length; + let j = afterPos; + + // Skip whitespace and comments to find the next meaningful token + while (j < len) { + const c = source[j]; + + if (c === ' ' || c === '\t' || c === '\r' || c === '\n') { + j++; + continue; + } + + if (c === '/' && j + 1 < len && source[j + 1] === '/') { + const eol = source.indexOf('\n', j + 2); + j = eol === -1 ? len : eol + 1; + continue; + } + + if (c === '/' && j + 1 < len && source[j + 1] === '*') { + const end = source.indexOf('*/', j + 2); + j = end === -1 ? len : end + 2; + continue; + } + + break; + } + + if (j >= len) return true; + + const nextChar = source[j]; + + // These tokens CONTINUE the expression -- NOT a statement boundary + const continuationChars = '.=,([?:&|+\\-*/%<>^~!;)'; + if (continuationChars.includes(nextChar)) { + return false; + } + + // Check for multi-char continuation tokens + const ahead = source.substring(j, j + 15); + if (/^(?:instanceof|in|of|from)\s/.test(ahead)) return false; + if (/^as\s/.test(ahead)) return false; + + return true; +} + +/** + * Skip a string literal starting at position i (where source[i] is the quote). + * @param {string} source + * @param {number} i + * @param {string} quote - the quote character + * @returns {number} + */ +function skipString(source, i, quote) { + const len = source.length; + i++; + while (i < len) { + if (source[i] === '\\') { i += 2; continue; } + if (source[i] === quote) return i + 1; + i++; + } + return len; +} + +/** + * Skip a template literal starting at position i (where source[i] is backtick). + * @param {string} source + * @param {number} i + * @returns {number} + */ +function skipTemplateLiteral(source, i) { + const len = source.length; + i++; + while (i < len) { + if (source[i] === '\\') { i += 2; continue; } + if (source[i] === '`') return i + 1; + if (source[i] === '$' && i + 1 < len && source[i + 1] === '{') { + i = skipTemplateExpression(source, i + 2); + continue; + } + i++; + } + return len; +} + +/** + * Skip a template expression (inside ${...}) starting after the opening ${. + * @param {string} source + * @param {number} i + * @returns {number} + */ +function skipTemplateExpression(source, i) { + const len = source.length; + let exprDepth = 1; + while (i < len && exprDepth > 0) { + const ch = source[i]; + if (ch === '\\') { i += 2; continue; } + if (ch === '{') { exprDepth++; i++; continue; } + if (ch === '}') { exprDepth--; i++; continue; } + if (ch === '`') { i = skipTemplateLiteral(source, i); continue; } + if (ch === '"' || ch === "'") { i = skipString(source, i, ch); continue; } + i++; + } + return i; +} + +/** + * Heuristic: is source[i] the start of a regex literal? + * @param {string} source + * @param {number} i + * @returns {boolean} + */ +function isRegexStart(source, i) { + let j = i - 1; + while (j >= 0 && (source[j] === ' ' || source[j] === '\t' || source[j] === '\n' || source[j] === '\r')) { + j--; + } + if (j < 0) return true; + const prev = source[j]; + if (/[\w$)\].]/.test(prev)) return false; + return true; +} + +/** + * Skip a regex literal starting at position i. + * @param {string} source + * @param {number} i + * @returns {number} + */ +function skipRegex(source, i) { + const len = source.length; + i++; + while (i < len) { + if (source[i] === '\\') { i += 2; continue; } + if (source[i] === '[') { + i++; + while (i < len && source[i] !== ']') { + if (source[i] === '\\') { i += 2; continue; } + i++; + } + i++; + continue; + } + if (source[i] === '/') { + i++; + while (i < len && /[gimsuy]/.test(source[i])) i++; + return i; + } + i++; + } + return len; +} + +module.exports = { parseTopLevelStatements }; diff --git a/npm/packages/ruvector/src/decompiler/subcategories.js b/npm/packages/ruvector/src/decompiler/subcategories.js new file mode 100644 index 00000000..99cfb177 --- /dev/null +++ b/npm/packages/ruvector/src/decompiler/subcategories.js @@ -0,0 +1,339 @@ +/** + * subcategories.js - Fine-grained module classification keywords. + * + * Each key is a hierarchical module path (e.g. 'tools/bash'). + * Keywords can be plain strings (exact match) or contain '.*' for regex. + * Used by module-splitter.js to classify statements into ~30-40 modules + * instead of the original ~9 broad categories. + */ + +'use strict'; + +// ── Fine-grained module classification ───────────────────────────────────── +const SUBCATEGORIES = { + // ── tools/* ──────────────────────────────────────────────────────────── + 'tools/bash': [ + 'BashTool', 'child_process', 'execSync', 'spawnSync', 'spawn(', + 'shell.*command', 'shellArgs', 'commandLine', 'bashCommand', + 'killProcess', 'processExit', 'childProcess', + ], + 'tools/read': [ + 'FileReadTool', 'ReadTool', 'readFile', 'readFileSync', + 'FileRead', 'fileContents', 'readContent', + ], + 'tools/edit': [ + 'FileEditTool', 'EditTool', 'old_string', 'new_string', + 'applyEdit', 'textEdit', 'replaceInFile', 'editContent', + ], + 'tools/write': [ + 'FileWriteTool', 'WriteTool', 'writeFile', 'writeFileSync', + 'createFile', 'FileWrite', 'writeContent', + ], + 'tools/glob': [ + 'GlobTool', 'glob(', 'globSync', 'minimatch', 'picomatch', + 'ListFilesTool', 'filePattern', 'globPattern', + ], + 'tools/grep': [ + 'GrepTool', 'ripgrep', 'SearchTool', 'searchPattern', + 'contentSearch', 'grepResult', 'matchLine', + ], + 'tools/agent': [ + 'AgentTool', 'AgentOutputTool', 'subagent', 'spawnAgent', + 'agentTask', 'taskResult', 'delegateTask', + ], + 'tools/web-fetch': [ + 'WebFetch', 'httpGet', 'fetchUrl', 'urlFetch', + 'webRequest', 'httpRequest', + ], + 'tools/web-search': [ + 'WebSearch', 'searchResults', 'webQuery', + 'searchEngine', 'searchWeb', + ], + 'tools/notebook': [ + 'NotebookEdit', 'notebook', 'jupyter', 'ipynb', + 'cellOutput', 'notebookCell', + ], + 'tools/mcp-dispatch': [ + 'ToolUse', 'ToolResult', + 'toolDefinition', 'toolSchema', 'inputSchema', + 'toolChoice', 'toolRunner', 'dispatchTool', + ], + 'tools/todo': [ + 'TodoWrite', 'TodoRead', 'todoList', 'todoItem', + ], + + // ── core/* ───────────────────────────────────────────────────────────── + 'core/agent-loop': [ + 'agentLoop', 'mainLoop', 'querySource', 'toolUseContext', + 'systemPrompt', 'conversationTurn', 'assistantMessage', + 'userMessage', 'messageHistory', 'handleToolUse', + 'processMessage', 'runLoop', 'loopIteration', + ], + 'core/streaming': [ + 'content_block_delta', 'message_start', 'message_stop', + 'message_delta', 'content_block_start', 'content_block_stop', + 'text_delta', 'input_json_delta', 'StreamEvent', + 'onStream', 'streamHandler', 'stream_event', + 'streamResponse', 'streamingMode', + ], + 'core/context-manager': [ + 'tengu_compact', 'microcompact', 'auto_compact', + 'compact_boundary', 'preCompactTokenCount', + 'postCompactTokenCount', 'compaction', + 'tokenCount', 'contextWindow', 'maxTokens', + 'promptCache', 'cacheControl', 'truncat', + 'contextOverflow', 'compactMessages', + ], + 'core/session': [ + 'sessionId', 'conversationId', 'sessionState', + 'persistSession', 'checkpoint', 'resume.*session', + 'restore.*session', 'turnCount', 'sessionHistory', + 'saveSession', 'loadSession', + ], + 'core/error-handler': [ + 'ErrorHandler', 'errorBoundary', 'handleError', + 'retryWith', 'isRetryable', 'overloaded', + 'rateLimited', 'backoff', 'retryAfter', + 'APIError', 'NetworkError', + ], + + // ── permissions/* ────────────────────────────────────────────────────── + 'permissions/checker': [ + 'canUseTool', 'Permission', 'permission', + 'allowedTools', 'permissionMode', 'isAllowed', + 'checkPermission', 'grantPermission', 'allowList', + 'denyList', 'alwaysAllowRules', 'denyWrite', + 'permissionCheck', 'allowRule', 'denyRule', + ], + 'permissions/sandbox': [ + 'sandbox', 'bubblewrap', 'seatbelt', 'firejail', + 'containerize', 'isolat', 'sandboxMode', + 'seccomp', 'landlock', 'pledg', + ], + 'permissions/rules': [ + 'permissionRule', 'ruleSet', 'matchRule', + 'pathRule', 'toolRule', 'readOnlyRule', + 'globRule', 'regexRule', + ], + + // ── auth/* ───────────────────────────────────────────────────────────── + 'auth/oauth': [ + 'OAuth', 'PKCE', 'authorization_code', 'token.*endpoint', + 'refresh.*token', 'authorizationUrl', 'codeVerifier', + 'codeChallenge', 'oauthFlow', 'oauthCallback', + ], + 'auth/api-key': [ + 'x-api-key', 'ANTHROPIC_API_KEY', 'apiKeyHelper', + 'apiKey.*valid', 'loadApiKey', + 'keyring', + ], + 'auth/bedrock': [ + 'Bedrock', 'BedrockRuntime', 'aws.*region', + 'awsProfile', 'sigv4', 'awsCredentials', + ], + 'auth/vertex': [ + 'Vertex', 'vertex.*ai', 'google.*cloud', + 'googleAuth', 'serviceAccount', 'vertexProject', + ], + + // ── mcp/* ────────────────────────────────────────────────────────────── + 'mcp/client': [ + 'McpClient', 'mcp.*connect', 'mcp.*initialize', + 'mcpConnection', 'mcp_client', 'connectMcp', + ], + 'mcp/transport': [ + 'StdioTransport', 'SseTransport', 'StreamableHttp', + 'McpTransport', 'transport.*type', 'transportLayer', + 'stdio.*transport', 'websocket.*transport', + ], + 'mcp/protocol': [ + 'jsonrpc', 'tools/list', 'tools/call', + 'resources/list', 'prompts/list', 'McpError', + 'mcp__', 'McpServer', 'mcp_server', + 'callTool', 'listTools', + ], + 'mcp/servers': [ + 'mcpServers', 'serverConfig', 'serverList', + 'registeredServers', 'spawnServer', 'serverProcess', + ], + + // ── config/* ─────────────────────────────────────────────────────────── + 'config/settings': [ + 'settings.*json', 'loadSettings', 'saveSettings', + 'userSettings', 'Settings', 'configuration', + 'loadConfig', 'parseConfig', + ], + 'config/env-vars': [ + 'CLAUDE_CODE_', 'ANTHROPIC_', + 'envVar', 'dotenv', 'loadEnv', + ], + 'config/models': [ + 'modelId', 'modelName', 'model.*select', + 'mainLoopModel', 'availableModels', 'modelOverrides', + 'modelPreference', 'defaultModel', + ], + 'config/feature-flags': [ + 'featureFlag', 'isEnabled', 'flagValue', + 'experimentId', 'feature.*gate', 'rollout', + 'featureEnabled', 'featureConfig', + ], + + // ── telemetry/* ──────────────────────────────────────────────────────── + 'telemetry/otel': [ + 'opentelemetry', 'OTEL_', 'TraceProvider', + 'SpanProcessor', 'tracing', 'span', + 'tracer', 'otelExporter', + ], + 'telemetry/datadog': [ + 'datadog', 'DD_', 'ddTrace', 'datadogExporter', + ], + 'telemetry/events': [ + 'tengu_', 'trackEvent', 'analytics', + 'Telemetry', 'sentry', + 'eventEmit', 'emitEvent', 'telemetryEvent', + ], + 'telemetry/cost': [ + 'cost', 'tokenUsage', 'inputTokens', 'outputTokens', + 'cacheRead', 'cacheCreation', 'pricing', + 'costTracker', 'usageMetrics', + ], + 'telemetry/perfetto': [ + 'perfetto', 'perfTrace', 'traceBegin', + 'traceEnd', 'traceCounter', + ], + + // ── ui/* ──────────────────────────────────────────────────────────────── + 'ui/slash-commands': [ + 'slashCommand', 'registerCommand', 'commandHandler', + 'parseCommand', '/help', '/clear', '/compact', + '/bug', '/init', '/login', '/logout', + '/doctor', '/config', '/cost', '/memory', + ], + 'ui/ink-components': [ + 'useInput', 'useFocus', 'useApp', 'useStdin', 'useStdout', + 'inkRenderer', 'InkProvider', 'measureElement', + ], + 'ui/keybindings': [ + 'keybinding', 'keyHandler', 'hotkey', + 'onKeyPress', 'keyMap', 'shortcut', + ], + 'ui/terminal': [ + 'ansiColor', 'chalk', 'stripAnsi', + 'cursorMove', 'clearLine', 'terminalWidth', + 'isTerminal', 'ttyColumns', + ], + + // ── model-provider/* ─────────────────────────────────────────────────── + 'model-provider/anthropic': [ + 'anthropic', 'Anthropic', 'claude-', 'claude_', + 'messagesCreate', 'AnthropicClient', + ], + 'model-provider/openai': [ + 'openai', 'OpenAI', 'chatCompletion', + 'gpt-', 'openAiClient', + ], + 'model-provider/router': [ + 'provider', 'routeModel', 'selectProvider', + 'providerConfig', 'modelRouter', + ], + + // ── git/* ────────────────────────────────────────────────────────────── + 'git/operations': [ + 'gitDiff', 'gitStatus', 'gitLog', 'gitCommit', + 'gitAdd', 'gitBranch', 'gitCheckout', + 'isGitRepo', 'getGitRoot', 'gitStash', + ], + + // ── filesystem/* ─────────────────────────────────────────────────────── + 'filesystem/operations': [ + 'readdirSync', 'mkdirSync', 'statSync', 'lstatSync', + 'renameSync', 'unlinkSync', 'copyFileSync', + 'existsSync', 'realpathSync', 'accessSync', + 'fs.readdir', 'fs.mkdir', 'fs.stat', 'fs.lstat', + ], + + // ── network/* ────────────────────────────────────────────────────────── + 'network/http': [ + 'http.*request', 'https.*request', 'fetch(', + 'axios', 'got(', 'requestOptions', + 'responseBody', 'statusCode', + ], +}; + +// ── String-literal patterns for minified code ───────────────────────────── +// Minified bundles mangle identifiers but preserve string literals. +// These patterns match quoted strings commonly found in each domain. +// Each pattern is matched against the raw code (not just identifiers). +const STRING_PATTERNS = { + 'tools/bash': ['"bash"', '"shell"', '"command"', '"child_process"', '"spawn"', '"BashTool"'], + 'tools/read': ['"FileReadTool"', '"ReadFileTool"', '"cat "', '"readFile"'], + 'tools/edit': ['"FileEditTool"', '"old_string"', '"new_string"', '"EditFileTool"'], + 'tools/write': ['"FileWriteTool"', '"WriteFileTool"', '"createFile"'], + 'tools/glob': ['"GlobTool"', '"ListFilesTool"', '"glob"', '"minimatch"'], + 'tools/grep': ['"GrepTool"', '"ripgrep"', '"rg "', '"SearchTool"'], + 'tools/agent': ['"AgentTool"', '"Task"', '"subagent"'], + 'tools/web-fetch': ['"WebFetchTool"', '"url_fetch"'], + 'tools/web-search': ['"WebSearchTool"', '"web_search"'], + 'tools/notebook': ['"NotebookEditTool"', '"ipynb"', '"jupyter"'], + 'tools/mcp-dispatch': ['"inputSchema"', '"toolSchema"', '"toolDefinition"'], + 'tools/todo': ['"TodoWriteTool"', '"TodoReadTool"'], + 'core/agent-loop': ['"assistant"', '"user"', '"system"', '"systemPrompt"', '"messageHistory"'], + 'core/streaming': [ + '"content_block_delta"', '"message_start"', '"message_stop"', + '"message_delta"', '"content_block_start"', '"content_block_stop"', + '"text_delta"', '"input_json_delta"', '"stream_event"', + ], + 'core/context-manager': [ + '"tengu_compact"', '"auto_compact"', '"compact"', + '"contextWindow"', '"maxTokens"', '"cacheControl"', + ], + 'core/session': ['"sessionId"', '"conversationId"', '"checkpoint"', '"resume"'], + 'core/error-handler': ['"overloaded"', '"rate_limit"', '"retryAfter"', '"APIError"'], + 'permissions/checker': [ + '"canUseTool"', '"permission"', '"allowedTools"', + '"permissionMode"', '"alwaysAllow"', + ], + 'permissions/sandbox': ['"sandbox"', '"bubblewrap"', '"seatbelt"', '"firejail"'], + 'auth/oauth': ['"OAuth"', '"PKCE"', '"authorization_code"', '"refresh_token"', '"code_verifier"'], + 'auth/api-key': ['"x-api-key"', '"ANTHROPIC_API_KEY"', '"apiKeyHelper"'], + 'auth/bedrock': ['"bedrock"', '"BedrockRuntime"', '"aws-region"'], + 'auth/vertex': ['"vertex"', '"vertexai"', '"google-cloud"'], + 'mcp/client': ['"McpClient"', '"mcp_client"'], + 'mcp/transport': ['"stdio"', '"sse"', '"streamable-http"', '"StdioTransport"'], + 'mcp/protocol': ['"jsonrpc"', '"tools/list"', '"tools/call"', '"resources/list"', '"mcp__"'], + 'mcp/servers': ['"mcpServers"', '"serverConfig"'], + 'config/settings': ['"settings.json"', '"userSettings"', '".claude"'], + 'config/env-vars': ['"CLAUDE_CODE_"', '"ANTHROPIC_"', '"CLAUDE_CONFIG"', '"CLAUDE_SKIP"'], + 'config/models': ['"modelId"', '"claude-sonnet"', '"claude-opus"', '"claude-haiku"'], + 'config/feature-flags': ['"featureFlag"', '"experiment"', '"rollout"'], + 'telemetry/otel': ['"opentelemetry"', '"OTEL_"', '"TraceProvider"'], + 'telemetry/datadog': ['"datadog"', '"DD_TRACE"'], + 'telemetry/events': ['"tengu_"', '"trackEvent"', '"analytics"', '"telemetryEvent"'], + 'telemetry/cost': ['"inputTokens"', '"outputTokens"', '"cacheRead"', '"cacheCreation"'], + 'ui/slash-commands': ['"/help"', '"/clear"', '"/compact"', '"/bug"', '"/init"', '"/doctor"'], + 'ui/ink-components': ['"useInput"', '"useFocus"', '"useApp"', '"inkRenderer"'], + 'ui/keybindings': ['"keybinding"', '"shortcut"', '"hotkey"'], + 'ui/terminal': ['"chalk"', '"stripAnsi"', '"ansiColor"'], + 'model-provider/anthropic': ['"anthropic"', '"claude-"', '"Anthropic"', '"messages"'], + 'model-provider/openai': ['"openai"', '"gpt-"', '"chatCompletion"'], + 'git/operations': ['"git diff"', '"git status"', '"git log"', '"git commit"'], + 'network/http': ['"Content-Type"', '"application/json"', '"Authorization"'], +}; + +// ── Legacy MODULE_KEYWORDS alias ─────────────────────────────────────────── +// Maps old broad categories for backward compat. +const MODULE_KEYWORDS = { + 'tool-dispatch': SUBCATEGORIES['tools/mcp-dispatch'], + 'permission-system': SUBCATEGORIES['permissions/checker'], + 'mcp-client': SUBCATEGORIES['mcp/protocol'], + 'streaming-handler': SUBCATEGORIES['core/streaming'], + 'context-manager': SUBCATEGORIES['core/context-manager'], + 'agent-loop': SUBCATEGORIES['core/agent-loop'], + 'commands': SUBCATEGORIES['ui/slash-commands'], + 'telemetry': SUBCATEGORIES['telemetry/events'], + 'config': SUBCATEGORIES['config/settings'], + 'session': SUBCATEGORIES['core/session'], + 'model-provider': SUBCATEGORIES['model-provider/anthropic'], +}; + +module.exports = { SUBCATEGORIES, MODULE_KEYWORDS, STRING_PATTERNS };