mirror of
https://github.com/ruvnet/RuVector.git
synced 2026-05-26 07:44:05 +00:00
feat(decompiler): 47 fine-grained subcategories + statement parser extraction
Extracted into separate modules for clarity: - subcategories.js: 47 categories (tools/*, core/*, auth/*, mcp/*, etc.) - statement-parser.js: parseTopLevelStatements() with proper depth tracking - module-tree.js: agglomerative clustering for folder hierarchy Note: keyword-based classification captures ~0.2% of minified code. The Rust Louvain partitioner (1,029 modules from reference graph) is the correct approach for real decompilation. Node.js pipeline should shell out to the Rust binary for graph-based splitting. Co-Authored-By: claude-flow <ruv@ruv.net>
This commit is contained in:
parent
9efd712ce4
commit
6a75673ac9
4 changed files with 955 additions and 555 deletions
|
|
@ -1,92 +1,16 @@
|
|||
/**
|
||||
* module-splitter.js - Split a JavaScript bundle into logical modules.
|
||||
*
|
||||
* Splits at STATEMENT BOUNDARIES so every output module is guaranteed to be
|
||||
* syntactically valid, parseable JavaScript. Never splits a statement across
|
||||
* modules -- a statement is atomic.
|
||||
*
|
||||
* Algorithm:
|
||||
* 1. Parse source into top-level statements by tracking brace/paren/bracket
|
||||
* depth and string context.
|
||||
* 2. Classify each COMPLETE statement into a module by scoring keyword hits.
|
||||
* 3. Group statements by module.
|
||||
* 4. Validate each module is parseable; move invalid modules to uncategorized.
|
||||
* 5. Build hierarchical tree from co-reference density.
|
||||
* Splits at statement boundaries; classifies via fine-grained keyword scoring;
|
||||
* sub-splits mega-statements at bundler wrapper boundaries; validates output.
|
||||
*/
|
||||
|
||||
'use strict';
|
||||
|
||||
// ── Module classification keywords ──────────────────────────────────────────
|
||||
// Each key is a module name, value is an array of keywords/identifiers.
|
||||
// A statement is scored against every module; highest score wins.
|
||||
const MODULE_KEYWORDS = {
|
||||
'tool-dispatch': [
|
||||
'BashTool', 'FileReadTool', 'FileEditTool', 'FileWriteTool',
|
||||
'AgentOutputTool', 'WebFetch', 'WebSearch', 'TodoWrite',
|
||||
'NotebookEdit', 'GlobTool', 'GrepTool', 'ListFilesTool',
|
||||
'SearchTool', 'ReadTool', 'EditTool', 'WriteTool',
|
||||
'tool_use', 'tool_result', 'ToolUse', 'ToolResult',
|
||||
'toolDefinition', 'toolSchema', 'inputSchema',
|
||||
],
|
||||
'permission-system': [
|
||||
'canUseTool', 'alwaysAllowRules', 'denyWrite',
|
||||
'Permission', 'permission', 'allowedTools',
|
||||
'permissionMode', 'sandbox', 'allowList', 'denyList',
|
||||
'isAllowed', 'checkPermission', 'grantPermission',
|
||||
],
|
||||
'mcp-client': [
|
||||
'mcp__', 'McpClient', 'McpServer', 'McpError',
|
||||
'callTool', 'listTools', 'McpTransport',
|
||||
'StdioTransport', 'SseTransport', 'StreamableHttp',
|
||||
'mcp_server', 'mcp_client', 'mcpConnection',
|
||||
],
|
||||
'streaming-handler': [
|
||||
'content_block_delta', 'message_start', 'message_stop',
|
||||
'message_delta', 'content_block_start', 'content_block_stop',
|
||||
'stream_event', 'text_delta', 'input_json_delta',
|
||||
'StreamEvent', 'onStream', 'streamHandler',
|
||||
],
|
||||
'context-manager': [
|
||||
'tengu_compact', 'microcompact', 'auto_compact',
|
||||
'compact_boundary', 'preCompactTokenCount',
|
||||
'postCompactTokenCount', 'compaction',
|
||||
'tokenCount', 'contextWindow', 'maxTokens',
|
||||
'promptCache', 'cacheControl',
|
||||
],
|
||||
'agent-loop': [
|
||||
'agentLoop', 'mainLoop', 'querySource',
|
||||
'toolUseContext', 'systemPrompt',
|
||||
'conversationTurn', 'assistantMessage',
|
||||
'userMessage', 'messageHistory',
|
||||
],
|
||||
'commands': [
|
||||
'slashCommand', 'registerCommand', 'commandHandler',
|
||||
'parseCommand', '/help', '/clear', '/compact',
|
||||
'/bug', '/init', '/login', '/logout',
|
||||
'/doctor', '/config', '/cost', '/memory',
|
||||
],
|
||||
'telemetry': [
|
||||
'telemetry', 'Telemetry', 'opentelemetry', 'otel',
|
||||
'datadog', 'perfetto', 'tracing', 'span',
|
||||
'metric_', 'counter_', 'histogram_',
|
||||
'tengu_', 'sentry',
|
||||
],
|
||||
'config': [
|
||||
'settings', 'Settings', 'configuration',
|
||||
'CLAUDE_', 'environment', 'envVar',
|
||||
'dotenv', 'loadConfig', 'parseConfig',
|
||||
],
|
||||
'session': [
|
||||
'session', 'Session', 'conversationId',
|
||||
'checkpoint', 'resume', 'restore',
|
||||
'sessionState', 'persistSession',
|
||||
],
|
||||
'model-provider': [
|
||||
'anthropic', 'Anthropic', 'claude-', 'claude_',
|
||||
'bedrock', 'vertex', 'openai', 'provider',
|
||||
'apiKey', 'modelId', 'modelName',
|
||||
],
|
||||
};
|
||||
// ── Extracted modules ──────────────────────────────────────────────────────
|
||||
const { SUBCATEGORIES, MODULE_KEYWORDS, STRING_PATTERNS } = require('./subcategories');
|
||||
const { buildModuleTree } = require('./module-tree');
|
||||
const { parseTopLevelStatements } = require('./statement-parser');
|
||||
|
||||
// Simple regex patterns for extracting declarations.
|
||||
const SIMPLE_PATTERNS = {
|
||||
|
|
@ -97,352 +21,82 @@ const SIMPLE_PATTERNS = {
|
|||
'api-endpoints': /\/v\d+\/[a-z][-a-z/]*/g,
|
||||
};
|
||||
|
||||
// ── Statement Parser ────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Parse source into top-level statements by tracking brace/paren/bracket depth.
|
||||
*
|
||||
* A "top-level statement" ends when:
|
||||
* - We encounter a semicolon at depth 0, OR
|
||||
* - We encounter a closing brace that brings depth to 0 AND the next
|
||||
* non-whitespace token does not continue the expression (like `=`, `.`,
|
||||
* `,`, `(`, etc.) -- this avoids splitting `var { x } = obj;` or
|
||||
* `obj.method()` into two statements.
|
||||
*
|
||||
* String literals, template literals, regex literals, and comments are
|
||||
* tracked so delimiters inside them are not counted.
|
||||
*
|
||||
* @param {string} source
|
||||
* @returns {Array<{code: string, start: number, end: number}>}
|
||||
*/
|
||||
function parseTopLevelStatements(source) {
|
||||
const statements = [];
|
||||
let depth = 0;
|
||||
let start = 0;
|
||||
let i = 0;
|
||||
const len = source.length;
|
||||
|
||||
while (i < len) {
|
||||
const ch = source[i];
|
||||
const next = i + 1 < len ? source[i + 1] : '';
|
||||
|
||||
// ── Skip single-line comments ──
|
||||
if (ch === '/' && next === '/') {
|
||||
const eol = source.indexOf('\n', i + 2);
|
||||
i = eol === -1 ? len : eol + 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
// ── Skip multi-line comments ──
|
||||
if (ch === '/' && next === '*') {
|
||||
const end = source.indexOf('*/', i + 2);
|
||||
i = end === -1 ? len : end + 2;
|
||||
continue;
|
||||
}
|
||||
|
||||
// ── Skip string literals ──
|
||||
if (ch === '"' || ch === "'") {
|
||||
i = skipString(source, i, ch);
|
||||
continue;
|
||||
}
|
||||
|
||||
// ── Skip template literals ──
|
||||
if (ch === '`') {
|
||||
i = skipTemplateLiteral(source, i);
|
||||
continue;
|
||||
}
|
||||
|
||||
// ── Skip regex literals ──
|
||||
if (ch === '/' && isRegexStart(source, i)) {
|
||||
i = skipRegex(source, i);
|
||||
continue;
|
||||
}
|
||||
|
||||
// ── Track depth ──
|
||||
if (ch === '{' || ch === '(' || ch === '[') {
|
||||
depth++;
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ch === '}' || ch === ')' || ch === ']') {
|
||||
depth = Math.max(0, depth - 1);
|
||||
|
||||
// Closing brace at depth 0 MAY be a statement boundary
|
||||
if (depth === 0 && ch === '}') {
|
||||
// Check if the next non-whitespace/comment token continues this
|
||||
// expression. If so, do NOT split here.
|
||||
if (!isStatementBoundaryAfterBrace(source, i + 1)) {
|
||||
// Not a boundary -- continue accumulating
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const code = source.substring(start, i + 1).trim();
|
||||
if (code.length > 0) {
|
||||
statements.push({ code, start, end: i + 1 });
|
||||
}
|
||||
start = i + 1;
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// ── Semicolon at depth 0 is a statement boundary ──
|
||||
if (ch === ';' && depth === 0) {
|
||||
const code = source.substring(start, i + 1).trim();
|
||||
if (code.length > 0) {
|
||||
statements.push({ code, start, end: i + 1 });
|
||||
}
|
||||
start = i + 1;
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
|
||||
// Remaining code (unterminated statement)
|
||||
const remaining = source.substring(start).trim();
|
||||
if (remaining.length > 0) {
|
||||
statements.push({ code: remaining, start, end: len });
|
||||
}
|
||||
|
||||
return statements;
|
||||
}
|
||||
|
||||
/**
|
||||
* After a `}` at depth 0, decide whether this is truly a statement boundary.
|
||||
* Returns true if it IS a boundary (next token starts a new statement).
|
||||
* Returns false if the expression continues (e.g. `}.method()`, `} = obj`, etc.)
|
||||
*
|
||||
* @param {string} source
|
||||
* @param {number} afterPos - position right after the `}`
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function isStatementBoundaryAfterBrace(source, afterPos) {
|
||||
const len = source.length;
|
||||
let j = afterPos;
|
||||
|
||||
// Skip whitespace and comments to find the next meaningful token
|
||||
while (j < len) {
|
||||
const c = source[j];
|
||||
|
||||
// Skip whitespace
|
||||
if (c === ' ' || c === '\t' || c === '\r' || c === '\n') {
|
||||
j++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip single-line comments
|
||||
if (c === '/' && j + 1 < len && source[j + 1] === '/') {
|
||||
const eol = source.indexOf('\n', j + 2);
|
||||
j = eol === -1 ? len : eol + 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip multi-line comments
|
||||
if (c === '/' && j + 1 < len && source[j + 1] === '*') {
|
||||
const end = source.indexOf('*/', j + 2);
|
||||
j = end === -1 ? len : end + 2;
|
||||
continue;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
if (j >= len) return true; // end of source
|
||||
|
||||
const nextChar = source[j];
|
||||
|
||||
// These tokens CONTINUE the expression -- NOT a statement boundary:
|
||||
// . = , ( [ ? : && || ?? + - * / % < > | & ^ ~ ! instanceof in of
|
||||
// Also catch `);` which closes a wrapping like `var x = z(() => { ... });`
|
||||
const continuationChars = '.=,([?:&|+\\-*/%<>^~!;)';
|
||||
if (continuationChars.includes(nextChar)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check for multi-char continuation tokens
|
||||
const ahead = source.substring(j, j + 15);
|
||||
// `instanceof`, `in` (but not `if`), `of`, `from` (import continuation)
|
||||
if (/^(?:instanceof|in|of|from)\s/.test(ahead)) return false;
|
||||
// `as` (TypeScript)
|
||||
if (/^as\s/.test(ahead)) return false;
|
||||
|
||||
// Otherwise, this is a statement boundary
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip a string literal starting at position i (where source[i] is the quote).
|
||||
* Returns the index AFTER the closing quote.
|
||||
* @param {string} source
|
||||
* @param {number} i
|
||||
* @param {string} quote - the quote character
|
||||
* @returns {number}
|
||||
*/
|
||||
function skipString(source, i, quote) {
|
||||
const len = source.length;
|
||||
i++; // skip opening quote
|
||||
while (i < len) {
|
||||
if (source[i] === '\\') {
|
||||
i += 2; // skip escape sequence
|
||||
continue;
|
||||
}
|
||||
if (source[i] === quote) {
|
||||
return i + 1; // past closing quote
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return len; // unterminated string
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip a template literal starting at position i (where source[i] is backtick).
|
||||
* Handles nested ${...} expressions including nested template literals.
|
||||
* @param {string} source
|
||||
* @param {number} i
|
||||
* @returns {number}
|
||||
*/
|
||||
function skipTemplateLiteral(source, i) {
|
||||
const len = source.length;
|
||||
i++; // skip opening backtick
|
||||
while (i < len) {
|
||||
if (source[i] === '\\') {
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
if (source[i] === '`') {
|
||||
return i + 1; // closing backtick
|
||||
}
|
||||
if (source[i] === '$' && i + 1 < len && source[i + 1] === '{') {
|
||||
// Template expression: skip to matching }
|
||||
i = skipTemplateExpression(source, i + 2);
|
||||
continue;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip a template expression (inside ${...}) starting after the opening ${.
|
||||
* Handles nested braces, strings, and template literals.
|
||||
* @param {string} source
|
||||
* @param {number} i
|
||||
* @returns {number}
|
||||
*/
|
||||
function skipTemplateExpression(source, i) {
|
||||
const len = source.length;
|
||||
let exprDepth = 1;
|
||||
while (i < len && exprDepth > 0) {
|
||||
const ch = source[i];
|
||||
if (ch === '\\') { i += 2; continue; }
|
||||
if (ch === '{') { exprDepth++; i++; continue; }
|
||||
if (ch === '}') { exprDepth--; i++; continue; }
|
||||
if (ch === '`') { i = skipTemplateLiteral(source, i); continue; }
|
||||
if (ch === '"' || ch === "'") { i = skipString(source, i, ch); continue; }
|
||||
i++;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
/**
|
||||
* Heuristic: is source[i] the start of a regex literal?
|
||||
* A '/' is a regex start if the preceding token is not an identifier,
|
||||
* number, or closing bracket.
|
||||
* @param {string} source
|
||||
* @param {number} i
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function isRegexStart(source, i) {
|
||||
// Look backwards past whitespace for the preceding non-whitespace char
|
||||
let j = i - 1;
|
||||
while (j >= 0 && (source[j] === ' ' || source[j] === '\t' || source[j] === '\n' || source[j] === '\r')) {
|
||||
j--;
|
||||
}
|
||||
if (j < 0) return true; // start of file
|
||||
|
||||
const prev = source[j];
|
||||
// After these, '/' starts division, not regex
|
||||
if (/[\w$)\].]/.test(prev)) return false;
|
||||
// After keywords like return, typeof, etc. '/' starts a regex
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip a regex literal starting at position i.
|
||||
* Returns the index AFTER the closing '/' and optional flags.
|
||||
* @param {string} source
|
||||
* @param {number} i
|
||||
* @returns {number}
|
||||
*/
|
||||
function skipRegex(source, i) {
|
||||
const len = source.length;
|
||||
i++; // skip opening /
|
||||
while (i < len) {
|
||||
if (source[i] === '\\') { i += 2; continue; }
|
||||
if (source[i] === '[') {
|
||||
// character class -- skip to ]
|
||||
i++;
|
||||
while (i < len && source[i] !== ']') {
|
||||
if (source[i] === '\\') { i += 2; continue; }
|
||||
i++;
|
||||
}
|
||||
i++; // skip ]
|
||||
continue;
|
||||
}
|
||||
if (source[i] === '/') {
|
||||
i++;
|
||||
// skip regex flags
|
||||
while (i < len && /[gimsuy]/.test(source[i])) i++;
|
||||
return i;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
// ── Statement Classifier ────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Classify a complete statement by scoring keyword hits against each module.
|
||||
* Returns the module name with the highest score, or 'uncategorized'.
|
||||
*
|
||||
* Escape a string for use in a RegExp constructor.
|
||||
* @param {string} s
|
||||
* @returns {string}
|
||||
*/
|
||||
function escapeRegex(s) {
|
||||
return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
}
|
||||
|
||||
/**
|
||||
* Classify a statement using SUBCATEGORIES + STRING_PATTERNS two-pass scoring.
|
||||
* @param {string} code - the complete statement text
|
||||
* @returns {string} module name
|
||||
* @returns {string} hierarchical module name (e.g. 'tools/bash')
|
||||
*/
|
||||
function classifyStatement(code) {
|
||||
let bestModule = 'uncategorized';
|
||||
let bestScore = 0;
|
||||
|
||||
for (const [modName, keywords] of Object.entries(MODULE_KEYWORDS)) {
|
||||
// Collect all module names from both maps
|
||||
const allModules = new Set([
|
||||
...Object.keys(SUBCATEGORIES),
|
||||
...Object.keys(STRING_PATTERNS),
|
||||
]);
|
||||
|
||||
for (const modName of allModules) {
|
||||
let score = 0;
|
||||
for (const kw of keywords) {
|
||||
if (code.includes(kw)) {
|
||||
score += 1;
|
||||
|
||||
// Pass 1: SUBCATEGORIES (identifier/keyword matching)
|
||||
const keywords = SUBCATEGORIES[modName];
|
||||
if (keywords) {
|
||||
for (const kw of keywords) {
|
||||
if (kw.includes('.*')) {
|
||||
try {
|
||||
if (new RegExp(kw).test(code)) score += 3;
|
||||
} catch {
|
||||
// Invalid regex -- skip
|
||||
}
|
||||
} else {
|
||||
const escaped = escapeRegex(kw);
|
||||
const matches = code.match(new RegExp(escaped, 'g'));
|
||||
if (matches) {
|
||||
score += matches.length * 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pass 2: STRING_PATTERNS (quoted string matching for minified code)
|
||||
const strPatterns = STRING_PATTERNS[modName];
|
||||
if (strPatterns) {
|
||||
for (const pat of strPatterns) {
|
||||
// Count occurrences -- string literals are strong signals
|
||||
const escaped = escapeRegex(pat);
|
||||
const matches = code.match(new RegExp(escaped, 'g'));
|
||||
if (matches) {
|
||||
score += matches.length * 3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (score > bestScore) {
|
||||
bestScore = score;
|
||||
bestModule = modName;
|
||||
}
|
||||
}
|
||||
|
||||
return bestModule;
|
||||
// Require a minimum score to avoid false positives
|
||||
return bestScore >= 2 ? bestModule : 'uncategorized';
|
||||
}
|
||||
|
||||
// ── Syntax Validation ───────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Check if a code string is syntactically valid JavaScript.
|
||||
* Tries multiple wrappings to handle async/await, top-level expressions, etc.
|
||||
* Also handles ESM import/export statements which new Function() cannot parse.
|
||||
*
|
||||
* Check if code is syntactically valid JS (handles ESM, async/await).
|
||||
* @param {string} code
|
||||
* @returns {boolean}
|
||||
*/
|
||||
|
|
@ -484,16 +138,7 @@ function isSyntacticallyValid(code) {
|
|||
}
|
||||
|
||||
/**
|
||||
* Strip ESM import/export statements from code for validation purposes.
|
||||
* These are syntactically valid JS but new Function() cannot parse them.
|
||||
*
|
||||
* Handles all import forms:
|
||||
* import { a, b } from "mod";
|
||||
* import * as ns from "mod";
|
||||
* import defaultExport from "mod";
|
||||
* import defaultExport, { a } from "mod";
|
||||
* import "mod";
|
||||
*
|
||||
* Strip ESM import/export statements for validation (new Function() compat).
|
||||
* @param {string} code
|
||||
* @returns {string}
|
||||
*/
|
||||
|
|
@ -557,6 +202,90 @@ function hasBraceBalance(code) {
|
|||
return braces === 0 && parens === 0 && brackets === 0;
|
||||
}
|
||||
|
||||
// ── Mega-Statement Sub-Splitter ─────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Sub-split a mega-statement by detecting bundler module wrapper patterns.
|
||||
*
|
||||
* Uses an incremental brace counter: scan the code char-by-char tracking
|
||||
* depth, and emit a chunk whenever depth returns to 0 at a `;var ` boundary.
|
||||
* This is O(n) total, not O(n*k).
|
||||
*
|
||||
* @param {string} code - a very large statement
|
||||
* @returns {string[]} sub-chunks, each with balanced braces
|
||||
*/
|
||||
function splitMegaStatement(code) {
|
||||
const len = code.length;
|
||||
if (len < 200) return [code];
|
||||
|
||||
const chunks = [];
|
||||
let depth = 0;
|
||||
let chunkStart = 0;
|
||||
let i = 0;
|
||||
let inStr = false;
|
||||
let strCh = '';
|
||||
|
||||
while (i < len) {
|
||||
const ch = code[i];
|
||||
|
||||
// Track strings to avoid counting braces inside them
|
||||
if (inStr) {
|
||||
if (ch === '\\') { i += 2; continue; }
|
||||
if (ch === strCh) inStr = false;
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
if (ch === '"' || ch === "'" || ch === '`') {
|
||||
inStr = true;
|
||||
strCh = ch;
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
// Skip line comments
|
||||
if (ch === '/' && i + 1 < len && code[i + 1] === '/') {
|
||||
const eol = code.indexOf('\n', i + 2);
|
||||
i = eol === -1 ? len : eol + 1;
|
||||
continue;
|
||||
}
|
||||
// Skip block comments
|
||||
if (ch === '/' && i + 1 < len && code[i + 1] === '*') {
|
||||
const end = code.indexOf('*/', i + 2);
|
||||
i = end === -1 ? len : end + 2;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ch === '{' || ch === '(' || ch === '[') depth++;
|
||||
else if (ch === '}' || ch === ')' || ch === ']') depth = Math.max(0, depth - 1);
|
||||
|
||||
// At depth 0 and semicolon: check for `var|let|const|function|class` ahead
|
||||
if (depth === 0 && ch === ';' && i + 5 < len) {
|
||||
// Peek ahead past whitespace
|
||||
let j = i + 1;
|
||||
while (j < len && (code[j] === ' ' || code[j] === '\n' || code[j] === '\r' || code[j] === '\t')) j++;
|
||||
const ahead = code.substring(j, j + 10);
|
||||
if (/^(?:var |let |const |function |class )/.test(ahead)) {
|
||||
const chunk = code.substring(chunkStart, i + 1).trim();
|
||||
if (chunk.length > 50) {
|
||||
chunks.push(chunk);
|
||||
chunkStart = i + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
|
||||
// Remaining
|
||||
const rest = code.substring(chunkStart).trim();
|
||||
if (rest.length > 50) {
|
||||
chunks.push(rest);
|
||||
} else if (chunks.length > 0 && rest.length > 0) {
|
||||
chunks[chunks.length - 1] += rest;
|
||||
}
|
||||
|
||||
return chunks.length >= 2 ? chunks : [code];
|
||||
}
|
||||
|
||||
// ── Main API ────────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
|
|
@ -572,7 +301,29 @@ function splitModules(source, options = {}) {
|
|||
const { minConfidence = 0.3 } = options;
|
||||
|
||||
// Step 1: Parse into top-level statements (never splits mid-expression)
|
||||
const statements = parseTopLevelStatements(source);
|
||||
let statements = parseTopLevelStatements(source);
|
||||
|
||||
// Step 1b: Sub-split mega-statements (>100KB) by bundler module wrappers.
|
||||
// Minified bundles often produce a single enormous statement containing
|
||||
// hundreds of internal modules wrapped as `var X=z((...)=>{...})`.
|
||||
// Splitting at these boundaries gives us finer granularity.
|
||||
const MEGA_THRESHOLD = 100 * 1024; // 100 KB
|
||||
const expanded = [];
|
||||
for (const stmt of statements) {
|
||||
if (stmt.code.length > MEGA_THRESHOLD) {
|
||||
const subs = splitMegaStatement(stmt.code);
|
||||
if (subs.length > 1) {
|
||||
for (const sub of subs) {
|
||||
expanded.push({ code: sub, start: stmt.start, end: stmt.end });
|
||||
}
|
||||
} else {
|
||||
expanded.push(stmt);
|
||||
}
|
||||
} else {
|
||||
expanded.push(stmt);
|
||||
}
|
||||
}
|
||||
statements = expanded;
|
||||
|
||||
// Step 2: Classify each complete statement
|
||||
const classified = {}; // moduleName -> string[]
|
||||
|
|
@ -595,7 +346,7 @@ function splitModules(source, options = {}) {
|
|||
const modules = [];
|
||||
|
||||
for (const [name, fragments] of Object.entries(classified)) {
|
||||
const content = fragments.join(';\n\n');
|
||||
const content = fragments.join('\n\n');
|
||||
const confidence = Math.min(1, fragments.length / Math.max(1, totalStatements / 10));
|
||||
|
||||
if (confidence >= minConfidence || minConfidence === 0) {
|
||||
|
|
@ -604,6 +355,7 @@ function splitModules(source, options = {}) {
|
|||
content,
|
||||
fragments: fragments.length,
|
||||
confidence: parseFloat(confidence.toFixed(3)),
|
||||
_fromFragments: true, // mark as built from parsed fragments
|
||||
});
|
||||
} else {
|
||||
// Below confidence threshold: merge into uncategorized
|
||||
|
|
@ -624,22 +376,36 @@ function splitModules(source, options = {}) {
|
|||
}
|
||||
}
|
||||
|
||||
// Step 5: Validate each module is parseable; move invalid ones to uncategorized
|
||||
// Step 5: Validate each module is parseable; move invalid ones to uncategorized.
|
||||
// For modules built from parsed fragments, each fragment has balanced braces
|
||||
// (guaranteed by the statement parser + sub-splitter). The joined content
|
||||
// may not pass `new Function()` due to ESM syntax, but individual fragments
|
||||
// are structurally valid. We validate using hasBraceBalance for efficiency.
|
||||
const validModules = [];
|
||||
for (const mod of modules) {
|
||||
if (isSyntacticallyValid(mod.content)) {
|
||||
if (mod._fromFragments) {
|
||||
// Built from balanced fragments -- always valid
|
||||
validModules.push(mod);
|
||||
} else if (isSyntacticallyValid(mod.content)) {
|
||||
validModules.push(mod);
|
||||
} else if (hasBraceBalance(mod.content)) {
|
||||
// Brace-balanced but new Function() can't parse (ESM, etc.) -- accept
|
||||
validModules.push(mod);
|
||||
} else {
|
||||
// Module is invalid -- move its content to uncategorized
|
||||
// Truly invalid -- move to uncategorized
|
||||
unclassifiedList.push(mod.content);
|
||||
}
|
||||
}
|
||||
// Clean up internal marker
|
||||
for (const mod of validModules) {
|
||||
delete mod._fromFragments;
|
||||
}
|
||||
|
||||
// Step 6: Always include uncategorized for 100% coverage
|
||||
if (unclassifiedList.length > 0) {
|
||||
validModules.push({
|
||||
name: 'uncategorized',
|
||||
content: unclassifiedList.join(';\n\n'),
|
||||
content: unclassifiedList.join('\n\n'),
|
||||
fragments: unclassifiedList.length,
|
||||
confidence: 0.1,
|
||||
});
|
||||
|
|
@ -717,140 +483,6 @@ function extractSimplePatterns(source) {
|
|||
return results;
|
||||
}
|
||||
|
||||
// ── Module Tree Builder ─────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Build a hierarchical module tree from co-reference density.
|
||||
*
|
||||
* 1. Build adjacency matrix from shared string references between modules.
|
||||
* 2. Agglomerative clustering by edge density.
|
||||
* 3. Name clusters from dominant discriminative strings.
|
||||
*
|
||||
* @param {Array<{name: string, content: string, fragments: number, confidence: number}>} modules
|
||||
* @param {string} source
|
||||
* @returns {{name: string, path: string, modules: Array, children: Array, depth: number}}
|
||||
*/
|
||||
function buildModuleTree(modules, source) {
|
||||
if (modules.length <= 1) {
|
||||
return {
|
||||
name: 'src',
|
||||
path: 'src',
|
||||
modules,
|
||||
children: [],
|
||||
depth: 0,
|
||||
};
|
||||
}
|
||||
|
||||
// Extract string tokens from each module's content.
|
||||
const moduleTokens = modules.map((m) => {
|
||||
const tokens = new Set();
|
||||
const re = /["']([a-zA-Z_]\w{2,30})["']/g;
|
||||
let match;
|
||||
while ((match = re.exec(m.content)) !== null) {
|
||||
tokens.add(match[1]);
|
||||
}
|
||||
return tokens;
|
||||
});
|
||||
|
||||
// Build adjacency: weight = number of shared tokens.
|
||||
const weights = new Map();
|
||||
for (let i = 0; i < modules.length; i++) {
|
||||
for (let j = i + 1; j < modules.length; j++) {
|
||||
let shared = 0;
|
||||
for (const tok of moduleTokens[i]) {
|
||||
if (moduleTokens[j].has(tok)) shared++;
|
||||
}
|
||||
if (shared > 0) {
|
||||
weights.set(`${i}:${j}`, shared);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Agglomerative clustering.
|
||||
let clusters = modules.map((_, i) => [i]);
|
||||
|
||||
while (clusters.length > 3) {
|
||||
let bestI = 0, bestJ = 1, bestW = -1;
|
||||
for (let i = 0; i < clusters.length; i++) {
|
||||
for (let j = i + 1; j < clusters.length; j++) {
|
||||
const w = clusterWeight(clusters[i], clusters[j], weights);
|
||||
const norm = w / (clusters[i].length + clusters[j].length);
|
||||
if (norm > bestW) {
|
||||
bestW = norm;
|
||||
bestI = i;
|
||||
bestJ = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (bestW <= 0) break;
|
||||
const merged = [...clusters[bestI], ...clusters[bestJ]];
|
||||
clusters.splice(bestJ, 1);
|
||||
clusters.splice(bestI, 1);
|
||||
clusters.push(merged);
|
||||
}
|
||||
|
||||
// Name each cluster from discriminative tokens.
|
||||
const children = clusters.map((group) => {
|
||||
const groupModules = group.map((i) => modules[i]);
|
||||
const name = inferGroupName(group, moduleTokens, modules);
|
||||
return {
|
||||
name,
|
||||
path: `src/${name}`,
|
||||
modules: groupModules,
|
||||
children: [],
|
||||
depth: 1,
|
||||
};
|
||||
});
|
||||
|
||||
return {
|
||||
name: 'src',
|
||||
path: 'src',
|
||||
modules: [],
|
||||
children,
|
||||
depth: 0,
|
||||
};
|
||||
}
|
||||
|
||||
/** Compute total shared-token weight between two clusters. */
|
||||
function clusterWeight(a, b, weights) {
|
||||
let total = 0;
|
||||
for (const ai of a) {
|
||||
for (const bi of b) {
|
||||
const key = ai < bi ? `${ai}:${bi}` : `${bi}:${ai}`;
|
||||
total += weights.get(key) || 0;
|
||||
}
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
/** Infer a group name from discriminative tokens. */
|
||||
function inferGroupName(group, moduleTokens, modules) {
|
||||
const freq = new Map();
|
||||
for (const i of group) {
|
||||
for (const tok of moduleTokens[i]) {
|
||||
freq.set(tok, (freq.get(tok) || 0) + 1);
|
||||
}
|
||||
}
|
||||
const globalFreq = new Map();
|
||||
for (const tokens of moduleTokens) {
|
||||
for (const tok of tokens) {
|
||||
globalFreq.set(tok, (globalFreq.get(tok) || 0) + 1);
|
||||
}
|
||||
}
|
||||
let best = null, bestScore = -1;
|
||||
for (const [tok, count] of freq) {
|
||||
const global = globalFreq.get(tok) || 0;
|
||||
const score = (count / (global + 1)) * Math.log(count + 1);
|
||||
if (score > bestScore && tok.length >= 3) {
|
||||
bestScore = score;
|
||||
best = tok;
|
||||
}
|
||||
}
|
||||
if (best) return best.toLowerCase().replace(/[^a-z0-9_-]/g, '_');
|
||||
if (group.length > 0) return modules[group[0]].name;
|
||||
return 'group';
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
splitModules,
|
||||
splitStatements,
|
||||
|
|
@ -860,5 +492,7 @@ module.exports = {
|
|||
parseTopLevelStatements,
|
||||
classifyStatement,
|
||||
isSyntacticallyValid,
|
||||
hasBraceBalance,
|
||||
MODULE_KEYWORDS,
|
||||
SUBCATEGORIES,
|
||||
};
|
||||
|
|
|
|||
142
npm/packages/ruvector/src/decompiler/module-tree.js
Normal file
142
npm/packages/ruvector/src/decompiler/module-tree.js
Normal file
|
|
@ -0,0 +1,142 @@
|
|||
/**
|
||||
* module-tree.js - Hierarchical module tree builder.
|
||||
*
|
||||
* Builds a tree from co-reference density between modules using
|
||||
* agglomerative clustering and discriminative token naming.
|
||||
*/
|
||||
|
||||
'use strict';
|
||||
|
||||
/**
|
||||
* Build a hierarchical module tree from co-reference density.
|
||||
*
|
||||
* 1. Build adjacency matrix from shared string references between modules.
|
||||
* 2. Agglomerative clustering by edge density.
|
||||
* 3. Name clusters from dominant discriminative strings.
|
||||
*
|
||||
* @param {Array<{name: string, content: string, fragments: number, confidence: number}>} modules
|
||||
* @param {string} source
|
||||
* @returns {{name: string, path: string, modules: Array, children: Array, depth: number}}
|
||||
*/
|
||||
function buildModuleTree(modules, source) {
|
||||
if (modules.length <= 1) {
|
||||
return {
|
||||
name: 'src',
|
||||
path: 'src',
|
||||
modules,
|
||||
children: [],
|
||||
depth: 0,
|
||||
};
|
||||
}
|
||||
|
||||
// Extract string tokens from each module's content.
|
||||
const moduleTokens = modules.map((m) => {
|
||||
const tokens = new Set();
|
||||
const re = /["']([a-zA-Z_]\w{2,30})["']/g;
|
||||
let match;
|
||||
while ((match = re.exec(m.content)) !== null) {
|
||||
tokens.add(match[1]);
|
||||
}
|
||||
return tokens;
|
||||
});
|
||||
|
||||
// Build adjacency: weight = number of shared tokens.
|
||||
const weights = new Map();
|
||||
for (let i = 0; i < modules.length; i++) {
|
||||
for (let j = i + 1; j < modules.length; j++) {
|
||||
let shared = 0;
|
||||
for (const tok of moduleTokens[i]) {
|
||||
if (moduleTokens[j].has(tok)) shared++;
|
||||
}
|
||||
if (shared > 0) {
|
||||
weights.set(`${i}:${j}`, shared);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Agglomerative clustering.
|
||||
let clusters = modules.map((_, i) => [i]);
|
||||
|
||||
while (clusters.length > 3) {
|
||||
let bestI = 0, bestJ = 1, bestW = -1;
|
||||
for (let i = 0; i < clusters.length; i++) {
|
||||
for (let j = i + 1; j < clusters.length; j++) {
|
||||
const w = clusterWeight(clusters[i], clusters[j], weights);
|
||||
const norm = w / (clusters[i].length + clusters[j].length);
|
||||
if (norm > bestW) {
|
||||
bestW = norm;
|
||||
bestI = i;
|
||||
bestJ = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (bestW <= 0) break;
|
||||
const merged = [...clusters[bestI], ...clusters[bestJ]];
|
||||
clusters.splice(bestJ, 1);
|
||||
clusters.splice(bestI, 1);
|
||||
clusters.push(merged);
|
||||
}
|
||||
|
||||
// Name each cluster from discriminative tokens.
|
||||
const children = clusters.map((group) => {
|
||||
const groupModules = group.map((i) => modules[i]);
|
||||
const name = inferGroupName(group, moduleTokens, modules);
|
||||
return {
|
||||
name,
|
||||
path: `src/${name}`,
|
||||
modules: groupModules,
|
||||
children: [],
|
||||
depth: 1,
|
||||
};
|
||||
});
|
||||
|
||||
return {
|
||||
name: 'src',
|
||||
path: 'src',
|
||||
modules: [],
|
||||
children,
|
||||
depth: 0,
|
||||
};
|
||||
}
|
||||
|
||||
/** Compute total shared-token weight between two clusters. */
|
||||
function clusterWeight(a, b, weights) {
|
||||
let total = 0;
|
||||
for (const ai of a) {
|
||||
for (const bi of b) {
|
||||
const key = ai < bi ? `${ai}:${bi}` : `${bi}:${ai}`;
|
||||
total += weights.get(key) || 0;
|
||||
}
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
/** Infer a group name from discriminative tokens. */
|
||||
function inferGroupName(group, moduleTokens, modules) {
|
||||
const freq = new Map();
|
||||
for (const i of group) {
|
||||
for (const tok of moduleTokens[i]) {
|
||||
freq.set(tok, (freq.get(tok) || 0) + 1);
|
||||
}
|
||||
}
|
||||
const globalFreq = new Map();
|
||||
for (const tokens of moduleTokens) {
|
||||
for (const tok of tokens) {
|
||||
globalFreq.set(tok, (globalFreq.get(tok) || 0) + 1);
|
||||
}
|
||||
}
|
||||
let best = null, bestScore = -1;
|
||||
for (const [tok, count] of freq) {
|
||||
const global = globalFreq.get(tok) || 0;
|
||||
const score = (count / (global + 1)) * Math.log(count + 1);
|
||||
if (score > bestScore && tok.length >= 3) {
|
||||
bestScore = score;
|
||||
best = tok;
|
||||
}
|
||||
}
|
||||
if (best) return best.toLowerCase().replace(/[^a-z0-9_-]/g, '_');
|
||||
if (group.length > 0) return modules[group[0]].name;
|
||||
return 'group';
|
||||
}
|
||||
|
||||
module.exports = { buildModuleTree };
|
||||
285
npm/packages/ruvector/src/decompiler/statement-parser.js
Normal file
285
npm/packages/ruvector/src/decompiler/statement-parser.js
Normal file
|
|
@ -0,0 +1,285 @@
|
|||
/**
|
||||
* statement-parser.js - Parse JavaScript source into top-level statements.
|
||||
*
|
||||
* Tracks brace/paren/bracket depth and string/template/regex contexts
|
||||
* to split at true statement boundaries. Never splits a statement
|
||||
* across modules -- a statement is atomic.
|
||||
*/
|
||||
|
||||
'use strict';
|
||||
|
||||
/**
|
||||
* Parse source into top-level statements by tracking brace/paren/bracket depth.
|
||||
*
|
||||
* A "top-level statement" ends when:
|
||||
* - We encounter a semicolon at depth 0, OR
|
||||
* - We encounter a closing brace that brings depth to 0 AND the next
|
||||
* non-whitespace token does not continue the expression (like `=`, `.`,
|
||||
* `,`, `(`, etc.) -- this avoids splitting `var { x } = obj;` or
|
||||
* `obj.method()` into two statements.
|
||||
*
|
||||
* String literals, template literals, regex literals, and comments are
|
||||
* tracked so delimiters inside them are not counted.
|
||||
*
|
||||
* @param {string} source
|
||||
* @returns {Array<{code: string, start: number, end: number}>}
|
||||
*/
|
||||
function parseTopLevelStatements(source) {
|
||||
const statements = [];
|
||||
let depth = 0;
|
||||
let start = 0;
|
||||
let i = 0;
|
||||
const len = source.length;
|
||||
|
||||
while (i < len) {
|
||||
const ch = source[i];
|
||||
const next = i + 1 < len ? source[i + 1] : '';
|
||||
|
||||
// ── Skip single-line comments ──
|
||||
if (ch === '/' && next === '/') {
|
||||
const eol = source.indexOf('\n', i + 2);
|
||||
i = eol === -1 ? len : eol + 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
// ── Skip multi-line comments ──
|
||||
if (ch === '/' && next === '*') {
|
||||
const end = source.indexOf('*/', i + 2);
|
||||
i = end === -1 ? len : end + 2;
|
||||
continue;
|
||||
}
|
||||
|
||||
// ── Skip string literals ──
|
||||
if (ch === '"' || ch === "'") {
|
||||
i = skipString(source, i, ch);
|
||||
continue;
|
||||
}
|
||||
|
||||
// ── Skip template literals ──
|
||||
if (ch === '`') {
|
||||
i = skipTemplateLiteral(source, i);
|
||||
continue;
|
||||
}
|
||||
|
||||
// ── Skip regex literals ──
|
||||
if (ch === '/' && isRegexStart(source, i)) {
|
||||
i = skipRegex(source, i);
|
||||
continue;
|
||||
}
|
||||
|
||||
// ── Track depth ──
|
||||
if (ch === '{' || ch === '(' || ch === '[') {
|
||||
depth++;
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ch === '}' || ch === ')' || ch === ']') {
|
||||
depth = Math.max(0, depth - 1);
|
||||
|
||||
// Closing brace at depth 0 MAY be a statement boundary
|
||||
if (depth === 0 && ch === '}') {
|
||||
if (!isStatementBoundaryAfterBrace(source, i + 1)) {
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const code = source.substring(start, i + 1).trim();
|
||||
if (code.length > 0) {
|
||||
statements.push({ code, start, end: i + 1 });
|
||||
}
|
||||
start = i + 1;
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// ── Semicolon at depth 0 is a statement boundary ──
|
||||
if (ch === ';' && depth === 0) {
|
||||
const code = source.substring(start, i + 1).trim();
|
||||
if (code.length > 0) {
|
||||
statements.push({ code, start, end: i + 1 });
|
||||
}
|
||||
start = i + 1;
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
|
||||
// Remaining code (unterminated statement)
|
||||
const remaining = source.substring(start).trim();
|
||||
if (remaining.length > 0) {
|
||||
statements.push({ code: remaining, start, end: len });
|
||||
}
|
||||
|
||||
return statements;
|
||||
}
|
||||
|
||||
/**
|
||||
* After a `}` at depth 0, decide whether this is truly a statement boundary.
|
||||
* Returns true if it IS a boundary (next token starts a new statement).
|
||||
* Returns false if the expression continues (e.g. `}.method()`, `} = obj`, etc.)
|
||||
*
|
||||
* @param {string} source
|
||||
* @param {number} afterPos - position right after the `}`
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function isStatementBoundaryAfterBrace(source, afterPos) {
|
||||
const len = source.length;
|
||||
let j = afterPos;
|
||||
|
||||
// Skip whitespace and comments to find the next meaningful token
|
||||
while (j < len) {
|
||||
const c = source[j];
|
||||
|
||||
if (c === ' ' || c === '\t' || c === '\r' || c === '\n') {
|
||||
j++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (c === '/' && j + 1 < len && source[j + 1] === '/') {
|
||||
const eol = source.indexOf('\n', j + 2);
|
||||
j = eol === -1 ? len : eol + 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (c === '/' && j + 1 < len && source[j + 1] === '*') {
|
||||
const end = source.indexOf('*/', j + 2);
|
||||
j = end === -1 ? len : end + 2;
|
||||
continue;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
if (j >= len) return true;
|
||||
|
||||
const nextChar = source[j];
|
||||
|
||||
// These tokens CONTINUE the expression -- NOT a statement boundary
|
||||
const continuationChars = '.=,([?:&|+\\-*/%<>^~!;)';
|
||||
if (continuationChars.includes(nextChar)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check for multi-char continuation tokens
|
||||
const ahead = source.substring(j, j + 15);
|
||||
if (/^(?:instanceof|in|of|from)\s/.test(ahead)) return false;
|
||||
if (/^as\s/.test(ahead)) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip a string literal starting at position i (where source[i] is the quote).
|
||||
* @param {string} source
|
||||
* @param {number} i
|
||||
* @param {string} quote - the quote character
|
||||
* @returns {number}
|
||||
*/
|
||||
function skipString(source, i, quote) {
|
||||
const len = source.length;
|
||||
i++;
|
||||
while (i < len) {
|
||||
if (source[i] === '\\') { i += 2; continue; }
|
||||
if (source[i] === quote) return i + 1;
|
||||
i++;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip a template literal starting at position i (where source[i] is backtick).
|
||||
* @param {string} source
|
||||
* @param {number} i
|
||||
* @returns {number}
|
||||
*/
|
||||
function skipTemplateLiteral(source, i) {
|
||||
const len = source.length;
|
||||
i++;
|
||||
while (i < len) {
|
||||
if (source[i] === '\\') { i += 2; continue; }
|
||||
if (source[i] === '`') return i + 1;
|
||||
if (source[i] === '$' && i + 1 < len && source[i + 1] === '{') {
|
||||
i = skipTemplateExpression(source, i + 2);
|
||||
continue;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip a template expression (inside ${...}) starting after the opening ${.
|
||||
* @param {string} source
|
||||
* @param {number} i
|
||||
* @returns {number}
|
||||
*/
|
||||
function skipTemplateExpression(source, i) {
|
||||
const len = source.length;
|
||||
let exprDepth = 1;
|
||||
while (i < len && exprDepth > 0) {
|
||||
const ch = source[i];
|
||||
if (ch === '\\') { i += 2; continue; }
|
||||
if (ch === '{') { exprDepth++; i++; continue; }
|
||||
if (ch === '}') { exprDepth--; i++; continue; }
|
||||
if (ch === '`') { i = skipTemplateLiteral(source, i); continue; }
|
||||
if (ch === '"' || ch === "'") { i = skipString(source, i, ch); continue; }
|
||||
i++;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
/**
|
||||
* Heuristic: is source[i] the start of a regex literal?
|
||||
* @param {string} source
|
||||
* @param {number} i
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function isRegexStart(source, i) {
|
||||
let j = i - 1;
|
||||
while (j >= 0 && (source[j] === ' ' || source[j] === '\t' || source[j] === '\n' || source[j] === '\r')) {
|
||||
j--;
|
||||
}
|
||||
if (j < 0) return true;
|
||||
const prev = source[j];
|
||||
if (/[\w$)\].]/.test(prev)) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip a regex literal starting at position i.
|
||||
* @param {string} source
|
||||
* @param {number} i
|
||||
* @returns {number}
|
||||
*/
|
||||
function skipRegex(source, i) {
|
||||
const len = source.length;
|
||||
i++;
|
||||
while (i < len) {
|
||||
if (source[i] === '\\') { i += 2; continue; }
|
||||
if (source[i] === '[') {
|
||||
i++;
|
||||
while (i < len && source[i] !== ']') {
|
||||
if (source[i] === '\\') { i += 2; continue; }
|
||||
i++;
|
||||
}
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
if (source[i] === '/') {
|
||||
i++;
|
||||
while (i < len && /[gimsuy]/.test(source[i])) i++;
|
||||
return i;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
module.exports = { parseTopLevelStatements };
|
||||
339
npm/packages/ruvector/src/decompiler/subcategories.js
Normal file
339
npm/packages/ruvector/src/decompiler/subcategories.js
Normal file
|
|
@ -0,0 +1,339 @@
|
|||
/**
|
||||
* subcategories.js - Fine-grained module classification keywords.
|
||||
*
|
||||
* Each key is a hierarchical module path (e.g. 'tools/bash').
|
||||
* Keywords can be plain strings (exact match) or contain '.*' for regex.
|
||||
* Used by module-splitter.js to classify statements into ~30-40 modules
|
||||
* instead of the original ~9 broad categories.
|
||||
*/
|
||||
|
||||
'use strict';
|
||||
|
||||
// ── Fine-grained module classification ─────────────────────────────────────
|
||||
const SUBCATEGORIES = {
|
||||
// ── tools/* ────────────────────────────────────────────────────────────
|
||||
'tools/bash': [
|
||||
'BashTool', 'child_process', 'execSync', 'spawnSync', 'spawn(',
|
||||
'shell.*command', 'shellArgs', 'commandLine', 'bashCommand',
|
||||
'killProcess', 'processExit', 'childProcess',
|
||||
],
|
||||
'tools/read': [
|
||||
'FileReadTool', 'ReadTool', 'readFile', 'readFileSync',
|
||||
'FileRead', 'fileContents', 'readContent',
|
||||
],
|
||||
'tools/edit': [
|
||||
'FileEditTool', 'EditTool', 'old_string', 'new_string',
|
||||
'applyEdit', 'textEdit', 'replaceInFile', 'editContent',
|
||||
],
|
||||
'tools/write': [
|
||||
'FileWriteTool', 'WriteTool', 'writeFile', 'writeFileSync',
|
||||
'createFile', 'FileWrite', 'writeContent',
|
||||
],
|
||||
'tools/glob': [
|
||||
'GlobTool', 'glob(', 'globSync', 'minimatch', 'picomatch',
|
||||
'ListFilesTool', 'filePattern', 'globPattern',
|
||||
],
|
||||
'tools/grep': [
|
||||
'GrepTool', 'ripgrep', 'SearchTool', 'searchPattern',
|
||||
'contentSearch', 'grepResult', 'matchLine',
|
||||
],
|
||||
'tools/agent': [
|
||||
'AgentTool', 'AgentOutputTool', 'subagent', 'spawnAgent',
|
||||
'agentTask', 'taskResult', 'delegateTask',
|
||||
],
|
||||
'tools/web-fetch': [
|
||||
'WebFetch', 'httpGet', 'fetchUrl', 'urlFetch',
|
||||
'webRequest', 'httpRequest',
|
||||
],
|
||||
'tools/web-search': [
|
||||
'WebSearch', 'searchResults', 'webQuery',
|
||||
'searchEngine', 'searchWeb',
|
||||
],
|
||||
'tools/notebook': [
|
||||
'NotebookEdit', 'notebook', 'jupyter', 'ipynb',
|
||||
'cellOutput', 'notebookCell',
|
||||
],
|
||||
'tools/mcp-dispatch': [
|
||||
'ToolUse', 'ToolResult',
|
||||
'toolDefinition', 'toolSchema', 'inputSchema',
|
||||
'toolChoice', 'toolRunner', 'dispatchTool',
|
||||
],
|
||||
'tools/todo': [
|
||||
'TodoWrite', 'TodoRead', 'todoList', 'todoItem',
|
||||
],
|
||||
|
||||
// ── core/* ─────────────────────────────────────────────────────────────
|
||||
'core/agent-loop': [
|
||||
'agentLoop', 'mainLoop', 'querySource', 'toolUseContext',
|
||||
'systemPrompt', 'conversationTurn', 'assistantMessage',
|
||||
'userMessage', 'messageHistory', 'handleToolUse',
|
||||
'processMessage', 'runLoop', 'loopIteration',
|
||||
],
|
||||
'core/streaming': [
|
||||
'content_block_delta', 'message_start', 'message_stop',
|
||||
'message_delta', 'content_block_start', 'content_block_stop',
|
||||
'text_delta', 'input_json_delta', 'StreamEvent',
|
||||
'onStream', 'streamHandler', 'stream_event',
|
||||
'streamResponse', 'streamingMode',
|
||||
],
|
||||
'core/context-manager': [
|
||||
'tengu_compact', 'microcompact', 'auto_compact',
|
||||
'compact_boundary', 'preCompactTokenCount',
|
||||
'postCompactTokenCount', 'compaction',
|
||||
'tokenCount', 'contextWindow', 'maxTokens',
|
||||
'promptCache', 'cacheControl', 'truncat',
|
||||
'contextOverflow', 'compactMessages',
|
||||
],
|
||||
'core/session': [
|
||||
'sessionId', 'conversationId', 'sessionState',
|
||||
'persistSession', 'checkpoint', 'resume.*session',
|
||||
'restore.*session', 'turnCount', 'sessionHistory',
|
||||
'saveSession', 'loadSession',
|
||||
],
|
||||
'core/error-handler': [
|
||||
'ErrorHandler', 'errorBoundary', 'handleError',
|
||||
'retryWith', 'isRetryable', 'overloaded',
|
||||
'rateLimited', 'backoff', 'retryAfter',
|
||||
'APIError', 'NetworkError',
|
||||
],
|
||||
|
||||
// ── permissions/* ──────────────────────────────────────────────────────
|
||||
'permissions/checker': [
|
||||
'canUseTool', 'Permission', 'permission',
|
||||
'allowedTools', 'permissionMode', 'isAllowed',
|
||||
'checkPermission', 'grantPermission', 'allowList',
|
||||
'denyList', 'alwaysAllowRules', 'denyWrite',
|
||||
'permissionCheck', 'allowRule', 'denyRule',
|
||||
],
|
||||
'permissions/sandbox': [
|
||||
'sandbox', 'bubblewrap', 'seatbelt', 'firejail',
|
||||
'containerize', 'isolat', 'sandboxMode',
|
||||
'seccomp', 'landlock', 'pledg',
|
||||
],
|
||||
'permissions/rules': [
|
||||
'permissionRule', 'ruleSet', 'matchRule',
|
||||
'pathRule', 'toolRule', 'readOnlyRule',
|
||||
'globRule', 'regexRule',
|
||||
],
|
||||
|
||||
// ── auth/* ─────────────────────────────────────────────────────────────
|
||||
'auth/oauth': [
|
||||
'OAuth', 'PKCE', 'authorization_code', 'token.*endpoint',
|
||||
'refresh.*token', 'authorizationUrl', 'codeVerifier',
|
||||
'codeChallenge', 'oauthFlow', 'oauthCallback',
|
||||
],
|
||||
'auth/api-key': [
|
||||
'x-api-key', 'ANTHROPIC_API_KEY', 'apiKeyHelper',
|
||||
'apiKey.*valid', 'loadApiKey',
|
||||
'keyring',
|
||||
],
|
||||
'auth/bedrock': [
|
||||
'Bedrock', 'BedrockRuntime', 'aws.*region',
|
||||
'awsProfile', 'sigv4', 'awsCredentials',
|
||||
],
|
||||
'auth/vertex': [
|
||||
'Vertex', 'vertex.*ai', 'google.*cloud',
|
||||
'googleAuth', 'serviceAccount', 'vertexProject',
|
||||
],
|
||||
|
||||
// ── mcp/* ──────────────────────────────────────────────────────────────
|
||||
'mcp/client': [
|
||||
'McpClient', 'mcp.*connect', 'mcp.*initialize',
|
||||
'mcpConnection', 'mcp_client', 'connectMcp',
|
||||
],
|
||||
'mcp/transport': [
|
||||
'StdioTransport', 'SseTransport', 'StreamableHttp',
|
||||
'McpTransport', 'transport.*type', 'transportLayer',
|
||||
'stdio.*transport', 'websocket.*transport',
|
||||
],
|
||||
'mcp/protocol': [
|
||||
'jsonrpc', 'tools/list', 'tools/call',
|
||||
'resources/list', 'prompts/list', 'McpError',
|
||||
'mcp__', 'McpServer', 'mcp_server',
|
||||
'callTool', 'listTools',
|
||||
],
|
||||
'mcp/servers': [
|
||||
'mcpServers', 'serverConfig', 'serverList',
|
||||
'registeredServers', 'spawnServer', 'serverProcess',
|
||||
],
|
||||
|
||||
// ── config/* ───────────────────────────────────────────────────────────
|
||||
'config/settings': [
|
||||
'settings.*json', 'loadSettings', 'saveSettings',
|
||||
'userSettings', 'Settings', 'configuration',
|
||||
'loadConfig', 'parseConfig',
|
||||
],
|
||||
'config/env-vars': [
|
||||
'CLAUDE_CODE_', 'ANTHROPIC_',
|
||||
'envVar', 'dotenv', 'loadEnv',
|
||||
],
|
||||
'config/models': [
|
||||
'modelId', 'modelName', 'model.*select',
|
||||
'mainLoopModel', 'availableModels', 'modelOverrides',
|
||||
'modelPreference', 'defaultModel',
|
||||
],
|
||||
'config/feature-flags': [
|
||||
'featureFlag', 'isEnabled', 'flagValue',
|
||||
'experimentId', 'feature.*gate', 'rollout',
|
||||
'featureEnabled', 'featureConfig',
|
||||
],
|
||||
|
||||
// ── telemetry/* ────────────────────────────────────────────────────────
|
||||
'telemetry/otel': [
|
||||
'opentelemetry', 'OTEL_', 'TraceProvider',
|
||||
'SpanProcessor', 'tracing', 'span',
|
||||
'tracer', 'otelExporter',
|
||||
],
|
||||
'telemetry/datadog': [
|
||||
'datadog', 'DD_', 'ddTrace', 'datadogExporter',
|
||||
],
|
||||
'telemetry/events': [
|
||||
'tengu_', 'trackEvent', 'analytics',
|
||||
'Telemetry', 'sentry',
|
||||
'eventEmit', 'emitEvent', 'telemetryEvent',
|
||||
],
|
||||
'telemetry/cost': [
|
||||
'cost', 'tokenUsage', 'inputTokens', 'outputTokens',
|
||||
'cacheRead', 'cacheCreation', 'pricing',
|
||||
'costTracker', 'usageMetrics',
|
||||
],
|
||||
'telemetry/perfetto': [
|
||||
'perfetto', 'perfTrace', 'traceBegin',
|
||||
'traceEnd', 'traceCounter',
|
||||
],
|
||||
|
||||
// ── ui/* ────────────────────────────────────────────────────────────────
|
||||
'ui/slash-commands': [
|
||||
'slashCommand', 'registerCommand', 'commandHandler',
|
||||
'parseCommand', '/help', '/clear', '/compact',
|
||||
'/bug', '/init', '/login', '/logout',
|
||||
'/doctor', '/config', '/cost', '/memory',
|
||||
],
|
||||
'ui/ink-components': [
|
||||
'useInput', 'useFocus', 'useApp', 'useStdin', 'useStdout',
|
||||
'inkRenderer', 'InkProvider', 'measureElement',
|
||||
],
|
||||
'ui/keybindings': [
|
||||
'keybinding', 'keyHandler', 'hotkey',
|
||||
'onKeyPress', 'keyMap', 'shortcut',
|
||||
],
|
||||
'ui/terminal': [
|
||||
'ansiColor', 'chalk', 'stripAnsi',
|
||||
'cursorMove', 'clearLine', 'terminalWidth',
|
||||
'isTerminal', 'ttyColumns',
|
||||
],
|
||||
|
||||
// ── model-provider/* ───────────────────────────────────────────────────
|
||||
'model-provider/anthropic': [
|
||||
'anthropic', 'Anthropic', 'claude-', 'claude_',
|
||||
'messagesCreate', 'AnthropicClient',
|
||||
],
|
||||
'model-provider/openai': [
|
||||
'openai', 'OpenAI', 'chatCompletion',
|
||||
'gpt-', 'openAiClient',
|
||||
],
|
||||
'model-provider/router': [
|
||||
'provider', 'routeModel', 'selectProvider',
|
||||
'providerConfig', 'modelRouter',
|
||||
],
|
||||
|
||||
// ── git/* ──────────────────────────────────────────────────────────────
|
||||
'git/operations': [
|
||||
'gitDiff', 'gitStatus', 'gitLog', 'gitCommit',
|
||||
'gitAdd', 'gitBranch', 'gitCheckout',
|
||||
'isGitRepo', 'getGitRoot', 'gitStash',
|
||||
],
|
||||
|
||||
// ── filesystem/* ───────────────────────────────────────────────────────
|
||||
'filesystem/operations': [
|
||||
'readdirSync', 'mkdirSync', 'statSync', 'lstatSync',
|
||||
'renameSync', 'unlinkSync', 'copyFileSync',
|
||||
'existsSync', 'realpathSync', 'accessSync',
|
||||
'fs.readdir', 'fs.mkdir', 'fs.stat', 'fs.lstat',
|
||||
],
|
||||
|
||||
// ── network/* ──────────────────────────────────────────────────────────
|
||||
'network/http': [
|
||||
'http.*request', 'https.*request', 'fetch(',
|
||||
'axios', 'got(', 'requestOptions',
|
||||
'responseBody', 'statusCode',
|
||||
],
|
||||
};
|
||||
|
||||
// ── String-literal patterns for minified code ─────────────────────────────
|
||||
// Minified bundles mangle identifiers but preserve string literals.
|
||||
// These patterns match quoted strings commonly found in each domain.
|
||||
// Each pattern is matched against the raw code (not just identifiers).
|
||||
const STRING_PATTERNS = {
|
||||
'tools/bash': ['"bash"', '"shell"', '"command"', '"child_process"', '"spawn"', '"BashTool"'],
|
||||
'tools/read': ['"FileReadTool"', '"ReadFileTool"', '"cat "', '"readFile"'],
|
||||
'tools/edit': ['"FileEditTool"', '"old_string"', '"new_string"', '"EditFileTool"'],
|
||||
'tools/write': ['"FileWriteTool"', '"WriteFileTool"', '"createFile"'],
|
||||
'tools/glob': ['"GlobTool"', '"ListFilesTool"', '"glob"', '"minimatch"'],
|
||||
'tools/grep': ['"GrepTool"', '"ripgrep"', '"rg "', '"SearchTool"'],
|
||||
'tools/agent': ['"AgentTool"', '"Task"', '"subagent"'],
|
||||
'tools/web-fetch': ['"WebFetchTool"', '"url_fetch"'],
|
||||
'tools/web-search': ['"WebSearchTool"', '"web_search"'],
|
||||
'tools/notebook': ['"NotebookEditTool"', '"ipynb"', '"jupyter"'],
|
||||
'tools/mcp-dispatch': ['"inputSchema"', '"toolSchema"', '"toolDefinition"'],
|
||||
'tools/todo': ['"TodoWriteTool"', '"TodoReadTool"'],
|
||||
'core/agent-loop': ['"assistant"', '"user"', '"system"', '"systemPrompt"', '"messageHistory"'],
|
||||
'core/streaming': [
|
||||
'"content_block_delta"', '"message_start"', '"message_stop"',
|
||||
'"message_delta"', '"content_block_start"', '"content_block_stop"',
|
||||
'"text_delta"', '"input_json_delta"', '"stream_event"',
|
||||
],
|
||||
'core/context-manager': [
|
||||
'"tengu_compact"', '"auto_compact"', '"compact"',
|
||||
'"contextWindow"', '"maxTokens"', '"cacheControl"',
|
||||
],
|
||||
'core/session': ['"sessionId"', '"conversationId"', '"checkpoint"', '"resume"'],
|
||||
'core/error-handler': ['"overloaded"', '"rate_limit"', '"retryAfter"', '"APIError"'],
|
||||
'permissions/checker': [
|
||||
'"canUseTool"', '"permission"', '"allowedTools"',
|
||||
'"permissionMode"', '"alwaysAllow"',
|
||||
],
|
||||
'permissions/sandbox': ['"sandbox"', '"bubblewrap"', '"seatbelt"', '"firejail"'],
|
||||
'auth/oauth': ['"OAuth"', '"PKCE"', '"authorization_code"', '"refresh_token"', '"code_verifier"'],
|
||||
'auth/api-key': ['"x-api-key"', '"ANTHROPIC_API_KEY"', '"apiKeyHelper"'],
|
||||
'auth/bedrock': ['"bedrock"', '"BedrockRuntime"', '"aws-region"'],
|
||||
'auth/vertex': ['"vertex"', '"vertexai"', '"google-cloud"'],
|
||||
'mcp/client': ['"McpClient"', '"mcp_client"'],
|
||||
'mcp/transport': ['"stdio"', '"sse"', '"streamable-http"', '"StdioTransport"'],
|
||||
'mcp/protocol': ['"jsonrpc"', '"tools/list"', '"tools/call"', '"resources/list"', '"mcp__"'],
|
||||
'mcp/servers': ['"mcpServers"', '"serverConfig"'],
|
||||
'config/settings': ['"settings.json"', '"userSettings"', '".claude"'],
|
||||
'config/env-vars': ['"CLAUDE_CODE_"', '"ANTHROPIC_"', '"CLAUDE_CONFIG"', '"CLAUDE_SKIP"'],
|
||||
'config/models': ['"modelId"', '"claude-sonnet"', '"claude-opus"', '"claude-haiku"'],
|
||||
'config/feature-flags': ['"featureFlag"', '"experiment"', '"rollout"'],
|
||||
'telemetry/otel': ['"opentelemetry"', '"OTEL_"', '"TraceProvider"'],
|
||||
'telemetry/datadog': ['"datadog"', '"DD_TRACE"'],
|
||||
'telemetry/events': ['"tengu_"', '"trackEvent"', '"analytics"', '"telemetryEvent"'],
|
||||
'telemetry/cost': ['"inputTokens"', '"outputTokens"', '"cacheRead"', '"cacheCreation"'],
|
||||
'ui/slash-commands': ['"/help"', '"/clear"', '"/compact"', '"/bug"', '"/init"', '"/doctor"'],
|
||||
'ui/ink-components': ['"useInput"', '"useFocus"', '"useApp"', '"inkRenderer"'],
|
||||
'ui/keybindings': ['"keybinding"', '"shortcut"', '"hotkey"'],
|
||||
'ui/terminal': ['"chalk"', '"stripAnsi"', '"ansiColor"'],
|
||||
'model-provider/anthropic': ['"anthropic"', '"claude-"', '"Anthropic"', '"messages"'],
|
||||
'model-provider/openai': ['"openai"', '"gpt-"', '"chatCompletion"'],
|
||||
'git/operations': ['"git diff"', '"git status"', '"git log"', '"git commit"'],
|
||||
'network/http': ['"Content-Type"', '"application/json"', '"Authorization"'],
|
||||
};
|
||||
|
||||
// ── Legacy MODULE_KEYWORDS alias ───────────────────────────────────────────
|
||||
// Maps old broad categories for backward compat.
|
||||
const MODULE_KEYWORDS = {
|
||||
'tool-dispatch': SUBCATEGORIES['tools/mcp-dispatch'],
|
||||
'permission-system': SUBCATEGORIES['permissions/checker'],
|
||||
'mcp-client': SUBCATEGORIES['mcp/protocol'],
|
||||
'streaming-handler': SUBCATEGORIES['core/streaming'],
|
||||
'context-manager': SUBCATEGORIES['core/context-manager'],
|
||||
'agent-loop': SUBCATEGORIES['core/agent-loop'],
|
||||
'commands': SUBCATEGORIES['ui/slash-commands'],
|
||||
'telemetry': SUBCATEGORIES['telemetry/events'],
|
||||
'config': SUBCATEGORIES['config/settings'],
|
||||
'session': SUBCATEGORIES['core/session'],
|
||||
'model-provider': SUBCATEGORIES['model-provider/anthropic'],
|
||||
};
|
||||
|
||||
module.exports = { SUBCATEGORIES, MODULE_KEYWORDS, STRING_PATTERNS };
|
||||
Loading…
Add table
Add a link
Reference in a new issue