ruvector/scripts/training/extract-sourcemap-pairs.mjs
rUv 36f2599774 feat(training): source map extraction + v2 model (83.67% val accuracy)
- Extract 14,198 training pairs from 6,941 source maps in node_modules
- Train v2 model (4-layer, 192-dim, 6-head transformer, 1.9M params)
- Val accuracy: 83.67% (up from 75.72%), exact match: 12.3% (up from 0.1%)
- Export weights.bin (7.3MB) for Rust runtime inference
- Add decompiler dashboard (React + Tailwind + Vite)
- Add runnable RVF (7,350 vectors, 49 segments, witness chain)
- Update evaluate-model.py to support configurable model architectures
- All 13 Rust tests pass, all 45 RVF files have valid SFVR headers

Co-Authored-By: claude-flow <ruv@ruv.net>
2026-04-03 04:57:47 +00:00

425 lines
13 KiB
JavaScript

#!/usr/bin/env node
/**
* Extract training pairs from source maps in node_modules.
*
* Source maps contain `names` arrays with original identifiers and VLQ-encoded
* mappings that tell us exactly which minified name maps to which original.
*
* For each source map:
* 1. Parse the .js.map JSON
* 2. Decode VLQ mappings to get (line, col, nameIdx) tuples
* 3. Read the corresponding .js file
* 4. Extract the minified identifier at each (line, col) position
* 5. Extract surrounding context (string literals, property accesses)
* 6. Output (minified, original, context, properties, kind) pairs
*
* Usage:
* node scripts/training/extract-sourcemap-pairs.mjs [--output training-data-sourcemaps.jsonl]
*/
import { readFileSync, writeFileSync, existsSync } from "fs";
import { resolve, dirname, basename, join } from "path";
import { execSync } from "child_process";
import { parseArgs } from "util";
const { values: args } = parseArgs({
options: {
output: { type: "string", default: "training-data-sourcemaps.jsonl" },
help: { type: "boolean", short: "h", default: false },
},
});
if (args.help) {
console.log("Usage: extract-sourcemap-pairs.mjs [--output FILE]");
process.exit(0);
}
const OUTPUT_PATH = resolve(args.output);
const ROOT = resolve(import.meta.dirname, "../..");
// ---------------------------------------------------------------------------
// VLQ Decoder
// ---------------------------------------------------------------------------
const VLQ_BASE_SHIFT = 5;
const VLQ_BASE = 1 << VLQ_BASE_SHIFT;
const VLQ_BASE_MASK = VLQ_BASE - 1;
const VLQ_CONTINUATION_BIT = VLQ_BASE;
const BASE64_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
const BASE64_MAP = new Map();
for (let i = 0; i < BASE64_CHARS.length; i++) {
BASE64_MAP.set(BASE64_CHARS[i], i);
}
function decodeVLQ(str, pos) {
let result = 0;
let shift = 0;
let continuation = true;
while (continuation && pos < str.length) {
const digit = BASE64_MAP.get(str[pos++]);
if (digit === undefined) break;
continuation = !!(digit & VLQ_CONTINUATION_BIT);
result += (digit & VLQ_BASE_MASK) << shift;
shift += VLQ_BASE_SHIFT;
}
// Sign is in the least significant bit
const negate = result & 1;
result >>= 1;
return { value: negate ? -result : result, pos };
}
/**
* Decode source map mappings string into an array of segments.
* Each segment: [genCol, sourceIdx, sourceLine, sourceCol, nameIdx?]
*/
function decodeMappings(mappingsStr) {
const lines = [];
let currentLine = [];
let generatedColumn = 0;
let sourceIndex = 0;
let sourceLine = 0;
let sourceColumn = 0;
let nameIndex = 0;
let pos = 0;
while (pos < mappingsStr.length) {
const ch = mappingsStr[pos];
if (ch === ";") {
lines.push(currentLine);
currentLine = [];
generatedColumn = 0;
pos++;
continue;
}
if (ch === ",") {
pos++;
continue;
}
// Decode segment
const segment = [];
// Field 1: generated column (relative)
let decoded = decodeVLQ(mappingsStr, pos);
generatedColumn += decoded.value;
segment.push(generatedColumn);
pos = decoded.pos;
if (pos < mappingsStr.length && mappingsStr[pos] !== "," && mappingsStr[pos] !== ";") {
// Field 2: source index (relative)
decoded = decodeVLQ(mappingsStr, pos);
sourceIndex += decoded.value;
segment.push(sourceIndex);
pos = decoded.pos;
// Field 3: source line (relative)
decoded = decodeVLQ(mappingsStr, pos);
sourceLine += decoded.value;
segment.push(sourceLine);
pos = decoded.pos;
// Field 4: source column (relative)
decoded = decodeVLQ(mappingsStr, pos);
sourceColumn += decoded.value;
segment.push(sourceColumn);
pos = decoded.pos;
// Field 5: name index (optional, relative)
if (pos < mappingsStr.length && mappingsStr[pos] !== "," && mappingsStr[pos] !== ";") {
decoded = decodeVLQ(mappingsStr, pos);
nameIndex += decoded.value;
segment.push(nameIndex);
pos = decoded.pos;
}
}
currentLine.push(segment);
}
if (currentLine.length > 0) {
lines.push(currentLine);
}
return lines;
}
// ---------------------------------------------------------------------------
// Identifier extraction from minified JS at a given position
// ---------------------------------------------------------------------------
const IDENT_RE = /^[a-zA-Z_$][a-zA-Z0-9_$]*/;
function extractIdentifierAt(lines, lineIdx, colIdx) {
if (lineIdx >= lines.length) return null;
const line = lines[lineIdx];
if (colIdx >= line.length) return null;
const rest = line.substring(colIdx);
const m = rest.match(IDENT_RE);
return m ? m[0] : null;
}
// ---------------------------------------------------------------------------
// Context extraction
// ---------------------------------------------------------------------------
function extractContext(lines, lineIdx, colIdx, windowLines = 3) {
const context = [];
const startLine = Math.max(0, lineIdx - windowLines);
const endLine = Math.min(lines.length, lineIdx + windowLines + 1);
for (let i = startLine; i < endLine; i++) {
const line = lines[i];
// Extract string literals
const strRe = /["']([a-zA-Z][a-zA-Z0-9_./-]{2,})["']/g;
let m;
while ((m = strRe.exec(line)) !== null) {
if (m[1].length < 30) context.push(m[1]);
}
// Extract property accesses
const propRe = /\.([a-zA-Z_$][a-zA-Z0-9_$]{2,})/g;
while ((m = propRe.exec(line)) !== null) {
if (m[1].length < 25) context.push(m[1]);
}
}
return [...new Set(context)].slice(0, 10);
}
function extractProperties(lines, lineIdx, identifier, windowLines = 5) {
const props = new Set();
const startLine = Math.max(0, lineIdx - windowLines);
const endLine = Math.min(lines.length, lineIdx + windowLines + 1);
for (let i = startLine; i < endLine; i++) {
const re = new RegExp(`\\b${escapeRegex(identifier)}\\.([a-zA-Z_$][a-zA-Z0-9_$]{1,})`, "g");
let m;
while ((m = re.exec(lines[i])) !== null) {
if (m[1].length < 25) props.add(m[1]);
}
}
return [...props].slice(0, 8);
}
function escapeRegex(s) {
return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}
// ---------------------------------------------------------------------------
// Determine kind from context
// ---------------------------------------------------------------------------
function inferKind(lines, lineIdx, colIdx, identifier) {
if (lineIdx >= lines.length) return "var";
const line = lines[lineIdx];
// Check what precedes the identifier
const before = line.substring(0, colIdx).trimEnd();
if (/\bfunction\s*$/.test(before)) return "function";
if (/\bclass\s*$/.test(before)) return "class";
if (/\b(?:const|let|var)\s*$/.test(before)) return "var";
// Check if identifier starts with uppercase (likely class)
if (/^[A-Z][a-z]/.test(identifier)) return "class";
// Check if followed by ( → likely function
const after = line.substring(colIdx + identifier.length).trimStart();
if (after.startsWith("(") || after.startsWith("=function") || after.startsWith("=async")) {
return "function";
}
return "var";
}
// ---------------------------------------------------------------------------
// Process a single source map
// ---------------------------------------------------------------------------
function processSourceMap(mapPath) {
const pairs = [];
let mapJson;
try {
mapJson = JSON.parse(readFileSync(mapPath, "utf8"));
} catch {
return pairs;
}
const names = mapJson.names || [];
const mappings = mapJson.mappings;
if (!names.length || !mappings) return pairs;
// Find the corresponding JS file
const jsPath = mapPath.replace(/\.map$/, "");
if (!existsSync(jsPath)) return pairs;
let jsContent;
try {
jsContent = readFileSync(jsPath, "utf8");
} catch {
return pairs;
}
const jsLines = jsContent.split("\n");
// Decode mappings
let decodedLines;
try {
decodedLines = decodeMappings(mappings);
} catch {
return pairs;
}
// Process each segment that has a name index
const seen = new Set();
for (let lineIdx = 0; lineIdx < decodedLines.length; lineIdx++) {
const segments = decodedLines[lineIdx];
for (const seg of segments) {
if (seg.length < 5) continue; // No name index
const genCol = seg[0];
const nameIdx = seg[4];
if (nameIdx < 0 || nameIdx >= names.length) continue;
const originalName = names[nameIdx];
if (!originalName || originalName.length < 3) continue;
// Skip common keywords
if (SKIP_NAMES.has(originalName)) continue;
// Extract the minified identifier at this position
const minified = extractIdentifierAt(jsLines, lineIdx, genCol);
if (!minified) continue;
// Skip if minified === original (no renaming happened)
if (minified === originalName) continue;
// Skip if minified is too long (probably not actually minified)
if (minified.length > 6) continue;
// Deduplicate
const key = `${minified}|${originalName}`;
if (seen.has(key)) continue;
seen.add(key);
// Extract context and properties
const context = extractContext(jsLines, lineIdx, genCol);
const properties = extractProperties(jsLines, lineIdx, minified);
const kind = inferKind(jsLines, lineIdx, genCol, originalName);
pairs.push({
minified,
original: originalName,
context_strings: context,
properties,
kind,
});
}
}
return pairs;
}
const SKIP_NAMES = new Set([
"if", "else", "for", "while", "do", "switch", "case", "break",
"continue", "return", "try", "catch", "finally", "throw", "new",
"delete", "typeof", "void", "instanceof", "in", "of", "with",
"this", "super", "true", "false", "null", "undefined", "NaN",
"Infinity", "arguments", "eval", "constructor", "prototype",
"use", "strict", "exports", "module", "require",
"Object", "Array", "String", "Number", "Boolean", "Function",
"Symbol", "BigInt", "Map", "Set", "WeakMap", "WeakSet",
"Promise", "Error", "TypeError", "RangeError", "SyntaxError",
"Math", "Date", "JSON", "RegExp", "Proxy", "Reflect",
"console", "document", "window", "global", "globalThis",
"process", "Buffer", "setTimeout", "setInterval", "clearTimeout",
"length", "push", "pop", "shift", "unshift",
"call", "apply", "bind", "toString", "valueOf",
"hasOwnProperty", "propertyIsEnumerable", "isPrototypeOf",
"__proto__", "__defineGetter__", "__defineSetter__",
]);
// ---------------------------------------------------------------------------
// Main
// ---------------------------------------------------------------------------
console.log("=== Extracting training pairs from source maps ===\n");
// Find all source map files
const findCmd = `find ${join(ROOT, "node_modules")} -name "*.js.map" -not -path "*/node_modules/*/node_modules/*" -size +1k 2>/dev/null`;
const mapFiles = execSync(findCmd, { encoding: "utf8" }).trim().split("\n").filter(Boolean);
console.log(`Found ${mapFiles.length} source map files\n`);
let totalPairs = 0;
let filesWithPairs = 0;
const allPairs = [];
for (let i = 0; i < mapFiles.length; i++) {
const mapFile = mapFiles[i];
const pairs = processSourceMap(mapFile);
if (pairs.length > 0) {
allPairs.push(...pairs);
totalPairs += pairs.length;
filesWithPairs++;
if (pairs.length >= 10) {
const rel = mapFile.replace(ROOT + "/node_modules/", "");
console.log(` [${pairs.length} pairs] ${rel}`);
}
}
// Progress every 500 files
if ((i + 1) % 500 === 0) {
console.log(` ... processed ${i + 1}/${mapFiles.length} files, ${totalPairs} pairs so far`);
}
}
console.log(`\nProcessed ${mapFiles.length} files`);
console.log(`Files with pairs: ${filesWithPairs}`);
console.log(`Total pairs: ${totalPairs}`);
// Deduplicate globally
const globalSeen = new Set();
const deduped = allPairs.filter((p) => {
const key = `${p.minified}|${p.original}`;
if (globalSeen.has(key)) return false;
globalSeen.add(key);
return true;
});
console.log(`After dedup: ${deduped.length} unique pairs`);
// Shuffle
for (let i = deduped.length - 1; i > 0; i--) {
const j = Math.floor(Math.random() * (i + 1));
[deduped[i], deduped[j]] = [deduped[j], deduped[i]];
}
// Write
const lines = deduped.map((p) => JSON.stringify(p)).join("\n");
writeFileSync(OUTPUT_PATH, lines + "\n", "utf8");
console.log(`\nWrote ${deduped.length} pairs to ${OUTPUT_PATH}`);
// Stats
const kindCounts = {};
for (const p of deduped) {
kindCounts[p.kind] = (kindCounts[p.kind] || 0) + 1;
}
console.log("\nBreakdown by kind:");
for (const [kind, count] of Object.entries(kindCounts).sort((a, b) => b[1] - a[1])) {
console.log(` ${kind}: ${count}`);
}
const avgCtx = deduped.reduce((s, p) => s + p.context_strings.length, 0) / Math.max(deduped.length, 1);
console.log(`\nAvg context strings: ${avgCtx.toFixed(1)}`);