mirror of
https://github.com/ruvnet/RuVector.git
synced 2026-05-22 19:56:25 +00:00
- Extract 14,198 training pairs from 6,941 source maps in node_modules - Train v2 model (4-layer, 192-dim, 6-head transformer, 1.9M params) - Val accuracy: 83.67% (up from 75.72%), exact match: 12.3% (up from 0.1%) - Export weights.bin (7.3MB) for Rust runtime inference - Add decompiler dashboard (React + Tailwind + Vite) - Add runnable RVF (7,350 vectors, 49 segments, witness chain) - Update evaluate-model.py to support configurable model architectures - All 13 Rust tests pass, all 45 RVF files have valid SFVR headers Co-Authored-By: claude-flow <ruv@ruv.net>
425 lines
13 KiB
JavaScript
425 lines
13 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* Extract training pairs from source maps in node_modules.
|
|
*
|
|
* Source maps contain `names` arrays with original identifiers and VLQ-encoded
|
|
* mappings that tell us exactly which minified name maps to which original.
|
|
*
|
|
* For each source map:
|
|
* 1. Parse the .js.map JSON
|
|
* 2. Decode VLQ mappings to get (line, col, nameIdx) tuples
|
|
* 3. Read the corresponding .js file
|
|
* 4. Extract the minified identifier at each (line, col) position
|
|
* 5. Extract surrounding context (string literals, property accesses)
|
|
* 6. Output (minified, original, context, properties, kind) pairs
|
|
*
|
|
* Usage:
|
|
* node scripts/training/extract-sourcemap-pairs.mjs [--output training-data-sourcemaps.jsonl]
|
|
*/
|
|
|
|
import { readFileSync, writeFileSync, existsSync } from "fs";
|
|
import { resolve, dirname, basename, join } from "path";
|
|
import { execSync } from "child_process";
|
|
import { parseArgs } from "util";
|
|
|
|
const { values: args } = parseArgs({
|
|
options: {
|
|
output: { type: "string", default: "training-data-sourcemaps.jsonl" },
|
|
help: { type: "boolean", short: "h", default: false },
|
|
},
|
|
});
|
|
|
|
if (args.help) {
|
|
console.log("Usage: extract-sourcemap-pairs.mjs [--output FILE]");
|
|
process.exit(0);
|
|
}
|
|
|
|
const OUTPUT_PATH = resolve(args.output);
|
|
const ROOT = resolve(import.meta.dirname, "../..");
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// VLQ Decoder
|
|
// ---------------------------------------------------------------------------
|
|
|
|
const VLQ_BASE_SHIFT = 5;
|
|
const VLQ_BASE = 1 << VLQ_BASE_SHIFT;
|
|
const VLQ_BASE_MASK = VLQ_BASE - 1;
|
|
const VLQ_CONTINUATION_BIT = VLQ_BASE;
|
|
|
|
const BASE64_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
|
const BASE64_MAP = new Map();
|
|
for (let i = 0; i < BASE64_CHARS.length; i++) {
|
|
BASE64_MAP.set(BASE64_CHARS[i], i);
|
|
}
|
|
|
|
function decodeVLQ(str, pos) {
|
|
let result = 0;
|
|
let shift = 0;
|
|
let continuation = true;
|
|
|
|
while (continuation && pos < str.length) {
|
|
const digit = BASE64_MAP.get(str[pos++]);
|
|
if (digit === undefined) break;
|
|
continuation = !!(digit & VLQ_CONTINUATION_BIT);
|
|
result += (digit & VLQ_BASE_MASK) << shift;
|
|
shift += VLQ_BASE_SHIFT;
|
|
}
|
|
|
|
// Sign is in the least significant bit
|
|
const negate = result & 1;
|
|
result >>= 1;
|
|
return { value: negate ? -result : result, pos };
|
|
}
|
|
|
|
/**
|
|
* Decode source map mappings string into an array of segments.
|
|
* Each segment: [genCol, sourceIdx, sourceLine, sourceCol, nameIdx?]
|
|
*/
|
|
function decodeMappings(mappingsStr) {
|
|
const lines = [];
|
|
let currentLine = [];
|
|
|
|
let generatedColumn = 0;
|
|
let sourceIndex = 0;
|
|
let sourceLine = 0;
|
|
let sourceColumn = 0;
|
|
let nameIndex = 0;
|
|
|
|
let pos = 0;
|
|
while (pos < mappingsStr.length) {
|
|
const ch = mappingsStr[pos];
|
|
|
|
if (ch === ";") {
|
|
lines.push(currentLine);
|
|
currentLine = [];
|
|
generatedColumn = 0;
|
|
pos++;
|
|
continue;
|
|
}
|
|
|
|
if (ch === ",") {
|
|
pos++;
|
|
continue;
|
|
}
|
|
|
|
// Decode segment
|
|
const segment = [];
|
|
|
|
// Field 1: generated column (relative)
|
|
let decoded = decodeVLQ(mappingsStr, pos);
|
|
generatedColumn += decoded.value;
|
|
segment.push(generatedColumn);
|
|
pos = decoded.pos;
|
|
|
|
if (pos < mappingsStr.length && mappingsStr[pos] !== "," && mappingsStr[pos] !== ";") {
|
|
// Field 2: source index (relative)
|
|
decoded = decodeVLQ(mappingsStr, pos);
|
|
sourceIndex += decoded.value;
|
|
segment.push(sourceIndex);
|
|
pos = decoded.pos;
|
|
|
|
// Field 3: source line (relative)
|
|
decoded = decodeVLQ(mappingsStr, pos);
|
|
sourceLine += decoded.value;
|
|
segment.push(sourceLine);
|
|
pos = decoded.pos;
|
|
|
|
// Field 4: source column (relative)
|
|
decoded = decodeVLQ(mappingsStr, pos);
|
|
sourceColumn += decoded.value;
|
|
segment.push(sourceColumn);
|
|
pos = decoded.pos;
|
|
|
|
// Field 5: name index (optional, relative)
|
|
if (pos < mappingsStr.length && mappingsStr[pos] !== "," && mappingsStr[pos] !== ";") {
|
|
decoded = decodeVLQ(mappingsStr, pos);
|
|
nameIndex += decoded.value;
|
|
segment.push(nameIndex);
|
|
pos = decoded.pos;
|
|
}
|
|
}
|
|
|
|
currentLine.push(segment);
|
|
}
|
|
|
|
if (currentLine.length > 0) {
|
|
lines.push(currentLine);
|
|
}
|
|
|
|
return lines;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Identifier extraction from minified JS at a given position
|
|
// ---------------------------------------------------------------------------
|
|
|
|
const IDENT_RE = /^[a-zA-Z_$][a-zA-Z0-9_$]*/;
|
|
|
|
function extractIdentifierAt(lines, lineIdx, colIdx) {
|
|
if (lineIdx >= lines.length) return null;
|
|
const line = lines[lineIdx];
|
|
if (colIdx >= line.length) return null;
|
|
const rest = line.substring(colIdx);
|
|
const m = rest.match(IDENT_RE);
|
|
return m ? m[0] : null;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Context extraction
|
|
// ---------------------------------------------------------------------------
|
|
|
|
function extractContext(lines, lineIdx, colIdx, windowLines = 3) {
|
|
const context = [];
|
|
const startLine = Math.max(0, lineIdx - windowLines);
|
|
const endLine = Math.min(lines.length, lineIdx + windowLines + 1);
|
|
|
|
for (let i = startLine; i < endLine; i++) {
|
|
const line = lines[i];
|
|
|
|
// Extract string literals
|
|
const strRe = /["']([a-zA-Z][a-zA-Z0-9_./-]{2,})["']/g;
|
|
let m;
|
|
while ((m = strRe.exec(line)) !== null) {
|
|
if (m[1].length < 30) context.push(m[1]);
|
|
}
|
|
|
|
// Extract property accesses
|
|
const propRe = /\.([a-zA-Z_$][a-zA-Z0-9_$]{2,})/g;
|
|
while ((m = propRe.exec(line)) !== null) {
|
|
if (m[1].length < 25) context.push(m[1]);
|
|
}
|
|
}
|
|
|
|
return [...new Set(context)].slice(0, 10);
|
|
}
|
|
|
|
function extractProperties(lines, lineIdx, identifier, windowLines = 5) {
|
|
const props = new Set();
|
|
const startLine = Math.max(0, lineIdx - windowLines);
|
|
const endLine = Math.min(lines.length, lineIdx + windowLines + 1);
|
|
|
|
for (let i = startLine; i < endLine; i++) {
|
|
const re = new RegExp(`\\b${escapeRegex(identifier)}\\.([a-zA-Z_$][a-zA-Z0-9_$]{1,})`, "g");
|
|
let m;
|
|
while ((m = re.exec(lines[i])) !== null) {
|
|
if (m[1].length < 25) props.add(m[1]);
|
|
}
|
|
}
|
|
|
|
return [...props].slice(0, 8);
|
|
}
|
|
|
|
function escapeRegex(s) {
|
|
return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Determine kind from context
|
|
// ---------------------------------------------------------------------------
|
|
|
|
function inferKind(lines, lineIdx, colIdx, identifier) {
|
|
if (lineIdx >= lines.length) return "var";
|
|
const line = lines[lineIdx];
|
|
|
|
// Check what precedes the identifier
|
|
const before = line.substring(0, colIdx).trimEnd();
|
|
if (/\bfunction\s*$/.test(before)) return "function";
|
|
if (/\bclass\s*$/.test(before)) return "class";
|
|
if (/\b(?:const|let|var)\s*$/.test(before)) return "var";
|
|
|
|
// Check if identifier starts with uppercase (likely class)
|
|
if (/^[A-Z][a-z]/.test(identifier)) return "class";
|
|
|
|
// Check if followed by ( → likely function
|
|
const after = line.substring(colIdx + identifier.length).trimStart();
|
|
if (after.startsWith("(") || after.startsWith("=function") || after.startsWith("=async")) {
|
|
return "function";
|
|
}
|
|
|
|
return "var";
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Process a single source map
|
|
// ---------------------------------------------------------------------------
|
|
|
|
function processSourceMap(mapPath) {
|
|
const pairs = [];
|
|
|
|
let mapJson;
|
|
try {
|
|
mapJson = JSON.parse(readFileSync(mapPath, "utf8"));
|
|
} catch {
|
|
return pairs;
|
|
}
|
|
|
|
const names = mapJson.names || [];
|
|
const mappings = mapJson.mappings;
|
|
if (!names.length || !mappings) return pairs;
|
|
|
|
// Find the corresponding JS file
|
|
const jsPath = mapPath.replace(/\.map$/, "");
|
|
if (!existsSync(jsPath)) return pairs;
|
|
|
|
let jsContent;
|
|
try {
|
|
jsContent = readFileSync(jsPath, "utf8");
|
|
} catch {
|
|
return pairs;
|
|
}
|
|
|
|
const jsLines = jsContent.split("\n");
|
|
|
|
// Decode mappings
|
|
let decodedLines;
|
|
try {
|
|
decodedLines = decodeMappings(mappings);
|
|
} catch {
|
|
return pairs;
|
|
}
|
|
|
|
// Process each segment that has a name index
|
|
const seen = new Set();
|
|
|
|
for (let lineIdx = 0; lineIdx < decodedLines.length; lineIdx++) {
|
|
const segments = decodedLines[lineIdx];
|
|
for (const seg of segments) {
|
|
if (seg.length < 5) continue; // No name index
|
|
|
|
const genCol = seg[0];
|
|
const nameIdx = seg[4];
|
|
|
|
if (nameIdx < 0 || nameIdx >= names.length) continue;
|
|
|
|
const originalName = names[nameIdx];
|
|
if (!originalName || originalName.length < 3) continue;
|
|
|
|
// Skip common keywords
|
|
if (SKIP_NAMES.has(originalName)) continue;
|
|
|
|
// Extract the minified identifier at this position
|
|
const minified = extractIdentifierAt(jsLines, lineIdx, genCol);
|
|
if (!minified) continue;
|
|
|
|
// Skip if minified === original (no renaming happened)
|
|
if (minified === originalName) continue;
|
|
|
|
// Skip if minified is too long (probably not actually minified)
|
|
if (minified.length > 6) continue;
|
|
|
|
// Deduplicate
|
|
const key = `${minified}|${originalName}`;
|
|
if (seen.has(key)) continue;
|
|
seen.add(key);
|
|
|
|
// Extract context and properties
|
|
const context = extractContext(jsLines, lineIdx, genCol);
|
|
const properties = extractProperties(jsLines, lineIdx, minified);
|
|
const kind = inferKind(jsLines, lineIdx, genCol, originalName);
|
|
|
|
pairs.push({
|
|
minified,
|
|
original: originalName,
|
|
context_strings: context,
|
|
properties,
|
|
kind,
|
|
});
|
|
}
|
|
}
|
|
|
|
return pairs;
|
|
}
|
|
|
|
const SKIP_NAMES = new Set([
|
|
"if", "else", "for", "while", "do", "switch", "case", "break",
|
|
"continue", "return", "try", "catch", "finally", "throw", "new",
|
|
"delete", "typeof", "void", "instanceof", "in", "of", "with",
|
|
"this", "super", "true", "false", "null", "undefined", "NaN",
|
|
"Infinity", "arguments", "eval", "constructor", "prototype",
|
|
"use", "strict", "exports", "module", "require",
|
|
"Object", "Array", "String", "Number", "Boolean", "Function",
|
|
"Symbol", "BigInt", "Map", "Set", "WeakMap", "WeakSet",
|
|
"Promise", "Error", "TypeError", "RangeError", "SyntaxError",
|
|
"Math", "Date", "JSON", "RegExp", "Proxy", "Reflect",
|
|
"console", "document", "window", "global", "globalThis",
|
|
"process", "Buffer", "setTimeout", "setInterval", "clearTimeout",
|
|
"length", "push", "pop", "shift", "unshift",
|
|
"call", "apply", "bind", "toString", "valueOf",
|
|
"hasOwnProperty", "propertyIsEnumerable", "isPrototypeOf",
|
|
"__proto__", "__defineGetter__", "__defineSetter__",
|
|
]);
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Main
|
|
// ---------------------------------------------------------------------------
|
|
|
|
console.log("=== Extracting training pairs from source maps ===\n");
|
|
|
|
// Find all source map files
|
|
const findCmd = `find ${join(ROOT, "node_modules")} -name "*.js.map" -not -path "*/node_modules/*/node_modules/*" -size +1k 2>/dev/null`;
|
|
const mapFiles = execSync(findCmd, { encoding: "utf8" }).trim().split("\n").filter(Boolean);
|
|
|
|
console.log(`Found ${mapFiles.length} source map files\n`);
|
|
|
|
let totalPairs = 0;
|
|
let filesWithPairs = 0;
|
|
const allPairs = [];
|
|
|
|
for (let i = 0; i < mapFiles.length; i++) {
|
|
const mapFile = mapFiles[i];
|
|
const pairs = processSourceMap(mapFile);
|
|
if (pairs.length > 0) {
|
|
allPairs.push(...pairs);
|
|
totalPairs += pairs.length;
|
|
filesWithPairs++;
|
|
|
|
if (pairs.length >= 10) {
|
|
const rel = mapFile.replace(ROOT + "/node_modules/", "");
|
|
console.log(` [${pairs.length} pairs] ${rel}`);
|
|
}
|
|
}
|
|
|
|
// Progress every 500 files
|
|
if ((i + 1) % 500 === 0) {
|
|
console.log(` ... processed ${i + 1}/${mapFiles.length} files, ${totalPairs} pairs so far`);
|
|
}
|
|
}
|
|
|
|
console.log(`\nProcessed ${mapFiles.length} files`);
|
|
console.log(`Files with pairs: ${filesWithPairs}`);
|
|
console.log(`Total pairs: ${totalPairs}`);
|
|
|
|
// Deduplicate globally
|
|
const globalSeen = new Set();
|
|
const deduped = allPairs.filter((p) => {
|
|
const key = `${p.minified}|${p.original}`;
|
|
if (globalSeen.has(key)) return false;
|
|
globalSeen.add(key);
|
|
return true;
|
|
});
|
|
|
|
console.log(`After dedup: ${deduped.length} unique pairs`);
|
|
|
|
// Shuffle
|
|
for (let i = deduped.length - 1; i > 0; i--) {
|
|
const j = Math.floor(Math.random() * (i + 1));
|
|
[deduped[i], deduped[j]] = [deduped[j], deduped[i]];
|
|
}
|
|
|
|
// Write
|
|
const lines = deduped.map((p) => JSON.stringify(p)).join("\n");
|
|
writeFileSync(OUTPUT_PATH, lines + "\n", "utf8");
|
|
console.log(`\nWrote ${deduped.length} pairs to ${OUTPUT_PATH}`);
|
|
|
|
// Stats
|
|
const kindCounts = {};
|
|
for (const p of deduped) {
|
|
kindCounts[p.kind] = (kindCounts[p.kind] || 0) + 1;
|
|
}
|
|
console.log("\nBreakdown by kind:");
|
|
for (const [kind, count] of Object.entries(kindCounts).sort((a, b) => b[1] - a[1])) {
|
|
console.log(` ${kind}: ${count}`);
|
|
}
|
|
|
|
const avgCtx = deduped.reduce((s, p) => s + p.context_strings.length, 0) / Math.max(deduped.length, 1);
|
|
console.log(`\nAvg context strings: ${avgCtx.toFixed(1)}`);
|