#!/usr/bin/env node /** * Extract training pairs from source maps in node_modules. * * Source maps contain `names` arrays with original identifiers and VLQ-encoded * mappings that tell us exactly which minified name maps to which original. * * For each source map: * 1. Parse the .js.map JSON * 2. Decode VLQ mappings to get (line, col, nameIdx) tuples * 3. Read the corresponding .js file * 4. Extract the minified identifier at each (line, col) position * 5. Extract surrounding context (string literals, property accesses) * 6. Output (minified, original, context, properties, kind) pairs * * Usage: * node scripts/training/extract-sourcemap-pairs.mjs [--output training-data-sourcemaps.jsonl] */ import { readFileSync, writeFileSync, existsSync } from "fs"; import { resolve, dirname, basename, join } from "path"; import { execSync } from "child_process"; import { parseArgs } from "util"; const { values: args } = parseArgs({ options: { output: { type: "string", default: "training-data-sourcemaps.jsonl" }, help: { type: "boolean", short: "h", default: false }, }, }); if (args.help) { console.log("Usage: extract-sourcemap-pairs.mjs [--output FILE]"); process.exit(0); } const OUTPUT_PATH = resolve(args.output); const ROOT = resolve(import.meta.dirname, "../.."); // --------------------------------------------------------------------------- // VLQ Decoder // --------------------------------------------------------------------------- const VLQ_BASE_SHIFT = 5; const VLQ_BASE = 1 << VLQ_BASE_SHIFT; const VLQ_BASE_MASK = VLQ_BASE - 1; const VLQ_CONTINUATION_BIT = VLQ_BASE; const BASE64_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; const BASE64_MAP = new Map(); for (let i = 0; i < BASE64_CHARS.length; i++) { BASE64_MAP.set(BASE64_CHARS[i], i); } function decodeVLQ(str, pos) { let result = 0; let shift = 0; let continuation = true; while (continuation && pos < str.length) { const digit = BASE64_MAP.get(str[pos++]); if (digit === undefined) break; continuation = !!(digit & VLQ_CONTINUATION_BIT); result += (digit & VLQ_BASE_MASK) << shift; shift += VLQ_BASE_SHIFT; } // Sign is in the least significant bit const negate = result & 1; result >>= 1; return { value: negate ? -result : result, pos }; } /** * Decode source map mappings string into an array of segments. * Each segment: [genCol, sourceIdx, sourceLine, sourceCol, nameIdx?] */ function decodeMappings(mappingsStr) { const lines = []; let currentLine = []; let generatedColumn = 0; let sourceIndex = 0; let sourceLine = 0; let sourceColumn = 0; let nameIndex = 0; let pos = 0; while (pos < mappingsStr.length) { const ch = mappingsStr[pos]; if (ch === ";") { lines.push(currentLine); currentLine = []; generatedColumn = 0; pos++; continue; } if (ch === ",") { pos++; continue; } // Decode segment const segment = []; // Field 1: generated column (relative) let decoded = decodeVLQ(mappingsStr, pos); generatedColumn += decoded.value; segment.push(generatedColumn); pos = decoded.pos; if (pos < mappingsStr.length && mappingsStr[pos] !== "," && mappingsStr[pos] !== ";") { // Field 2: source index (relative) decoded = decodeVLQ(mappingsStr, pos); sourceIndex += decoded.value; segment.push(sourceIndex); pos = decoded.pos; // Field 3: source line (relative) decoded = decodeVLQ(mappingsStr, pos); sourceLine += decoded.value; segment.push(sourceLine); pos = decoded.pos; // Field 4: source column (relative) decoded = decodeVLQ(mappingsStr, pos); sourceColumn += decoded.value; segment.push(sourceColumn); pos = decoded.pos; // Field 5: name index (optional, relative) if (pos < mappingsStr.length && mappingsStr[pos] !== "," && mappingsStr[pos] !== ";") { decoded = decodeVLQ(mappingsStr, pos); nameIndex += decoded.value; segment.push(nameIndex); pos = decoded.pos; } } currentLine.push(segment); } if (currentLine.length > 0) { lines.push(currentLine); } return lines; } // --------------------------------------------------------------------------- // Identifier extraction from minified JS at a given position // --------------------------------------------------------------------------- const IDENT_RE = /^[a-zA-Z_$][a-zA-Z0-9_$]*/; function extractIdentifierAt(lines, lineIdx, colIdx) { if (lineIdx >= lines.length) return null; const line = lines[lineIdx]; if (colIdx >= line.length) return null; const rest = line.substring(colIdx); const m = rest.match(IDENT_RE); return m ? m[0] : null; } // --------------------------------------------------------------------------- // Context extraction // --------------------------------------------------------------------------- function extractContext(lines, lineIdx, colIdx, windowLines = 3) { const context = []; const startLine = Math.max(0, lineIdx - windowLines); const endLine = Math.min(lines.length, lineIdx + windowLines + 1); for (let i = startLine; i < endLine; i++) { const line = lines[i]; // Extract string literals const strRe = /["']([a-zA-Z][a-zA-Z0-9_./-]{2,})["']/g; let m; while ((m = strRe.exec(line)) !== null) { if (m[1].length < 30) context.push(m[1]); } // Extract property accesses const propRe = /\.([a-zA-Z_$][a-zA-Z0-9_$]{2,})/g; while ((m = propRe.exec(line)) !== null) { if (m[1].length < 25) context.push(m[1]); } } return [...new Set(context)].slice(0, 10); } function extractProperties(lines, lineIdx, identifier, windowLines = 5) { const props = new Set(); const startLine = Math.max(0, lineIdx - windowLines); const endLine = Math.min(lines.length, lineIdx + windowLines + 1); for (let i = startLine; i < endLine; i++) { const re = new RegExp(`\\b${escapeRegex(identifier)}\\.([a-zA-Z_$][a-zA-Z0-9_$]{1,})`, "g"); let m; while ((m = re.exec(lines[i])) !== null) { if (m[1].length < 25) props.add(m[1]); } } return [...props].slice(0, 8); } function escapeRegex(s) { return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); } // --------------------------------------------------------------------------- // Determine kind from context // --------------------------------------------------------------------------- function inferKind(lines, lineIdx, colIdx, identifier) { if (lineIdx >= lines.length) return "var"; const line = lines[lineIdx]; // Check what precedes the identifier const before = line.substring(0, colIdx).trimEnd(); if (/\bfunction\s*$/.test(before)) return "function"; if (/\bclass\s*$/.test(before)) return "class"; if (/\b(?:const|let|var)\s*$/.test(before)) return "var"; // Check if identifier starts with uppercase (likely class) if (/^[A-Z][a-z]/.test(identifier)) return "class"; // Check if followed by ( → likely function const after = line.substring(colIdx + identifier.length).trimStart(); if (after.startsWith("(") || after.startsWith("=function") || after.startsWith("=async")) { return "function"; } return "var"; } // --------------------------------------------------------------------------- // Process a single source map // --------------------------------------------------------------------------- function processSourceMap(mapPath) { const pairs = []; let mapJson; try { mapJson = JSON.parse(readFileSync(mapPath, "utf8")); } catch { return pairs; } const names = mapJson.names || []; const mappings = mapJson.mappings; if (!names.length || !mappings) return pairs; // Find the corresponding JS file const jsPath = mapPath.replace(/\.map$/, ""); if (!existsSync(jsPath)) return pairs; let jsContent; try { jsContent = readFileSync(jsPath, "utf8"); } catch { return pairs; } const jsLines = jsContent.split("\n"); // Decode mappings let decodedLines; try { decodedLines = decodeMappings(mappings); } catch { return pairs; } // Process each segment that has a name index const seen = new Set(); for (let lineIdx = 0; lineIdx < decodedLines.length; lineIdx++) { const segments = decodedLines[lineIdx]; for (const seg of segments) { if (seg.length < 5) continue; // No name index const genCol = seg[0]; const nameIdx = seg[4]; if (nameIdx < 0 || nameIdx >= names.length) continue; const originalName = names[nameIdx]; if (!originalName || originalName.length < 3) continue; // Skip common keywords if (SKIP_NAMES.has(originalName)) continue; // Extract the minified identifier at this position const minified = extractIdentifierAt(jsLines, lineIdx, genCol); if (!minified) continue; // Skip if minified === original (no renaming happened) if (minified === originalName) continue; // Skip if minified is too long (probably not actually minified) if (minified.length > 6) continue; // Deduplicate const key = `${minified}|${originalName}`; if (seen.has(key)) continue; seen.add(key); // Extract context and properties const context = extractContext(jsLines, lineIdx, genCol); const properties = extractProperties(jsLines, lineIdx, minified); const kind = inferKind(jsLines, lineIdx, genCol, originalName); pairs.push({ minified, original: originalName, context_strings: context, properties, kind, }); } } return pairs; } const SKIP_NAMES = new Set([ "if", "else", "for", "while", "do", "switch", "case", "break", "continue", "return", "try", "catch", "finally", "throw", "new", "delete", "typeof", "void", "instanceof", "in", "of", "with", "this", "super", "true", "false", "null", "undefined", "NaN", "Infinity", "arguments", "eval", "constructor", "prototype", "use", "strict", "exports", "module", "require", "Object", "Array", "String", "Number", "Boolean", "Function", "Symbol", "BigInt", "Map", "Set", "WeakMap", "WeakSet", "Promise", "Error", "TypeError", "RangeError", "SyntaxError", "Math", "Date", "JSON", "RegExp", "Proxy", "Reflect", "console", "document", "window", "global", "globalThis", "process", "Buffer", "setTimeout", "setInterval", "clearTimeout", "length", "push", "pop", "shift", "unshift", "call", "apply", "bind", "toString", "valueOf", "hasOwnProperty", "propertyIsEnumerable", "isPrototypeOf", "__proto__", "__defineGetter__", "__defineSetter__", ]); // --------------------------------------------------------------------------- // Main // --------------------------------------------------------------------------- console.log("=== Extracting training pairs from source maps ===\n"); // Find all source map files const findCmd = `find ${join(ROOT, "node_modules")} -name "*.js.map" -not -path "*/node_modules/*/node_modules/*" -size +1k 2>/dev/null`; const mapFiles = execSync(findCmd, { encoding: "utf8" }).trim().split("\n").filter(Boolean); console.log(`Found ${mapFiles.length} source map files\n`); let totalPairs = 0; let filesWithPairs = 0; const allPairs = []; for (let i = 0; i < mapFiles.length; i++) { const mapFile = mapFiles[i]; const pairs = processSourceMap(mapFile); if (pairs.length > 0) { allPairs.push(...pairs); totalPairs += pairs.length; filesWithPairs++; if (pairs.length >= 10) { const rel = mapFile.replace(ROOT + "/node_modules/", ""); console.log(` [${pairs.length} pairs] ${rel}`); } } // Progress every 500 files if ((i + 1) % 500 === 0) { console.log(` ... processed ${i + 1}/${mapFiles.length} files, ${totalPairs} pairs so far`); } } console.log(`\nProcessed ${mapFiles.length} files`); console.log(`Files with pairs: ${filesWithPairs}`); console.log(`Total pairs: ${totalPairs}`); // Deduplicate globally const globalSeen = new Set(); const deduped = allPairs.filter((p) => { const key = `${p.minified}|${p.original}`; if (globalSeen.has(key)) return false; globalSeen.add(key); return true; }); console.log(`After dedup: ${deduped.length} unique pairs`); // Shuffle for (let i = deduped.length - 1; i > 0; i--) { const j = Math.floor(Math.random() * (i + 1)); [deduped[i], deduped[j]] = [deduped[j], deduped[i]]; } // Write const lines = deduped.map((p) => JSON.stringify(p)).join("\n"); writeFileSync(OUTPUT_PATH, lines + "\n", "utf8"); console.log(`\nWrote ${deduped.length} pairs to ${OUTPUT_PATH}`); // Stats const kindCounts = {}; for (const p of deduped) { kindCounts[p.kind] = (kindCounts[p.kind] || 0) + 1; } console.log("\nBreakdown by kind:"); for (const [kind, count] of Object.entries(kindCounts).sort((a, b) => b[1] - a[1])) { console.log(` ${kind}: ${count}`); } const avgCtx = deduped.reduce((s, p) => s + p.context_strings.length, 0) / Math.max(deduped.length, 1); console.log(`\nAvg context strings: ${avgCtx.toFixed(1)}`);