ruvector/scripts/training/generate-data-v2.mjs

#!/usr/bin/env node
/**
 * Generate expanded training data for JS deobfuscation model (v2).
 *
 * Sources:
 *   1. Existing training-data.jsonl (merge)
 *   2. Real JS files from node_modules (identifier extraction)
 *   3. Synthetic augmentation with context diversity
 *
 * Targets 15,000+ unique pairs for SOTA training.
 *
 * Usage:
 *   node scripts/training/generate-data-v2.mjs [--output training-data-v2.jsonl]
 */

import { readFileSync, writeFileSync, readdirSync, statSync, existsSync } from "fs";
import { join, resolve, basename } from "path";
import { parseArgs } from "util";

const { values: args } = parseArgs({
  options: {
    output: { type: "string", default: "training-data-v2.jsonl" },
    help: { type: "boolean", short: "h", default: false },
  },
});

if (args.help) {
  console.log("Usage: generate-data-v2.mjs [--output FILE]");
  process.exit(0);
}

const OUTPUT_PATH = resolve(args.output);
const ROOT = resolve(import.meta.dirname, "../..");

/** @type {Map<string, object>} key -> pair object, for dedup */
const pairMap = new Map();

function addPair(minified, original, contextStrings, properties, kind) {
  if (!minified || !original || original.length <= 1) return;
  // Skip if original looks minified itself
  if (original.length <= 2 && !/^[A-Z]/.test(original)) return;
  const key = `${minified}|${original}`;
  if (pairMap.has(key)) return;
  pairMap.set(key, {
    minified,
    original,
    context_strings: contextStrings.slice(0, 8),
    properties: properties.slice(0, 8),
    kind,
  });
}

// ---------------------------------------------------------------------------
// Source 1: Merge existing training data
// ---------------------------------------------------------------------------

function mergeExisting() {
  const existingPath = join(ROOT, "training-data.jsonl");
  if (!existsSync(existingPath)) {
    console.log("  [existing] no training-data.jsonl found, skipping");
    return 0;
  }
  const lines = readFileSync(existingPath, "utf8").trim().split("\n");
  let count = 0;
  for (const line of lines) {
    if (!line.trim()) continue;
    try {
      const obj = JSON.parse(line);
      addPair(
        obj.minified,
        obj.original,
        obj.context_strings || [],
        obj.properties || [],
        obj.kind || "var"
      );
      count++;
    } catch { /* skip bad lines */ }
  }
  console.log(`  [existing] merged ${count} pairs`);
  return count;
}

// ---------------------------------------------------------------------------
// Source 2: Extract identifiers from real JS files in node_modules
// ---------------------------------------------------------------------------

/** Walk directory tree, collect .js files up to maxDepth */
function collectJsFiles(dir, maxDepth = 3, depth = 0) {
  const files = [];
  if (depth > maxDepth) return files;
  let entries;
  try { entries = readdirSync(dir); } catch { return files; }
  for (const entry of entries) {
    if (entry === "node_modules" && depth > 0) continue;
    if (entry.startsWith(".")) continue;
    const full = join(dir, entry);
    let stat;
    try { stat = statSync(full); } catch { continue; }
    if (stat.isDirectory()) {
      files.push(...collectJsFiles(full, maxDepth, depth + 1));
    } else if (entry.endsWith(".js") && stat.size > 1000 && stat.size < 200000) {
      files.push(full);
    }
  }
  return files;
}

/**
 * Extract identifiers from a JS source file using regex patterns.
 * Returns array of { name, kind, nearbyTokens }
 */
function extractIdentifiers(source) {
  const results = [];
  const seen = new Set();

  // Pattern: function declarations
  const funcDeclRe = /\bfunction\s+([a-zA-Z_$][a-zA-Z0-9_$]{2,})\s*\(/g;
  let m;
  while ((m = funcDeclRe.exec(source)) !== null) {
    if (!seen.has(m[1])) {
      seen.add(m[1]);
      const ctx = extractNearbyContext(source, m.index, 200);
      results.push({ name: m[1], kind: "function", ctx });
    }
  }

  // Pattern: const/let/var declarations with meaningful names
  const varDeclRe = /\b(?:const|let|var)\s+([a-zA-Z_$][a-zA-Z0-9_$]{2,})\s*=/g;
  while ((m = varDeclRe.exec(source)) !== null) {
    if (!seen.has(m[1])) {
      seen.add(m[1]);
      const ctx = extractNearbyContext(source, m.index, 200);
      results.push({ name: m[1], kind: "var", ctx });
    }
  }

  // Pattern: class declarations
  const classDeclRe = /\bclass\s+([a-zA-Z_$][a-zA-Z0-9_$]{2,})\b/g;
  while ((m = classDeclRe.exec(source)) !== null) {
    if (!seen.has(m[1])) {
      seen.add(m[1]);
      const ctx = extractNearbyContext(source, m.index, 200);
      results.push({ name: m[1], kind: "class", ctx });
    }
  }

  // Pattern: method definitions (object/class methods)
  const methodRe = /\b([a-zA-Z_$][a-zA-Z0-9_$]{2,})\s*\([^)]*\)\s*\{/g;
  while ((m = methodRe.exec(source)) !== null) {
    const name = m[1];
    if (!seen.has(name) && !SKIP_NAMES.has(name)) {
      seen.add(name);
      const ctx = extractNearbyContext(source, m.index, 200);
      results.push({ name, kind: "function", ctx });
    }
  }

  // Pattern: exports.X = or module.exports.X =
  const exportsRe = /(?:exports|module\.exports)\.([a-zA-Z_$][a-zA-Z0-9_$]{2,})\s*=/g;
  while ((m = exportsRe.exec(source)) !== null) {
    if (!seen.has(m[1])) {
      seen.add(m[1]);
      const ctx = extractNearbyContext(source, m.index, 200);
      results.push({ name: m[1], kind: "var", ctx });
    }
  }

  // Pattern: prototype methods
  const protoRe = /\.prototype\.([a-zA-Z_$][a-zA-Z0-9_$]{2,})\s*=/g;
  while ((m = protoRe.exec(source)) !== null) {
    if (!seen.has(m[1])) {
      seen.add(m[1]);
      const ctx = extractNearbyContext(source, m.index, 200);
      results.push({ name: m[1], kind: "function", ctx });
    }
  }

  return results;
}

const SKIP_NAMES = new Set([
  "if", "else", "for", "while", "do", "switch", "case", "break",
  "continue", "return", "try", "catch", "finally", "throw", "new",
  "delete", "typeof", "void", "instanceof", "in", "of", "with",
  "this", "super", "true", "false", "null", "undefined", "NaN",
  "Infinity", "arguments", "eval", "constructor", "prototype",
  "use", "strict", "exports", "module", "require",
]);

/**
 * Extract nearby context tokens around a match position.
 */
function extractNearbyContext(source, pos, window) {
  const start = Math.max(0, pos - window);
  const end = Math.min(source.length, pos + window);
  const snippet = source.slice(start, end);

  // Extract string literals as context
  const strings = [];
  const strRe = /["']([a-zA-Z][a-zA-Z0-9_.-]{2,})["']/g;
  let m;
  while ((m = strRe.exec(snippet)) !== null) {
    if (!SKIP_NAMES.has(m[1]) && m[1].length < 30) {
      strings.push(m[1]);
    }
  }

  // Extract property accesses as context
  const propRe = /\.([a-zA-Z_$][a-zA-Z0-9_$]{2,})/g;
  while ((m = propRe.exec(snippet)) !== null) {
    if (!SKIP_NAMES.has(m[1]) && m[1].length < 25) {
      strings.push(m[1]);
    }
  }

  // Deduplicate and limit
  return [...new Set(strings)].slice(0, 10);
}

/**
 * Extract property accesses for a given identifier from source.
 */
function extractProperties(source, name) {
  const props = new Set();
  // Look for name.property patterns
  const re = new RegExp(`\\b${escapeRegex(name)}\\.([a-zA-Z_$][a-zA-Z0-9_$]{1,})`, "g");
  let m;
  while ((m = re.exec(source)) !== null) {
    if (m[1].length < 25) props.add(m[1]);
  }
  return [...props].slice(0, 8);
}

function escapeRegex(s) {
  return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}

// Minifier name generators
const MINIFIER_STYLES = [
  (i) => String.fromCharCode(97 + (i % 26)),
  (i) => String.fromCharCode(97 + (i % 26)) + "$",
  (i) => "_" + String.fromCharCode(97 + (i % 26)),
  (i) => "_0x" + (0x1a2b + i).toString(16),
  (i) => String.fromCharCode(97 + (i % 26)) + (i % 10).toString(),
  (i) => "__" + String.fromCharCode(97 + (i % 26)),
  (i) => "$" + String.fromCharCode(97 + (i % 26)),
  (i) => String.fromCharCode(65 + (i % 26)),
  (i) => {
    const a = String.fromCharCode(97 + (i % 26));
    const b = String.fromCharCode(97 + ((i + 1) % 26));
    return a + b;
  },
  (i) => "$" + (i % 100).toString(),
  (i) => "_" + (i % 100).toString(),
  (i) => "t" + i,
  (i) => "e$" + String.fromCharCode(97 + (i % 26)),
  (i) => "n" + (i % 100),
  (i) => "r" + String.fromCharCode(97 + (i % 26)),
  (i) => String.fromCharCode(97 + (i % 26)) + String.fromCharCode(97 + ((i * 7) % 26)),
];

function extractFromNodeModules() {
  const nmDir = join(ROOT, "node_modules");
  if (!existsSync(nmDir)) {
    console.log("  [node_modules] directory not found");
    return 0;
  }

  const jsFiles = collectJsFiles(nmDir, 4);
  console.log(`  [node_modules] found ${jsFiles.length} JS files to scan`);

  let totalExtracted = 0;
  let fileIdx = 0;

  for (const file of jsFiles) {
    let source;
    try { source = readFileSync(file, "utf8"); } catch { continue; }

    // Skip minified files (low ratio of newlines to content)
    const lineCount = source.split("\n").length;
    if (lineCount < 10 && source.length > 5000) continue;

    const identifiers = extractIdentifiers(source);
    if (identifiers.length === 0) continue;

    for (let i = 0; i < identifiers.length; i++) {
      const { name, kind, ctx } = identifiers[i];
      if (name.length < 3 || SKIP_NAMES.has(name)) continue;

      const properties = extractProperties(source, name);

      // Generate multiple minified variants per identifier
      const numVariants = Math.min(4, MINIFIER_STYLES.length);
      for (let v = 0; v < numVariants; v++) {
        const styleIdx = (fileIdx + i + v) % MINIFIER_STYLES.length;
        const minified = MINIFIER_STYLES[styleIdx](fileIdx + i);

        // Vary context slightly for each variant
        const contextVariant = varySyntheticContext(ctx, v);
        addPair(minified, name, contextVariant, properties, kind);
        totalExtracted++;
      }
    }
    fileIdx++;
  }

  console.log(`  [node_modules] extracted ${totalExtracted} pairs`);
  return totalExtracted;
}

// ---------------------------------------------------------------------------
// Source 3: Augmentation -- camelCase splitting + semantic context
// ---------------------------------------------------------------------------

/** Split camelCase/PascalCase into tokens */
function splitCamelCase(name) {
  return name
    .replace(/([A-Z])/g, " $1")
    .trim()
    .toLowerCase()
    .split(/\s+/)
    .filter((t) => t.length > 1);
}

/** Generate semantic context from the name itself */
function generateSemanticContext(name) {
  const tokens = splitCamelCase(name);
  const semantic = [];

  // Add the camelCase tokens as context hints
  semantic.push(...tokens.slice(0, 4));

  // Add type hints based on common prefixes/suffixes
  if (/^is[A-Z]/.test(name)) semantic.push("boolean", "check");
  if (/^has[A-Z]/.test(name)) semantic.push("boolean", "exists");
  if (/^get[A-Z]/.test(name)) semantic.push("getter", "return");
  if (/^set[A-Z]/.test(name)) semantic.push("setter", "assign");
  if (/^on[A-Z]/.test(name)) semantic.push("event", "handler");
  if (/^handle[A-Z]/.test(name)) semantic.push("event", "callback");
  if (/^create[A-Z]/.test(name)) semantic.push("factory", "new");
  if (/^parse[A-Z]/.test(name)) semantic.push("parse", "input");
  if (/^format[A-Z]/.test(name)) semantic.push("format", "output");
  if (/^validate[A-Z]/.test(name)) semantic.push("validate", "check");
  if (/^render[A-Z]/.test(name)) semantic.push("render", "display");
  if (/^fetch[A-Z]/.test(name)) semantic.push("async", "request");
  if (/^load[A-Z]/.test(name)) semantic.push("async", "data");
  if (/^save[A-Z]/.test(name)) semantic.push("persist", "store");
  if (/^delete[A-Z]/.test(name)) semantic.push("remove", "destroy");
  if (/^update[A-Z]/.test(name)) semantic.push("modify", "change");
  if (/^init/.test(name)) semantic.push("initialize", "setup");
  if (/^process/.test(name)) semantic.push("transform", "pipeline");

  // Suffix-based hints
  if (/Error$/.test(name)) semantic.push("error", "exception");
  if (/Handler$/.test(name)) semantic.push("handler", "callback");
  if (/Manager$/.test(name)) semantic.push("manager", "lifecycle");
  if (/Service$/.test(name)) semantic.push("service", "business");
  if (/Controller$/.test(name)) semantic.push("controller", "http");
  if (/Factory$/.test(name)) semantic.push("factory", "create");
  if (/Builder$/.test(name)) semantic.push("builder", "construct");
  if (/Adapter$/.test(name)) semantic.push("adapter", "convert");
  if (/Provider$/.test(name)) semantic.push("provider", "inject");
  if (/Listener$/.test(name)) semantic.push("listener", "event");
  if (/Config$/.test(name)) semantic.push("config", "settings");
  if (/Options$/.test(name)) semantic.push("options", "settings");
  if (/Result$/.test(name)) semantic.push("result", "output");
  if (/Callback$/.test(name)) semantic.push("callback", "async");

  return [...new Set(semantic)].slice(0, 8);
}

/**
 * Vary context slightly for training diversity.
 */
function varySyntheticContext(ctx, variant) {
  if (!ctx || ctx.length === 0) return ["unknown"];
  switch (variant % 5) {
    case 0: return ctx;
    case 1: return ctx.length > 2 ? [...ctx.slice(1), ctx[0]] : ctx;
    case 2: return ctx.slice(0, Math.max(2, Math.ceil(ctx.length / 2)));
    case 3: return [...ctx, "prototype", "constructor"].slice(0, 8);
    case 4: return [...ctx.slice(0, 3), "undefined", "null"].slice(0, 8);
    default: return ctx;
  }
}

/**
 * Generate augmented pairs by cross-version simulation.
 */
function generateCrossVersionAugmentation() {
  const originals = new Map();
  for (const [, pair] of pairMap) {
    if (!originals.has(pair.original)) {
      originals.set(pair.original, pair);
    }
  }

  let augmented = 0;
  const allOriginals = [...originals.entries()];

  for (const [originalName, basePair] of allOriginals) {
    // Generate 2-3 extra "version" variants
    const versions = 2 + Math.floor(Math.random() * 2);
    for (let v = 0; v < versions; v++) {
      const minified = randomMinifiedName();
      const key = `${minified}|${originalName}`;
      if (pairMap.has(key)) continue;

      // Vary context
      const ctx = varySyntheticContext(basePair.context_strings, v);
      addPair(minified, originalName, ctx, basePair.properties, basePair.kind);
      augmented++;
    }
  }

  console.log(`  [cross-version] augmented ${augmented} pairs`);
  return augmented;
}

function randomMinifiedName() {
  const styles = [
    () => String.fromCharCode(97 + rand(26)) + rand(100),
    () => "_0x" + rand(0xffff).toString(16),
    () => String.fromCharCode(97 + rand(26)) + String.fromCharCode(97 + rand(26)),
    () => "$" + String.fromCharCode(97 + rand(26)),
    () => "t" + rand(200),
    () => "n" + rand(100),
    () => "_" + rand(200),
    () => String.fromCharCode(97 + rand(26)) + String.fromCharCode(97 + rand(26)) + rand(10),
  ];
  return styles[rand(styles.length)]();
}

function rand(max) { return Math.floor(Math.random() * max); }

// ---------------------------------------------------------------------------
// Source 4: Additional synthetic names for coverage
// ---------------------------------------------------------------------------

function generateAdditionalSynthetic() {
  // Common web/Node.js identifiers not likely in node_modules source
  const EXTRA_NAMES = {
    function: [
      // Webpack/bundler internals
      "__webpack_require__", "__webpack_modules__", "__webpack_exports__",
      // React internals
      "createElement", "cloneElement", "createRef", "forwardRef",
      "memo", "lazy", "Suspense", "Fragment",
      "useId", "useSyncExternalStore", "useInsertionEffect",
      // Next.js patterns
      "getServerSideProps", "getStaticProps", "getStaticPaths",
      "generateMetadata", "generateStaticParams",
      // Express patterns
      "createApplication", "createMiddleware", "createRoute",
      "useRouter", "useParams", "useSearchParams",
      // Testing
      "beforeEach", "afterEach", "beforeAll", "afterAll",
      "spyOn", "mockImplementation", "mockReturnValue",
      // Utilities
      "cloneDeep", "mergeWith", "assignIn", "defaultsDeep",
      "flattenDeep", "uniqBy", "groupBy", "sortBy", "orderBy",
      "pickBy", "omitBy", "mapKeys", "mapValues",
      // Crypto/Security
      "createHash", "createCipher", "createDecipher", "createSign",
      "randomBytes", "scrypt", "pbkdf2",
      // Stream
      "createReadStream", "createWriteStream", "pipeline", "finished",
      "Transform", "Readable", "Writable", "Duplex", "PassThrough",
    ],
    class: [
      "AbortController", "AbortSignal", "TextEncoder", "TextDecoder",
      "URLSearchParams", "FormData", "Headers", "ReadableStream",
      "WritableStream", "TransformStream", "BroadcastChannel",
      "IntersectionObserver", "MutationObserver", "ResizeObserver",
      "PerformanceObserver", "MessageChannel", "MessagePort",
      "WeakRef", "FinalizationRegistry", "SharedArrayBuffer",
      // Framework classes
      "EventTarget", "CustomEvent", "DOMParser", "XMLSerializer",
      "WebSocket", "Worker", "ServiceWorker", "SharedWorker",
    ],
    var: [
      // Common config keys
      "baseURL", "timeout", "maxRedirects", "maxContentLength",
      "validateStatus", "transformRequest", "transformResponse",
      "paramsSerializer", "withCredentials", "responseEncoding",
      // State patterns
      "initialState", "rootReducer", "rootSaga", "rootEpic",
      "storeEnhancers", "middlewares", "devTools",
      // Build tools
      "webpackConfig", "rollupConfig", "viteConfig", "babelConfig",
      "tsConfig", "eslintConfig", "prettierConfig",
      // Environment
      "NODE_ENV", "API_URL", "BASE_PATH", "PUBLIC_URL",
    ],
  };

  let count = 0;
  for (const [kind, names] of Object.entries(EXTRA_NAMES)) {
    for (let i = 0; i < names.length; i++) {
      const original = names[i];
      const semanticCtx = generateSemanticContext(original);
      const props = kind === "function"
        ? ["length", "name", "call", "apply", "bind"]
        : kind === "class"
          ? ["prototype", "constructor", "name"]
          : ["toString", "valueOf"];

      // 4 minified variants per name
      for (let v = 0; v < 4; v++) {
        const styleIdx = (i + v) % MINIFIER_STYLES.length;
        const minified = MINIFIER_STYLES[styleIdx](i);
        const ctx = varySyntheticContext(semanticCtx, v);
        addPair(minified, original, ctx, props, kind);
        count++;
      }
    }
  }

  console.log(`  [extra-synthetic] generated ${count} pairs`);
  return count;
}

// ---------------------------------------------------------------------------
// Main
// ---------------------------------------------------------------------------

console.log("=== Generating expanded training data (v2) ===\n");

console.log("Step 1: Merging existing training data");
mergeExisting();

console.log("\nStep 2: Extracting identifiers from node_modules");
extractFromNodeModules();

console.log("\nStep 3: Additional synthetic identifiers");
generateAdditionalSynthetic();

console.log("\nStep 4: Cross-version augmentation");
generateCrossVersionAugmentation();

// Convert to array and shuffle
const allPairs = [...pairMap.values()];

// Fisher-Yates shuffle
for (let i = allPairs.length - 1; i > 0; i--) {
  const j = Math.floor(Math.random() * (i + 1));
  [allPairs[i], allPairs[j]] = [allPairs[j], allPairs[i]];
}

console.log(`\n=== Total unique pairs: ${allPairs.length} ===`);

// Write JSONL
const lines = allPairs.map((p) => JSON.stringify(p)).join("\n");
writeFileSync(OUTPUT_PATH, lines + "\n", "utf8");
console.log(`Wrote ${allPairs.length} pairs to ${OUTPUT_PATH}`);

// Print stats
const kindCounts = {};
for (const p of allPairs) {
  kindCounts[p.kind] = (kindCounts[p.kind] || 0) + 1;
}
console.log("\nBreakdown by kind:");
for (const [kind, count] of Object.entries(kindCounts).sort((a, b) => b[1] - a[1])) {
  console.log(`  ${kind}: ${count}`);
}

// Print average context length
const avgCtx = allPairs.reduce((s, p) => s + p.context_strings.length, 0) / allPairs.length;
const avgProps = allPairs.reduce((s, p) => s + p.properties.length, 0) / allPairs.length;
console.log(`\nAverage context strings per pair: ${avgCtx.toFixed(1)}`);
console.log(`Average properties per pair: ${avgProps.toFixed(1)}`);