ruvector/scripts/training/build-optimal-dataset.py
rUv 36f2599774 feat(training): source map extraction + v2 model (83.67% val accuracy)
- Extract 14,198 training pairs from 6,941 source maps in node_modules
- Train v2 model (4-layer, 192-dim, 6-head transformer, 1.9M params)
- Val accuracy: 83.67% (up from 75.72%), exact match: 12.3% (up from 0.1%)
- Export weights.bin (7.3MB) for Rust runtime inference
- Add decompiler dashboard (React + Tailwind + Vite)
- Add runnable RVF (7,350 vectors, 49 segments, witness chain)
- Update evaluate-model.py to support configurable model architectures
- All 13 Rust tests pass, all 45 RVF files have valid SFVR headers

Co-Authored-By: claude-flow <ruv@ruv.net>
2026-04-03 04:57:47 +00:00

231 lines
6.7 KiB
Python

#!/usr/bin/env python3
"""
Build an optimal training dataset that balances learnability with diversity.
Strategy:
1. Keep all original synthetic pairs (high signal-to-noise)
2. Add real identifiers from node_modules but with enriched context
3. Add many minified variants per original (the model needs to learn
that context predicts the name, not the minified form)
4. Ensure each original name has 5-10 variants with different minified names
5. Target ~20K pairs for fast CPU training with good accuracy
Key insight: The model's job is context -> original_name. The minified
name is mostly noise (it's random). So we need many different minified
names mapping to the same original, with consistent context. This teaches
the model to rely on context, not the minified form.
"""
import json
import random
import sys
from collections import defaultdict
random.seed(42)
OUTPUT = sys.argv[1] if len(sys.argv) > 1 else "training-data-optimal.jsonl"
# Minifier styles
STYLES = [
lambda i: chr(97 + (i % 26)),
lambda i: chr(97 + (i % 26)) + "$",
lambda i: "_" + chr(97 + (i % 26)),
lambda i: "_0x" + hex(0x1a2b + i)[2:],
lambda i: chr(97 + (i % 26)) + str(i % 10),
lambda i: "__" + chr(97 + (i % 26)),
lambda i: "$" + chr(97 + (i % 26)),
lambda i: chr(65 + (i % 26)),
lambda i: chr(97 + (i % 26)) + chr(97 + ((i + 1) % 26)),
lambda i: "$" + str(i % 100),
lambda i: "_" + str(i % 100),
lambda i: "t" + str(i),
lambda i: "e$" + chr(97 + (i % 26)),
lambda i: "n" + str(i),
lambda i: "r" + chr(97 + (i % 26)),
]
def random_minified():
i = random.randint(0, 500)
s = random.choice(STYLES)
return s(i)
def semantic_context(name):
"""Generate context from identifier name."""
tokens = []
current = ""
for c in name:
if c.isupper() and current:
tokens.append(current.lower())
current = c
else:
current += c
if current:
tokens.append(current.lower())
result = [t for t in tokens if len(t) > 1]
if name.startswith("is") or name.startswith("has"):
result.append("boolean")
if name.startswith("get") or name.startswith("fetch"):
result.append("getter")
if name.startswith("set"):
result.append("setter")
if name.startswith("on") or name.startswith("handle"):
result.append("event")
if name.startswith("create"):
result.append("factory")
if name.startswith("parse"):
result.append("parse")
if name.startswith("format"):
result.append("format")
if name.startswith("validate"):
result.append("validate")
if name.endswith("Error"):
result.append("error")
if name.endswith("Service"):
result.append("service")
if name.endswith("Handler"):
result.append("handler")
return result[:8]
def vary_context(ctx, variant):
"""Slightly vary context for training diversity."""
if not ctx:
return ["unknown"]
ctx = list(ctx)
v = variant % 6
if v == 0:
return ctx
if v == 1:
return ctx[1:] + ctx[:1] if len(ctx) > 1 else ctx
if v == 2:
return ctx[:max(2, len(ctx) // 2)]
if v == 3:
return ctx + ["prototype"]
if v == 4:
return ctx + ["constructor"]
if v == 5:
# Reverse order
return list(reversed(ctx))
return ctx
# Step 1: Load and analyze existing training data
print("Loading existing training data...")
existing_pairs = []
with open("training-data.jsonl") as f:
for line in f:
if line.strip():
existing_pairs.append(json.loads(line))
print(f" Existing: {len(existing_pairs)} pairs")
# Step 2: Load real identifiers from v2 data
print("Loading real identifier pairs from v2...")
real_pairs = []
with open("training-data-v2.jsonl") as f:
for line in f:
if line.strip():
p = json.loads(line)
if len(p.get("context_strings", [])) >= 2 and len(p["original"]) >= 3:
real_pairs.append(p)
print(f" Real pairs with good context: {len(real_pairs)}")
# Step 3: Group by original name
by_original = defaultdict(list)
for p in existing_pairs + real_pairs:
by_original[p["original"]].append(p)
print(f" Unique original names: {len(by_original)}")
# Step 4: Build optimal dataset
final_pairs = []
seen_keys = set()
for original, variants in by_original.items():
if len(original) < 3:
continue
# Find the best context (most context strings)
best = max(variants, key=lambda v: len(v.get("context_strings", [])))
base_ctx = best.get("context_strings", [])
base_props = best.get("properties", [])
kind = best.get("kind", "var")
if len(base_ctx) == 0:
base_ctx = semantic_context(original)
# Generate 8 variants
for v in range(8):
minified = random_minified()
key = f"{minified}|{original}"
attempts = 0
while key in seen_keys and attempts < 20:
minified = random_minified()
key = f"{minified}|{original}"
attempts += 1
if key in seen_keys:
continue
seen_keys.add(key)
ctx = vary_context(base_ctx, v)
final_pairs.append({
"minified": minified,
"original": original,
"context_strings": ctx[:8],
"properties": base_props[:6],
"kind": kind,
})
print(f"\nGenerated {len(final_pairs)} pairs from {len(by_original)} unique names")
# Step 5: Augment
aug_pairs = []
for p in final_pairs:
if random.random() < 0.25:
ctx = list(p["context_strings"])
random.shuffle(ctx)
aug_pairs.append({
**p, "minified": random_minified(), "context_strings": ctx,
})
if random.random() < 0.15 and len(p["context_strings"]) > 2:
k = max(2, len(p["context_strings"]) // 2)
ctx = random.sample(p["context_strings"], k)
aug_pairs.append({
**p, "minified": random_minified(), "context_strings": ctx,
})
final_pairs.extend(aug_pairs)
print(f"After augmentation: {len(final_pairs)} pairs")
# Deduplicate
deduped = []
seen2 = set()
for p in final_pairs:
key = f"{p['minified']}|{p['original']}"
if key not in seen2:
seen2.add(key)
deduped.append(p)
random.shuffle(deduped)
# Cap at 25K
if len(deduped) > 25000:
deduped = deduped[:25000]
print(f"Final: {len(deduped)} pairs")
with open(OUTPUT, "w") as f:
for p in deduped:
f.write(json.dumps(p) + "\n")
print(f"Wrote to {OUTPUT}")
kinds = defaultdict(int)
for p in deduped:
kinds[p["kind"]] += 1
for k, v in sorted(kinds.items()):
print(f" {k}: {v}")
avg_ctx = sum(len(p["context_strings"]) for p in deduped) / len(deduped)
print(f"Avg context: {avg_ctx:.1f}")