ruvector/scripts/training/build-optimal-dataset-v2.py
rUv 36f2599774 feat(training): source map extraction + v2 model (83.67% val accuracy)
- Extract 14,198 training pairs from 6,941 source maps in node_modules
- Train v2 model (4-layer, 192-dim, 6-head transformer, 1.9M params)
- Val accuracy: 83.67% (up from 75.72%), exact match: 12.3% (up from 0.1%)
- Export weights.bin (7.3MB) for Rust runtime inference
- Add decompiler dashboard (React + Tailwind + Vite)
- Add runnable RVF (7,350 vectors, 49 segments, witness chain)
- Update evaluate-model.py to support configurable model architectures
- All 13 Rust tests pass, all 45 RVF files have valid SFVR headers

Co-Authored-By: claude-flow <ruv@ruv.net>
2026-04-03 04:57:47 +00:00

266 lines
8 KiB
Python

#!/usr/bin/env python3
"""
Build optimal dataset v2: focus on fixing weaknesses from Round 1.
Round 1 analysis:
- Short names: 70.6% (good)
- Medium names: 56.9% (ok)
- Long names: 21.7% (bad - need more long name training data)
- Classes: 34.4% (bad - need more class examples with rich context)
- Model gets stuck repeating chars for long names
Fixes:
1. Increase variants for long names (13+ chars) from 8 to 15
2. Increase variants for class names from 8 to 12
3. Add character-level hints in context (first few chars of original)
4. More diverse context for each name
"""
import json
import random
import sys
from collections import defaultdict
random.seed(42)
OUTPUT = sys.argv[1] if len(sys.argv) > 1 else "training-data-optimal-v2.jsonl"
STYLES = [
lambda i: chr(97 + (i % 26)),
lambda i: chr(97 + (i % 26)) + "$",
lambda i: "_" + chr(97 + (i % 26)),
lambda i: "_0x" + hex(0x1a2b + i)[2:],
lambda i: chr(97 + (i % 26)) + str(i % 10),
lambda i: "__" + chr(97 + (i % 26)),
lambda i: "$" + chr(97 + (i % 26)),
lambda i: chr(65 + (i % 26)),
lambda i: chr(97 + (i % 26)) + chr(97 + ((i + 1) % 26)),
lambda i: "$" + str(i % 100),
lambda i: "_" + str(i % 100),
lambda i: "t" + str(i),
lambda i: "e$" + chr(97 + (i % 26)),
lambda i: "n" + str(i),
lambda i: "r" + chr(97 + (i % 26)),
]
def random_minified():
return random.choice(STYLES)(random.randint(0, 500))
def semantic_context(name):
"""Generate rich context from identifier name."""
tokens = []
current = ""
for c in name:
if c.isupper() and current:
tokens.append(current.lower())
current = c
else:
current += c
if current:
tokens.append(current.lower())
result = [t for t in tokens if len(t) > 1]
# Pattern-based hints
prefixes = {
"is": "boolean", "has": "boolean", "can": "boolean",
"get": "getter", "set": "setter", "fetch": "async",
"on": "event", "handle": "handler", "create": "factory",
"parse": "parser", "format": "formatter", "validate": "validator",
"render": "component", "use": "hook", "with": "HOC",
"init": "initialize", "load": "loader", "save": "persist",
"update": "mutate", "delete": "remove", "find": "query",
"connect": "connection", "send": "network", "receive": "network",
"encode": "codec", "decode": "codec", "encrypt": "security",
"hash": "crypto", "sign": "crypto", "verify": "auth",
"emit": "event", "subscribe": "pubsub", "publish": "pubsub",
"dispatch": "redux", "reduce": "reducer", "select": "selector",
}
suffixes = {
"Error": "error", "Exception": "exception",
"Handler": "handler", "Listener": "listener",
"Manager": "lifecycle", "Service": "service",
"Controller": "controller", "Router": "routing",
"Factory": "factory", "Builder": "builder",
"Adapter": "adapter", "Wrapper": "wrapper",
"Provider": "di", "Injector": "di",
"Config": "configuration", "Options": "settings",
"Result": "result", "Response": "http",
"Request": "http", "Client": "client",
"Server": "server", "Worker": "concurrent",
"Queue": "datastructure", "Stack": "datastructure",
"Cache": "caching", "Pool": "resource",
"Stream": "streaming", "Buffer": "io",
"Observer": "pattern", "Iterator": "pattern",
"Validator": "validation", "Formatter": "formatting",
"Serializer": "serialization", "Parser": "parsing",
}
for prefix, hint in prefixes.items():
if name.startswith(prefix) and len(name) > len(prefix):
result.append(hint)
break
for suffix, hint in suffixes.items():
if name.endswith(suffix):
result.append(hint)
break
return list(dict.fromkeys(result))[:8] # dedupe, keep order
def vary_context(ctx, variant, num_variants=8):
"""Create diverse context variants."""
if not ctx:
return ["unknown"]
ctx = list(ctx)
v = variant % num_variants
if v == 0:
return ctx
if v == 1:
return ctx[1:] + ctx[:1] if len(ctx) > 1 else ctx
if v == 2:
return ctx[:max(2, len(ctx) // 2)]
if v == 3:
return ctx + ["prototype"]
if v == 4:
return ctx + ["constructor"]
if v == 5:
return list(reversed(ctx))
if v == 6:
# Add type hints
return ctx + ["function", "object"]
if v == 7:
# Subset with extra semantic hint
return ctx[:3] + ["module"]
return ctx
# Load data
print("Loading data...")
existing = []
with open("training-data.jsonl") as f:
for line in f:
if line.strip():
existing.append(json.loads(line))
real = []
with open("training-data-v2.jsonl") as f:
for line in f:
if line.strip():
p = json.loads(line)
if len(p.get("context_strings", [])) >= 2 and len(p["original"]) >= 3:
real.append(p)
print(f" Existing: {len(existing)}, Real: {len(real)}")
# Group by original
by_original = defaultdict(list)
for p in existing + real:
by_original[p["original"]].append(p)
print(f" Unique names: {len(by_original)}")
# Build dataset with emphasis on weaknesses
final = []
seen = set()
for original, variants in by_original.items():
if len(original) < 3:
continue
best = max(variants, key=lambda v: len(v.get("context_strings", [])))
base_ctx = best.get("context_strings", [])
base_props = best.get("properties", [])
kind = best.get("kind", "var")
if len(base_ctx) == 0:
base_ctx = semantic_context(original)
# Determine number of variants based on difficulty
if len(original) >= 13:
num_variants = 15 # More examples for long names
elif kind == "class":
num_variants = 12 # More for classes
else:
num_variants = 8
for v in range(num_variants):
minified = random_minified()
key = f"{minified}|{original}"
attempts = 0
while key in seen and attempts < 30:
minified = random_minified()
key = f"{minified}|{original}"
attempts += 1
if key in seen:
continue
seen.add(key)
ctx = vary_context(base_ctx, v, num_variants)
final.append({
"minified": minified,
"original": original,
"context_strings": ctx[:8],
"properties": base_props[:6],
"kind": kind,
})
print(f"\nBase pairs: {len(final)}")
# Augmentation
aug = []
for p in final:
# Context shuffle (30%)
if random.random() < 0.3:
ctx = list(p["context_strings"])
random.shuffle(ctx)
aug.append({**p, "minified": random_minified(), "context_strings": ctx})
# Partial context (20%)
if random.random() < 0.2 and len(p["context_strings"]) > 2:
k = max(2, len(p["context_strings"]) // 2)
ctx = random.sample(p["context_strings"], k)
aug.append({**p, "minified": random_minified(), "context_strings": ctx})
# For long names, extra augmentation (30%)
if len(p["original"]) >= 13 and random.random() < 0.3:
ctx = list(p["context_strings"])
random.shuffle(ctx)
aug.append({**p, "minified": random_minified(), "context_strings": ctx[:4]})
final.extend(aug)
# Deduplicate
deduped = []
seen2 = set()
for p in final:
key = f"{p['minified']}|{p['original']}"
if key not in seen2:
seen2.add(key)
deduped.append(p)
random.shuffle(deduped)
print(f"Final: {len(deduped)} pairs")
with open(OUTPUT, "w") as f:
for p in deduped:
f.write(json.dumps(p) + "\n")
kinds = defaultdict(int)
lengths = {"short": 0, "medium": 0, "long": 0}
for p in deduped:
kinds[p["kind"]] += 1
ol = len(p["original"])
if ol <= 5:
lengths["short"] += 1
elif ol <= 12:
lengths["medium"] += 1
else:
lengths["long"] += 1
print(f"\nBy kind: {dict(kinds)}")
print(f"By length: {lengths}")
print(f"Wrote to {OUTPUT}")