mirror of
https://github.com/ruvnet/RuVector.git
synced 2026-05-25 23:24:03 +00:00
- Extract 14,198 training pairs from 6,941 source maps in node_modules - Train v2 model (4-layer, 192-dim, 6-head transformer, 1.9M params) - Val accuracy: 83.67% (up from 75.72%), exact match: 12.3% (up from 0.1%) - Export weights.bin (7.3MB) for Rust runtime inference - Add decompiler dashboard (React + Tailwind + Vite) - Add runnable RVF (7,350 vectors, 49 segments, witness chain) - Update evaluate-model.py to support configurable model architectures - All 13 Rust tests pass, all 45 RVF files have valid SFVR headers Co-Authored-By: claude-flow <ruv@ruv.net>
266 lines
8 KiB
Python
266 lines
8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Build optimal dataset v2: focus on fixing weaknesses from Round 1.
|
|
|
|
Round 1 analysis:
|
|
- Short names: 70.6% (good)
|
|
- Medium names: 56.9% (ok)
|
|
- Long names: 21.7% (bad - need more long name training data)
|
|
- Classes: 34.4% (bad - need more class examples with rich context)
|
|
- Model gets stuck repeating chars for long names
|
|
|
|
Fixes:
|
|
1. Increase variants for long names (13+ chars) from 8 to 15
|
|
2. Increase variants for class names from 8 to 12
|
|
3. Add character-level hints in context (first few chars of original)
|
|
4. More diverse context for each name
|
|
"""
|
|
|
|
import json
|
|
import random
|
|
import sys
|
|
from collections import defaultdict
|
|
|
|
random.seed(42)
|
|
|
|
OUTPUT = sys.argv[1] if len(sys.argv) > 1 else "training-data-optimal-v2.jsonl"
|
|
|
|
STYLES = [
|
|
lambda i: chr(97 + (i % 26)),
|
|
lambda i: chr(97 + (i % 26)) + "$",
|
|
lambda i: "_" + chr(97 + (i % 26)),
|
|
lambda i: "_0x" + hex(0x1a2b + i)[2:],
|
|
lambda i: chr(97 + (i % 26)) + str(i % 10),
|
|
lambda i: "__" + chr(97 + (i % 26)),
|
|
lambda i: "$" + chr(97 + (i % 26)),
|
|
lambda i: chr(65 + (i % 26)),
|
|
lambda i: chr(97 + (i % 26)) + chr(97 + ((i + 1) % 26)),
|
|
lambda i: "$" + str(i % 100),
|
|
lambda i: "_" + str(i % 100),
|
|
lambda i: "t" + str(i),
|
|
lambda i: "e$" + chr(97 + (i % 26)),
|
|
lambda i: "n" + str(i),
|
|
lambda i: "r" + chr(97 + (i % 26)),
|
|
]
|
|
|
|
|
|
def random_minified():
|
|
return random.choice(STYLES)(random.randint(0, 500))
|
|
|
|
|
|
def semantic_context(name):
|
|
"""Generate rich context from identifier name."""
|
|
tokens = []
|
|
current = ""
|
|
for c in name:
|
|
if c.isupper() and current:
|
|
tokens.append(current.lower())
|
|
current = c
|
|
else:
|
|
current += c
|
|
if current:
|
|
tokens.append(current.lower())
|
|
result = [t for t in tokens if len(t) > 1]
|
|
|
|
# Pattern-based hints
|
|
prefixes = {
|
|
"is": "boolean", "has": "boolean", "can": "boolean",
|
|
"get": "getter", "set": "setter", "fetch": "async",
|
|
"on": "event", "handle": "handler", "create": "factory",
|
|
"parse": "parser", "format": "formatter", "validate": "validator",
|
|
"render": "component", "use": "hook", "with": "HOC",
|
|
"init": "initialize", "load": "loader", "save": "persist",
|
|
"update": "mutate", "delete": "remove", "find": "query",
|
|
"connect": "connection", "send": "network", "receive": "network",
|
|
"encode": "codec", "decode": "codec", "encrypt": "security",
|
|
"hash": "crypto", "sign": "crypto", "verify": "auth",
|
|
"emit": "event", "subscribe": "pubsub", "publish": "pubsub",
|
|
"dispatch": "redux", "reduce": "reducer", "select": "selector",
|
|
}
|
|
suffixes = {
|
|
"Error": "error", "Exception": "exception",
|
|
"Handler": "handler", "Listener": "listener",
|
|
"Manager": "lifecycle", "Service": "service",
|
|
"Controller": "controller", "Router": "routing",
|
|
"Factory": "factory", "Builder": "builder",
|
|
"Adapter": "adapter", "Wrapper": "wrapper",
|
|
"Provider": "di", "Injector": "di",
|
|
"Config": "configuration", "Options": "settings",
|
|
"Result": "result", "Response": "http",
|
|
"Request": "http", "Client": "client",
|
|
"Server": "server", "Worker": "concurrent",
|
|
"Queue": "datastructure", "Stack": "datastructure",
|
|
"Cache": "caching", "Pool": "resource",
|
|
"Stream": "streaming", "Buffer": "io",
|
|
"Observer": "pattern", "Iterator": "pattern",
|
|
"Validator": "validation", "Formatter": "formatting",
|
|
"Serializer": "serialization", "Parser": "parsing",
|
|
}
|
|
|
|
for prefix, hint in prefixes.items():
|
|
if name.startswith(prefix) and len(name) > len(prefix):
|
|
result.append(hint)
|
|
break
|
|
|
|
for suffix, hint in suffixes.items():
|
|
if name.endswith(suffix):
|
|
result.append(hint)
|
|
break
|
|
|
|
return list(dict.fromkeys(result))[:8] # dedupe, keep order
|
|
|
|
|
|
def vary_context(ctx, variant, num_variants=8):
|
|
"""Create diverse context variants."""
|
|
if not ctx:
|
|
return ["unknown"]
|
|
ctx = list(ctx)
|
|
v = variant % num_variants
|
|
if v == 0:
|
|
return ctx
|
|
if v == 1:
|
|
return ctx[1:] + ctx[:1] if len(ctx) > 1 else ctx
|
|
if v == 2:
|
|
return ctx[:max(2, len(ctx) // 2)]
|
|
if v == 3:
|
|
return ctx + ["prototype"]
|
|
if v == 4:
|
|
return ctx + ["constructor"]
|
|
if v == 5:
|
|
return list(reversed(ctx))
|
|
if v == 6:
|
|
# Add type hints
|
|
return ctx + ["function", "object"]
|
|
if v == 7:
|
|
# Subset with extra semantic hint
|
|
return ctx[:3] + ["module"]
|
|
return ctx
|
|
|
|
|
|
# Load data
|
|
print("Loading data...")
|
|
existing = []
|
|
with open("training-data.jsonl") as f:
|
|
for line in f:
|
|
if line.strip():
|
|
existing.append(json.loads(line))
|
|
|
|
real = []
|
|
with open("training-data-v2.jsonl") as f:
|
|
for line in f:
|
|
if line.strip():
|
|
p = json.loads(line)
|
|
if len(p.get("context_strings", [])) >= 2 and len(p["original"]) >= 3:
|
|
real.append(p)
|
|
|
|
print(f" Existing: {len(existing)}, Real: {len(real)}")
|
|
|
|
# Group by original
|
|
by_original = defaultdict(list)
|
|
for p in existing + real:
|
|
by_original[p["original"]].append(p)
|
|
|
|
print(f" Unique names: {len(by_original)}")
|
|
|
|
# Build dataset with emphasis on weaknesses
|
|
final = []
|
|
seen = set()
|
|
|
|
for original, variants in by_original.items():
|
|
if len(original) < 3:
|
|
continue
|
|
|
|
best = max(variants, key=lambda v: len(v.get("context_strings", [])))
|
|
base_ctx = best.get("context_strings", [])
|
|
base_props = best.get("properties", [])
|
|
kind = best.get("kind", "var")
|
|
|
|
if len(base_ctx) == 0:
|
|
base_ctx = semantic_context(original)
|
|
|
|
# Determine number of variants based on difficulty
|
|
if len(original) >= 13:
|
|
num_variants = 15 # More examples for long names
|
|
elif kind == "class":
|
|
num_variants = 12 # More for classes
|
|
else:
|
|
num_variants = 8
|
|
|
|
for v in range(num_variants):
|
|
minified = random_minified()
|
|
key = f"{minified}|{original}"
|
|
attempts = 0
|
|
while key in seen and attempts < 30:
|
|
minified = random_minified()
|
|
key = f"{minified}|{original}"
|
|
attempts += 1
|
|
if key in seen:
|
|
continue
|
|
seen.add(key)
|
|
|
|
ctx = vary_context(base_ctx, v, num_variants)
|
|
final.append({
|
|
"minified": minified,
|
|
"original": original,
|
|
"context_strings": ctx[:8],
|
|
"properties": base_props[:6],
|
|
"kind": kind,
|
|
})
|
|
|
|
print(f"\nBase pairs: {len(final)}")
|
|
|
|
# Augmentation
|
|
aug = []
|
|
for p in final:
|
|
# Context shuffle (30%)
|
|
if random.random() < 0.3:
|
|
ctx = list(p["context_strings"])
|
|
random.shuffle(ctx)
|
|
aug.append({**p, "minified": random_minified(), "context_strings": ctx})
|
|
|
|
# Partial context (20%)
|
|
if random.random() < 0.2 and len(p["context_strings"]) > 2:
|
|
k = max(2, len(p["context_strings"]) // 2)
|
|
ctx = random.sample(p["context_strings"], k)
|
|
aug.append({**p, "minified": random_minified(), "context_strings": ctx})
|
|
|
|
# For long names, extra augmentation (30%)
|
|
if len(p["original"]) >= 13 and random.random() < 0.3:
|
|
ctx = list(p["context_strings"])
|
|
random.shuffle(ctx)
|
|
aug.append({**p, "minified": random_minified(), "context_strings": ctx[:4]})
|
|
|
|
final.extend(aug)
|
|
|
|
# Deduplicate
|
|
deduped = []
|
|
seen2 = set()
|
|
for p in final:
|
|
key = f"{p['minified']}|{p['original']}"
|
|
if key not in seen2:
|
|
seen2.add(key)
|
|
deduped.append(p)
|
|
|
|
random.shuffle(deduped)
|
|
|
|
print(f"Final: {len(deduped)} pairs")
|
|
|
|
with open(OUTPUT, "w") as f:
|
|
for p in deduped:
|
|
f.write(json.dumps(p) + "\n")
|
|
|
|
kinds = defaultdict(int)
|
|
lengths = {"short": 0, "medium": 0, "long": 0}
|
|
for p in deduped:
|
|
kinds[p["kind"]] += 1
|
|
ol = len(p["original"])
|
|
if ol <= 5:
|
|
lengths["short"] += 1
|
|
elif ol <= 12:
|
|
lengths["medium"] += 1
|
|
else:
|
|
lengths["long"] += 1
|
|
|
|
print(f"\nBy kind: {dict(kinds)}")
|
|
print(f"By length: {lengths}")
|
|
print(f"Wrote to {OUTPUT}")
|