ruvector/scripts/training/release_gate.py
rUv f12e6c1584 feat: implement ADR-129 training pipeline and TurboQuant sidecar infra
Training tooling:
- release_gate.py: Automated 7-gate ship/no-ship checker (G1-G7)
- export_training_data.py: Dataset export with governance (schema,
  dedup, quality scoring, contamination check)
- contamination_check.py: 13-gram eval contamination detection
- run_calibration.py: Phase 1 imatrix + TurboQuant profiling
- run_sft.py: Phase 2 LoRA SFT + DPO training
- deploy_training.sh: Cloud Run job creation + Vertex AI setup
- Dockerfile: GPU training image (transformers + peft + trl)

Rust infrastructure:
- turboquant_profile.rs: .turboquant.json sidecar config loading,
  per-layer TQ config discovery, default profiles

Ref: ADR-129, #310

Co-Authored-By: claude-flow <ruv@ruv.net>
2026-03-28 02:27:32 +00:00

285 lines
9 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Release gate automation for RuvLTRA model training.
Implements the 7 ship/no-ship criteria from ADR-129 (Section 3.2).
A model version is approved for publishing only if ALL gates pass.
Usage:
python release_gate.py --model-path /path/to/model --results-dir /path/to/results
python release_gate.py --results-dir ./results # model-path is optional
Exit codes:
0 - All gates PASS (ship)
1 - One or more gates FAIL (no-ship)
"""
import argparse
import json
import sys
from pathlib import Path
# ---------------------------------------------------------------------------
# Threshold configuration per model size
# ---------------------------------------------------------------------------
THRESHOLDS = {
"0.5B": {
"humaneval_pass1_absolute": 0.45, # G1: >=45% absolute
"humaneval_pass1_delta": 0.05, # G1: >=5pp improvement
"routing_accuracy_min": 0.80, # G2: >=80%
"wikitext2_ppl_increase_max": 0.05, # G3: <5% increase
"tq_compression_min": 8.0, # G4: >=8x
"tq_ppl_delta_max": 0.01, # G4: <1%
"long_context_ppl_max": 20.0, # G5: <20 PPL at 16K
"contamination_max": 0, # G6: zero contamination
"tok_per_sec_min": 80, # G7: >=80 tok/s
},
"3B": {
"humaneval_pass1_absolute": 0.55, # G1: >=55% absolute
"humaneval_pass1_delta": 0.05, # G1: >=5pp improvement
"routing_accuracy_min": 0.80, # G2: >=80%
"wikitext2_ppl_increase_max": 0.05, # G3: <5% increase
"tq_compression_min": 8.0, # G4: >=8x
"tq_ppl_delta_max": 0.01, # G4: <1%
"long_context_ppl_max": 20.0, # G5: <20 PPL at 16K
"contamination_max": 0, # G6: zero contamination
"tok_per_sec_min": 40, # G7: >=40 tok/s
},
}
# ---------------------------------------------------------------------------
# Gate check functions
# ---------------------------------------------------------------------------
def check_g1(baseline, candidate, thresholds):
"""G1: Code quality - HumanEval pass@1."""
base_score = baseline["humaneval_pass1"]
cand_score = candidate["humaneval_pass1"]
delta = cand_score - base_score
abs_threshold = thresholds["humaneval_pass1_absolute"]
delta_threshold = thresholds["humaneval_pass1_delta"]
meets_absolute = cand_score >= abs_threshold
meets_delta = delta >= delta_threshold
passed = meets_absolute or meets_delta
detail = (
f"pass@1={cand_score:.1%} (baseline={base_score:.1%}, "
f"delta={delta:+.1%}); "
f"need >={abs_threshold:.0%} absolute OR >={delta_threshold:.0%} improvement"
)
return passed, detail
def check_g2(baseline, candidate, thresholds):
"""G2: Routing no-regression - accuracy >= 80%."""
accuracy = candidate["routing_accuracy"]
minimum = thresholds["routing_accuracy_min"]
passed = accuracy >= minimum
detail = (
f"routing_accuracy={accuracy:.1%}; "
f"need >={minimum:.0%}"
)
return passed, detail
def check_g3(baseline, candidate, thresholds):
"""G3: General no-regression - wikitext-2 perplexity increase < 5%."""
base_ppl = baseline["wikitext2_ppl"]
cand_ppl = candidate["wikitext2_ppl"]
max_increase = thresholds["wikitext2_ppl_increase_max"]
if base_ppl > 0:
pct_increase = (cand_ppl - base_ppl) / base_ppl
else:
pct_increase = 0.0
passed = pct_increase < max_increase
detail = (
f"wikitext2_ppl={cand_ppl:.2f} (baseline={base_ppl:.2f}, "
f"increase={pct_increase:+.2%}); "
f"need <{max_increase:.0%} increase"
)
return passed, detail
def check_g4(baseline, candidate, thresholds):
"""G4: TurboQuant memory - compression >= 8x, perplexity delta < 1%."""
compression = candidate["tq_compression"]
ppl_delta = candidate["tq_ppl_delta"]
min_compression = thresholds["tq_compression_min"]
max_ppl_delta = thresholds["tq_ppl_delta_max"]
passed = compression >= min_compression and ppl_delta < max_ppl_delta
detail = (
f"compression={compression:.1f}x (need >={min_compression:.0f}x), "
f"ppl_delta={ppl_delta:.3%} (need <{max_ppl_delta:.0%})"
)
return passed, detail
def check_g5(baseline, candidate, thresholds):
"""G5: Long context - perplexity at 16K < 20 PPL."""
ppl = candidate["long_context_ppl"]
maximum = thresholds["long_context_ppl_max"]
passed = ppl < maximum
detail = f"long_context_ppl={ppl:.1f} PPL; need <{maximum:.0f} PPL"
return passed, detail
def check_g6(baseline, candidate, thresholds):
"""G6: Contamination - zero eval contamination."""
count = candidate["contamination_count"]
maximum = thresholds["contamination_max"]
passed = count <= maximum
detail = f"contamination_count={count}; need <={maximum}"
return passed, detail
def check_g7(baseline, candidate, thresholds):
"""G7: Inference speed - tok/s above minimum."""
speed = candidate["tok_per_sec"]
minimum = thresholds["tok_per_sec_min"]
passed = speed >= minimum
detail = f"tok/s={speed:.0f}; need >={minimum}"
return passed, detail
# ---------------------------------------------------------------------------
# Gate runner
# ---------------------------------------------------------------------------
GATES = [
("G1", "Code quality (HumanEval pass@1)", check_g1),
("G2", "Routing no-regression", check_g2),
("G3", "General no-regression (wikitext-2 PPL)", check_g3),
("G4", "TurboQuant memory", check_g4),
("G5", "Long context", check_g5),
("G6", "Contamination", check_g6),
("G7", "Inference speed", check_g7),
]
def run_gates(data):
"""Run all 7 release gates and return results."""
model_size = data["model_size"]
if model_size not in THRESHOLDS:
supported = ", ".join(sorted(THRESHOLDS.keys()))
print(
f"ERROR: Unknown model_size '{model_size}'. "
f"Supported: {supported}",
file=sys.stderr,
)
sys.exit(1)
thresholds = THRESHOLDS[model_size]
baseline = data["baseline"]
candidate = data["candidate"]
results = []
for gate_id, gate_name, check_fn in GATES:
passed, detail = check_fn(baseline, candidate, thresholds)
results.append({
"gate": gate_id,
"name": gate_name,
"passed": passed,
"detail": detail,
})
return results
def print_results(results, model_size):
"""Print formatted gate results and overall verdict."""
print("=" * 72)
print(f" RuvLTRA Release Gate Report | Model size: {model_size}")
print("=" * 72)
all_passed = True
for r in results:
status = "PASS" if r["passed"] else "FAIL"
marker = " " if r["passed"] else ">"
if not r["passed"]:
all_passed = False
print(f" {marker} [{status}] {r['gate']}: {r['name']}")
print(f" {r['detail']}")
print("-" * 72)
verdict = "PASS -- ship approved" if all_passed else "FAIL -- do not ship"
print(f" Verdict: {verdict}")
print("=" * 72)
return all_passed
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(
description="RuvLTRA release gate checker (ADR-129 Section 3.2)",
)
parser.add_argument(
"--model-path",
type=str,
default=None,
help="Path to the model directory (informational, logged in output)",
)
parser.add_argument(
"--results-dir",
type=str,
required=True,
help="Directory containing gate_results.json",
)
parser.add_argument(
"--output-json",
type=str,
default=None,
help="Optional path to write JSON report",
)
args = parser.parse_args()
results_file = Path(args.results_dir) / "gate_results.json"
if not results_file.exists():
print(
f"ERROR: {results_file} not found. "
f"Run evaluation scripts first to generate gate results.",
file=sys.stderr,
)
sys.exit(1)
with open(results_file, "r") as f:
data = json.load(f)
if args.model_path:
print(f"Model: {args.model_path}")
results = run_gates(data)
all_passed = print_results(results, data["model_size"])
if args.output_json:
report = {
"model_size": data["model_size"],
"model_path": args.model_path,
"verdict": "PASS" if all_passed else "FAIL",
"gates": results,
}
output_path = Path(args.output_json)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w") as f:
json.dump(report, f, indent=2)
print(f"\nJSON report written to: {output_path}")
sys.exit(0 if all_passed else 1)
if __name__ == "__main__":
main()