ruvector/scripts/training/release_gate.py

#!/usr/bin/env python3
"""
Release gate automation for RuvLTRA model training.

Implements the 7 ship/no-ship criteria from ADR-129 (Section 3.2).
A model version is approved for publishing only if ALL gates pass.

Usage:
    python release_gate.py --model-path /path/to/model --results-dir /path/to/results
    python release_gate.py --results-dir ./results  # model-path is optional

Exit codes:
    0 - All gates PASS (ship)
    1 - One or more gates FAIL (no-ship)
"""

import argparse
import json
import sys
from pathlib import Path


# ---------------------------------------------------------------------------
# Threshold configuration per model size
# ---------------------------------------------------------------------------

THRESHOLDS = {
    "0.5B": {
        "humaneval_pass1_absolute": 0.45,   # G1: >=45% absolute
        "humaneval_pass1_delta": 0.05,       # G1: >=5pp improvement
        "routing_accuracy_min": 0.80,        # G2: >=80%
        "wikitext2_ppl_increase_max": 0.05,  # G3: <5% increase
        "tq_compression_min": 8.0,           # G4: >=8x
        "tq_ppl_delta_max": 0.01,            # G4: <1%
        "long_context_ppl_max": 20.0,        # G5: <20 PPL at 16K
        "contamination_max": 0,              # G6: zero contamination
        "tok_per_sec_min": 80,               # G7: >=80 tok/s
    },
    "3B": {
        "humaneval_pass1_absolute": 0.55,    # G1: >=55% absolute
        "humaneval_pass1_delta": 0.05,       # G1: >=5pp improvement
        "routing_accuracy_min": 0.80,        # G2: >=80%
        "wikitext2_ppl_increase_max": 0.05,  # G3: <5% increase
        "tq_compression_min": 8.0,           # G4: >=8x
        "tq_ppl_delta_max": 0.01,            # G4: <1%
        "long_context_ppl_max": 20.0,        # G5: <20 PPL at 16K
        "contamination_max": 0,              # G6: zero contamination
        "tok_per_sec_min": 40,               # G7: >=40 tok/s
    },
}


# ---------------------------------------------------------------------------
# Gate check functions
# ---------------------------------------------------------------------------

def check_g1(baseline, candidate, thresholds):
    """G1: Code quality - HumanEval pass@1."""
    base_score = baseline["humaneval_pass1"]
    cand_score = candidate["humaneval_pass1"]
    delta = cand_score - base_score
    abs_threshold = thresholds["humaneval_pass1_absolute"]
    delta_threshold = thresholds["humaneval_pass1_delta"]

    meets_absolute = cand_score >= abs_threshold
    meets_delta = delta >= delta_threshold

    passed = meets_absolute or meets_delta
    detail = (
        f"pass@1={cand_score:.1%} (baseline={base_score:.1%}, "
        f"delta={delta:+.1%}); "
        f"need >={abs_threshold:.0%} absolute OR >={delta_threshold:.0%} improvement"
    )
    return passed, detail


def check_g2(baseline, candidate, thresholds):
    """G2: Routing no-regression - accuracy >= 80%."""
    accuracy = candidate["routing_accuracy"]
    minimum = thresholds["routing_accuracy_min"]

    passed = accuracy >= minimum
    detail = (
        f"routing_accuracy={accuracy:.1%}; "
        f"need >={minimum:.0%}"
    )
    return passed, detail


def check_g3(baseline, candidate, thresholds):
    """G3: General no-regression - wikitext-2 perplexity increase < 5%."""
    base_ppl = baseline["wikitext2_ppl"]
    cand_ppl = candidate["wikitext2_ppl"]
    max_increase = thresholds["wikitext2_ppl_increase_max"]

    if base_ppl > 0:
        pct_increase = (cand_ppl - base_ppl) / base_ppl
    else:
        pct_increase = 0.0

    passed = pct_increase < max_increase
    detail = (
        f"wikitext2_ppl={cand_ppl:.2f} (baseline={base_ppl:.2f}, "
        f"increase={pct_increase:+.2%}); "
        f"need <{max_increase:.0%} increase"
    )
    return passed, detail


def check_g4(baseline, candidate, thresholds):
    """G4: TurboQuant memory - compression >= 8x, perplexity delta < 1%."""
    compression = candidate["tq_compression"]
    ppl_delta = candidate["tq_ppl_delta"]
    min_compression = thresholds["tq_compression_min"]
    max_ppl_delta = thresholds["tq_ppl_delta_max"]

    passed = compression >= min_compression and ppl_delta < max_ppl_delta
    detail = (
        f"compression={compression:.1f}x (need >={min_compression:.0f}x), "
        f"ppl_delta={ppl_delta:.3%} (need <{max_ppl_delta:.0%})"
    )
    return passed, detail


def check_g5(baseline, candidate, thresholds):
    """G5: Long context - perplexity at 16K < 20 PPL."""
    ppl = candidate["long_context_ppl"]
    maximum = thresholds["long_context_ppl_max"]

    passed = ppl < maximum
    detail = f"long_context_ppl={ppl:.1f} PPL; need <{maximum:.0f} PPL"
    return passed, detail


def check_g6(baseline, candidate, thresholds):
    """G6: Contamination - zero eval contamination."""
    count = candidate["contamination_count"]
    maximum = thresholds["contamination_max"]

    passed = count <= maximum
    detail = f"contamination_count={count}; need <={maximum}"
    return passed, detail


def check_g7(baseline, candidate, thresholds):
    """G7: Inference speed - tok/s above minimum."""
    speed = candidate["tok_per_sec"]
    minimum = thresholds["tok_per_sec_min"]

    passed = speed >= minimum
    detail = f"tok/s={speed:.0f}; need >={minimum}"
    return passed, detail


# ---------------------------------------------------------------------------
# Gate runner
# ---------------------------------------------------------------------------

GATES = [
    ("G1", "Code quality (HumanEval pass@1)", check_g1),
    ("G2", "Routing no-regression", check_g2),
    ("G3", "General no-regression (wikitext-2 PPL)", check_g3),
    ("G4", "TurboQuant memory", check_g4),
    ("G5", "Long context", check_g5),
    ("G6", "Contamination", check_g6),
    ("G7", "Inference speed", check_g7),
]


def run_gates(data):
    """Run all 7 release gates and return results."""
    model_size = data["model_size"]
    if model_size not in THRESHOLDS:
        supported = ", ".join(sorted(THRESHOLDS.keys()))
        print(
            f"ERROR: Unknown model_size '{model_size}'. "
            f"Supported: {supported}",
            file=sys.stderr,
        )
        sys.exit(1)

    thresholds = THRESHOLDS[model_size]
    baseline = data["baseline"]
    candidate = data["candidate"]

    results = []
    for gate_id, gate_name, check_fn in GATES:
        passed, detail = check_fn(baseline, candidate, thresholds)
        results.append({
            "gate": gate_id,
            "name": gate_name,
            "passed": passed,
            "detail": detail,
        })

    return results


def print_results(results, model_size):
    """Print formatted gate results and overall verdict."""
    print("=" * 72)
    print(f"  RuvLTRA Release Gate Report  |  Model size: {model_size}")
    print("=" * 72)

    all_passed = True
    for r in results:
        status = "PASS" if r["passed"] else "FAIL"
        marker = " " if r["passed"] else ">"
        if not r["passed"]:
            all_passed = False
        print(f"  {marker} [{status}] {r['gate']}: {r['name']}")
        print(f"          {r['detail']}")

    print("-" * 72)
    verdict = "PASS -- ship approved" if all_passed else "FAIL -- do not ship"
    print(f"  Verdict: {verdict}")
    print("=" * 72)

    return all_passed


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(
        description="RuvLTRA release gate checker (ADR-129 Section 3.2)",
    )
    parser.add_argument(
        "--model-path",
        type=str,
        default=None,
        help="Path to the model directory (informational, logged in output)",
    )
    parser.add_argument(
        "--results-dir",
        type=str,
        required=True,
        help="Directory containing gate_results.json",
    )
    parser.add_argument(
        "--output-json",
        type=str,
        default=None,
        help="Optional path to write JSON report",
    )
    args = parser.parse_args()

    results_file = Path(args.results_dir) / "gate_results.json"
    if not results_file.exists():
        print(
            f"ERROR: {results_file} not found. "
            f"Run evaluation scripts first to generate gate results.",
            file=sys.stderr,
        )
        sys.exit(1)

    with open(results_file, "r") as f:
        data = json.load(f)

    if args.model_path:
        print(f"Model: {args.model_path}")

    results = run_gates(data)
    all_passed = print_results(results, data["model_size"])

    if args.output_json:
        report = {
            "model_size": data["model_size"],
            "model_path": args.model_path,
            "verdict": "PASS" if all_passed else "FAIL",
            "gates": results,
        }
        output_path = Path(args.output_json)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        with open(output_path, "w") as f:
            json.dump(report, f, indent=2)
        print(f"\nJSON report written to: {output_path}")

    sys.exit(0 if all_passed else 1)


if __name__ == "__main__":
    main()