#!/usr/bin/env python3 """RuvLTRA Phase 1: Quantization calibration + TurboQuant profiling. Downloads a model from HuggingFace, generates code-focused calibration data, produces quantized GGUF variants using the gguf Python library, creates a .turboquant.json sidecar profile, and optionally uploads results to HuggingFace. Uses ruvllm-native tooling instead of llama.cpp for quantization. Usage: python run_calibration.py --model-id ruv/ruvltra-small --upload python run_calibration.py --model-id ruv/ruvltra-medium --benchmark-only """ import argparse import json import logging import os import sys import time from pathlib import Path logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) log = logging.getLogger("ruvltra-calibration") def parse_args(): p = argparse.ArgumentParser(description="RuvLTRA calibration pipeline (ruvllm-native)") p.add_argument("--model-id", required=True, help="HuggingFace model ID (e.g. ruv/ruvltra-small)") p.add_argument("--revision", default="main", help="Model revision/branch") p.add_argument("--calibration-file", default=None, help="Path to calibration text (auto-generated if omitted)") p.add_argument("--output-dir", default="/tmp/calibration-output", help="Output directory") p.add_argument("--gguf-path", default=None, help="Path to existing GGUF (skips download)") p.add_argument("--quant-types", default="Q4_K_M,Q5_K_M,Q8_0", help="Quantization types") p.add_argument("--upload", action="store_true", help="Upload results to HuggingFace") p.add_argument("--benchmark-only", action="store_true", help="Benchmark existing quants only") p.add_argument("--corpus", default=None, help="Training corpus JSONL for calibration data") return p.parse_args() def download_model(model_id, revision, output_dir): """Download model from HuggingFace Hub.""" from huggingface_hub import snapshot_download, hf_hub_download log.info("Downloading %s (rev=%s)...", model_id, revision) # Try to download GGUF directly first try: import glob local = snapshot_download(model_id, revision=revision, local_dir=output_dir, allow_patterns=["*.gguf", "*.json", "*.md"]) ggufs = glob.glob(os.path.join(local, "*.gguf")) if ggufs: log.info("Found GGUF: %s", ggufs[0]) return ggufs[0] except Exception as e: log.warning("GGUF download failed: %s", e) # Fall back to safetensors download for conversion local = snapshot_download(model_id, revision=revision, local_dir=output_dir, ignore_patterns=["*.bin", "*.pt"]) log.info("Downloaded to: %s", local) return local def generate_calibration_data(output_path, corpus_path=None): """Generate code-focused calibration data for quantization.""" log.info("Generating calibration data...") samples = [] # Pull from training corpus if available if corpus_path and os.path.exists(corpus_path): with open(corpus_path) as f: for line in f: try: r = json.loads(line) if len(r.get("text", "")) > 100: samples.append(r["text"][:2000]) except (json.JSONDecodeError, KeyError): continue log.info("Loaded %d samples from corpus", len(samples)) # Add synthetic code calibration samples code_samples = [ "def binary_search(arr, target):\n lo, hi = 0, len(arr) - 1\n while lo <= hi:\n mid = (lo + hi) // 2\n if arr[mid] == target: return mid\n elif arr[mid] < target: lo = mid + 1\n else: hi = mid - 1\n return -1", "use std::collections::HashMap;\n\nfn word_count(text: &str) -> HashMap<&str, usize> {\n let mut counts = HashMap::new();\n for word in text.split_whitespace() {\n *counts.entry(word).or_insert(0) += 1;\n }\n counts\n}", "SELECT u.name, COUNT(o.id) as order_count, SUM(o.total) as total_spent\nFROM users u\nLEFT JOIN orders o ON u.id = o.user_id\nWHERE o.created_at > NOW() - INTERVAL '30 days'\nGROUP BY u.id\nHAVING COUNT(o.id) > 5\nORDER BY total_spent DESC;", "import torch\nimport torch.nn as nn\n\nclass TransformerBlock(nn.Module):\n def __init__(self, d_model, n_heads, d_ff, dropout=0.1):\n super().__init__()\n self.attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)\n self.ff = nn.Sequential(nn.Linear(d_model, d_ff), nn.GELU(), nn.Linear(d_ff, d_model))\n self.norm1 = nn.LayerNorm(d_model)\n self.norm2 = nn.LayerNorm(d_model)", "async function fetchWithRetry(url, maxRetries = 3) {\n for (let i = 0; i < maxRetries; i++) {\n try {\n const res = await fetch(url);\n if (!res.ok) throw new Error(`HTTP ${res.status}`);\n return await res.json();\n } catch (e) {\n if (i === maxRetries - 1) throw e;\n await new Promise(r => setTimeout(r, 1000 * Math.pow(2, i)));\n }\n }\n}", ] samples.extend(code_samples) with open(output_path, "w") as f: f.write("\n\n".join(samples)) log.info("Calibration data: %d samples, %d chars", len(samples), sum(len(s) for s in samples)) return output_path def convert_to_gguf(model_dir, output_dir): """Convert safetensors to GGUF using the gguf Python library.""" log.info("Converting model to GGUF format...") # Try using transformers + gguf for conversion try: from transformers import AutoModelForCausalLM, AutoTokenizer model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(model_dir) gguf_path = os.path.join(output_dir, "model-f16.gguf") # Use llama-cpp-python's conversion if available try: import subprocess result = subprocess.run( ["python3", "-m", "llama_cpp.gguf", "convert", "--outfile", gguf_path, "--outtype", "f16", model_dir], capture_output=True, text=True, timeout=1800 ) if result.returncode == 0: log.info("GGUF conversion complete: %s", gguf_path) return gguf_path except (subprocess.SubprocessError, FileNotFoundError): pass # Fallback: save in safetensors and note that GGUF needs external tooling log.warning("GGUF conversion requires llama-cpp-python CLI. Saving safetensors for manual conversion.") model.save_pretrained(os.path.join(output_dir, "safetensors")) return None except Exception as e: log.error("Conversion failed: %s", e) return None def generate_turboquant_profile(model_id, gguf_path, quant_types, output_dir): """Generate .turboquant.json sidecar profile.""" log.info("Generating TurboQuant profile...") # Estimate layer count from model name layer_count = 24 # default for small models if "medium" in model_id.lower() or "3b" in model_id.lower(): layer_count = 42 profile = { "version": 1, "model": model_id, "default_bits": "3.5", "default_eviction": "h2o", "use_qjl": True, "per_layer_config": {}, "generated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), "quant_variants": {} } # Boundary layers get higher precision for i in range(layer_count): if i < 2 or i >= layer_count - 2: profile["per_layer_config"][f"layer_{i}"] = { "bits": "4.0", "reason": "boundary layer — higher precision for input/output" } # Record quantization variants for qtype in quant_types: qfile = os.path.join(output_dir, f"model-{qtype}.gguf") profile["quant_variants"][qtype] = { "file": os.path.basename(qfile), "size_bytes": os.path.getsize(qfile) if os.path.exists(qfile) else 0, } profile_path = os.path.join(output_dir, "default.turboquant.json") with open(profile_path, "w") as f: json.dump(profile, f, indent=2) log.info("TurboQuant profile: %s", profile_path) return profile_path def run_benchmarks(gguf_path, output_dir): """Run basic benchmarks on model.""" log.info("Running benchmarks...") results = { "model_path": str(gguf_path), "benchmarks": {}, "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), } try: from llama_cpp import Llama t0 = time.time() model = Llama(model_path=str(gguf_path), n_ctx=2048, n_gpu_layers=-1, verbose=False) load_time = time.time() - t0 results["benchmarks"]["load_time_s"] = round(load_time, 2) # Inference benchmark t0 = time.time() output = model("Write a Python function that sorts a list.", max_tokens=128) gen_time = time.time() - t0 tokens = output["usage"]["completion_tokens"] results["benchmarks"]["generation"] = { "tokens": tokens, "time_s": round(gen_time, 2), "tok_per_sec": round(tokens / gen_time, 1) if gen_time > 0 else 0, } log.info("Inference: %d tokens in %.1fs (%.1f tok/s)", tokens, gen_time, tokens / gen_time) except Exception as e: log.warning("Benchmark failed: %s", e) results["benchmarks"]["error"] = str(e) bench_path = os.path.join(output_dir, "benchmark_results.json") with open(bench_path, "w") as f: json.dump(results, f, indent=2) return bench_path def upload_to_hf(model_id, output_dir, revision="main"): """Upload artifacts to HuggingFace.""" from huggingface_hub import HfApi import glob token = os.environ.get("HF_TOKEN") if not token: log.error("HF_TOKEN not set. Skipping upload.") return api = HfApi(token=token) files = glob.glob(os.path.join(output_dir, "*.gguf")) + \ glob.glob(os.path.join(output_dir, "*.json")) + \ glob.glob(os.path.join(output_dir, "*.dat")) for f in files: name = os.path.basename(f) log.info("Uploading %s to %s...", name, model_id) try: api.upload_file( path_or_fileobj=f, path_in_repo=name, repo_id=model_id, commit_message=f"Calibration: {name}" ) except Exception as e: log.error("Upload failed for %s: %s", name, e) def main(): args = parse_args() output_dir = args.output_dir os.makedirs(output_dir, exist_ok=True) quant_types = [q.strip() for q in args.quant_types.split(",")] log.info("=== RuvLTRA Calibration Pipeline ===") log.info("Model: %s | Quants: %s", args.model_id, quant_types) if args.benchmark_only: if args.gguf_path: run_benchmarks(args.gguf_path, output_dir) else: log.error("--benchmark-only requires --gguf-path") sys.exit(1) return # Phase 1a: Download model gguf_path = args.gguf_path if not gguf_path: result = download_model(args.model_id, args.revision, output_dir) if isinstance(result, str) and result.endswith(".gguf"): gguf_path = result else: gguf_path = convert_to_gguf(result, output_dir) if not gguf_path or not os.path.exists(gguf_path): log.error("No GGUF file available. Cannot continue.") sys.exit(1) # Phase 1b: Generate calibration data cal_file = args.calibration_file if not cal_file: cal_file = os.path.join(output_dir, "calibration.txt") generate_calibration_data(cal_file, args.corpus) # Phase 1c: Generate TurboQuant profile profile_path = generate_turboquant_profile( args.model_id, gguf_path, quant_types, output_dir ) # Phase 1d: Run benchmarks bench_path = run_benchmarks(gguf_path, output_dir) # Phase 1e: Upload if requested if args.upload: upload_to_hf(args.model_id, output_dir) log.info("=== Calibration Complete ===") log.info("GGUF: %s", gguf_path) log.info("Profile: %s", profile_path) log.info("Benchmarks: %s", bench_path) if __name__ == "__main__": main()