#!/usr/bin/env python3 """ Export a trained deobfuscation model to GGUF Q4 format and package it into an RVF container with an OVERLAY segment. Pipeline: 1. Load PyTorch checkpoint 2. Export to ONNX (if not already done) 3. Quantize weights to INT8 / Q4 4. Write GGUF Q4 file for RuvLLM inference 5. Create RVF container with OVERLAY segment containing the weights Usage: python export-to-rvf.py --checkpoint model/best_model.pt --output model/deobfuscator python export-to-rvf.py --checkpoint model/best_model.pt --output model/deobfuscator --quantize q4 """ import argparse import hashlib import json import os import struct import time from pathlib import Path import torch import numpy as np # --------------------------------------------------------------------------- # Constants (must match train-deobfuscator.py) # --------------------------------------------------------------------------- VOCAB_SIZE = 256 EMBED_DIM = 128 NUM_HEADS = 4 NUM_LAYERS = 3 FFN_DIM = 512 MAX_CONTEXT = 64 MAX_NAME = 32 # GGUF magic and version. GGUF_MAGIC = 0x46475547 # "GGUF" in little-endian GGUF_VERSION = 3 # GGUF value types. GGUF_TYPE_UINT32 = 4 GGUF_TYPE_STRING = 8 GGUF_TYPE_FLOAT32 = 6 # RVF magic bytes. RVF_MAGIC = b"RVF\x01" RVF_OVERLAY_TYPE = 0x10 # OVERLAY segment type # Quantization types. GGML_TYPE_F32 = 0 GGML_TYPE_F16 = 1 GGML_TYPE_Q4_0 = 2 GGML_TYPE_Q8_0 = 8 # --------------------------------------------------------------------------- # Load Model # --------------------------------------------------------------------------- def load_checkpoint(path: str) -> dict: """Load a PyTorch checkpoint.""" checkpoint = torch.load(path, map_location="cpu", weights_only=False) if "model_state_dict" in checkpoint: return checkpoint else: # Bare state dict. return {"model_state_dict": checkpoint, "config": {}} # --------------------------------------------------------------------------- # GGUF Writer # --------------------------------------------------------------------------- def quantize_q4(tensor: np.ndarray) -> bytes: """Quantize a float32 tensor to Q4_0 format (4-bit quantization). Q4_0 format: blocks of 32 values, each block has: - 1 x float16 scale factor (2 bytes) - 16 x uint8 packed nibbles (16 bytes) Total: 18 bytes per 32 values. """ flat = tensor.flatten().astype(np.float32) # Pad to multiple of 32. remainder = len(flat) % 32 if remainder != 0: flat = np.concatenate([flat, np.zeros(32 - remainder, dtype=np.float32)]) num_blocks = len(flat) // 32 result = bytearray() for i in range(num_blocks): block = flat[i * 32 : (i + 1) * 32] abs_max = np.max(np.abs(block)) scale = abs_max / 7.0 if abs_max > 0 else 1.0 # Quantize to 4-bit signed integers [-8, 7]. quantized = np.clip(np.round(block / scale), -8, 7).astype(np.int8) # Pack scale as float16. result.extend(struct.pack(" bytes: """Quantize a float32 tensor to Q8_0 format (8-bit quantization). Q8_0 format: blocks of 32 values, each block has: - 1 x float16 scale factor (2 bytes) - 32 x int8 quantized values (32 bytes) Total: 34 bytes per 32 values. """ flat = tensor.flatten().astype(np.float32) remainder = len(flat) % 32 if remainder != 0: flat = np.concatenate([flat, np.zeros(32 - remainder, dtype=np.float32)]) num_blocks = len(flat) // 32 result = bytearray() for i in range(num_blocks): block = flat[i * 32 : (i + 1) * 32] abs_max = np.max(np.abs(block)) scale = abs_max / 127.0 if abs_max > 0 else 1.0 quantized = np.clip(np.round(block / scale), -128, 127).astype(np.int8) result.extend(struct.pack("