kvcache-ai-ktransformers/kt-kernel/scripts/merge_cpu_weights.py

#!/usr/bin/env python3

import argparse
import os
import glob
import numpy as np
import torch
from safetensors.torch import save_file
import gc
import json
import shutil
import sys


def discover_layers(input_path: str):
    """Discover all layer folders in the input directory."""
    layer_folders = []
    for item in os.listdir(input_path):
        if item.startswith("_layer_"):
            try:
                layer_idx = int(item.split("_")[-1])
                layer_folders.append((layer_idx, item))
            except ValueError:
                continue
    layer_folders.sort(key=lambda x: x[0])
    return layer_folders


def discover_numa_folders(layer_path: str):
    """Discover all NUMA folders within a layer folder."""
    numa_folders = []
    for item in os.listdir(layer_path):
        if item.startswith("_numa_"):
            try:
                numa_idx = int(item.split("_")[-1])
                numa_folders.append((numa_idx, item))
            except ValueError:
                continue
    numa_folders.sort(key=lambda x: x[0])
    return numa_folders


def detect_quant_method(layer_path: str):
    """Detect quantization method from file names (INT4 vs INT8)."""
    for root, _, files in os.walk(layer_path):
        for f in files:
            if f.startswith("MOE_INT4_"):
                return "moe_int4", "MOE_INT4"
            elif f.startswith("MOE_INT8_"):
                return "moe_int8", "MOE_INT8"
            elif f.startswith("INT4_"):
                return "int4", "INT4"
            elif f.startswith("INT8_"):
                return "int8", "INT8"
    raise ValueError(f"Could not detect quant method in {layer_path}")


def load_binary_tensor(file_path: str) -> torch.Tensor:
    """Load .kt format binary tensor file."""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    with open(file_path, "rb") as f:
        binary_data = f.read()

    if "scale" in file_path:
        np_array = np.frombuffer(binary_data, dtype=np.float32)
    else:
        np_array = np.frombuffer(binary_data, dtype=np.int8)

    return torch.from_numpy(np_array.copy())


def process_layer(layer_path: str, amx_prefix: str, layer_idx: int) -> dict:
    """Process a single layer folder and return all tensors."""
    tensors = {}
    numa_folders = discover_numa_folders(layer_path)

    if not numa_folders:
        print(f"  Warning: No NUMA folders found in {layer_path}", file=sys.stderr)
        return tensors

    proj_mappings = [
        ("down", "ffn_down_exps"),
        ("gate", "ffn_gate_exps"),
        ("up", "ffn_up_exps"),
    ]

    for numa_idx, numa_folder in numa_folders:
        numa_path = os.path.join(layer_path, numa_folder)

        for proj_name, proj_key in proj_mappings:
            quant_pattern = os.path.join(numa_path, f"{amx_prefix}_{proj_name}_*Byte_quant_.kt")
            scale_pattern = os.path.join(numa_path, f"{amx_prefix}_{proj_name}_*Byte_scale_.kt")

            quant_files = sorted(glob.glob(quant_pattern))
            scale_files = sorted(glob.glob(scale_pattern))

            for quant_file in quant_files:
                filename = os.path.basename(quant_file)
                remainder = filename[len(f"{amx_prefix}_{proj_name}_"):]
                try:
                    expert_idx = int(remainder.split("_")[0])
                except (ValueError, IndexError):
                    print(f"    Warning: Could not parse expert index from {filename}", file=sys.stderr)
                    continue

                weight_key = f"blk.{layer_idx}.{proj_key}.{expert_idx}.numa.{numa_idx}.weight"
                tensors[weight_key] = load_binary_tensor(quant_file)

            for scale_file in scale_files:
                filename = os.path.basename(scale_file)
                remainder = filename[len(f"{amx_prefix}_{proj_name}_"):]
                try:
                    expert_idx = int(remainder.split("_")[0])
                except (ValueError, IndexError):
                    print(f"    Warning: Could not parse expert index from {filename}", file=sys.stderr)
                    continue

                scale_key = f"blk.{layer_idx}.{proj_key}.{expert_idx}.numa.{numa_idx}.scale"
                tensors[scale_key] = load_binary_tensor(scale_file)

    return tensors


def write_shards(accumulated_tensors: dict, output_path: str, shard_counter: dict, keep_remainder: bool = True):
    """Write accumulated tensors to one or more shard files.

    Args:
        accumulated_tensors: Dict of tensors to write
        output_path: Output directory
        shard_counter: Dict with 'shard' and 'max_tensors' keys
        keep_remainder: If True, keep leftover tensors in accumulator for next batch
    """
    if not accumulated_tensors:
        return

    max_tensors = shard_counter["max_tensors"]
    current_shard = shard_counter["shard"]
    total_tensors = len(accumulated_tensors)

    if total_tensors <= max_tensors:
        if not keep_remainder:
            output_file = os.path.join(output_path, f"model-{current_shard:05d}.safetensors")
            save_file(accumulated_tensors, output_file)
            print(f"  Saved {total_tensors} tensors to {output_file}")
            shard_counter["shard"] = current_shard + 1
            accumulated_tensors.clear()
        else:
            pass  # Keep accumulating until we hit max_tensors
    else:
        full_shards = total_tensors // max_tensors
        remainder = total_tensors % max_tensors

        items = list(accumulated_tensors.items())

        # Write full shards
        for i in range(full_shards):
            batch = dict(items[i * max_tensors : (i + 1) * max_tensors])
            output_file = os.path.join(output_path, f"model-{current_shard:05d}.safetensors")
            save_file(batch, output_file)
            print(f"  Saved {len(batch)} tensors to {output_file}")
            current_shard += 1

        # Keep remainder for next batch if enabled
        if keep_remainder and remainder > 0:
            remainder_items = dict(items[full_shards * max_tensors:])
            accumulated_tensors.clear()
            accumulated_tensors.update(remainder_items)
            print(f"  Rolled over {remainder} tensors to next batch")
        elif remainder > 0:
            # Write remainder as final shard
            batch = dict(items[full_shards * max_tensors:])
            output_file = os.path.join(output_path, f"model-{current_shard:05d}.safetensors")
            save_file(batch, output_file)
            print(f"  Saved {len(batch)} tensors to {output_file}")
            current_shard += 1
            accumulated_tensors.clear()

        shard_counter["shard"] = current_shard


def copy_config_files(original_path: str, output_path: str):
    """Copy config and tokenizer files from original model folder."""
    config_files = [
        "config.json",
        "tokenizer.json",
        "tokenizer_config.json",
        "special_tokens_map.json",
    ]

    for config_file in config_files:
        src_path = os.path.join(original_path, config_file)
        if os.path.exists(src_path):
            dst_path = os.path.join(output_path, config_file)
            shutil.copy2(src_path, dst_path)
            print(f"Copied: {config_file}")
        else:
            print(f"Warning: {config_file} not found in {original_path}, skipping", file=sys.stderr)


def main():
    parser = argparse.ArgumentParser(
        description="Merge CPU-optimized weights from nested folder structure to sharded safetensors"
    )
    parser.add_argument(
        "--input-path", "-i", required=True, help="Input directory with nested _layer_* folders"
    )
    parser.add_argument("--output", "-o", required=True, help="Output directory for merged safetensors")
    parser.add_argument(
        "--original-path",
        "-r",
        default=None,
        help="Original model folder with config.json and tokenizer files to copy",
    )
    parser.add_argument(
        "--max-tensors",
        type=int,
        default=3000,
        help="Maximum tensors per safetensors shard (default: 3000)",
    )

    args = parser.parse_args()

    if not os.path.exists(args.input_path):
        print(f"Error: Input path does not exist: {args.input_path}", file=sys.stderr)
        return 1

    os.makedirs(args.output, exist_ok=True)

    print("Discovering layer folders...")
    layer_folders = discover_layers(args.input_path)
    if not layer_folders:
        print(f"Error: No _layer_* folders found in {args.input_path}", file=sys.stderr)
        return 1

    print(f"Found {len(layer_folders)} layer folders")

    print("Detecting quantization method...")
    first_layer_path = os.path.join(args.input_path, layer_folders[0][1])
    quant_method, amx_prefix = detect_quant_method(first_layer_path)
    print(f"Detected quant method: {quant_method} (prefix: {amx_prefix})")

    print(f"\nProcessing layers (max {args.max_tensors} tensors per shard)...")

    accumulated_tensors = {}
    shard_counter = {"shard": 1, "max_tensors": args.max_tensors}

    for layer_idx, layer_folder in layer_folders:
        layer_path = os.path.join(args.input_path, layer_folder)
        print(f"Processing layer {layer_idx} ({layer_folder})...")

        layer_tensors = process_layer(layer_path, amx_prefix, layer_idx)
        print(f"  Loaded {len(layer_tensors)} tensors from this layer")

        accumulated_tensors.update(layer_tensors)

        if len(accumulated_tensors) >= args.max_tensors:
            print(f"  Accumulator has {len(accumulated_tensors)} tensors, flushing to shard(s)...")
            write_shards(accumulated_tensors, args.output, shard_counter, keep_remainder=True)

        gc.collect()

    if accumulated_tensors:
        print(f"Flushing remaining {len(accumulated_tensors)} tensors to final shard(s)...")
        write_shards(accumulated_tensors, args.output, shard_counter, keep_remainder=False)

    if args.original_path:
        print(f"\nCopying config files from {args.original_path}...")
        copy_config_files(args.original_path, args.output)

    total_shards = shard_counter["shard"] - 1
    print(f"\nConversion completed! Created {total_shards} shard(s) in {args.output}")
    return 0


if __name__ == "__main__":
    exit(main())