From e72a4fb880f45d8aa535662c6e8a99a7bd3b11e8 Mon Sep 17 00:00:00 2001 From: DocShotgun <126566557+DocShotgun@users.noreply.github.com> Date: Fri, 21 Nov 2025 20:00:15 -0800 Subject: [PATCH] [feat](kt-kernel): Add resume arg to CPU weight conversion (#1630) * [feat]: kt-kernel: Add resume arg to CPU weight conversion * [docs]: kt-kernel: Document resume arg for CPU weight conversion * [fix]: kt-kernel: Only print resume layer if in use * [fix]: kt-kernel: Don't log skipped layers when using resume_layer --- kt-kernel/scripts/README.md | 14 ++++++++++++++ kt-kernel/scripts/convert_cpu_weights.py | 21 ++++++++++++++++++--- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/kt-kernel/scripts/README.md b/kt-kernel/scripts/README.md index b8d4c015..42bb1651 100644 --- a/kt-kernel/scripts/README.md +++ b/kt-kernel/scripts/README.md @@ -107,6 +107,20 @@ output_dir/ - Need to process very large models on memory-constrained systems - Want to preserve intermediate layer-wise quantized weights +### Resume Layer + +For memory-constrained systems that are unable to complete quantization despite enabling low memory mode with `--no-merge-safetensor`, restart the script with the `--resume-layer` arg to specify the layer from which to continue the conversion process. In the example below, we skip layers 0-11 and resume conversion starting with layer 12. + +```bash +python scripts/convert_cpu_weights.py \ + --input-path /path/to/model \ + --input-type bf16 \ + --output /path/to/output \ + --quant-method int4 \ + --no-merge-safetensor + --resume-layer 12 +``` + ## Examples ### Example 1: Quantize DeepSeek-V3.1 (FP8 → INT4) diff --git a/kt-kernel/scripts/convert_cpu_weights.py b/kt-kernel/scripts/convert_cpu_weights.py index 92f3a442..14d11c37 100644 --- a/kt-kernel/scripts/convert_cpu_weights.py +++ b/kt-kernel/scripts/convert_cpu_weights.py @@ -330,11 +330,18 @@ class ConverterBase: """ raise NotImplementedError("Subclasses must implement _convert_layer_experts") - def convert(self): - """Convert all expert layers using subclass-specific logic.""" + def convert(self, resume_layer: int = 0): + """Convert all expert layers using subclass-specific logic. + + Args: + resume_layer (int, optional): The layer index to resume conversion from. + Layers with an index lower than this will be skipped. Defaults to 0. + """ print("Starting conversion...") print(f"Input: {self.input_path}") print(f"Output: {self.output_path}") + if resume_layer > 0: + print(f"Resuming from layer: {resume_layer}") # Create output directory os.makedirs(self.output_path, exist_ok=True) @@ -355,6 +362,8 @@ class ConverterBase: # Process layers with memory cleanup for i, (layer_idx, expert_ids) in enumerate(sorted(expert_layers.items())): + if layer_idx < resume_layer: + continue print(f"Processing layer {layer_idx} ({i+1}/{len(expert_layers)})...") layer_tensors = self._convert_layer_experts(layer_idx, expert_ids) @@ -840,6 +849,12 @@ def main(): default=False, help="Keep layer folders without merging to safetensor files (default: False)", ) + parser.add_argument( + "--resume-layer", + type=int, + default=0, + help="Resume conversion starting at this layer index (default: 0)", + ) args = parser.parse_args() @@ -893,7 +908,7 @@ def main(): ) # Run conversion - converter.convert() + converter.convert(resume_layer=args.resume_layer) # Cleanup converter.close()