kvcache-ai-ktransformers/archive/ktransformers/optimize/optimize.py
Jiaqi Liao 57d14d22bc
Refactor: restructure repository to focus on kt-kernel and KT-SFT modulesq recon (#1581)
* refactor: move legacy code to archive/ directory

  - Moved ktransformers, csrc, third_party, merge_tensors to archive/
  - Moved build scripts and configurations to archive/
  - Kept kt-kernel, KT-SFT, doc, and README files in root
  - Preserved complete git history for all moved files

* refactor: restructure repository to focus on kt-kernel and KT-SFT modules

* fix README

* fix README

* fix README

* fix README

* docs: add performance benchmarks to kt-kernel section

Add comprehensive performance data for kt-kernel to match KT-SFT's presentation:
- AMX kernel optimization: 21.3 TFLOPS (3.9× faster than PyTorch)
- Prefill phase: up to 20× speedup vs baseline
- Decode phase: up to 4× speedup
- NUMA optimization: up to 63% throughput improvement
- Multi-GPU (8×L20): 227.85 tokens/s total throughput with DeepSeek-R1 FP8

Source: https://lmsys.org/blog/2025-10-22-KTransformers/

This provides users with concrete performance metrics for both core modules,
making it easier to understand the capabilities of each component.

* refactor: improve kt-kernel performance data with specific hardware and models

Replace generic performance descriptions with concrete benchmarks:
- Specify exact hardware: 8×L20 GPU + Xeon Gold 6454S, Single/Dual-socket Xeon + AMX
- Include specific models: DeepSeek-R1-0528 (FP8), DeepSeek-V3 (671B)
- Show detailed metrics: total throughput, output throughput, concurrency details
- Match KT-SFT presentation style for consistency

This provides users with actionable performance data they can use to evaluate
hardware requirements and expected performance for their use cases.

* fix README

* docs: clean up performance table and improve formatting

* add pic for README

* refactor: simplify .gitmodules and backup legacy submodules

- Remove 7 legacy submodules from root .gitmodules (archive/third_party/*)
- Keep only 2 active submodules for kt-kernel (llama.cpp, pybind11)
- Backup complete .gitmodules to archive/.gitmodules
- Add documentation in archive/README.md for researchers who need legacy submodules

This reduces initial clone size by ~500MB and avoids downloading unused dependencies.

* refactor: move doc/ back to root directory

Keep documentation in root for easier access and maintenance.

* refactor: consolidate all images to doc/assets/

- Move kt-kernel/assets/heterogeneous_computing.png to doc/assets/
- Remove KT-SFT/assets/ (images already in doc/assets/)
- Update KT-SFT/README.md image references to ../doc/assets/
- Eliminates ~7.9MB image duplication
- Centralizes all documentation assets in one location

* fix pic path for README
2025-11-10 17:42:26 +08:00

163 lines
7.9 KiB
Python

'''
Description :
Author : Boxin Zhang, Azure-Tang
Version : 0.1.0
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
from typing import Mapping, List
import torch
import yaml
import re
from torch import nn
from transformers import AutoConfig
from transformers.configuration_utils import PretrainedConfig
# from operators import BaseInjectedModule
from ktransformers.util.custom_loader import GGUFLoader, ModelLoaderFactory
from ktransformers.util.custom_gguf import translate_name_to_gguf
from ktransformers.util import utils
from ktransformers.util.utils import set_module, load_weights
import itertools
import copy
try:
import torch_npu
use_torch_npu = torch_npu.npu.is_available()
except:
use_torch_npu = False
def inject(module, local_optimization_dict, model_config:AutoConfig ,gguf_loader:GGUFLoader, prefix=''):
for name, child in module._modules.items():
if child is not None:
child_prefix = prefix + name
if child_prefix in local_optimization_dict:
inject_module_meta=local_optimization_dict[child_prefix]
if inject_module_meta["class"] != "default":
import_path = inject_module_meta["class"].split(".")
import_module_name = ".".join(import_path[:-1])
gguf_loader.tensor_device_map[inject_module_meta["key"]] = inject_module_meta["kwargs"] if "kwargs" in inject_module_meta else dict()
import_class_name = import_path[-1]
module_cls=getattr(__import__(import_module_name, fromlist=[""]), import_class_name)
if use_torch_npu:
print(f"Injecting {child_prefix} as", import_module_name, ".",
import_class_name) if torch.distributed.get_rank() == 0 else None #TODO 分布式
else:
print(f"Injecting {child_prefix} as", import_module_name, ".", import_class_name)
inject_module=module_cls(key = inject_module_meta["key"], gguf_loader = gguf_loader, config = model_config, orig_module=child, **inject_module_meta["kwargs"])
set_module(module, name, inject_module)
elif inject_module_meta["class"] == "default":
print(f"Injecting {child_prefix} as default")
gguf_loader.tensor_device_map[inject_module_meta["key"]] = inject_module_meta["kwargs"] if "kwargs" in inject_module_meta else dict()
else:
raise Exception("inject_module_meta[\"class\"] must be \"default\" or a class path")
child_prefix += "."
child_optimization_dict = {k: v for k, v in local_optimization_dict.items() if k.startswith(child_prefix)}
inject(child, child_optimization_dict, model_config, gguf_loader, child_prefix)
def del_meta(module:nn.Module):
#print("default loading weights", prefix)
persistent_buffers = {k: v for k, v in module._buffers.items() if k not in module._non_persistent_buffers_set}
local_name_params = itertools.chain(module._parameters.items(), persistent_buffers.items())
local_state = {k: v for k, v in local_name_params if v is not None}
for name, param in local_state.items():
if param.device == "meta" or param.device == torch.device("meta"):
module.__delattr__(name)
for name, child in module._modules.items():
del_meta(child)
def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, prefix: str="", default_device: str = "cuda:0"):
module_name = prefix[:-1]
if use_torch_npu:
translated_name = translate_name_to_gguf(prefix)[:-1]
recursive = True
for rule in rule_list:
match_meta = rule["match"]
if "class" not in match_meta and "name" not in match_meta:
raise Exception("match must have at least one of \"class\" and \"name\"")
if "class" in match_meta:
import_path = match_meta["class"].split(".")
import_module_name = ".".join(import_path[:-1])
import_class_name = import_path[-1]
module_cls=getattr(__import__(import_module_name, fromlist=[""]), import_class_name)
if not isinstance(module, module_cls):
continue
if "name" in match_meta:
if re.search(match_meta["name"], module_name) is None:
continue
if "replace" not in rule:
raise Exception("replace must be in rule")
if "replace" in rule:
replace_meta = rule["replace"]
if module_name not in out_data:
out_data[module_name]={"key": module_name if not use_torch_npu else translated_name,
"class": replace_meta["class"] if "class" in replace_meta else "default",
# "device": replace_meta["device"] if "device" in replace_meta else default_device,
"kwargs": copy.deepcopy(replace_meta["kwargs"]) if "kwargs" in replace_meta else dict()}
else:
if out_data[module_name]["class"] == "default":
out_data[module_name]["class"] = replace_meta["class"] if "class" in replace_meta else "default"
out_data[module_name]["kwargs"].update(copy.deepcopy(replace_meta["kwargs"]) if "kwargs" in replace_meta else dict())
if "recursive" in rule:
recursive = bool(rule["recursive"])
break
if module_name not in out_data:
out_data[module_name]= {
"class": "default",
"key": module_name if not use_torch_npu else translated_name,
"kwargs": {"generate_device": default_device,
"prefill_device": default_device}
}
#print(out_data[module_name])
#input()
if recursive:
for name, child in module._modules.items():
if child is not None:
child_prefix = prefix + name + "."
gen_optimize_config(child, out_data, rule_list, child_prefix, default_device = default_device)
def translate_model_config(model_config: PretrainedConfig):
# for supporting some special model
if model_config.model_type == "mixtral":
model_config.moe_intermediate_size = model_config.intermediate_size
return model_config
def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, model_config: PretrainedConfig, default_device: str = "cuda:0", q4_gguf_path=""):
with open(rule_file, 'r', encoding='utf-8') as f:
rule_list = yaml.load(f.read(), Loader=yaml.FullLoader)
optimize_config = dict()
gen_optimize_config(module, optimize_config, rule_list, default_device = default_device)
model_config = translate_model_config(model_config)
if use_torch_npu:
if q4_gguf_path:
q4_gguf_loader = GGUFLoader(q4_gguf_path)
utils.Q4_GGUF_LODER = q4_gguf_loader
gguf_loader = GGUFLoader(gguf_path, getattr(model_config, "quantize", None))
with torch.device("meta"):
inject(module, optimize_config, model_config, gguf_loader)
# pre load lm_head because its big inter result
load_weights(module.lm_head, gguf_loader, "lm_head.")
load_weights(module, gguf_loader)
module.gguf_loader = gguf_loader
else:
weights_loader = ModelLoaderFactory.create_loader(gguf_path)
with torch.device("meta"):
inject(module, optimize_config, model_config, weights_loader)
# pre load lm_head because its big inter result
load_weights(module.lm_head, weights_loader, "lm_head.", device=default_device)
load_weights(module, weights_loader, device=default_device)
module.gguf_loader = weights_loader
del_meta(module)
if torch.cuda.is_available():
torch.cuda.empty_cache()
elif torch.xpu.is_available():
torch.xpu.empty_cache()
else:
torch.cuda.empty_cache()