[build] prepare v0.6.1 SFT wheel packaging on main (#1945)
Some checks failed
Book-CI / test (push) Waiting to run
Book-CI / test-1 (push) Waiting to run
Book-CI / test-2 (push) Waiting to run
Deploy / deploy (macos-latest) (push) Waiting to run
Deploy / deploy (ubuntu-latest) (push) Waiting to run
Deploy / deploy (windows-latest) (push) Waiting to run
Release Fake Tag / publish (push) Has been cancelled
Release to PyPI / Build & publish sglang-kt (push) Has been cancelled
Release to PyPI / Build kt-kernel (Python 3.11) (push) Has been cancelled
Release to PyPI / Build kt-kernel (Python 3.12) (push) Has been cancelled
Release sglang-kt to PyPI / Build sglang-kt wheel (push) Has been cancelled
Release to PyPI / Publish kt-kernel to PyPI (push) Has been cancelled
Release sglang-kt to PyPI / Publish sglang-kt to PyPI (push) Has been cancelled

* [build]: prepare 0.6.1 SFT wheel packaging on main

* [build]: finalize py311+ wheel packaging defaults
This commit is contained in:
Peilin Li 2026-04-24 12:08:38 +08:00 committed by GitHub
parent 9544a8960d
commit 85308615b9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 305 additions and 35 deletions

View file

@ -107,6 +107,7 @@ jobs:
working-directory: kt-kernel
env:
CPUINFER_BUILD_ALL_VARIANTS: '1'
CPUINFER_ENABLE_CPPTRACE: '0'
CPUINFER_USE_CUDA: '1'
CPUINFER_CUDA_ARCHS: '80;86;89;90'
CPUINFER_CUDA_STATIC_RUNTIME: '1'

View file

@ -24,10 +24,11 @@ option(KTRANSFORMERS_CPU_DEBUG "ktransformers: DEBUG CPU use AMX" OFF)
option(KTRANSFORMERS_CPU_MLA "ktransformers: CPU use MLA" OFF)
option(KTRANSFORMERS_CPU_MOE_KERNEL "ktransformers: CPU use moe kernel" OFF)
option(KTRANSFORMERS_CPU_MOE_AMD "ktransformers: CPU use moe kernel for amd" OFF)
option(KTRANSFORMERS_ENABLE_CPPTRACE "Enable native crash tracing in kt-kernel" OFF)
# LTO control
option(CPUINFER_ENABLE_LTO "Enable link time optimization (IPO)" OFF)
project(kt_kernel_ext VERSION 0.5.3)
project(kt_kernel_ext VERSION 0.6.1)
# Auto-detect CPU features early (unless building with LLAMA_NATIVE)
if(NOT LLAMA_NATIVE AND NOT MSVC)
@ -588,6 +589,19 @@ else()
message(STATUS "LTO: disabled")
endif()
if(KTRANSFORMERS_ENABLE_CPPTRACE)
include(FetchContent)
FetchContent_Declare(
cpptrace
GIT_REPOSITORY https://github.com/jeremy-rifkin/cpptrace.git
GIT_TAG v1.0.4
)
FetchContent_MakeAvailable(cpptrace)
target_link_libraries(${PROJECT_NAME} PRIVATE cpptrace::cpptrace)
target_compile_definitions(${PROJECT_NAME} PRIVATE KTRANSFORMERS_ENABLE_CPPTRACE=1)
message(STATUS "cpptrace: enabled")
endif()
# If BLIS was detected earlier, apply its include directory and library to the
# created Python extension target. We only do this after the module target
# (${PROJECT_NAME}) has been created by pybind11_add_module().

197
kt-kernel/autosetup.sh Executable file
View file

@ -0,0 +1,197 @@
#!/usr/bin/env bash
set -euo pipefail
shopt -s nullglob
PY_LIST=${PY_LIST:-"3.11 3.12 3.13"}
TORCH_LIST=${TORCH_LIST:-"2.11.0"}
WORK_ROOT=${WORK_ROOT:-/mnt/data3/lpl/kt-kernel-autosetup}
WHEELS_DIR=${WHEELS_DIR:-"$PWD/wheels"}
PIP_CACHE_DIR=${PIP_CACHE_DIR:-/mnt/data3/lpl/pip-cache}
TMP_ROOT=${TMP_ROOT:-/mnt/data3/lpl/tmp}
FORCE=${FORCE:-0}
REPAIR=${REPAIR:-0}
AUDITWHEEL_PLAT=${AUDITWHEEL_PLAT:-manylinux_2_28_x86_64}
CPUINFER_ENABLE_CPPTRACE=${CPUINFER_ENABLE_CPPTRACE:-OFF}
mkdir -p "$WORK_ROOT" "$WHEELS_DIR" "$PIP_CACHE_DIR" "$TMP_ROOT"
index_for_torch_version() {
case "$1" in
2.3.*) echo "https://download.pytorch.org/whl/cu121" ;;
2.4.*) echo "https://download.pytorch.org/whl/cu121" ;;
2.5.*) echo "https://download.pytorch.org/whl/cu124" ;;
2.6.*) echo "https://download.pytorch.org/whl/cu124" ;;
2.7.*) echo "https://download.pytorch.org/whl/cu126" ;;
2.8.*) echo "https://download.pytorch.org/whl/cu128" ;;
2.9.*) echo "https://download.pytorch.org/whl/cu128" ;;
2.10.*) echo "" ;;
2.11.*) echo "" ;;
*) echo "https://download.pytorch.org/whl/cu124" ;;
esac
}
verify_torch_stack() {
python - <<'PY'
import email
import importlib.metadata as md
import pathlib
import site
import sys
from packaging.requirements import Requirement
import torch
sp = pathlib.Path(site.getsitepackages()[0])
meta = next(sp.glob('torch-*.dist-info/METADATA'))
msg = email.message_from_string(meta.read_text())
def norm(name: str) -> str:
return name.lower().replace('_', '-').replace('.', '-')
expected = {}
for line in msg.get_all('Requires-Dist', []):
req = Requirement(line)
if not req.name.startswith('nvidia-'):
continue
pinned = [spec.version for spec in req.specifier if spec.operator == '==']
if len(pinned) != 1:
continue
expected[norm(req.name)] = (req.name, pinned[0])
installed_versions = {}
for dist in md.distributions():
name = dist.metadata.get('Name')
if not name:
continue
installed_versions[norm(name)] = dist.version
mismatch = []
for key, (pkg, ver) in sorted(expected.items()):
installed = installed_versions.get(key)
if installed is None:
mismatch.append(f'{pkg}: missing, expected {ver}')
continue
if installed != ver:
mismatch.append(f'{pkg}: installed {installed}, expected {ver}')
cusparselt = sp / 'cusparselt' / 'lib' / 'libcusparseLt.so.0'
if not cusparselt.exists():
mismatch.append(f'cusparselt layout missing: expected {cusparselt}')
if mismatch:
print('Torch CUDA runtime stack is inconsistent:', file=sys.stderr)
for item in mismatch:
print(f' - {item}', file=sys.stderr)
raise SystemExit(2)
print('TORCH_OK', torch.__version__, torch.version.cuda, torch.cuda.is_available())
print('CUSPARSELT_PATH', cusparselt)
PY
}
verify_wheel_contents() {
python - "$1" <<'PY'
import pathlib
import sys
import zipfile
wheel = pathlib.Path(sys.argv[1])
with zipfile.ZipFile(wheel) as zf:
names = set(zf.namelist())
if not any(name.startswith('kt_kernel/kt_kernel_ext') and name.endswith('.so') for name in names):
raise SystemExit('missing kt_kernel_ext shared object in wheel')
required = [
'kt_kernel/sft/__init__.py',
'kt_kernel/sft/wrapper.py',
'kt_kernel/cli/completions/_kt',
]
missing = [name for name in required if name not in names]
if missing:
raise SystemExit(f'missing required wheel entries: {missing}')
print(f'WHEEL_OK {wheel.name}')
PY
}
for py in $PY_LIST; do
PYBIN="$(command -v python${py} || true)"
if [[ ! -x "$PYBIN" ]]; then
echo ">> Skip python ${py}: not found"
continue
fi
for tv in $TORCH_LIST; do
echo "======== Build: Python ${py} × Torch ${tv} ========"
ENV_DIR="$WORK_ROOT/.venv-py${py//./}-torch${tv//./}"
OUT_DIR="$WHEELS_DIR/py${py//./}-torch${tv//./}"
IDX="$(index_for_torch_version "$tv")"
if [[ "$FORCE" = "1" ]]; then
rm -rf "$OUT_DIR"
elif compgen -G "$OUT_DIR/*.whl" > /dev/null; then
echo ">> Found existing wheel for py${py//./}-torch${tv//./}, skip"
continue
fi
rm -rf "$ENV_DIR"
mkdir -p "$OUT_DIR"
"$PYBIN" -m venv "$ENV_DIR"
# shellcheck disable=SC1090
source "$ENV_DIR/bin/activate"
export PYTHONNOUSERSITE=1
export PIP_CACHE_DIR
export CPUINFER_ENABLE_CPPTRACE
export TMPDIR="$TMP_ROOT"
export TEMP="$TMP_ROOT"
export TMP="$TMP_ROOT"
python -m pip install -U pip setuptools wheel build cmake pybind11 packaging numpy
if [[ -n "$IDX" ]]; then
python -m pip install --index-url "$IDX" "torch==$tv"
else
python -m pip install "torch==$tv"
fi
verify_torch_stack
rm -rf build dist kt_kernel.egg-info
python -m build --no-isolation --wheel -v
wheels=(dist/*.whl)
if (( ${#wheels[@]} != 1 )); then
echo "!! expected exactly one wheel in dist/, got ${#wheels[@]}" >&2
exit 2
fi
verify_wheel_contents "${wheels[0]}"
python - "$OUT_DIR/build-info.txt" "$py" "$tv" "$IDX" "$CPUINFER_ENABLE_CPPTRACE" <<'PY'
from pathlib import Path
import platform
import sys
import torch
out = Path(sys.argv[1])
out.write_text(
f"python={sys.argv[2]}\n"
f"torch={torch.__version__}\n"
f"torch_cuda={torch.version.cuda}\n"
f"cuda_available={torch.cuda.is_available()}\n"
f"index_url={sys.argv[4]}\n"
f"platform={platform.platform()}\n"
f"cpptrace={sys.argv[5]}\n"
)
print(f"BUILD_INFO {out}")
PY
if [[ "$REPAIR" = "1" ]]; then
python -m pip install -U auditwheel patchelf
rm -rf "$OUT_DIR/wheelhouse"
mkdir -p "$OUT_DIR/wheelhouse"
auditwheel repair "${wheels[0]}" --plat "$AUDITWHEEL_PLAT" -w "$OUT_DIR/wheelhouse"
cp "$OUT_DIR/wheelhouse"/*.whl "$OUT_DIR/"
else
cp "${wheels[0]}" "$OUT_DIR/"
fi
deactivate
done
done
echo "== Wheels saved in ${WHEELS_DIR} =="

View file

@ -12,7 +12,9 @@
#include <sys/wait.h>
#include <unistd.h>
#if defined(KTRANSFORMERS_ENABLE_CPPTRACE)
#include <cpptrace/cpptrace.hpp>
#endif
#include <csignal>
#include <cstddef>
#include <cstring>
@ -54,8 +56,8 @@ static const bool _is_plain_ = false;
#if defined(__x86_64__)
#include "operators/avx2/bf16-moe.hpp"
#include "operators/avx2/fp8-moe.hpp"
#include "operators/avx2/gptq_int4_avxvnni-moe.hpp"
#include "operators/avx2/gptq_int4-moe.hpp"
#include "operators/avx2/gptq_int4_avxvnni-moe.hpp"
#endif
#include <pybind11/stl.h> // std::vector/std::pair/std::string conversions
@ -74,7 +76,6 @@ static const bool _is_plain_ = false;
namespace py = pybind11;
using namespace pybind11::literals;
py::object to_float_ptr(uintptr_t input_ptr, int size, ggml_type type) {
if (type < 0 || type >= GGML_TYPE_COUNT) {
PyErr_SetString(PyExc_ValueError, "Invalid ggml_type");
@ -473,7 +474,6 @@ void bind_moe_module(py::module_& moe_module, const char* name) {
}
PYBIND11_MODULE(kt_kernel_ext, m) {
py::class_<WorkerPool>(m, "WorkerPool").def(py::init<int>());
py::class_<WorkerPoolConfig>(m, "WorkerPoolConfig")
.def(py::init<>())
@ -813,7 +813,7 @@ PYBIND11_MODULE(kt_kernel_ext, m) {
bind_moe_module<AVX2_FP8_MOE_TP<avx2::GemmKernelAVX2FP8>>(moe_module, "AVX2FP8_MOE");
bind_moe_module<AVX2_GPTQ_INT4_MOE_TP<avx2::GemmKernelAVX2GPTQInt4>>(moe_module, "AVX2GPTQInt4_MOE");
bind_moe_module<AVXVNNI256_GPTQ_INT4_MOE_TP<avxvnni::GemmKernelAVXVNNI256GPTQInt4>>(moe_module,
"AVXVNNI256GPTQInt4_MOE");
"AVXVNNI256GPTQInt4_MOE");
#endif
#if defined(USE_MOE_KERNEL)
@ -976,6 +976,7 @@ PYBIND11_MODULE(kt_kernel_ext, m) {
py::arg("size"), py::arg("type"));
}
#if defined(KTRANSFORMERS_ENABLE_CPPTRACE)
static void warmup_cpptrace() {
// 避免第一次调用触发 lazy-loadingmalloc 等) :contentReference[oaicite:7]{index=7}
cpptrace::frame_ptr buffer[10];
@ -1002,3 +1003,4 @@ __attribute__((constructor)) static void install_handlers() {
sigaction(SIGABRT, &sa, nullptr);
}
#endif

View file

@ -75,6 +75,7 @@ Optional variables (with defaults):
CPUINFER_ENABLE_AVX512_VNNI=ON/OFF Override VNNI detection (auto if unset)
CPUINFER_ENABLE_AVX512_BF16=ON/OFF Override BF16 detection (auto if unset)
CPUINFER_ENABLE_AVX512_VBMI=ON/OFF Override VBMI detection (auto if unset)
CPUINFER_ENABLE_CPPTRACE=ON/OFF Enable native crash tracing (default OFF)
Software Fallback Support:
✓ If VNNI not available: Uses AVX512BW fallback (2-3x slower but works)
@ -392,6 +393,7 @@ echo " CPUINFER_ENABLE_AMX = $CPUINFER_ENABLE_AMX"
echo " CPUINFER_ENABLE_AVX512_VNNI = ${CPUINFER_ENABLE_AVX512_VNNI:-AUTO}"
echo " CPUINFER_ENABLE_AVX512_BF16 = ${CPUINFER_ENABLE_AVX512_BF16:-AUTO}"
echo " CPUINFER_ENABLE_AVX512_VBMI = ${CPUINFER_ENABLE_AVX512_VBMI:-AUTO}"
echo " CPUINFER_ENABLE_CPPTRACE = ${CPUINFER_ENABLE_CPPTRACE:-OFF}"
echo " CPUINFER_BUILD_TYPE = ${CPUINFER_BUILD_TYPE:-Release}"
echo " CPUINFER_PARALLEL = ${CPUINFER_PARALLEL:-AUTO}"
echo " CPUINFER_VERBOSE = ${CPUINFER_VERBOSE:-1}"

View file

@ -251,7 +251,7 @@ class AMX_MOE_TP : public AMX_MOE_BASE<T, AMX_MOE_TP<T>> {
if (config_.load) {
std::cout << "Loading from \"" << prefix << "\"" << std::endl;
pool->do_work_stealing_job(
config_.expert_num * mat_type_all * mat_split, nullptr,
config_.expert_num * mat_type_all * mat_split,
[this, physical_to_logical_map, prefix, mat_type_all, mat_split](int task_id) {
int64_t expert_idx = task_id / (mat_type_all * mat_split);
uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
@ -273,8 +273,7 @@ class AMX_MOE_TP : public AMX_MOE_BASE<T, AMX_MOE_TP<T>> {
read_weights(prefix, "_down_", (char*)down_bb_[expert_idx]->b, logical_expert_id, size, scale_size,
mat_split, mat_split_idex);
}
},
nullptr, "load_fwd_kt");
});
}
// check process, store down matrix to check
#ifdef CHECK

View file

@ -18,28 +18,29 @@ classifiers = [
"Operating System :: POSIX :: Linux",
"Operating System :: MacOS",
]
requires-python = ">=3.8"
requires-python = ">=3.11"
dependencies = [
# Core dependencies
"torch>=2.0.0",
"safetensors>=0.4.0",
"compressed-tensors>=0.7.0",
"numpy>=1.24.0",
"triton>=2.0.0",
"gguf>=0.17.0",
# CLI dependencies
"typer[all]>=0.9.0",
"typer>=0.9.0",
"rich>=13.0.0",
"pyyaml>=6.0",
"httpx>=0.25.0",
"packaging>=23.0",
# SGLang (kvcache-ai fork)
"sglang-kt",
# Development dependencies
"black>=25.9.0",
]
[project.optional-dependencies]
sglang = [
"sglang-kt",
]
convert = [
"compressed-tensors>=0.7.0",
]
test = [
"pytest>=7.0.0",
"psutil>=5.9.0",

View file

@ -77,9 +77,9 @@ try:
_version_ns = {}
with open(_root_version_file, "r", encoding="utf-8") as f:
exec(f.read(), _version_ns)
__version__ = _version_ns.get("__version__", "0.5.3")
__version__ = _version_ns.get("__version__", "0.6.1")
else:
__version__ = "0.5.3"
__version__ = "0.6.1"
except ImportError:
# Python < 3.8, fallback to pkg_resources or hardcoded version
try:
@ -88,8 +88,8 @@ except ImportError:
try:
__version__ = get_distribution("kt-kernel").version
except DistributionNotFound:
__version__ = "0.5.3"
__version__ = "0.6.1"
except ImportError:
__version__ = "0.5.3"
__version__ = "0.6.1"
__all__ = ["KTMoEWrapper", "AMXSFTMoEWrapper", "generate_gpu_experts_masks", "kt_kernel_ext", "__cpu_variant__", "__version__"]

View file

@ -16,6 +16,6 @@ except PackageNotFoundError:
_root_version_file = Path(__file__).resolve().parents[3] / "version.py"
if _root_version_file.exists():
exec(_root_version_file.read_text(encoding="utf-8"), _version_ns)
__version__ = _version_ns.get("__version__", "0.5.3")
__version__ = _version_ns.get("__version__", "0.6.1")
else:
__version__ = "0.5.3"
__version__ = "0.6.1"

View file

@ -5,9 +5,6 @@
# Core dependencies (minimum versions)
torch>=2.0.0
safetensors>=0.4.0
compressed-tensors>=0.7.0
numpy>=1.24.0
triton>=2.0.0
gguf>=0.17.0
# Development dependencies
black>=25.9.0

View file

@ -24,6 +24,7 @@ Environment knobs (export before running pip install .):
CPUINFER_ENABLE_AVX512_VNNI=OFF ON/OFF -> -DLLAMA_AVX512_VNNI
CPUINFER_ENABLE_AVX512_BF16=OFF ON/OFF -> -DLLAMA_AVX512_BF16
CPUINFER_ENABLE_AVX512_VBMI=OFF ON/OFF -> -DLLAMA_AVX512_VBMI (required for FP8 MoE)
CPUINFER_ENABLE_CPPTRACE=ON/OFF ON/OFF -> -DKTRANSFORMERS_ENABLE_CPPTRACE (debug-only)
CPUINFER_BLIS_ROOT=/path/to/blis Forward to -DBLIS_ROOT
@ -610,6 +611,7 @@ class CMakeBuild(build_ext):
_forward_bool_env(cmake_args, "CPUINFER_ENABLE_LTO", "CPUINFER_ENABLE_LTO")
_forward_str_env(cmake_args, "CPUINFER_LTO_JOBS", "CPUINFER_LTO_JOBS")
_forward_str_env(cmake_args, "CPUINFER_LTO_MODE", "CPUINFER_LTO_MODE")
_forward_bool_env(cmake_args, "CPUINFER_ENABLE_CPPTRACE", "KTRANSFORMERS_ENABLE_CPPTRACE")
# CUDA static runtime toggle
_forward_bool_env(cmake_args, "CPUINFER_CUDA_STATIC_RUNTIME", "KTRANSFORMERS_CUDA_STATIC_RUNTIME")
@ -695,9 +697,9 @@ if _version_file.exists():
_version_ns = {}
with open(_version_file, "r", encoding="utf-8") as f:
exec(f.read(), _version_ns)
_base_version = _version_ns.get("__version__", "0.5.3")
_base_version = _version_ns.get("__version__", "0.6.1")
else:
_base_version = "0.5.3"
_base_version = "0.6.1"
# Determine version
if "CPUINFER_VERSION" in os.environ:
@ -727,23 +729,31 @@ setup(
description="KT-Kernel: High-performance kernel operations for KTransformers (AMX/AVX/KML optimizations)",
author="kvcache-ai",
license="Apache-2.0",
python_requires=">=3.8",
python_requires=">=3.10",
packages=[
"kt_kernel",
"kt_kernel.utils",
"kt_kernel.sft",
"kt_kernel.cli",
"kt_kernel.cli.commands",
"kt_kernel.cli.completions",
"kt_kernel.cli.config",
"kt_kernel.cli.utils",
],
package_dir={
"kt_kernel": "python",
"kt_kernel.utils": "python/utils",
"kt_kernel.sft": "python/sft",
"kt_kernel.cli": "python/cli",
"kt_kernel.cli.commands": "python/cli/commands",
"kt_kernel.cli.completions": "python/cli/completions",
"kt_kernel.cli.config": "python/cli/config",
"kt_kernel.cli.utils": "python/cli/utils",
},
package_data={
"kt_kernel.cli.completions": ["*.bash", "*.fish", "_kt"],
},
include_package_data=True,
entry_points={
"console_scripts": [
"kt=kt_kernel.cli.main:main",

34
ktransformers/__init__.py Normal file
View file

@ -0,0 +1,34 @@
"""Top-level Python package for KTransformers.
The runtime kernels live in kt-kernel. Optional SFT support is activated
via pip install "ktransformers[sft]" which adds transformers-kt and
accelerate-kt to the environment.
"""
from __future__ import annotations
from importlib.metadata import PackageNotFoundError, version
from pathlib import Path
def _read_repo_version() -> str:
ns: dict[str, str] = {}
exec((Path(__file__).resolve().parents[1] / 'version.py').read_text(), ns)
return ns['__version__']
try:
__version__ = version('ktransformers')
except PackageNotFoundError:
__version__ = _read_repo_version()
def has_sft_support() -> bool:
try:
import kt_kernel.sft # noqa: F401
except Exception:
return False
return True
__all__ = ['__version__', 'has_sft_support']

View file

@ -4,12 +4,12 @@ build-backend = "setuptools.build_meta"
[project]
name = "ktransformers"
dynamic = ["version", "dependencies"]
dynamic = ["version", "dependencies", "optional-dependencies"]
description = "KTransformers: CPU-GPU heterogeneous inference framework for LLMs"
readme = "README.md"
authors = [{ name = "kvcache-ai" }]
license = "Apache-2.0"
requires-python = ">=3.8"
requires-python = ">=3.11"
classifiers = [
"Programming Language :: Python :: 3",
"Operating System :: POSIX :: Linux",
@ -19,5 +19,5 @@ classifiers = [
Homepage = "https://github.com/kvcache-ai/ktransformers"
[tool.setuptools]
# No actual Python packages — this is a meta-package
packages = []
# Ship a minimal top-level Python package so the distribution is importable.
packages = ["ktransformers"]

View file

@ -1,4 +1,9 @@
"""Meta-package: pip install ktransformers → installs kt-kernel + sglang-kt."""
"""Lightweight top-level package: pip install ktransformers -> installs kt-kernel.
Extras:
- ktransformers[sft] installs transformers-kt + accelerate-kt
- ktransformers[sglang] installs sglang-kt
"""
from pathlib import Path
from setuptools import setup
@ -11,6 +16,14 @@ setup(
version=_v,
install_requires=[
f"kt-kernel=={_v}",
f"sglang-kt=={_v}",
],
extras_require={
"sft": [
"transformers-kt==5.6.0",
"accelerate-kt==1.14.0",
],
"sglang": [
"sglang-kt>=0.5.3",
],
},
)

View file

@ -1,6 +1,6 @@
"""
KTransformers version information.
Shared across kt-kernel and kt-sft modules.
Shared across the top-level package and kt-kernel.
"""
__version__ = "0.5.3"
__version__ = "0.6.1"