mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2025-09-09 22:05:30 +00:00
add balance-serve, support concurrence
This commit is contained in:
parent
8d0292aa44
commit
25cee5810e
196 changed files with 22077 additions and 565 deletions
74
setup.py
74
setup.py
|
@ -35,6 +35,8 @@ try:
|
|||
from torch_musa.utils.musa_extension import BuildExtension, MUSAExtension, MUSA_HOME
|
||||
except ImportError:
|
||||
MUSA_HOME=None
|
||||
|
||||
with_balance = os.environ.get("USE_BALANCE_SERVE", "0") == "1"
|
||||
|
||||
class CpuInstructInfo:
|
||||
CPU_INSTRUCT = os.getenv("CPU_INSTRUCT", "NATIVE")
|
||||
|
@ -212,7 +214,7 @@ class VersionInfo:
|
|||
cpu_instruct = self.get_cpu_instruct()
|
||||
backend_version = ""
|
||||
if CUDA_HOME is not None:
|
||||
backend_version = f""
|
||||
backend_version = f"cu{self.get_cuda_bare_metal_version(CUDA_HOME)}"
|
||||
elif MUSA_HOME is not None:
|
||||
backend_version = f"mu{self.get_musa_bare_metal_version(MUSA_HOME)}"
|
||||
elif ROCM_HOME is not None:
|
||||
|
@ -274,11 +276,10 @@ PLAT_TO_CMAKE = {
|
|||
|
||||
|
||||
class CMakeExtension(Extension):
|
||||
def __init__(self, name: str, sourcedir: str = "") -> None:
|
||||
def __init__(self, name: str, sourcedir: str) -> None:
|
||||
super().__init__(name, sources=[])
|
||||
self.sourcedir = os.fspath(
|
||||
Path(sourcedir).resolve() / "ktransformers" / "ktransformers_ext")
|
||||
|
||||
print(name, sourcedir)
|
||||
self.sourcedir = sourcedir
|
||||
|
||||
class CMakeBuild(BuildExtension):
|
||||
|
||||
|
@ -342,16 +343,17 @@ class CMakeBuild(BuildExtension):
|
|||
f"-DEXAMPLE_VERSION_INFO={self.distribution.get_version()}"]
|
||||
if self.compiler.compiler_type != "msvc":
|
||||
if not cmake_generator or cmake_generator == "Ninja":
|
||||
try:
|
||||
import ninja
|
||||
pass
|
||||
# try:
|
||||
# import ninja
|
||||
|
||||
ninja_executable_path = Path(ninja.BIN_DIR) / "ninja"
|
||||
cmake_args += [
|
||||
"-GNinja",
|
||||
f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}",
|
||||
]
|
||||
except ImportError:
|
||||
pass
|
||||
# ninja_executable_path = Path(ninja.BIN_DIR) / "ninja"
|
||||
# cmake_args += [
|
||||
# "-GNinja",
|
||||
# f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}",
|
||||
# ]
|
||||
# except ImportError:
|
||||
# pass
|
||||
|
||||
else:
|
||||
# Single config generators are handled "normally"
|
||||
|
@ -387,10 +389,12 @@ class CMakeBuild(BuildExtension):
|
|||
build_args += [f"--parallel={cpu_count}"]
|
||||
print("CMake args:", cmake_args)
|
||||
build_temp = Path(ext.sourcedir) / "build"
|
||||
print("build_temp:", build_temp)
|
||||
|
||||
if not build_temp.exists():
|
||||
build_temp.mkdir(parents=True)
|
||||
result = subprocess.run(
|
||||
["cmake", ext.sourcedir, *cmake_args], cwd=build_temp, check=True , capture_output=True
|
||||
["cmake", ext.sourcedir, *cmake_args], cwd=build_temp, check=True , capture_output=True, text=True
|
||||
)
|
||||
print("Standard output:", result.stdout)
|
||||
print("Standard error:", result.stderr)
|
||||
|
@ -400,9 +404,9 @@ class CMakeBuild(BuildExtension):
|
|||
|
||||
if CUDA_HOME is not None or ROCM_HOME is not None:
|
||||
ops_module = CUDAExtension('KTransformersOps', [
|
||||
'ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu',
|
||||
'ktransformers/ktransformers_ext/cuda/binding.cpp',
|
||||
'ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu'
|
||||
'csrc/ktransformers_ext/cuda/custom_gguf/dequant.cu',
|
||||
'csrc/ktransformers_ext/cuda/binding.cpp',
|
||||
'csrc/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu'
|
||||
],
|
||||
extra_compile_args={
|
||||
'cxx': ['-O3', '-DKTRANSFORMERS_USE_CUDA'],
|
||||
|
@ -415,7 +419,7 @@ if CUDA_HOME is not None or ROCM_HOME is not None:
|
|||
}
|
||||
)
|
||||
elif MUSA_HOME is not None:
|
||||
SimplePorting(cuda_dir_path="ktransformers/ktransformers_ext/cuda", mapping_rule={
|
||||
SimplePorting(cuda_dir_path="csrc/ktransformers_ext/cuda", mapping_rule={
|
||||
# Common rules
|
||||
"at::cuda": "at::musa",
|
||||
"#include <ATen/cuda/CUDAContext.h>": "#include \"torch_musa/csrc/aten/musa/MUSAContext.h\"",
|
||||
|
@ -423,10 +427,10 @@ elif MUSA_HOME is not None:
|
|||
"nv_bfloat16": "mt_bfloat16",
|
||||
}).run()
|
||||
ops_module = MUSAExtension('KTransformersOps', [
|
||||
'ktransformers/ktransformers_ext/cuda_musa/custom_gguf/dequant.mu',
|
||||
'ktransformers/ktransformers_ext/cuda_musa/binding.cpp',
|
||||
'csrc/ktransformers_ext/cuda_musa/custom_gguf/dequant.mu',
|
||||
'csrc/ktransformers_ext/cuda_musa/binding.cpp',
|
||||
# TODO: Add Marlin support for MUSA.
|
||||
# 'ktransformers/ktransformers_ext/cuda_musa/gptq_marlin/gptq_marlin.mu'
|
||||
# 'csrc/ktransformers_ext/cuda_musa/gptq_marlin/gptq_marlin.mu'
|
||||
],
|
||||
extra_compile_args={
|
||||
'cxx': ['force_mcc'],
|
||||
|
@ -440,12 +444,30 @@ elif MUSA_HOME is not None:
|
|||
else:
|
||||
raise ValueError("Unsupported backend: CUDA_HOME and MUSA_HOME are not set.")
|
||||
|
||||
ext_modules = [
|
||||
CMakeExtension("cpuinfer_ext", os.fspath(Path("").resolve() / "csrc" / "ktransformers_ext")),
|
||||
ops_module,
|
||||
CUDAExtension(
|
||||
'vLLMMarlin', [
|
||||
'csrc/custom_marlin/binding.cpp',
|
||||
'csrc/custom_marlin/gptq_marlin/gptq_marlin.cu',
|
||||
'csrc/custom_marlin/gptq_marlin/gptq_marlin_repack.cu',
|
||||
],
|
||||
extra_compile_args={
|
||||
'cxx': ['-O3'],
|
||||
'nvcc': ['-O3', '-Xcompiler', '-fPIC'],
|
||||
},
|
||||
)
|
||||
]
|
||||
if with_balance:
|
||||
print("using balance_serve")
|
||||
ext_modules.append(
|
||||
CMakeExtension("balance_serve", os.fspath(Path("").resolve()/ "csrc"/ "balance_serve"))
|
||||
)
|
||||
|
||||
setup(
|
||||
name=VersionInfo.PACKAGE_NAME,
|
||||
version=VersionInfo().get_package_version(),
|
||||
cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild},
|
||||
ext_modules=[
|
||||
CMakeExtension("cpuinfer_ext"),
|
||||
ops_module,
|
||||
]
|
||||
ext_modules=ext_modules
|
||||
)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue