add balance-serve, support concurrence

This commit is contained in:
Atream 2025-03-31 22:55:32 +08:00
parent 8d0292aa44
commit 25cee5810e
196 changed files with 22077 additions and 565 deletions

View file

@ -35,6 +35,8 @@ try:
from torch_musa.utils.musa_extension import BuildExtension, MUSAExtension, MUSA_HOME
except ImportError:
MUSA_HOME=None
with_balance = os.environ.get("USE_BALANCE_SERVE", "0") == "1"
class CpuInstructInfo:
CPU_INSTRUCT = os.getenv("CPU_INSTRUCT", "NATIVE")
@ -212,7 +214,7 @@ class VersionInfo:
cpu_instruct = self.get_cpu_instruct()
backend_version = ""
if CUDA_HOME is not None:
backend_version = f""
backend_version = f"cu{self.get_cuda_bare_metal_version(CUDA_HOME)}"
elif MUSA_HOME is not None:
backend_version = f"mu{self.get_musa_bare_metal_version(MUSA_HOME)}"
elif ROCM_HOME is not None:
@ -274,11 +276,10 @@ PLAT_TO_CMAKE = {
class CMakeExtension(Extension):
def __init__(self, name: str, sourcedir: str = "") -> None:
def __init__(self, name: str, sourcedir: str) -> None:
super().__init__(name, sources=[])
self.sourcedir = os.fspath(
Path(sourcedir).resolve() / "ktransformers" / "ktransformers_ext")
print(name, sourcedir)
self.sourcedir = sourcedir
class CMakeBuild(BuildExtension):
@ -342,16 +343,17 @@ class CMakeBuild(BuildExtension):
f"-DEXAMPLE_VERSION_INFO={self.distribution.get_version()}"]
if self.compiler.compiler_type != "msvc":
if not cmake_generator or cmake_generator == "Ninja":
try:
import ninja
pass
# try:
# import ninja
ninja_executable_path = Path(ninja.BIN_DIR) / "ninja"
cmake_args += [
"-GNinja",
f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}",
]
except ImportError:
pass
# ninja_executable_path = Path(ninja.BIN_DIR) / "ninja"
# cmake_args += [
# "-GNinja",
# f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}",
# ]
# except ImportError:
# pass
else:
# Single config generators are handled "normally"
@ -387,10 +389,12 @@ class CMakeBuild(BuildExtension):
build_args += [f"--parallel={cpu_count}"]
print("CMake args:", cmake_args)
build_temp = Path(ext.sourcedir) / "build"
print("build_temp:", build_temp)
if not build_temp.exists():
build_temp.mkdir(parents=True)
result = subprocess.run(
["cmake", ext.sourcedir, *cmake_args], cwd=build_temp, check=True , capture_output=True
["cmake", ext.sourcedir, *cmake_args], cwd=build_temp, check=True , capture_output=True, text=True
)
print("Standard output:", result.stdout)
print("Standard error:", result.stderr)
@ -400,9 +404,9 @@ class CMakeBuild(BuildExtension):
if CUDA_HOME is not None or ROCM_HOME is not None:
ops_module = CUDAExtension('KTransformersOps', [
'ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu',
'ktransformers/ktransformers_ext/cuda/binding.cpp',
'ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu'
'csrc/ktransformers_ext/cuda/custom_gguf/dequant.cu',
'csrc/ktransformers_ext/cuda/binding.cpp',
'csrc/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu'
],
extra_compile_args={
'cxx': ['-O3', '-DKTRANSFORMERS_USE_CUDA'],
@ -415,7 +419,7 @@ if CUDA_HOME is not None or ROCM_HOME is not None:
}
)
elif MUSA_HOME is not None:
SimplePorting(cuda_dir_path="ktransformers/ktransformers_ext/cuda", mapping_rule={
SimplePorting(cuda_dir_path="csrc/ktransformers_ext/cuda", mapping_rule={
# Common rules
"at::cuda": "at::musa",
"#include <ATen/cuda/CUDAContext.h>": "#include \"torch_musa/csrc/aten/musa/MUSAContext.h\"",
@ -423,10 +427,10 @@ elif MUSA_HOME is not None:
"nv_bfloat16": "mt_bfloat16",
}).run()
ops_module = MUSAExtension('KTransformersOps', [
'ktransformers/ktransformers_ext/cuda_musa/custom_gguf/dequant.mu',
'ktransformers/ktransformers_ext/cuda_musa/binding.cpp',
'csrc/ktransformers_ext/cuda_musa/custom_gguf/dequant.mu',
'csrc/ktransformers_ext/cuda_musa/binding.cpp',
# TODO: Add Marlin support for MUSA.
# 'ktransformers/ktransformers_ext/cuda_musa/gptq_marlin/gptq_marlin.mu'
# 'csrc/ktransformers_ext/cuda_musa/gptq_marlin/gptq_marlin.mu'
],
extra_compile_args={
'cxx': ['force_mcc'],
@ -440,12 +444,30 @@ elif MUSA_HOME is not None:
else:
raise ValueError("Unsupported backend: CUDA_HOME and MUSA_HOME are not set.")
ext_modules = [
CMakeExtension("cpuinfer_ext", os.fspath(Path("").resolve() / "csrc" / "ktransformers_ext")),
ops_module,
CUDAExtension(
'vLLMMarlin', [
'csrc/custom_marlin/binding.cpp',
'csrc/custom_marlin/gptq_marlin/gptq_marlin.cu',
'csrc/custom_marlin/gptq_marlin/gptq_marlin_repack.cu',
],
extra_compile_args={
'cxx': ['-O3'],
'nvcc': ['-O3', '-Xcompiler', '-fPIC'],
},
)
]
if with_balance:
print("using balance_serve")
ext_modules.append(
CMakeExtension("balance_serve", os.fspath(Path("").resolve()/ "csrc"/ "balance_serve"))
)
setup(
name=VersionInfo.PACKAGE_NAME,
version=VersionInfo().get_package_version(),
cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild},
ext_modules=[
CMakeExtension("cpuinfer_ext"),
ops_module,
]
ext_modules=ext_modules
)