diff --git a/README.md b/README.md
index 87e0a4e..d34dffe 100644
--- a/README.md
+++ b/README.md
@@ -74,24 +74,37 @@ Some preparation:
conda activate ktransformers # you may need to run ‘conda init’ and reopen shell first
```
- Download source code:
+- Make sure that PyTorch, packaging, ninja is installed
+ ```
+ pip install torch packaging ninja
+ ```
+
+
Installation
+You can install using Pypi:
+
+```
+pip install ktransformers --no-build-isolation
+```
+
+Or download source code and compile:
+ - init source code
```sh
git clone https://github.com/kvcache-ai/ktransformers.git
cd ktransformers
git submodule init
git submodule update
```
+ - [Optional] If you want to run with website, please [compile the website](./doc/en/api/server/website.md) before execute ```bash install.sh```
+ - Compile and install
+ ```
+ bash install.sh
+ ```
Local Chat
We provide a simple command-line local chat Python script that you can run for testing.
> Note that this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666). We use the DeepSeek-V2-Lite-Chat-GGUF model as an example here. But we alse support other models, you can replace it with any other model that you want to test.
-Install
-
-```sh
-bash install.sh
-```
Run Example
@@ -109,11 +122,11 @@ wget https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/resolve/main/DeepS
cd .. # Move to repo's root dir
# Start local chat
-python ktransformers/local_chat.py --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
+python -m ktransformers.local_chat --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
# If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try:
# GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite
-# python ktransformers/local_chat.py --model_path ./DeepSeek-V2-Lite --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
+# python ktransformers.local_chat --model_path ./DeepSeek-V2-Lite --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
```
@@ -154,7 +167,7 @@ wget https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GGUF/resolve/main/qwen2
cd ..
-python ktransformers/local_chat.py --model_name Qwen/Qwen2-57B-A14B-Instruct --gguf_path ./Qwen2-57B-GGUF
+python -m ktransformers.local_chat --model_name Qwen/Qwen2-57B-A14B-Instruct --gguf_path ./Qwen2-57B-GGUF
# If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try:
# GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct
@@ -172,11 +185,11 @@ wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/De
cd ..
-python ktransformers/local_chat.py --model_name deepseek-ai/DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
+python -m ktransformers.local_chat --model_name deepseek-ai/DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
# If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try:
# GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat-0628
-# python ktransformers/local_chat.py --model_path ./DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
+# python -m ktransformers.local_chat --model_path ./DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
```
| model name | weights download link |
@@ -193,15 +206,6 @@ python ktransformers/local_chat.py --model_name deepseek-ai/DeepSeek-V2-Chat-062
RESTful API and Web UI
-Install
-
-[Optional] If you want to run with website, please [compile the website](./doc/en/api/server/website.md) before execute ```pip install .```
-
-Install ktransformers with source.
-```
-pip install -r requirements-local_chat.txt
-pip install . --no-build-isolation
-```
Start without website:
diff --git a/install.sh b/install.sh
index d8cceef..fa5ba18 100644
--- a/install.sh
+++ b/install.sh
@@ -10,16 +10,6 @@ rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info
echo "Installing python dependencies from requirements.txt"
pip install -r requirements-local_chat.txt
-echo "Installing ktransformers cpuinfer"
-mkdir -p ktransformers/ktransformers_ext/build
-cd ktransformers/ktransformers_ext/build
-cmake ..
-cmake --build . --config Release
-
-echo "Installing ktransformers gpu kernel, this may take for a while, please wait"
-sleep 3
-
-cd ../cuda
-python setup.py install
-cd ../../..
+echo "Installing ktransformers"
+pip install . --no-build-isolation
echo "Installation completed successfully"
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 3378ef0..0bbef99 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,3 +6,63 @@ requires = [
"packaging"
]
build-backend = "setuptools.build_meta"
+
+[project]
+
+name = "ktransformers"
+
+dynamic = ["version"]
+
+dependencies = [
+ "torch >= 2.3.0",
+ "transformers == 4.43.2",
+ "fastapi >= 0.111.0",
+ "langchain >= 0.2.0",
+ "blessed >= 1.20.0",
+ "accelerate >= 0.31.0",
+ "sentencepiece >= 0.1.97",
+ "setuptools",
+ "ninja",
+ "wheel",
+ "colorlog",
+ "build",
+ "fire"
+]
+
+requires-python = ">=3.11"
+
+authors = [
+ {name = "KVCache.AI", email = "zhang.mingxing@outlook.com"}
+]
+
+maintainers = [
+ {name = "james0zan", email = "zhang.mingxing@outlook.com"},
+ {name = "awake", email = "awake@approaching.ai"},
+ {name = "unicorn chan", email = "nl@approaching.ai"}
+]
+
+description = "KTransformers, pronounced as Quick Transformers, is designed to enhance your Transformers experience with advanced kernel optimizations and placement/parallelism strategies."
+
+readme = "README.md"
+license = {file = "LICENSE"}
+
+keywords = ["ktransformers", "llm"]
+
+classifiers = [
+ "Development Status :: 4 - Beta",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12"
+]
+
+[project.urls]
+Homepage = "https://kvcache.ai"
+Repository = "https://github.com/kvcache-ai/ktransformers.git"
+Issues = "https://github.com/kvcache-ai/ktransformers/issues"
+
+
+[project.scripts]
+ktransformers = "ktransformers.server.main:main"
+
+[tool.setuptools.packages.find]
+where = ["./", ]
+include = ["ktransformers"]
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 5219f7b..38ee098 100644
--- a/setup.py
+++ b/setup.py
@@ -3,44 +3,54 @@
'''
Description :
Author : chenxl
-Date : 2024-07-12 07:25:42
+Date : 2024-07-27 16:15:27
Version : 1.0.0
LastEditors : chenxl
-LastEditTime : 2024-07-27 04:31:03
+LastEditTime : 2024-07-29 09:40:24
+Adapted from:
+https://github.com/Dao-AILab/flash-attention/blob/v2.6.3/setup.py
+Copyright (c) 2023, Tri Dao.
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
+
import os
-import shutil
import sys
import re
import ast
import subprocess
import platform
-import io
+import urllib.request
+import urllib.error
from pathlib import Path
from packaging.version import parse
import torch.version
from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
from setuptools import setup, Extension
-import torch
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
-ROOT_DIR = os.path.dirname(__file__)
+
class VersionInfo:
THIS_DIR = os.path.dirname(os.path.abspath(__file__))
PACKAGE_NAME = "ktransformers"
+ BASE_WHEEL_URL:str = (
+ "https://github.com/kvcache-ai/ktransformers/releases/download/{tag_name}/{wheel_filename}"
+ )
+ FORCE_BUILD = os.getenv("KTRANSFORMERS_FORCE_BUILD", "FALSE") == "TRUE"
+
def get_cuda_bare_metal_version(self, cuda_dir):
- raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
+ raw_output = subprocess.check_output(
+ [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
output = raw_output.split()
release_idx = output.index("release") + 1
bare_metal_version = parse(output[release_idx].split(",")[0])
cuda_version = f"{bare_metal_version.major}{bare_metal_version.minor}"
return cuda_version
-
+
def get_cuda_version_of_torch(self,):
torch_cuda_version = parse(torch.version.cuda)
cuda_version = f"{torch_cuda_version.major}{torch_cuda_version.minor}"
return cuda_version
-
+
def get_platform(self,):
"""
Returns the platform name as used in wheel filenames.
@@ -49,13 +59,13 @@ class VersionInfo:
return f'linux_{platform.uname().machine}'
else:
raise ValueError("Unsupported platform: {}".format(sys.platform))
-
+
def get_cpu_instruct(self,):
if sys.platform.startswith("linux"):
- with open('/proc/cpuinfo', 'r') as cpu_f:
+ with open('/proc/cpuinfo', 'r', encoding="utf-8") as cpu_f:
cpuinfo = cpu_f.read()
-
- flags_line = [line for line in cpuinfo.split('\n') if line.startswith('flags')][0]
+ flags_line = [line for line in cpuinfo.split(
+ '\n') if line.startswith('flags')][0]
flags = flags_line.split(':')[1].strip().split(' ')
for flag in flags:
if 'avx512' in flag:
@@ -63,38 +73,70 @@ class VersionInfo:
for flag in flags:
if 'avx2' in flag:
return 'avx2'
- raise ValueError("Unsupported cpu Instructions: {}".format(flags_line))
-
+ raise ValueError(
+ "Unsupported cpu Instructions: {}".format(flags_line))
+ else:
+ raise ValueError("Unsupported platform: {}".format(sys.platform))
+
def get_torch_version(self,):
torch_version_raw = parse(torch.__version__)
torch_version = f"{torch_version_raw.major}{torch_version_raw.minor}"
return torch_version
- def get_package_version(self,):
- version_file = os.path.join(Path(VersionInfo.THIS_DIR), VersionInfo.PACKAGE_NAME, "__init__.py")
+ def get_flash_version(self,):
+ version_file = os.path.join(
+ Path(VersionInfo.THIS_DIR), VersionInfo.PACKAGE_NAME, "__init__.py")
with open(version_file, "r", encoding="utf-8") as f:
- version_match = re.search(r"^__version__\s*=\s*(.*)$", f.read(), re.MULTILINE)
- public_version = ast.literal_eval(version_match.group(1))
- package_version = f"{str(public_version)}+cu{self.get_cuda_bare_metal_version(CUDA_HOME)}torch{self.get_torch_version()}{self.get_cpu_instruct()}"
+ version_match = re.search(
+ r"^__version__\s*=\s*(.*)$", f.read(), re.MULTILINE)
+ flash_version = ast.literal_eval(version_match.group(1))
+ return flash_version
+
+ def get_package_version(self, full_version=False):
+ flash_version = self.get_flash_version()
+ package_version = f"{str(flash_version)}+cu{self.get_cuda_bare_metal_version(CUDA_HOME)}torch{self.get_torch_version()}{self.get_cpu_instruct()}"
+ if full_version:
+ return package_version
+ if not VersionInfo.FORCE_BUILD:
+ return str(flash_version)
return package_version
-
+
class BuildWheelsCommand(_bdist_wheel):
def get_wheel_name(self,):
version_info = VersionInfo()
+ package_version = version_info.get_package_version(full_version=True)
+ flash_version = version_info.get_flash_version()
python_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
- wheel_filename = f"{VersionInfo.PACKAGE_NAME}-{version_info.get_package_version()}-{python_version}-{python_version}-{version_info.get_platform()}.whl"
- return wheel_filename
-
-
+ wheel_filename = f"{VersionInfo.PACKAGE_NAME}-{package_version}-{python_version}-{python_version}-{version_info.get_platform()}.whl"
+ wheel_url = VersionInfo.BASE_WHEEL_URL.format(tag_name=f"v{flash_version}", wheel_filename=wheel_filename)
+ return wheel_filename, wheel_url
+
+
def run(self):
- super().run()
- impl_tag, abi_tag, plat_tag = self.get_tag()
- archive_basename = f"{self.wheel_dist_name}-{impl_tag}-{abi_tag}-{plat_tag}"
- wheel_path = os.path.join(self.dist_dir, archive_basename + ".whl")
- wheel_name_with_platform = os.path.join(self.dist_dir, self.get_wheel_name())
- os.rename(wheel_path, wheel_name_with_platform)
-
+ if VersionInfo.FORCE_BUILD:
+ super().run()
+ wheel_filename, wheel_url = self.get_wheel_name()
+ print("Guessing wheel URL: ", wheel_url)
+ try:
+ urllib.request.urlretrieve(wheel_url, wheel_filename)
+ # Make the archive
+ # Lifted from the root wheel processing command
+ # https://github.com/pypa/wheel/blob/cf71108ff9f6ffc36978069acb28824b44ae028e/src/wheel/bdist_wheel.py#LL381C9-L381C85
+ if not os.path.exists(self.dist_dir):
+ os.makedirs(self.dist_dir)
+
+ impl_tag, abi_tag, plat_tag = self.get_tag()
+ archive_basename = f"{self.wheel_dist_name}-{impl_tag}-{abi_tag}-{plat_tag}"
+
+ wheel_path = os.path.join(self.dist_dir, archive_basename + ".whl")
+ print("Raw wheel path", wheel_path)
+ os.rename(wheel_filename, wheel_path)
+ except (urllib.error.HTTPError, urllib.error.URLError):
+ print("Precompiled wheel not found. Building from source...")
+ # If the wheel could not be downloaded, build from source
+ super().run()
+
# Convert distutils Windows platform specifiers to CMake -A arguments
PLAT_TO_CMAKE = {
@@ -104,22 +146,17 @@ PLAT_TO_CMAKE = {
"win-arm64": "ARM64",
}
-class CopyExtension(Extension):
- def __init__(self, name: str, sourcedir: str = "", copy_file_source="") -> None:
- super().__init__(name, sources=[])
- self.sourcedir = os.fspath(Path(sourcedir).resolve())
- self.source_file = copy_file_source
+
class CMakeExtension(Extension):
def __init__(self, name: str, sourcedir: str = "") -> None:
super().__init__(name, sources=[])
- self.sourcedir = os.fspath(Path(sourcedir).resolve() / "ktransformers/ktransformers_ext")
+ self.sourcedir = os.fspath(
+ Path(sourcedir).resolve() / "ktransformers" / "ktransformers_ext")
+
+
class CMakeBuild(BuildExtension):
+
def build_extension(self, ext) -> None:
- if isinstance(ext, CopyExtension):
- ext_fullpath = Path.cwd() / self.get_ext_fullpath(ext.name)
- extdir = ext_fullpath.parent.resolve()
- shutil.copy(ext.source_file, extdir)
- return
if not isinstance(ext, CMakeExtension):
super().build_extension(ext)
return
@@ -129,7 +166,8 @@ class CMakeBuild(BuildExtension):
# Using this requires trailing slash for auto-detection & inclusion of
# auxiliary "native" libs
- debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug
+ debug = int(os.environ.get("DEBUG", 0)
+ ) if self.debug is None else self.debug
cfg = "Debug" if debug else "Release"
# CMake lets you override the generator - we need to check this.
@@ -146,10 +184,12 @@ class CMakeBuild(BuildExtension):
]
build_args = []
if "CMAKE_ARGS" in os.environ:
- cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
+ cmake_args += [
+ item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
# In this example, we pass in the version to C++. You might not need to.
- cmake_args += [f"-DEXAMPLE_VERSION_INFO={self.distribution.get_version()}"]
+ cmake_args += [
+ f"-DEXAMPLE_VERSION_INFO={self.distribution.get_version()}"]
if self.compiler.compiler_type != "msvc":
if not cmake_generator or cmake_generator == "Ninja":
try:
@@ -165,7 +205,8 @@ class CMakeBuild(BuildExtension):
else:
# Single config generators are handled "normally"
- single_config = any(x in cmake_generator for x in {"NMake", "Ninja"})
+ single_config = any(
+ x in cmake_generator for x in {"NMake", "Ninja"})
# CMake allows an arch-in-generator style for backward compatibility
contains_arch = any(x in cmake_generator for x in {"ARM", "Win64"})
@@ -183,7 +224,8 @@ class CMakeBuild(BuildExtension):
# Cross-compile support for macOS - respect ARCHFLAGS if set
archs = re.findall(r"-arch (\S+)", os.environ.get("ARCHFLAGS", ""))
if archs:
- cmake_args += ["-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))]
+ cmake_args += [
+ "-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))]
if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
if hasattr(self, "parallel") and self.parallel:
@@ -199,51 +241,16 @@ class CMakeBuild(BuildExtension):
["cmake", "--build", ".", *build_args], cwd=build_temp, check=True
)
-def read_readme() -> str:
- p = os.path.join(ROOT_DIR, "README.md")
- if os.path.isfile(p):
- return io.open(p, "r", encoding="utf-8").read()
- else:
- return ""
setup(
- name="ktransformers",
version=VersionInfo().get_package_version(),
- author="KVCache.ai",
- license="Apache 2.0",
- description = "KTransformers, pronounced as Quick Transformers, is designed to enhance your Transformers experience with advanced kernel optimizations and placement/parallelism strategies.",
- long_description=read_readme(),
- long_description_content_type="text/markdown",
- cmdclass={"build_ext": CMakeBuild},
- install_requires = [
- "torch >= 2.3.0",
- "transformers == 4.43.2",
- "fastapi >= 0.111.0",
- "langchain >= 0.2.0",
- "blessed >= 1.20.0",
- "accelerate >= 0.31.0",
- "sentencepiece >= 0.1.97",
- "setuptools",
- "ninja",
- "wheel",
- "colorlog",
- "build",
- "packaging",
- "fire"
- ],
- python_requires=">=3.10",
- entry_points={
- "console_scripts": [
- "ktransformers=ktransformers.server.main:main",
- ],
- },
- packages=["ktransformers"],
- include_package_data=True,
+ cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild},
ext_modules=[
- CUDAExtension('KTransformersOps', [
- 'ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu',
- 'ktransformers/ktransformers_ext/cuda/binding.cpp',
- 'ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu',
- ]),
- CMakeExtension("cpuinfer_ext")]
-)
\ No newline at end of file
+ CMakeExtension("cpuinfer_ext"),
+ CUDAExtension('KTransformersOps', [
+ 'ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu',
+ 'ktransformers/ktransformers_ext/cuda/binding.cpp',
+ 'ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu'
+ ])
+ ]
+)