[feature] support for pypi install

2025-09-05 20:19:51 +00:00 · 2024-07-29 11:51:28 +00:00 · 2024-07-29 11:51:28 +00:00 · dd18a11cab
commit dd18a11cab
parent a25320b703
4 changed files with 185 additions and 124 deletions
--- a/README.md
+++ b/README.md
@ -74,24 +74,37 @@ Some preparation:
  conda activate ktransformers # you may need to run ‘conda init’ and reopen shell first
  ```
-  Download source code:
+- Make sure that PyTorch, packaging, ninja is installed
  ```
  pip install torch packaging ninja
  ```
 <h3>Installation</h3>
 You can install using Pypi:
 ```
 pip install ktransformers --no-build-isolation
 ```
 Or download source code and compile:
 - init source code 
  ```sh
  git clone https://github.com/kvcache-ai/ktransformers.git
  cd ktransformers
  git submodule init
  git submodule update
  ```
 - [Optional] If you want to run with website, please [compile the website](./doc/en/api/server/website.md) before execute ```bash install.sh```
 - Compile and install
   ```
   bash install.sh
   ```
 <h3>Local Chat</h3>
 We provide a simple command-line local chat Python script that you can run for testing. 
  > Note that this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666). We use the DeepSeek-V2-Lite-Chat-GGUF model as an example here. But we alse support other models, you can replace it with any other model that you want to test. 
 <h4>Install</h4>
 ```sh
 bash install.sh
 ```
 <h4>Run Example</h4>
@ -109,11 +122,11 @@ wget https://huggingface.co/mzwing/DeepSeek-V2-Lite-Chat-GGUF/resolve/main/DeepS
 cd .. # Move to repo's root dir
 # Start local chat
-python  ktransformers/local_chat.py --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
+python -m ktransformers.local_chat --model_path deepseek-ai/DeepSeek-V2-Lite-Chat --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
 # If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：
 # GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite
-# python  ktransformers/local_chat.py --model_path ./DeepSeek-V2-Lite --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
+# python  ktransformers.local_chat --model_path ./DeepSeek-V2-Lite --gguf_path ./DeepSeek-V2-Lite-Chat-GGUF
 ```
@ -154,7 +167,7 @@ wget https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GGUF/resolve/main/qwen2
 cd ..
-python ktransformers/local_chat.py --model_name Qwen/Qwen2-57B-A14B-Instruct --gguf_path ./Qwen2-57B-GGUF
+python -m ktransformers.local_chat --model_name Qwen/Qwen2-57B-A14B-Instruct --gguf_path ./Qwen2-57B-GGUF
 # If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：
 # GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct
@ -172,11 +185,11 @@ wget https://huggingface.co/bartowski/DeepSeek-V2-Chat-0628-GGUF/resolve/main/De
 cd ..
-python ktransformers/local_chat.py --model_name deepseek-ai/DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
+python -m ktransformers.local_chat --model_name deepseek-ai/DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
 # If you see “OSError: We couldn't connect to 'https://huggingface.co' to load this file”, try：
 # GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/deepseek-ai/DeepSeek-V2-Chat-0628
-# python  ktransformers/local_chat.py --model_path ./DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
+# python -m ktransformers.local_chat --model_path ./DeepSeek-V2-Chat-0628 --gguf_path ./DeepSeek-V2-Chat-0628-GGUF
 ```
 | model name | weights download link |
@ -193,15 +206,6 @@ python ktransformers/local_chat.py --model_name deepseek-ai/DeepSeek-V2-Chat-062
 <h3>RESTful API and Web UI</h3>
 <h4>Install</h4>
 [Optional] If you want to run with website, please [compile the website](./doc/en/api/server/website.md) before execute ```pip install .```
 Install ktransformers with source.
 ```
 pip install -r requirements-local_chat.txt
 pip install . --no-build-isolation
 ```
 Start without website:
--- a/install.sh
+++ b/install.sh
@ -10,16 +10,6 @@ rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info
 echo "Installing python dependencies from requirements.txt"
 pip install -r requirements-local_chat.txt
-echo "Installing ktransformers cpuinfer"
+echo "Installing ktransformers"
-mkdir -p ktransformers/ktransformers_ext/build
+pip install . --no-build-isolation
 cd ktransformers/ktransformers_ext/build
 cmake ..
 cmake --build . --config Release
 echo "Installing ktransformers gpu kernel, this may take for a while, please wait"
 sleep 3
 cd ../cuda
 python setup.py install
 cd ../../..
 echo "Installation completed successfully"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,3 +6,63 @@ requires = [
  "packaging"
  ]
 build-backend = "setuptools.build_meta"
 [project]
 name = "ktransformers"
 dynamic = ["version"]
 dependencies = [
  "torch >= 2.3.0",
  "transformers == 4.43.2",
  "fastapi >= 0.111.0",
  "langchain >= 0.2.0",
  "blessed >= 1.20.0",
  "accelerate >= 0.31.0",
  "sentencepiece >= 0.1.97",
  "setuptools",
  "ninja",
  "wheel",
  "colorlog",
  "build",
  "fire"
 ]
 requires-python = ">=3.11"
 authors = [
  {name = "KVCache.AI", email = "zhang.mingxing@outlook.com"}
 ]
 maintainers = [
  {name = "james0zan", email = "zhang.mingxing@outlook.com"},
  {name = "awake", email = "awake@approaching.ai"},
  {name = "unicorn chan", email = "nl@approaching.ai"}
 ]
 description = "KTransformers, pronounced as Quick Transformers, is designed to enhance your Transformers experience with advanced kernel optimizations and placement/parallelism strategies."
 readme = "README.md"
 license = {file = "LICENSE"}
 keywords = ["ktransformers", "llm"]
 classifiers = [
  "Development Status :: 4 - Beta",
  "Programming Language :: Python :: 3.11",
  "Programming Language :: Python :: 3.12"
 ]
 [project.urls]
 Homepage = "https://kvcache.ai"
 Repository = "https://github.com/kvcache-ai/ktransformers.git"
 Issues = "https://github.com/kvcache-ai/ktransformers/issues"
 [project.scripts]
 ktransformers = "ktransformers.server.main:main"
 [tool.setuptools.packages.find]
 where = ["./", ]
 include = ["ktransformers"]
--- a/setup.py
+++ b/setup.py
@ -3,33 +3,43 @@
 '''
 Description  :  
 Author       : chenxl
-Date         : 2024-07-12 07:25:42
+Date         : 2024-07-27 16:15:27
 Version      : 1.0.0
 LastEditors  : chenxl 
-LastEditTime : 2024-07-27 04:31:03
+LastEditTime : 2024-07-29 09:40:24
 Adapted from:
 https://github.com/Dao-AILab/flash-attention/blob/v2.6.3/setup.py
 Copyright (c) 2023, Tri Dao.
 Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
 '''
 import os
 import shutil
 import sys
 import re
 import ast
 import subprocess
 import platform
-import io
+import urllib.request
 import urllib.error
 from pathlib import Path
 from packaging.version import parse
 import torch.version
 from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
 from setuptools import setup, Extension
 import torch
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
-ROOT_DIR = os.path.dirname(__file__)
+
 class VersionInfo:
    THIS_DIR = os.path.dirname(os.path.abspath(__file__))
    PACKAGE_NAME = "ktransformers"
    BASE_WHEEL_URL:str = (
        "https://github.com/kvcache-ai/ktransformers/releases/download/{tag_name}/{wheel_filename}"
    )
    FORCE_BUILD = os.getenv("KTRANSFORMERS_FORCE_BUILD", "FALSE") == "TRUE"
    def get_cuda_bare_metal_version(self, cuda_dir):
-        raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
+        raw_output = subprocess.check_output(
            [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
        output = raw_output.split()
        release_idx = output.index("release") + 1
        bare_metal_version = parse(output[release_idx].split(",")[0])
@ -52,10 +62,10 @@ class VersionInfo:
    def get_cpu_instruct(self,):
        if sys.platform.startswith("linux"):
-            with open('/proc/cpuinfo', 'r') as cpu_f:
+            with open('/proc/cpuinfo', 'r', encoding="utf-8") as cpu_f:
                cpuinfo = cpu_f.read()
-            
+            flags_line = [line for line in cpuinfo.split(
-            flags_line = [line for line in cpuinfo.split('\n') if line.startswith('flags')][0]
+                '\n') if line.startswith('flags')][0]
            flags = flags_line.split(':')[1].strip().split(' ')
            for flag in flags:
                if 'avx512' in flag:
@ -63,37 +73,69 @@ class VersionInfo:
            for flag in flags:
                if 'avx2' in flag:
                    return 'avx2'
-            raise ValueError("Unsupported cpu Instructions: {}".format(flags_line))
+            raise ValueError(
                "Unsupported cpu Instructions: {}".format(flags_line))
        else:
            raise ValueError("Unsupported platform: {}".format(sys.platform))
    def get_torch_version(self,):
        torch_version_raw = parse(torch.__version__)
        torch_version = f"{torch_version_raw.major}{torch_version_raw.minor}"
        return torch_version
-    def get_package_version(self,):
+    def get_flash_version(self,):
-        version_file = os.path.join(Path(VersionInfo.THIS_DIR), VersionInfo.PACKAGE_NAME, "__init__.py")
+        version_file = os.path.join(
            Path(VersionInfo.THIS_DIR), VersionInfo.PACKAGE_NAME, "__init__.py")
        with open(version_file, "r", encoding="utf-8") as f:
-            version_match = re.search(r"^__version__\s*=\s*(.*)$", f.read(), re.MULTILINE)
+            version_match = re.search(
-        public_version = ast.literal_eval(version_match.group(1))
+                r"^__version__\s*=\s*(.*)$", f.read(), re.MULTILINE)
-        package_version = f"{str(public_version)}+cu{self.get_cuda_bare_metal_version(CUDA_HOME)}torch{self.get_torch_version()}{self.get_cpu_instruct()}"
+        flash_version = ast.literal_eval(version_match.group(1))
        return flash_version
    def get_package_version(self, full_version=False):
        flash_version = self.get_flash_version()
        package_version = f"{str(flash_version)}+cu{self.get_cuda_bare_metal_version(CUDA_HOME)}torch{self.get_torch_version()}{self.get_cpu_instruct()}"
        if full_version:
            return package_version
        if not VersionInfo.FORCE_BUILD:
            return str(flash_version)
        return package_version
 class BuildWheelsCommand(_bdist_wheel):
    def get_wheel_name(self,):
        version_info = VersionInfo()
        package_version = version_info.get_package_version(full_version=True)
        flash_version = version_info.get_flash_version()
        python_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
-        wheel_filename = f"{VersionInfo.PACKAGE_NAME}-{version_info.get_package_version()}-{python_version}-{python_version}-{version_info.get_platform()}.whl"
+        wheel_filename = f"{VersionInfo.PACKAGE_NAME}-{package_version}-{python_version}-{python_version}-{version_info.get_platform()}.whl"
-        return wheel_filename
+        wheel_url = VersionInfo.BASE_WHEEL_URL.format(tag_name=f"v{flash_version}", wheel_filename=wheel_filename)
        return wheel_filename, wheel_url
    def run(self):
        if VersionInfo.FORCE_BUILD:
            super().run()
        wheel_filename, wheel_url = self.get_wheel_name()
        print("Guessing wheel URL: ", wheel_url)
        try:
            urllib.request.urlretrieve(wheel_url, wheel_filename)
            # Make the archive
            # Lifted from the root wheel processing command
            # https://github.com/pypa/wheel/blob/cf71108ff9f6ffc36978069acb28824b44ae028e/src/wheel/bdist_wheel.py#LL381C9-L381C85
            if not os.path.exists(self.dist_dir):
                os.makedirs(self.dist_dir)
            impl_tag, abi_tag, plat_tag = self.get_tag()
            archive_basename = f"{self.wheel_dist_name}-{impl_tag}-{abi_tag}-{plat_tag}"
            wheel_path = os.path.join(self.dist_dir, archive_basename + ".whl")
-        wheel_name_with_platform = os.path.join(self.dist_dir, self.get_wheel_name())
+            print("Raw wheel path", wheel_path)
-        os.rename(wheel_path, wheel_name_with_platform)        
+            os.rename(wheel_filename, wheel_path)
        except (urllib.error.HTTPError, urllib.error.URLError):
            print("Precompiled wheel not found. Building from source...")
            # If the wheel could not be downloaded, build from source
            super().run()
 # Convert distutils Windows platform specifiers to CMake -A arguments
@ -104,22 +146,17 @@ PLAT_TO_CMAKE = {
    "win-arm64": "ARM64",
 }
-class CopyExtension(Extension):
+
    def __init__(self, name: str, sourcedir: str = "", copy_file_source="") -> None:
        super().__init__(name, sources=[])
        self.sourcedir = os.fspath(Path(sourcedir).resolve())
        self.source_file = copy_file_source
 class CMakeExtension(Extension):
    def __init__(self, name: str, sourcedir: str = "") -> None:
        super().__init__(name, sources=[])
-        self.sourcedir = os.fspath(Path(sourcedir).resolve() / "ktransformers/ktransformers_ext")
+        self.sourcedir = os.fspath(
            Path(sourcedir).resolve() / "ktransformers" / "ktransformers_ext")
 class CMakeBuild(BuildExtension):
    def build_extension(self, ext) -> None:
        if  isinstance(ext, CopyExtension):
            ext_fullpath = Path.cwd() / self.get_ext_fullpath(ext.name)
            extdir = ext_fullpath.parent.resolve()
            shutil.copy(ext.source_file, extdir)
            return
        if not isinstance(ext, CMakeExtension):
            super().build_extension(ext)
            return
@ -129,7 +166,8 @@ class CMakeBuild(BuildExtension):
        # Using this requires trailing slash for auto-detection & inclusion of
        # auxiliary "native" libs
-        debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug
+        debug = int(os.environ.get("DEBUG", 0)
                    ) if self.debug is None else self.debug
        cfg = "Debug" if debug else "Release"
        # CMake lets you override the generator - we need to check this.
@ -146,10 +184,12 @@ class CMakeBuild(BuildExtension):
        ]
        build_args = []
        if "CMAKE_ARGS" in os.environ:
-            cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
+            cmake_args += [
                item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
        # In this example, we pass in the version to C++. You might not need to.
-        cmake_args += [f"-DEXAMPLE_VERSION_INFO={self.distribution.get_version()}"]
+        cmake_args += [
            f"-DEXAMPLE_VERSION_INFO={self.distribution.get_version()}"]
        if self.compiler.compiler_type != "msvc":
            if not cmake_generator or cmake_generator == "Ninja":
                try:
@ -165,7 +205,8 @@ class CMakeBuild(BuildExtension):
        else:
            # Single config generators are handled "normally"
-            single_config = any(x in cmake_generator for x in {"NMake", "Ninja"})
+            single_config = any(
                x in cmake_generator for x in {"NMake", "Ninja"})
            # CMake allows an arch-in-generator style for backward compatibility
            contains_arch = any(x in cmake_generator for x in {"ARM", "Win64"})
@ -183,7 +224,8 @@ class CMakeBuild(BuildExtension):
            # Cross-compile support for macOS - respect ARCHFLAGS if set
            archs = re.findall(r"-arch (\S+)", os.environ.get("ARCHFLAGS", ""))
            if archs:
-                cmake_args += ["-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))]
+                cmake_args += [
                    "-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))]
        if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
            if hasattr(self, "parallel") and self.parallel:
@ -199,51 +241,16 @@ class CMakeBuild(BuildExtension):
            ["cmake", "--build", ".", *build_args], cwd=build_temp, check=True
        )
 def read_readme() -> str:
    p = os.path.join(ROOT_DIR, "README.md")
    if os.path.isfile(p):
        return io.open(p, "r", encoding="utf-8").read()
    else:
        return ""
 setup(
    name="ktransformers",
    version=VersionInfo().get_package_version(),
-    author="KVCache.ai",
+    cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild},
    license="Apache 2.0",
    description = "KTransformers, pronounced as Quick Transformers, is designed to enhance your Transformers experience with advanced kernel optimizations and placement/parallelism strategies.",
    long_description=read_readme(),
    long_description_content_type="text/markdown",
    cmdclass={"build_ext": CMakeBuild},
    install_requires = [
        "torch >= 2.3.0",
        "transformers == 4.43.2",
        "fastapi >= 0.111.0",
        "langchain >= 0.2.0",
        "blessed >= 1.20.0",
        "accelerate >= 0.31.0",
        "sentencepiece >= 0.1.97",
        "setuptools",
        "ninja",
        "wheel",
        "colorlog",
        "build",
        "packaging",
        "fire"
    ],
    python_requires=">=3.10",
    entry_points={
        "console_scripts": [
            "ktransformers=ktransformers.server.main:main",
        ],
    },
    packages=["ktransformers"],
    include_package_data=True,
    ext_modules=[
        CMakeExtension("cpuinfer_ext"),
        CUDAExtension('KTransformersOps', [
            'ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu',
            'ktransformers/ktransformers_ext/cuda/binding.cpp',
-                'ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu',
+            'ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu'
-      ]),
+        ])
-            CMakeExtension("cpuinfer_ext")]
+    ]
 )