From c74453d8ca1a9725e7cab7ba32be264be5d6f365 Mon Sep 17 00:00:00 2001 From: liam Date: Thu, 13 Feb 2025 16:26:31 +0800 Subject: [PATCH] :memo: add doc support and fix bug in qwen2 --- .github/workflows/book-ci.yml | 32 ++++++++++++++++++++ .github/workflows/deploy.yml | 48 ++++++++++++++++++++++++++++++ .gitignore | 1 + book.toml | 18 +++++++++++ doc/README.md | 31 +++++++++++++++++++ doc/SUMMARY.md | 14 +++++++++ doc/basic/note1.md | 1 + doc/basic/note2.md | 1 + doc/zh/api/server/README.md | 2 ++ ktransformers/operators/experts.py | 2 ++ ktransformers/operators/linear.py | 32 ++++++++++++-------- 11 files changed, 170 insertions(+), 12 deletions(-) create mode 100644 .github/workflows/book-ci.yml create mode 100644 .github/workflows/deploy.yml create mode 100644 book.toml create mode 100644 doc/README.md create mode 100644 doc/SUMMARY.md create mode 100644 doc/basic/note1.md create mode 100644 doc/basic/note2.md create mode 100644 doc/zh/api/server/README.md diff --git a/.github/workflows/book-ci.yml b/.github/workflows/book-ci.yml new file mode 100644 index 0000000..f09f18a --- /dev/null +++ b/.github/workflows/book-ci.yml @@ -0,0 +1,32 @@ +name: Book-CI + +on: + push: + branches: + - main + - server_support + + pull_request: + branches: + - main + - server_support +jobs: + test: + name: test + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + steps: + - uses: actions/checkout@v4 + - name: Install Rust + run: | + rustup set profile minimal + rustup toolchain install stable + rustup default stable + - name: Setup mdBook + uses: peaceiris/actions-mdbook@v2 + with: + mdbook-version: "latest" + # - name: Run tests + # run: mdbook test \ No newline at end of file diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 0000000..f9f8341 --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,48 @@ +name: Deploy + +on: + push: + branches: + - main + - server_support + + pull_request: + branches: + - main + - server_support + +defaults: + run: + shell: bash + +permissions: + contents: write + +jobs: + deploy: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + steps: + - uses: actions/checkout@v4 + - name: Install Rust + run: | + rustup set profile minimal + rustup toolchain install stable + rustup default stable + - name: Setup mdBook + uses: peaceiris/actions-mdbook@v2 + with: + mdbook-version: "latest" + - run: mdbook build + # - name: Copy Assets + # run: | + # chmod +x ci/copy-assets.sh + # ci/copy-assets.sh ${{ matrix.os }} + - name: Deploy + uses: peaceiris/actions-gh-pages@v3 + if: ${{ github.ref == 'refs/heads/main' }} or || github.ref == 'refs/heads/server_support' + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./book \ No newline at end of file diff --git a/.gitignore b/.gitignore index d45e956..1631d01 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,4 @@ img/ tmp1.txt test_65_300_1536.txt test.txt +book diff --git a/book.toml b/book.toml new file mode 100644 index 0000000..c88d9b7 --- /dev/null +++ b/book.toml @@ -0,0 +1,18 @@ +[book] +authors = ["kvcache-ai"] +language = "zh-CN" +title = "Ktransformers" +src = "doc" + +[output.html] +git-repository-url = "https://github.com/kvcache-ai/ktransformers" +edit-url-template = "https://github.com/kvcache-ai/ktransformers/edit/main/{path}" + +[output.html.playground] +editable = true +copy-js = true +# line-numbers = true + +[output.html.fold] +enable = true +level = 0 \ No newline at end of file diff --git a/doc/README.md b/doc/README.md new file mode 100644 index 0000000..0183497 --- /dev/null +++ b/doc/README.md @@ -0,0 +1,31 @@ +
+ +

+ + + KTransformers + + + +

+ +
+ +

🎉 Introduction

+KTransformers, pronounced as Quick Transformers, is designed to enhance your 🤗 Transformers experience with advanced kernel optimizations and placement/parallelism strategies. +

+KTransformers is a flexible, Python-centric framework designed with extensibility at its core. +By implementing and injecting an optimized module with a single line of code, users gain access to a Transformers-compatible +interface, RESTful APIs compliant with OpenAI and Ollama, and even a simplified ChatGPT-like web UI. +

+Our vision for KTransformers is to serve as a flexible platform for experimenting with innovative LLM inference optimizations. Please let us know if you need any other features. + +

🔥 Updates

+ +* **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. The detailed tutorial is [here](./doc/en/DeepseekR1_V3_tutorial.md). +* **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./doc/en/long_context_tutorial.md). +* **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G. +* **Aug 15, 2024**: Update detailed [TUTORIAL](doc/en/injection_tutorial.md) for injection and multi-GPU. +* **Aug 14, 2024**: Support llamfile as linear backend. +* **Aug 12, 2024**: Support multiple GPU; Support new model: mixtral 8\*7B and 8\*22B; Support q2k, q3k, q5k dequant on gpu. +* **Aug 9, 2024**: Support windows native. \ No newline at end of file diff --git a/doc/SUMMARY.md b/doc/SUMMARY.md new file mode 100644 index 0000000..449e0f6 --- /dev/null +++ b/doc/SUMMARY.md @@ -0,0 +1,14 @@ +# Ktransformer + +[Introduction](./README.md) +# DeepSeek +- [DeepseekR1_V3_tutorial](en/DeepseekR1_V3_tutorial.md) +- [deepseek-v2-injection](en/deepseek-v2-injection.md) +- [Makefile_usage](en/makefile_usage.md) +# Server +- [Server](zh/api/server/README.md) + - [Server](zh/api/server/server.md) + - [Website](zh/api/server/website.md) + - [Tabby](zh/api/server/tabby.md) +# FAQ +- [FAQ](en/FAQ.md) \ No newline at end of file diff --git a/doc/basic/note1.md b/doc/basic/note1.md new file mode 100644 index 0000000..daa3dba --- /dev/null +++ b/doc/basic/note1.md @@ -0,0 +1 @@ +# basic-first20 diff --git a/doc/basic/note2.md b/doc/basic/note2.md new file mode 100644 index 0000000..b73e982 --- /dev/null +++ b/doc/basic/note2.md @@ -0,0 +1 @@ +# basic-data_structure diff --git a/doc/zh/api/server/README.md b/doc/zh/api/server/README.md new file mode 100644 index 0000000..a0f47f4 --- /dev/null +++ b/doc/zh/api/server/README.md @@ -0,0 +1,2 @@ +# Server +Still Under Construction... (May have bugs and lack of documentation) \ No newline at end of file diff --git a/ktransformers/operators/experts.py b/ktransformers/operators/experts.py index 274a3ca..ecfbca0 100644 --- a/ktransformers/operators/experts.py +++ b/ktransformers/operators/experts.py @@ -576,6 +576,8 @@ class KQwen2MoeSparseMoeBlock(BaseInjectedModule, Qwen2MoeSparseMoeBlock): routing_weights_expert = routing_weights.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else routing_weights_expert.cpu() shared_expert_output = self.shared_expert(hidden_states) + tmp = self.shared_expert_gate(hidden_states) + print("shared_expert_gate shape ", tmp.shape) shared_expert_output = ( F.sigmoid(self.shared_expert_gate(hidden_states)) * shared_expert_output ) diff --git a/ktransformers/operators/linear.py b/ktransformers/operators/linear.py index 9e35e8d..305f266 100644 --- a/ktransformers/operators/linear.py +++ b/ktransformers/operators/linear.py @@ -54,15 +54,15 @@ class KLinearBase(ABC): self.has_bias = False self.dtype = torch.get_default_dtype() - # if orig_module is not None: - # self.in_features = orig_module.in_features - # self.out_features = orig_module.out_features - # else: - shape = self.gguf_loader.tensor_info[key + ".weight"]["shape"] - if len(shape) == 1: - print("Warning: orig_module is not set, but has in_features or out_features equals to 1, can't get in_features and out_features from GGUF") - self.in_features = self.gguf_loader.tensor_info[key + ".weight"]["shape"][0] - self.out_features = self.gguf_loader.tensor_info[key + ".weight"]["shape"][1] + if orig_module is not None: + self.in_features = orig_module.in_features + self.out_features = orig_module.out_features + else: + shape = self.gguf_loader.tensor_info[key + ".weight"]["shape"] + if len(shape) == 1: + print("Warning: orig_module is not set, but has in_features or out_features equals to 1, can't get in_features and out_features from GGUF") + self.in_features = self.gguf_loader.tensor_info[key + ".weight"]["shape"][0] + self.out_features = self.gguf_loader.tensor_info[key + ".weight"]["shape"][1] @abstractmethod def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -136,12 +136,19 @@ class KLinearTorch(KLinearBase): def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None): if device is None: device = self.device if w is None: w = self.load_weight(device=device) + # else: self.out_features = w.shape[0], self.in_features = w.shape[1] if isinstance(w, nn.Parameter): - self.w = w.to(dtype=self.dtype).T + try: + self.w = w.to(dtype=self.dtype).view(self.out_features, self.in_features).T + except: + self.w = w.to(dtype=self.dtype).T self.has_bias = False elif isinstance(w, tuple): - self.w = w[0].to(dtype=self.dtype).T + try: + self.w = w[0].to(dtype=self.dtype).view(self.out_features, self.in_features).T + except: + self.w = w[0].to(dtype=self.dtype).T self.bias = w[1].to(dtype=self.dtype) self.has_bias = True else: @@ -187,7 +194,8 @@ class KLinearMarlin(KLinearBase): def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None): if device is None: device = self.device assert device.lower() != "cpu", "Marlin quantized linear only supports GPU device" - if w is None: w = self.load_weight(device=device) + if w is None: + w = self.load_weight(device=device) if isinstance(w, nn.Parameter): # pad weight