📝 add doc support and fix bug in qwen2

2025-09-10 15:29:39 +00:00 · 2025-02-13 16:26:31 +08:00 · 2025-02-13 16:26:31 +08:00 · c74453d8ca
commit c74453d8ca
parent 8bad019ef2
11 changed files with 170 additions and 12 deletions
--- a/.github/workflows/book-ci.yml
+++ b/.github/workflows/book-ci.yml
@ -0,0 +1,32 @@
 name: Book-CI
 on:
  push:
    branches:
      - main
      - server_support
  pull_request:
    branches:
      - main
      - server_support
 jobs:
  test:
    name: test
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest, macos-latest, windows-latest]
    steps:
      - uses: actions/checkout@v4
      - name: Install Rust
        run: |
          rustup set profile minimal
          rustup toolchain install stable
          rustup default stable
      - name: Setup mdBook
        uses: peaceiris/actions-mdbook@v2
        with:
          mdbook-version: "latest"
      # - name: Run tests
      #   run: mdbook test
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@ -0,0 +1,48 @@
 name: Deploy
 on:
  push:
    branches:
      - main
      - server_support
  pull_request:
    branches:
      - main
      - server_support
 defaults:
  run:
    shell: bash
 permissions:
  contents: write
 jobs:
  deploy:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest, macos-latest, windows-latest]
    steps:
      - uses: actions/checkout@v4
      - name: Install Rust
        run: |
          rustup set profile minimal
          rustup toolchain install stable
          rustup default stable
      - name: Setup mdBook
        uses: peaceiris/actions-mdbook@v2
        with:
          mdbook-version: "latest"
      - run: mdbook build
      # - name: Copy Assets
      #   run: |
      #     chmod +x ci/copy-assets.sh
      #     ci/copy-assets.sh ${{ matrix.os }}
      - name: Deploy
        uses: peaceiris/actions-gh-pages@v3
        if: ${{ github.ref == 'refs/heads/main' }} or || github.ref == 'refs/heads/server_support'
        with:
          github_token: ${{ secrets.GITHUB_TOKEN }}
          publish_dir: ./book
--- a/.gitignore
+++ b/.gitignore
@ -22,3 +22,4 @@ img/
 tmp1.txt
 test_65_300_1536.txt
 test.txt
 book
--- a/book.toml
+++ b/book.toml
@ -0,0 +1,18 @@
 [book]
 authors = ["kvcache-ai"]
 language = "zh-CN"
 title = "Ktransformers"
 src = "doc"
 [output.html]
 git-repository-url = "https://github.com/kvcache-ai/ktransformers"
 edit-url-template = "https://github.com/kvcache-ai/ktransformers/edit/main/{path}"
 [output.html.playground]
 editable = true
 copy-js = true
 # line-numbers = true
 [output.html.fold]
 enable = true
 level = 0
--- a/doc/README.md
+++ b/doc/README.md
@ -0,0 +1,31 @@
 <div align="center">
  <!-- <h1>KTransformers</h1> -->
  <p align="center">
 <picture>
    <img alt="KTransformers" src="https://github.com/user-attachments/assets/d5a2492f-a415-4456-af99-4ab102f13f8b" width=50%>
 </picture>
 </p>
 </div>
 <h2 id="intro">🎉 Introduction</h2>
 KTransformers, pronounced as Quick Transformers, is designed to enhance your 🤗 <a href="https://github.com/huggingface/transformers">Transformers</a> experience with advanced kernel optimizations and placement/parallelism strategies.
 <br/><br/>
 KTransformers is a flexible, Python-centric framework designed with extensibility at its core. 
 By implementing and injecting an optimized module with a single line of code, users gain access to a Transformers-compatible
 interface, RESTful APIs compliant with OpenAI and Ollama, and even a simplified ChatGPT-like web UI. 
 <br/><br/>
 Our vision for KTransformers is to serve as a flexible platform for experimenting with innovative LLM inference optimizations. Please let us know if you need any other features.
 <h2 id="Updates">🔥 Updates</h2>
 * **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. The detailed tutorial is [here](./doc/en/DeepseekR1_V3_tutorial.md).
 * **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./doc/en/long_context_tutorial.md).
 * **Aug 28, 2024**: Decrease DeepseekV2's required VRAM from 21G to 11G.
 * **Aug 15, 2024**: Update detailed [TUTORIAL](doc/en/injection_tutorial.md) for injection and multi-GPU. 
 * **Aug 14, 2024**: Support llamfile as linear backend. 
 * **Aug 12, 2024**: Support multiple GPU; Support new model: mixtral 8\*7B  and 8\*22B; Support q2k, q3k, q5k dequant on gpu.
 * **Aug 9, 2024**: Support windows native.
--- a/doc/SUMMARY.md
+++ b/doc/SUMMARY.md
@ -0,0 +1,14 @@
 # Ktransformer
 [Introduction](./README.md)
 # DeepSeek
 - [DeepseekR1_V3_tutorial](en/DeepseekR1_V3_tutorial.md)
 - [deepseek-v2-injection](en/deepseek-v2-injection.md)
 - [Makefile_usage](en/makefile_usage.md)
 # Server
 - [Server](zh/api/server/README.md)
  - [Server](zh/api/server/server.md)
  - [Website](zh/api/server/website.md)
  - [Tabby](zh/api/server/tabby.md)
 # FAQ
 - [FAQ](en/FAQ.md)
--- a/doc/basic/note1.md
+++ b/doc/basic/note1.md
@ -0,0 +1 @@
 # basic-first20
--- a/doc/basic/note2.md
+++ b/doc/basic/note2.md
@ -0,0 +1 @@
 # basic-data_structure
--- a/doc/zh/api/server/README.md
+++ b/doc/zh/api/server/README.md
@ -0,0 +1,2 @@
 # Server
 Still Under Construction... (May have bugs and lack of documentation)
--- a/ktransformers/operators/experts.py
+++ b/ktransformers/operators/experts.py
@ -576,6 +576,8 @@ class KQwen2MoeSparseMoeBlock(BaseInjectedModule, Qwen2MoeSparseMoeBlock):
        routing_weights_expert = routing_weights.to(self.experts.device) if isinstance(self.experts, KExpertsBase) else routing_weights_expert.cpu()
        shared_expert_output = self.shared_expert(hidden_states)
        tmp  = self.shared_expert_gate(hidden_states) 
        print("shared_expert_gate shape ", tmp.shape)
        shared_expert_output = (
            F.sigmoid(self.shared_expert_gate(hidden_states)) * shared_expert_output
        )
--- a/ktransformers/operators/linear.py
+++ b/ktransformers/operators/linear.py
@ -54,10 +54,10 @@ class KLinearBase(ABC):
        self.has_bias = False
        self.dtype = torch.get_default_dtype()
-        # if orig_module is not None:
+        if orig_module is not None:
-        #     self.in_features = orig_module.in_features
+            self.in_features = orig_module.in_features
-        #     self.out_features = orig_module.out_features
+            self.out_features = orig_module.out_features
-        # else:
+        else:
            shape = self.gguf_loader.tensor_info[key + ".weight"]["shape"]
            if len(shape) == 1:
                print("Warning: orig_module is not set, but has in_features or out_features equals to 1, can't get in_features and out_features from GGUF")
@ -136,11 +136,18 @@ class KLinearTorch(KLinearBase):
    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
        if device is None: device = self.device
        if w is None: w = self.load_weight(device=device)
        # else: self.out_features = w.shape[0], self.in_features = w.shape[1]
        if isinstance(w, nn.Parameter):
            try:
                self.w = w.to(dtype=self.dtype).view(self.out_features, self.in_features).T
            except: 
                self.w = w.to(dtype=self.dtype).T
            self.has_bias = False
        elif isinstance(w, tuple):
            try:
                self.w = w[0].to(dtype=self.dtype).view(self.out_features, self.in_features).T
            except:
                self.w = w[0].to(dtype=self.dtype).T
            self.bias = w[1].to(dtype=self.dtype)
            self.has_bias = True
@ -187,7 +194,8 @@ class KLinearMarlin(KLinearBase):
    def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
        if device is None: device = self.device
        assert device.lower() != "cpu", "Marlin quantized linear only supports GPU device"
-        if w is None: w = self.load_weight(device=device)
+        if w is None: 
            w = self.load_weight(device=device) 
        if isinstance(w, nn.Parameter):
            # pad weight
		`@ -0,0 +1,2 @@`
							`# Server`
							`Still Under Construction... (May have bugs and lack of documentation)`