Date: Wed, 28 Aug 2024 16:11:43 +0000
Subject: [PATCH] [feature] release 0.1.3
---
.github/workflows/package_wheel_release.yml | 34 +-
.gitignore | 3 +-
README.md | 98 +-
doc/assets/Framework_effect.png | Bin 0 -> 111136 bytes
doc/assets/InfLLM_equation.jpg | Bin 0 -> 17725 bytes
doc/assets/InfLLM_framework.png | Bin 0 -> 120894 bytes
doc/assets/KTransformers_long_context_v1.png | Bin 0 -> 134536 bytes
doc/assets/KTransformers_long_context_v2.png | Bin 0 -> 161040 bytes
doc/assets/Quest_framework.png | Bin 0 -> 290850 bytes
doc/assets/SnapKV_framework.png | Bin 0 -> 214311 bytes
doc/assets/SparQ_attention.png | Bin 0 -> 140228 bytes
doc/assets/internlm_memory.png | Bin 0 -> 127690 bytes
doc/assets/long_context_generate.png | Bin 0 -> 185748 bytes
doc/assets/long_context_prefill.png | Bin 0 -> 150325 bytes
doc/assets/needle_128K.png | Bin 0 -> 138555 bytes
doc/assets/needle_1M.png | Bin 0 -> 103188 bytes
doc/en/long_context_tutorial.md | 316 ++
ktransformers/__init__.py | 12 +-
ktransformers/configs/config.yaml | 18 +-
.../ktransformers_ext/CMakeLists.txt | 9 +-
.../bench/bench_attention.py | 178 ++
.../bench/bench_attention_torch.py | 94 +
.../ktransformers_ext/cpu_backend/backend.cpp | 88 +-
.../ktransformers_ext/cpu_backend/backend.h | 19 +-
.../cpu_backend/task_queue.cpp | 2 +-
.../cpu_backend/task_queue.h | 2 +-
.../examples/test_attention.py | 142 +
.../ktransformers_ext/ext_bindings.cpp | 644 ++++-
.../operators/kvcache/kvcache.h | 727 +++++
.../operators/kvcache/kvcache_attn.cpp | 2533 +++++++++++++++++
.../operators/kvcache/kvcache_load_dump.cpp | 123 +
.../operators/kvcache/kvcache_read_write.cpp | 1019 +++++++
.../operators/kvcache/kvcache_utils.cpp | 1157 ++++++++
.../operators/llamafile/linear.cpp | 18 +-
.../operators/llamafile/mlp.cpp | 26 +-
.../operators/llamafile/moe.cpp | 28 +-
ktransformers/local_chat.py | 99 +-
ktransformers/models/configuration_llama.py | 203 ++
ktransformers/models/modeling_llama.py | 1744 ++++++++++++
ktransformers/operators/RoPE.py | 176 +-
ktransformers/operators/attention.py | 173 +-
ktransformers/operators/cpuinfer.py | 746 ++++-
ktransformers/operators/dynamic_attention.py | 775 +++++
ktransformers/operators/experts.py | 4 +-
ktransformers/operators/models.py | 756 ++++-
.../DeepSeek-V2-Chat-multi-gpu-4.yaml | 2 +-
.../DeepSeek-V2-Chat-multi-gpu.yaml | 2 +-
.../optimize_rules/DeepSeek-V2-Chat.yaml | 8 +-
.../DeepSeek-V2-Lite-Chat-multi-gpu.yaml | 2 +-
.../Internlm2_5-7b-Chat-1m.yaml | 28 +
.../Qwen2-57B-A14B-Instruct-multi-gpu.yaml | 2 +-
.../Qwen2-57B-A14B-Instruct.yaml | 9 +-
ktransformers/server/config/config.py | 28 +-
ktransformers/util/cuda_graph_runner.py | 3 +-
ktransformers/util/custom_gguf.py | 3 +-
ktransformers/util/utils.py | 24 +-
pyproject.toml | 3 +-
requirements-local_chat.txt | 3 +-
58 files changed, 11709 insertions(+), 374 deletions(-)
create mode 100644 doc/assets/Framework_effect.png
create mode 100644 doc/assets/InfLLM_equation.jpg
create mode 100644 doc/assets/InfLLM_framework.png
create mode 100644 doc/assets/KTransformers_long_context_v1.png
create mode 100644 doc/assets/KTransformers_long_context_v2.png
create mode 100644 doc/assets/Quest_framework.png
create mode 100644 doc/assets/SnapKV_framework.png
create mode 100644 doc/assets/SparQ_attention.png
create mode 100644 doc/assets/internlm_memory.png
create mode 100644 doc/assets/long_context_generate.png
create mode 100644 doc/assets/long_context_prefill.png
create mode 100644 doc/assets/needle_128K.png
create mode 100644 doc/assets/needle_1M.png
create mode 100644 doc/en/long_context_tutorial.md
create mode 100644 ktransformers/ktransformers_ext/bench/bench_attention.py
create mode 100644 ktransformers/ktransformers_ext/bench/bench_attention_torch.py
create mode 100644 ktransformers/ktransformers_ext/examples/test_attention.py
create mode 100644 ktransformers/ktransformers_ext/operators/kvcache/kvcache.h
create mode 100644 ktransformers/ktransformers_ext/operators/kvcache/kvcache_attn.cpp
create mode 100644 ktransformers/ktransformers_ext/operators/kvcache/kvcache_load_dump.cpp
create mode 100644 ktransformers/ktransformers_ext/operators/kvcache/kvcache_read_write.cpp
create mode 100644 ktransformers/ktransformers_ext/operators/kvcache/kvcache_utils.cpp
mode change 100755 => 100644 ktransformers/local_chat.py
create mode 100644 ktransformers/models/configuration_llama.py
create mode 100644 ktransformers/models/modeling_llama.py
create mode 100644 ktransformers/operators/dynamic_attention.py
create mode 100644 ktransformers/optimize/optimize_rules/Internlm2_5-7b-Chat-1m.yaml
diff --git a/.github/workflows/package_wheel_release.yml b/.github/workflows/package_wheel_release.yml
index 93e5f38..f04ee07 100644
--- a/.github/workflows/package_wheel_release.yml
+++ b/.github/workflows/package_wheel_release.yml
@@ -29,11 +29,6 @@ jobs:
- { os: ubuntu-20.04, pyver: '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.12', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
- - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
- - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
- - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
- - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
- - { os: ubuntu-20.04, pyver: '3.12', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
- { os: ubuntu-20.04, pyver: '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
@@ -52,12 +47,6 @@ jobs:
- { os: ubuntu-20.04, pyver: '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
- - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
- - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
- - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
- - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
- - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
- - { os: ubuntu-20.04, pyver: '3.11', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
- { os: ubuntu-20.04, pyver: '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
@@ -76,12 +65,6 @@ jobs:
- { os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
- - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
- - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
- - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
- - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
- - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
- - { os: ubuntu-20.04, pyver: '3.10', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
- { os: ubuntu-20.04, pyver: '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
- { os: ubuntu-20.04, pyver: '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
@@ -98,10 +81,6 @@ jobs:
- { os: windows-2022, pyver: '3.12', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
- { os: windows-2022, pyver: '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
- { os: windows-2022, pyver: '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
- - { os: windows-2022, pyver: '3.12', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
- - { os: windows-2022, pyver: '3.12', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
- - { os: windows-2022, pyver: '3.12', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
- - { os: windows-2022, pyver: '3.12', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
- { os: windows-2022, pyver: '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
- { os: windows-2022, pyver: '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
- { os: windows-2022, pyver: '3.12', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
@@ -114,10 +93,6 @@ jobs:
- { os: windows-2022, pyver: '3.11', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
- { os: windows-2022, pyver: '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
- { os: windows-2022, pyver: '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
- - { os: windows-2022, pyver: '3.11', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
- - { os: windows-2022, pyver: '3.11', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
- - { os: windows-2022, pyver: '3.11', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
- - { os: windows-2022, pyver: '3.11', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
- { os: windows-2022, pyver: '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
- { os: windows-2022, pyver: '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
- { os: windows-2022, pyver: '3.11', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
@@ -130,10 +105,6 @@ jobs:
- { os: windows-2022, pyver: '3.10', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
- { os: windows-2022, pyver: '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
- { os: windows-2022, pyver: '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
- - { os: windows-2022, pyver: '3.10', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
- - { os: windows-2022, pyver: '3.10', cuda: '12.5.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
- - { os: windows-2022, pyver: '3.10', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
- - { os: windows-2022, pyver: '3.10', cuda: '12.4.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
- { os: windows-2022, pyver: '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
- { os: windows-2022, pyver: '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
- { os: windows-2022, pyver: '3.10', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
@@ -219,6 +190,11 @@ jobs:
$env:CUDA_PATH = "$env:CUDA_PATH/Library"
$env:CUDA_HOME = $env:CUDA_PATH
$env:PATH = "$env:CUDA_PATH/bin;" + $env:PATH
+ $directory = "$env:CUDA_PATH/lib/x64/"
+ if (-not (Test-Path -Path $directory)) {
+ New-Item -ItemType Directory -Path $directory
+ Write-Output "Directory '$directory' created."
+ }
cp $env:CUDA_PATH/lib/*.lib $env:CUDA_PATH/lib/x64/
$env:INCLUDE =$env:CUDA_PATH + "/include/targets/x64;" + $env:INCLUDE
diff --git a/.gitignore b/.gitignore
index 1bb8666..5d72e80 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,4 +17,5 @@ compile_commands.json
*dist/
ktransformers/server/local_store/
ktransformers/server_test1.db
-*.patch
\ No newline at end of file
+*.patch
+local_chat_djw.py
\ No newline at end of file
diff --git a/README.md b/README.md
index f04a159..a3a6792 100644
--- a/README.md
+++ b/README.md
@@ -1,18 +1,17 @@
-
🎉 Introduction
KTransformers, pronounced as Quick Transformers, is designed to enhance your 🤗 Transformers experience with advanced kernel optimizations and placement/parallelism strategies.
@@ -22,17 +21,43 @@ interface, RESTful APIs compliant with OpenAI and Ollama, and even a simplified
Our vision for KTransformers is to serve as a flexible platform for experimenting with innovative LLM inference optimizations. Please let us know if you need any other features.
-✨ Updates
+🔥 Updates
+* **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM.
+* **Aug 28, 2024**: Decrease DeepseekV2's required DRAM from 20G to 10G.
* **Aug 15, 2024**: Update detailed [TUTORIAL](doc/en/injection_tutorial.md) for injection and multi-GPU.
-* **Aug 14, 2024**: Support llamfile as linear backend,
+* **Aug 14, 2024**: Support llamfile as linear backend.
* **Aug 12, 2024**: Support multiple GPU; Support new model: mixtral 8\*7B and 8\*22B; Support q2k, q3k, q5k dequant on gpu.
* **Aug 9, 2024**: Support windows native.
🔥 Show Cases
-GPT-4-level Local VSCode Copilot on a Desktop with only 24GB VRAM
+1M Context Local Inference on a Desktop with Only 24GB VRAM
+https://github.com/user-attachments/assets/a865e5e4-bca3-401e-94b8-af3c080e6c12
+
+* **1M Context InternLM 2.5 7B**: Operates at full bf16 precision, utilizing 24GB VRAM and 150GB DRAM, which is feasible on a local desktop setup. It achieves a 92.88% success rate on the 1M "Needle In a Haystack" test and 100% on the 128K NIAH test.
+
+
+
+
+
+
+
+
+
+
+
+
+
+* **Enhanced Speed**: Reaches 16.91 tokens/s for generation with a 1M context using sparse attention, powered by llamafile kernels. This method is over 10 times faster than full attention approach of llama.cpp.
+
+* **Flexible Sparse Attention Framework**: Offers a flexible block sparse attention framework for CPU offloaded decoding. Compatible with SnapKV, Quest, and InfLLm. Further information is available [here](./doc/en/long_context_tutorial.md).
+
+
+
GPT-4-level Local VSCode Copilot on a Desktop with only 24GB VRAM
+
+
https://github.com/user-attachments/assets/0b9fa2da-66f0-48eb-b4b9-f0e1f06f8927