From 0a540f9abd98915edb99fed47d80078ed8d2f343 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 7 Dec 2025 14:02:04 +0100 Subject: [PATCH 01/14] ci : add windows-cuda 13.1 release (#17839) --- .github/actions/windows-setup-cuda/action.yml | 31 +++++++++++++++++++ .github/workflows/release.yml | 6 ++-- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/.github/actions/windows-setup-cuda/action.yml b/.github/actions/windows-setup-cuda/action.yml index 5575caeca..6ad61582a 100644 --- a/.github/actions/windows-setup-cuda/action.yml +++ b/.github/actions/windows-setup-cuda/action.yml @@ -65,3 +65,34 @@ runs: echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 + + - name: Install Cuda Toolkit 13.1 + if: ${{ inputs.cuda_version == '13.1' }} + shell: pwsh + run: | + mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" + choco install unzip -y + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_crt/windows-x86_64/cuda_crt-windows-x86_64-13.1.80-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-13.1.80-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-13.1.80-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-13.1.80-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-13.2.0.9-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libnvvm/windows-x86_64/libnvvm-windows-x86_64-13.1.80-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-13.1.68-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-13.1.80-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-13.1.68-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-13.1.78-archive.zip" + unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_crt-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_cudart-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvcc-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvrtc-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\libcublas-windows-x86_64-13.2.0.9-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\libnvvm-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvtx-windows-x86_64-13.1.68-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_profiler_api-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\visual_studio_integration-windows-x86_64-13.1.68-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_cccl-windows-x86_64-13.1.78-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y + echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 + echo "CUDA_PATH_V13_1=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 9e064ae1d..3668e4e2c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -421,7 +421,7 @@ jobs: strategy: matrix: - cuda: ['12.4'] + cuda: ['12.4', '13.1'] steps: - name: Clone @@ -476,6 +476,7 @@ jobs: $dst='.\build\bin\cudart\' robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll + robocopy "${{env.CUDA_PATH}}\bin\x64" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll 7z a cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip $dst\* - name: Upload Cuda runtime @@ -835,7 +836,8 @@ jobs: **Windows:** - [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip) - [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip) - - [Windows x64 (CUDA)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) + - [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) + - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.1-x64.zip) - [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip) - [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip) - [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip) From 08f9d3cc1d40169082ccf2416ebadb8a5afca9d9 Mon Sep 17 00:00:00 2001 From: lovedheart <6277001+lovedheart@users.noreply.github.com> Date: Sun, 7 Dec 2025 18:40:42 +0100 Subject: [PATCH 02/14] Vulkan: improve mul_mat_vec_iq1_m (#16907) * Optimize Vulkan shader for matrix-vector multiplication * Revert changes on compute_outputs and main Refactor compute_outputs to handle remaining rows correctly. * Fix trailing whitespace --- .../vulkan-shaders/mul_mat_vec_iq1_m.comp | 86 +++++++++++++++---- 1 file changed, 68 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp index 4cb292380..e5cc7ff86 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp @@ -7,35 +7,85 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; FLOAT_TYPE temp[NUM_COLS][NUM_ROWS]; -void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) { +void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i, + const uint num_blocks_per_row, const uint first_row, const uint num_rows) { + // Compute starting index in matrix B for this superblock const uint y_idx = i * QUANT_K + 32 * ib32; - uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i; + + // Precompute indices for quantization lookup tables + const uint qh_base = 2 * ib32; + const uint qs_base = 4 * ib32; + const uint sc_index = ib32 / 2; + const uint sc_shift = 6 * (ib32 & 1); + + // Loop over rows in the superblock [[unroll]] for (uint n = 0; n < num_rows; ++n) { + // Load per-block scales and shift for quantization const uint16_t[4] scales = data_a[ibi].scales; const u16vec4 s = u16vec4(scales[0], scales[1], scales[2], scales[3]) >> 12; const float d = float(unpackHalf2x16(s.x | (s.y << 4) | (s.z << 8) | (s.w << 12)).x); + const uint sc = data_a[ibi].scales[sc_index] >> sc_shift; - const uint sc = data_a[ibi].scales[ib32 / 2] >> (6 * (ib32 & 1)); + // Temporary caches for decoding + FLOAT_TYPE dl_cache[4]; + uint16_t gvf_cache[4]; + float delta_cache[4]; + + // Precompute the multiplier and lookup values for 4 sub-blocks [[unroll]] for (uint l = 0; l < 4; ++l) { - const uint qh = data_a[ibi].qh[2 * ib32 + l / 2] >> (4 * (l&1)); - const uint qs = data_a[ibi].qs[4 * ib32 + l]; - const float delta = ((qh & 8) != 0) ? -IQ1M_DELTA : IQ1M_DELTA; - const float dl = d * (2 * bitfieldExtract(sc, 3 * int(l / 2), 3) + 1); + dl_cache[l] = FLOAT_TYPE(d * (2 * bitfieldExtract(sc, 3 * int(l / 2), 3) + 1)); + const uint qh = data_a[ibi].qh[qh_base + l / 2] >> (4 * (l & 1)); + const uint qs = data_a[ibi].qs[qs_base + l]; + gvf_cache[l] = iq1s_grid[qs | ((qh & 7) << 8)]; + delta_cache[l] = ((qh & 8) != 0) ? -IQ1M_DELTA : IQ1M_DELTA; + } - const int16_t grid = int16_t(iq1s_grid[qs | ((qh & 7) << 8)]); + // Loop over columns of the output + [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { + // Compute base index for matrix B + const uint base_b_idx = (j * p.batch_stride_b + b_offset + y_idx) / 4; + vec4 b_vals[8]; - [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { - vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]); - vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]); - - FLOAT_TYPE sum = FLOAT_TYPE(0.0); - [[unroll]] for (int k = 0; k < 4; ++k) { - sum = fma(FLOAT_TYPE(b0[k]), bitfieldExtract(grid, 2 * k, 2) + delta, - fma(FLOAT_TYPE(b4[k]), bitfieldExtract(grid, 8 + 2 * k, 2) + delta, sum)); - } - temp[j][n] = fma(dl, sum, temp[j][n]); + // Load 8 vec4 values from matrix B + [[unroll]] for (int idx = 0; idx < 8; ++idx) { + b_vals[idx] = vec4(data_b_v4[base_b_idx + idx]); } + + FLOAT_TYPE col_sum = FLOAT_TYPE(0.0); + + // Loop over sub-blocks + [[unroll]] for (uint l = 0; l < 4; ++l) { + const uint16_t grid = gvf_cache[l]; + const float dl = dl_cache[l]; + + // Decode 8 2-bit fbits from gvf_cache + float f0 = float(bitfieldExtract(grid, 0, 2)); + float f1 = float(bitfieldExtract(grid, 2, 2)); + float f2 = float(bitfieldExtract(grid, 4, 2)); + float f3 = float(bitfieldExtract(grid, 6, 2)); + float f4 = float(bitfieldExtract(grid, 8, 2)); + float f5 = float(bitfieldExtract(grid, 10, 2)); + float f6 = float(bitfieldExtract(grid, 12, 2)); + float f7 = float(bitfieldExtract(grid, 14, 2)); + + // Pack into vec4 for vectorized FMA + const vec4 fbits_v0 = vec4(f0, f1, f2, f3); + const vec4 fbits_v1 = vec4(f4, f5, f6, f7); + const vec4 delta_v = vec4(delta_cache[l]); + + // Vectorized fused multiply-add + vec4 sum_v = fma(b_vals[2*l + 0], fbits_v0 + delta_v, vec4(0.0)); + sum_v = fma(b_vals[2*l + 1], fbits_v1 + delta_v, sum_v); + + // Horizontal add to get scalar sum + FLOAT_TYPE sum = sum_v.x + sum_v.y + sum_v.z + sum_v.w; + + // Accumulate to column sum + col_sum = fma(dl, sum, col_sum); + } + // Write result to temporary buffer + temp[j][n] += col_sum; } ibi += num_blocks_per_row; } From 4d3726278bfd3f42ad9a5b5db2ae056a8eef2ee2 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Sun, 7 Dec 2025 22:29:54 +0100 Subject: [PATCH 03/14] model: add llama 4 scaling for mistral-large (deepseek arch) (#17744) --- src/llama-model.cpp | 4 ++++ src/models/deepseek2.cpp | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index c3675dbdc..7d09d7abd 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1628,6 +1628,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { } ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false); + // (optional) temperature tuning - used by mistral-large + ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false); + ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false); + switch (hparams.n_layer) { case 27: type = LLM_TYPE_16B; break; case 60: type = LLM_TYPE_236B; break; diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp index 0b41f7ba8..dbaa8297b 100644 --- a/src/models/deepseek2.cpp +++ b/src/models/deepseek2.cpp @@ -30,6 +30,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr // {n_embd, n_tokens} inpL = build_inp_embd(model.tok_embd); + // (optional) temperature tuning - used by mistral-large + ggml_tensor * inp_attn_scale = nullptr; + if (hparams.f_attn_temp_scale != 0.0f) { + inp_attn_scale = build_inp_attn_scale(); + } + // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); @@ -128,6 +134,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr ggml_tensor * Vcur = kv_cmpr; cb(Vcur, "Vcur", il); + if (inp_attn_scale) { + // apply llama 4 temperature scaling + Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale); + cb(Qcur, "Qcur_attn_temp_scaled", il); + } + // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group) cur = build_attn(inp_attn, model.layers[il].wo, NULL, @@ -160,6 +172,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0); cb(Kcur, "Kcur", il); + if (inp_attn_scale) { + // apply llama 4 temperature scaling + Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale); + cb(Qcur, "Qcur_attn_temp_scaled", il); + } + // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups) cur = build_attn(inp_attn, model.layers[il].wo, NULL, From 79d61896d35f37b79f432ae935698c5459ba8a41 Mon Sep 17 00:00:00 2001 From: ixgbe <1113177880@qq.com> Date: Mon, 8 Dec 2025 16:41:34 +0800 Subject: [PATCH 04/14] ggml-cpu: add ggml_thread_cpu_relax with Zihintpause support (#17784) * ggml-cpu: add ggml_thread_cpu_relax with Zihintpause support Signed-off-by: Wang Yang * cmake: enable RISC-V zihintpause extension for Spacemit builds * readme : add ZIHINTPAUSE support for RISC-V --------- Signed-off-by: Wang Yang --- .github/workflows/build-linux-cross.yml | 1 + README.md | 2 +- docs/build-riscv64-spacemit.md | 1 + ggml/CMakeLists.txt | 1 + ggml/src/ggml-cpu/CMakeLists.txt | 3 +++ ggml/src/ggml-cpu/ggml-cpu.c | 9 +++++++++ 6 files changed, 16 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-linux-cross.yml b/.github/workflows/build-linux-cross.yml index 36201281f..c2c6ea12a 100644 --- a/.github/workflows/build-linux-cross.yml +++ b/.github/workflows/build-linux-cross.yml @@ -291,6 +291,7 @@ jobs: -DGGML_RVV=ON \ -DGGML_RV_ZFH=ON \ -DGGML_RV_ZICBOP=ON \ + -DGGML_RV_ZIHINTPAUSE=ON \ -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \ -DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake diff --git a/README.md b/README.md index eac8d66cc..7dd2bfd8a 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ range of hardware - locally and in the cloud. - Plain C/C++ implementation without any dependencies - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks - AVX, AVX2, AVX512 and AMX support for x86 architectures -- RVV, ZVFH, ZFH and ZICBOP support for RISC-V architectures +- RVV, ZVFH, ZFH, ZICBOP and ZIHINTPAUSE support for RISC-V architectures - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use - Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA) - Vulkan and SYCL backend support diff --git a/docs/build-riscv64-spacemit.md b/docs/build-riscv64-spacemit.md index eaa653254..79bd4de63 100644 --- a/docs/build-riscv64-spacemit.md +++ b/docs/build-riscv64-spacemit.md @@ -19,6 +19,7 @@ cmake -B build \ -DGGML_RVV=ON \ -DGGML_RV_ZFH=ON \ -DGGML_RV_ZICBOP=ON \ + -DGGML_RV_ZIHINTPAUSE=ON \ -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \ -DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake \ -DCMAKE_INSTALL_PREFIX=build/installed diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 6b69ad828..ab5b4760e 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -168,6 +168,7 @@ option(GGML_RVV "ggml: enable rvv" ON) option(GGML_RV_ZFH "ggml: enable riscv zfh" ON) option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON) option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON) +option(GGML_RV_ZIHINTPAUSE "ggml: enable riscv zihintpause " ON) option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF) option(GGML_VXE "ggml: enable vxe" ${GGML_NATIVE}) diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 7e53a57b7..fc31089f3 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -469,6 +469,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name) if (GGML_RV_ZICBOP) string(APPEND MARCH_STR "_zicbop") endif() + if (GGML_RV_ZIHINTPAUSE) + string(APPEND MARCH_STR "_zihintpause") + endif() list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d) else() # Begin with the lowest baseline diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 850755726..b468b115a 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -490,6 +490,15 @@ static inline void ggml_thread_cpu_relax(void) { static inline void ggml_thread_cpu_relax(void) { _mm_pause(); } +#elif defined(__riscv) +static inline void ggml_thread_cpu_relax(void) { + #ifdef __riscv_zihintpause + __asm__ __volatile__ ("pause"); + #else + /* Encoding of the pause instruction */ + __asm__ __volatile__ (".4byte 0x100000F"); + #endif +} #else static inline void ggml_thread_cpu_relax(void) {;} #endif From 5814b4dce18f9c5cbebef175e381a7b0ff147d72 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Mon, 8 Dec 2025 10:41:08 +0100 Subject: [PATCH 05/14] cuda: optimize SOLVE_TRI using registers and FMAF (#17703) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ggml-cuda: optimize solve_tri_f32_fast and fix stride handling - Switch from using shared memory for the RHS/solution matrix to a register-based approach (x_low, x_high), reducing shared memory pressure and bank conflicts. - Implement explicit `fmaf` instructions for the reduction loop. - Update kernel arguments to pass strides in bytes rather than elements to align with standard ggml tensor arithmetic (casting to `char *` before addition). - Remove unused `MAX_K_FAST` definition. * Small cleanup * Remove comments in solve_tri.cu * Update ggml/src/ggml-cuda/solve_tri.cu Co-authored-by: Johannes Gäßler * Update ggml/src/ggml-cuda/solve_tri.cu Co-authored-by: Johannes Gäßler * Update ggml/src/ggml-cuda/solve_tri.cu Co-authored-by: Johannes Gäßler * Use const for variables in solve_tri.cu * Replace fmaf with more readable code * remove last fmaf --------- Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/solve_tri.cu | 68 +++++++++++++++------------------ 1 file changed, 30 insertions(+), 38 deletions(-) diff --git a/ggml/src/ggml-cuda/solve_tri.cu b/ggml/src/ggml-cuda/solve_tri.cu index 2e2b39720..e161d4dc4 100644 --- a/ggml/src/ggml-cuda/solve_tri.cu +++ b/ggml/src/ggml-cuda/solve_tri.cu @@ -3,7 +3,6 @@ #include "solve_tri.cuh" #define MAX_N_FAST 64 -#define MAX_K_FAST 32 // ====================== // Fast Kernel (n <= 64, k <= 32) - Warp-based parallel reduction @@ -48,65 +47,58 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A, float * X_batch = (float *) (X + i02 * nb2 + i03 * nb3); __shared__ float sA[MAX_N_FAST * MAX_N_FAST]; - __shared__ float sXt[MAX_N_FAST * (MAX_K_FAST + 1)]; const int offset = threadIdx.x + threadIdx.y * blockDim.x; #pragma unroll for (int i = 0; i < n * n; i += k * WARP_SIZE) { - int i0 = i + offset; + const int i0 = i + offset; if (i0 < n * n) { sA[i0] = A_batch[i0]; } } - const int rows_per_warp = (n + WARP_SIZE - 1) / WARP_SIZE; - -#pragma unroll - for (int i = 0; i < rows_per_warp; i++) { - const int i0 = lane + i * WARP_SIZE; - if (i0 < n) { - sXt[col_idx * n + i0] = B_batch[i0 * k + col_idx]; - } - } - __syncthreads(); + float x_low = (lane < n) ? B_batch[lane * k + col_idx] : 0.0f; + float x_high = (WARP_SIZE + lane < n) ? B_batch[(WARP_SIZE + lane) * k + col_idx] : 0.0f; + + const int half = WARP_SIZE; + const int nrows_low = (n < half) ? n : half; + #pragma unroll - for (int row = 0; row < n; ++row) { + for (int row = 0; row < nrows_low; ++row) { float sum = 0.0f; - - { - int j = lane; - if (j < row) { - sum += sA[row * n + j] * sXt[col_idx * n + j]; - } + if (lane < row) { + sum += sA[row * n + lane] * x_low; } - if (row >= WARP_SIZE) { - int j = WARP_SIZE + lane; - if (j < row) { - sum += sA[row * n + j] * sXt[col_idx * n + j]; - } - } - sum = warp_reduce_sum(sum); - if (lane == 0) { - const float b_val = sXt[col_idx * n + row]; - const float a_diag = sA[row * n + row]; - // no safeguards for division by zero because that indicates corrupt - // data anyway - sXt[col_idx * n + row] = (b_val - sum) / a_diag; + if (lane == row) { + x_low = (x_low - sum) / sA[row * n + row]; } } - __syncthreads(); +#pragma unroll + for (int row = half; row < n; ++row) { + float sum = sA[row * n + lane] * x_low; + const int j = half + lane; + if (j < row) { + sum += sA[row * n + j] * x_high; + } + sum = warp_reduce_sum(sum); + + if (lane == row - half) { + x_high = (x_high - sum) / sA[row * n + row]; + } + } #pragma unroll - for (int i = 0; i < rows_per_warp; i++) { - const int i0 = lane + i * WARP_SIZE; - if (i0 < n) { - X_batch[i0 * k + col_idx] = sXt[col_idx * n + i0]; + for (int rr = 0; rr < 2; ++rr) { + const int row = rr * WARP_SIZE + lane; + if (row < n) { + const float val = (row < half) ? x_low : x_high; + X_batch[row * k + col_idx] = val; } } } From 2bc96931d2583cad9e85b9a967b78f42ef76b8a7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 8 Dec 2025 12:43:12 +0200 Subject: [PATCH 06/14] server : make cache_reuse configurable per request (#17858) --- tools/server/README.md | 2 ++ tools/server/server-context.cpp | 17 +++++++++++++---- tools/server/server-task.cpp | 12 +++++++----- tools/server/server-task.h | 15 +++++++++------ 4 files changed, 31 insertions(+), 15 deletions(-) diff --git a/tools/server/README.md b/tools/server/README.md index bf274db79..9deb241b0 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -495,6 +495,8 @@ By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to re `n_cmpl`: Number of completions to generate from the current prompt. If input has multiple prompts, the output will have N prompts times `n_cmpl` entries. +`n_cache_reuse`: Min chunk size to attempt reusing from the cache via KV shifting. For more info, see `--cache-reuse` arg. Default: `0`, which is disabled. + `stream`: Allows receiving each predicted token in real-time instead of waiting for the completion to finish (uses a different response format). To enable this, set to `true`. `stop`: Specify a JSON array of stopping strings. diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 12a4e94e5..d0039631d 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1880,8 +1880,18 @@ struct server_context_impl { n_past = std::min(n_past, slot.alora_invocation_start - 1); } + const auto n_cache_reuse = slot.task->params.n_cache_reuse; + + const bool can_cache_reuse = + llama_memory_can_shift(llama_get_memory(ctx)) && + !slot.prompt.tokens.has_mtmd; + + if (!can_cache_reuse && n_cache_reuse > 0) { + SLT_WRN(slot, "cache reuse is not supported - ignoring n_cache_reuse = %d\n", n_cache_reuse); + } + // reuse chunks from the cached prompt by shifting their KV cache in the new position - if (params_base.n_cache_reuse > 0) { + if (can_cache_reuse && n_cache_reuse > 0) { GGML_ASSERT(!slot.prompt.tokens.has_mtmd); size_t head_c = n_past; // cache @@ -1892,7 +1902,7 @@ struct server_context_impl { GGML_ABORT("not supported by multimodal"); } - SLT_DBG(slot, "trying to reuse chunks with size > %d, n_past = %d\n", params_base.n_cache_reuse, n_past); + SLT_DBG(slot, "trying to reuse chunks with size > %d, n_past = %d\n", n_cache_reuse, n_past); while (head_c < slot.prompt.tokens.size() && head_p < input_tokens.size()) { @@ -1901,11 +1911,10 @@ struct server_context_impl { while (head_c + n_match < slot.prompt.tokens.size() && head_p + n_match < input_tokens.size() && slot.prompt.tokens[head_c + n_match] == input_tokens[head_p + n_match]) { - n_match++; } - if (n_match >= (size_t) params_base.n_cache_reuse) { + if (n_match >= (size_t) n_cache_reuse) { SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match); //for (size_t i = head_p; i < head_p + n_match; i++) { // SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index c401f47a7..360826062 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -155,11 +155,12 @@ task_params server_task::params_from_json_cmpl( // Sampling parameter defaults are loaded from the global server context (but individual requests can still them) task_params defaults; - defaults.sampling = params_base.sampling; - defaults.speculative = params_base.speculative; - defaults.n_keep = params_base.n_keep; - defaults.n_predict = params_base.n_predict; - defaults.antiprompt = params_base.antiprompt; + defaults.sampling = params_base.sampling; + defaults.speculative = params_base.speculative; + defaults.n_keep = params_base.n_keep; + defaults.n_predict = params_base.n_predict; + defaults.n_cache_reuse = params_base.n_cache_reuse; + defaults.antiprompt = params_base.antiprompt; // enabling this will output extra debug information in the HTTP responses from the server params.verbose = params_base.verbosity > 9; @@ -176,6 +177,7 @@ task_params server_task::params_from_json_cmpl( params.n_keep = json_value(data, "n_keep", defaults.n_keep); params.n_discard = json_value(data, "n_discard", defaults.n_discard); params.n_cmpl = json_value(data, "n_cmpl", json_value(data, "n", 1)); + params.n_cache_reuse = json_value(data, "n_cache_reuse", defaults.n_cache_reuse); //params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms); params.response_fields = json_value(data, "response_fields", std::vector()); diff --git a/tools/server/server-task.h b/tools/server/server-task.h index 4e4840fc8..da4e22a7c 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -55,6 +55,8 @@ struct task_params { int32_t n_indent = 0; // minimum line indentation for the generated text in number of whitespace characters int32_t n_cmpl = 1; // number of completions to generate from this prompt + int32_t n_cache_reuse = 0; // min chunk size to attempt reusing from the cache via KV shifting (0 = disabled) + int64_t t_max_prompt_ms = -1; // TODO: implement int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit @@ -62,18 +64,19 @@ struct task_params { std::vector antiprompt; std::vector response_fields; - bool timings_per_token = false; + + bool timings_per_token = false; bool post_sampling_probs = false; struct common_params_sampling sampling; struct common_params_speculative speculative; // response formatting - bool verbose = false; - task_response_type res_type = TASK_RESPONSE_TYPE_NONE; - std::string oaicompat_model; - std::string oaicompat_cmpl_id; - common_chat_syntax oaicompat_chat_syntax; + bool verbose = false; + task_response_type res_type = TASK_RESPONSE_TYPE_NONE; + std::string oaicompat_model; + std::string oaicompat_cmpl_id; + common_chat_syntax oaicompat_chat_syntax; // Embeddings int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm) From 37a4f632442cb0e6d046b714d36cecb619833f4e Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Mon, 8 Dec 2025 13:54:58 +0100 Subject: [PATCH 07/14] server : add development documentation (#17760) * first draft * rewrite * update & remove duplicated sections --- tools/server/README-dev.md | 151 +++++++++++++++++++++++++++++++++++++ tools/server/README.md | 145 +++++------------------------------ 2 files changed, 170 insertions(+), 126 deletions(-) create mode 100644 tools/server/README-dev.md diff --git a/tools/server/README-dev.md b/tools/server/README-dev.md new file mode 100644 index 000000000..67ebe1aaf --- /dev/null +++ b/tools/server/README-dev.md @@ -0,0 +1,151 @@ +# llama-server Development Documentation + +This document provides an in-depth technical overview of `llama-server`, intended for maintainers and contributors. + +If you are an end user consuming `llama-server` as a product, please refer to the main [README](./README.md) instead. + +## Backend + +### Overview + +The server supports two primary operating modes: + +- **Inference mode**: The default mode for performing inference with a single loaded GGUF model. +- **Router mode**: Enables management of multiple inference server instances behind a single API endpoint. Requests are automatically routed to the appropriate backend instance based on the requested model. + +The core architecture consists of the following components: + +- `server_context`: Holds the primary inference state, including the main `llama_context` and all active slots. +- `server_slot`: An abstraction over a single “sequence” in llama.cpp, responsible for managing individual parallel inference requests. +- `server_routes`: Middleware layer between `server_context` and the HTTP interface; handles JSON parsing/formatting and request routing logic. +- `server_http_context`: Implements the HTTP server using `cpp-httplib`. +- `server_queue`: Thread-safe queue used by HTTP workers to submit new tasks to `server_context`. +- `server_response`: Thread-safe queue used by `server_context` to return results to HTTP workers. +- `server_response_reader`: Higher-level wrapper around the two queues above for cleaner code. +- `server_task`: Unit of work pushed into `server_queue`. +- `server_task_result`: Unit of result pushed into `server_response`. +- `server_tokens`: Unified representation of token sequences (supports both text and multimodal tokens); used by `server_task` and `server_slot`. +- `server_prompt_checkpoint`: For recurrent (e.g., RWKV) and SWA models, stores snapshots of KV cache state. Enables reuse when subsequent requests share the same prompt prefix, saving redundant computation. +- `server_models`: Standalone component for managing multiple backend instances (used in router mode). It is completely independent of `server_context`. + +```mermaid +graph TD + API_User <--> server_http_context + server_http_context <-- router mode --> server_models + server_http_context <-- inference mode --> server_routes + server_routes -- server_task --> server_queue + subgraph server_context + server_queue --> server_slot + server_slot -- server_task_result --> server_response + server_slot[multiple server_slot] + end + server_response --> server_routes +``` + +TODO: mention about how batching is handled by `server_slot` + +### Thread Management + +`server_context` runs on a dedicated single thread. Because it is single-threaded, heavy post-processing (especially after token generation) should be avoided, as it directly impacts multi-sequence throughput. + +Each incoming HTTP request is handled by its own thread managed by the HTTP library. The following operations are performed in HTTP worker threads: + +- JSON request parsing +- Chat template application +- Tokenization +- Conversion of `server_task_result` into final JSON response +- Error formatting into JSON +- Tracking of partial/incremental responses (e.g., streaming tool calls or reasoning steps) + +**Best practices to follow:** + +- All JSON formatting and chat template logic must stay in the HTTP layer. +- Avoid passing raw JSON between the HTTP layer and `server_slot`. Instead, parse everything into native C++ types as early as possible. + +### Testing + +`llama-server` includes an automated test suite based on `pytest`. + +The framework automatically starts a `llama-server` instance, sends requests, and validates responses. + +For detailed instructions, see the [test documentation](./tests/README.md). + +### Notable Related PRs + +- Initial server implementation: https://github.com/ggml-org/llama.cpp/pull/1443 +- Parallel decoding support: https://github.com/ggml-org/llama.cpp/pull/3228 +- Refactor introducing `server_queue` and `server_response`: https://github.com/ggml-org/llama.cpp/pull/5065 +- Reranking endpoint: https://github.com/ggml-org/llama.cpp/pull/9510 +- Multimodal model support (`libmtmd`): https://github.com/ggml-org/llama.cpp/pull/12898 +- Unified KV cache handling: https://github.com/ggml-org/llama.cpp/pull/16736 +- Separation of HTTP logic into dedicated files: https://github.com/ggml-org/llama.cpp/pull/17216 +- Large-scale code base split into smaller files: https://github.com/ggml-org/llama.cpp/pull/17362 +- Introduction of router mode: https://github.com/ggml-org/llama.cpp/pull/17470 + + + + +## Web UI + +The project includes a web-based user interface for interacting with `llama-server`. It supports both single-model (`MODEL` mode) and multi-model (`ROUTER` mode) operation. + +The SvelteKit-based Web UI is introduced in this PR: https://github.com/ggml-org/llama.cpp/pull/14839 + +### Features + +- **Chat interface** with streaming responses +- **Multi-model support** (ROUTER mode) - switch between models, auto-load on selection +- **Modality validation** - ensures selected model supports conversation's attachments (images, audio) +- **Conversation management** - branching, regeneration, editing with history preservation +- **Attachment support** - images, audio, PDFs (with vision/text fallback) +- **Configurable parameters** - temperature, top_p, etc. synced with server defaults +- **Dark/light theme** + +### Tech Stack + +- **SvelteKit** - frontend framework with Svelte 5 runes for reactive state +- **TailwindCSS** + **shadcn-svelte** - styling and UI components +- **Vite** - build tooling +- **IndexedDB** (Dexie) - local storage for conversations +- **LocalStorage** - user settings persistence + +### Architecture + +The WebUI follows a layered architecture: + +``` +Routes → Components → Hooks → Stores → Services → Storage/API +``` + +- **Stores** - reactive state management (`chatStore`, `conversationsStore`, `modelsStore`, `serverStore`, `settingsStore`) +- **Services** - stateless API/database communication (`ChatService`, `ModelsService`, `PropsService`, `DatabaseService`) +- **Hooks** - reusable logic (`useModelChangeValidation`, `useProcessingState`) + +For detailed architecture diagrams, see [`tools/server/webui/docs/`](webui/docs/): + +- `high-level-architecture.mmd` - full architecture with all modules +- `high-level-architecture-simplified.mmd` - simplified overview +- `data-flow-simplified-model-mode.mmd` - data flow for single-model mode +- `data-flow-simplified-router-mode.mmd` - data flow for multi-model mode +- `flows/*.mmd` - detailed per-domain flows (chat, conversations, models, etc.) + +### Development + +```sh +# make sure you have Node.js installed +cd tools/server/webui +npm i + +# run dev server (with hot reload) +npm run dev + +# run tests +npm run test + +# build production bundle +npm run build +``` + +After `public/index.html.gz` has been generated, rebuild `llama-server` as described in the [build](#build) section to include the updated UI. + +**Note:** The Vite dev server automatically proxies API requests to `http://localhost:8080`. Make sure `llama-server` is running on that port during development. diff --git a/tools/server/README.md b/tools/server/README.md index 9deb241b0..f98fb44c7 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -2,7 +2,7 @@ Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/yhirose/cpp-httplib), [nlohmann::json](https://github.com/nlohmann/json) and **llama.cpp**. -Set of LLM REST APIs and a simple web front end to interact with llama.cpp. +Set of LLM REST APIs and a web UI to interact with llama.cpp. **Features:** * LLM inference of F16 and quantized models on GPU and CPU @@ -19,7 +19,7 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp. * Speculative decoding * Easy-to-use web UI -The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggml-org/llama.cpp/issues/4216). +For the ful list of features, please refer to [server's changelog](https://github.com/ggml-org/llama.cpp/issues/9291) ## Usage @@ -289,69 +289,6 @@ For more details, please refer to [multimodal documentation](../../docs/multimod cmake --build build --config Release -t llama-server ``` -## Web UI - -The project includes a web-based user interface for interacting with `llama-server`. It supports both single-model (`MODEL` mode) and multi-model (`ROUTER` mode) operation. - -### Features - -- **Chat interface** with streaming responses -- **Multi-model support** (ROUTER mode) - switch between models, auto-load on selection -- **Modality validation** - ensures selected model supports conversation's attachments (images, audio) -- **Conversation management** - branching, regeneration, editing with history preservation -- **Attachment support** - images, audio, PDFs (with vision/text fallback) -- **Configurable parameters** - temperature, top_p, etc. synced with server defaults -- **Dark/light theme** - -### Tech Stack - -- **SvelteKit** - frontend framework with Svelte 5 runes for reactive state -- **TailwindCSS** + **shadcn-svelte** - styling and UI components -- **Vite** - build tooling -- **IndexedDB** (Dexie) - local storage for conversations -- **LocalStorage** - user settings persistence - -### Architecture - -The WebUI follows a layered architecture: - -``` -Routes → Components → Hooks → Stores → Services → Storage/API -``` - -- **Stores** - reactive state management (`chatStore`, `conversationsStore`, `modelsStore`, `serverStore`, `settingsStore`) -- **Services** - stateless API/database communication (`ChatService`, `ModelsService`, `PropsService`, `DatabaseService`) -- **Hooks** - reusable logic (`useModelChangeValidation`, `useProcessingState`) - -For detailed architecture diagrams, see [`tools/server/webui/docs/`](webui/docs/): - -- `high-level-architecture.mmd` - full architecture with all modules -- `high-level-architecture-simplified.mmd` - simplified overview -- `data-flow-simplified-model-mode.mmd` - data flow for single-model mode -- `data-flow-simplified-router-mode.mmd` - data flow for multi-model mode -- `flows/*.mmd` - detailed per-domain flows (chat, conversations, models, etc.) - -### Development - -```sh -# make sure you have Node.js installed -cd tools/server/webui -npm i - -# run dev server (with hot reload) -npm run dev - -# run tests -npm run test - -# build production bundle -npm run build -``` - -After `public/index.html.gz` has been generated, rebuild `llama-server` as described in the [build](#build) section to include the updated UI. - -**Note:** The Vite dev server automatically proxies API requests to `http://localhost:8080`. Make sure `llama-server` is running on that port during development. - ## Quick Start To get started right away, run the following command, making sure to use the correct path for the model you have: @@ -380,7 +317,7 @@ docker run -p 8080:8080 -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:se docker run -p 8080:8080 -v /path/to/models:/models --gpus all ghcr.io/ggml-org/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99 ``` -## Testing with CURL +## Using with CURL Using [curl](https://curl.se/). On Windows, `curl.exe` should be available in the base OS. @@ -391,46 +328,6 @@ curl --request POST \ --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}' ``` -## Advanced testing - -We implemented a [server test framework](./tests/README.md) using human-readable scenario. - -*Before submitting an issue, please try to reproduce it with this format.* - -## Node JS Test - -You need to have [Node.js](https://nodejs.org/en) installed. - -```bash -mkdir llama-client -cd llama-client -``` - -Create an index.js file and put this inside: - -```javascript -const prompt = "Building a website can be done in 10 simple steps:" - -async function test() { - let response = await fetch("http://127.0.0.1:8080/completion", { - method: "POST", - body: JSON.stringify({ - prompt, - n_predict: 64, - }) - }) - console.log((await response.json()).content) -} - -test() -``` - -And run it: - -```bash -node index.js -``` - ## API Endpoints ### GET `/health`: Returns health check result @@ -1638,6 +1535,22 @@ Response: } ``` +## API errors + +`llama-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi + +Example of an error: + +```json +{ + "error": { + "code": 401, + "message": "Invalid API Key", + "type": "authentication_error" + } +} +``` + ## More examples ### Interactive mode @@ -1657,26 +1570,6 @@ Run with bash: bash chat.sh ``` -### OAI-like API - -The HTTP `llama-server` supports an OAI-like API: https://github.com/openai/openai-openapi - -### API errors - -`llama-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi - -Example of an error: - -```json -{ - "error": { - "code": 401, - "message": "Invalid API Key", - "type": "authentication_error" - } -} -``` - Apart from error types supported by OAI, we also have custom types that are specific to functionalities of llama.cpp: **When /metrics or /slots endpoint is disabled** From 51e0c2d917c21826585e84be1c27f75147325de0 Mon Sep 17 00:00:00 2001 From: Jay Zenith <162098309+JayZenith@users.noreply.github.com> Date: Mon, 8 Dec 2025 05:10:12 -0800 Subject: [PATCH 08/14] cuda : add FILL op support (#17851) * cuda : add FILL op support * cuda : add missing FILL op files --- ggml/src/ggml-cuda/fill.cu | 37 +++++++++++++++++++++++++++++++++ ggml/src/ggml-cuda/fill.cuh | 3 +++ ggml/src/ggml-cuda/ggml-cuda.cu | 5 +++++ 3 files changed, 45 insertions(+) create mode 100644 ggml/src/ggml-cuda/fill.cu create mode 100644 ggml/src/ggml-cuda/fill.cuh diff --git a/ggml/src/ggml-cuda/fill.cu b/ggml/src/ggml-cuda/fill.cu new file mode 100644 index 000000000..eb8ccb780 --- /dev/null +++ b/ggml/src/ggml-cuda/fill.cu @@ -0,0 +1,37 @@ +#include "fill.cuh" +#include "convert.cuh" + +#define CUDA_FILL_BLOCK_SIZE 256 + +template +static __global__ void fill_kernel(T * __restrict__ dst, const int64_t k, const T value) { + const int64_t i = (int64_t)blockDim.x * blockIdx.x + threadIdx.x; + if (i >= k) { + return; + } + dst[i] = value; +} + +void ggml_cuda_op_fill(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + void * dst_d = dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(ggml_is_contiguous(dst)); + + float value; + memcpy(&value, dst->op_params, sizeof(float)); + + const int64_t k = ggml_nelements(dst); + const int64_t num_blocks = (k + CUDA_FILL_BLOCK_SIZE - 1) / CUDA_FILL_BLOCK_SIZE; + + switch (dst->type) { + case GGML_TYPE_F32: + fill_kernel<<>>((float *)dst_d, k, value); + break; + case GGML_TYPE_F16: + fill_kernel<<>>((half *)dst_d, k, ggml_cuda_cast(value)); + break; + default: + GGML_ABORT("unsupported type"); + } +} diff --git a/ggml/src/ggml-cuda/fill.cuh b/ggml/src/ggml-cuda/fill.cuh new file mode 100644 index 000000000..8443c8362 --- /dev/null +++ b/ggml/src/ggml-cuda/fill.cuh @@ -0,0 +1,3 @@ +#include "common.cuh" + +void ggml_cuda_op_fill(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 235d94d50..d0463388c 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -56,6 +56,7 @@ #include "ggml-cuda/solve_tri.cuh" #include "ggml-cuda/tri.cuh" #include "ggml-cuda/cumsum.cuh" +#include "ggml-cuda/fill.cuh" #include "ggml.h" #include @@ -2730,6 +2731,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_SOLVE_TRI: ggml_cuda_op_solve_tri(ctx, dst); break; + case GGML_OP_FILL: + ggml_cuda_op_fill(ctx, dst); + break; default: return false; } @@ -4617,6 +4621,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_CROSS_ENTROPY_LOSS_BACK: case GGML_OP_OPT_STEP_ADAMW: case GGML_OP_OPT_STEP_SGD: + case GGML_OP_FILL: case GGML_OP_CUMSUM: case GGML_OP_TRI: return true; From 636fc17a376dacc01da20d508e6986a299b1f819 Mon Sep 17 00:00:00 2001 From: hksdpc255 <43977088+hksdpc255@users.noreply.github.com> Date: Tue, 9 Dec 2025 00:32:04 +1100 Subject: [PATCH 09/14] Fix Kimi-K2 tool-call parsing issues (#17376) * Fix kimi-k2 parsing * fix template & add more tests for kimi-k2 * Another fix for Kimi-K2 chat template. * enable allow_toolcall_in_think for Kimi-K2 * Refine key-value separator and value end format * Enable tool call in think for kimi-k2 * allow_toolcall_in_think is now tested with Kimi-K2 * Remove outdated TODO comment in XML tool call parser Removed TODO comment about untested tool call feature. * Rename function from "utf8_truncate_safe" to "utf8_truncate_safe_len" --- common/chat-parser-xml-toolcall.cpp | 54 +++++--- common/chat-parser-xml-toolcall.h | 2 +- common/chat-parser.cpp | 5 +- models/templates/Kimi-K2-Instruct.jinja | 10 +- models/templates/Kimi-K2-Thinking.jinja | 10 +- tests/test-chat.cpp | 163 +++++++++++++++++++++--- 6 files changed, 194 insertions(+), 50 deletions(-) diff --git a/common/chat-parser-xml-toolcall.cpp b/common/chat-parser-xml-toolcall.cpp index 734989555..a80900ff8 100644 --- a/common/chat-parser-xml-toolcall.cpp +++ b/common/chat-parser-xml-toolcall.cpp @@ -724,16 +724,10 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons if (reasoning_unclosed) { if (auto pos = content.find(end_think); pos == std::string::npos && builder.pos() != builder.input().size()) { unclosed_reasoning_content += content; - if (form.allow_toolcall_in_think) { - builder.move_to(tc->groups[0].begin); - if (!builder.try_consume_xml_tool_calls(form)) { - unclosed_reasoning_content += tool_call_start; - builder.move_to(tc->groups[0].end); - } - } else { + if (!(form.allow_toolcall_in_think && tc)) { unclosed_reasoning_content += tool_call_start; + continue; } - continue; } else { reasoning_unclosed = false; std::string reasoning_content; @@ -781,8 +775,12 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons } } else { // This start is in thinking block, skip this tool call - auto pos = think_start + start_think.size(); - unclosed_reasoning_content = content.substr(pos) + tool_call_start; + // This start is in thinking block + if (form.allow_toolcall_in_think) { + unclosed_reasoning_content = content.substr(think_start + start_think.size()); + } else { + unclosed_reasoning_content = content.substr(think_start + start_think.size()) + tool_call_start; + } reasoning_unclosed = true; content.resize(think_start); toolcall_in_think = true; @@ -805,14 +803,35 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons } // remove potential partial suffix - if (content.size() > 0 && builder.pos() == builder.input().size() && unclosed_reasoning_content.empty()) { - rstrip(content); - trim_potential_partial_word(content); - rstrip(content); + if (builder.pos() == builder.input().size()) { + if (unclosed_reasoning_content.empty()) { + rstrip(content); + trim_potential_partial_word(content); + rstrip(content); + } else { + rstrip(unclosed_reasoning_content); + trim_potential_partial_word(unclosed_reasoning_content); + rstrip(unclosed_reasoning_content); + } + } + + // consume unclosed_reasoning_content if allow_toolcall_in_think is set + if (form.allow_toolcall_in_think && !unclosed_reasoning_content.empty()) { + if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) { + builder.add_reasoning_content(unclosed_reasoning_content); + } else { + if (content.empty()) { + content = start_think + unclosed_reasoning_content; + } else { + content += "\n\n" + start_think; + content += unclosed_reasoning_content; + } + } + unclosed_reasoning_content.clear(); } // Add content - if (content.size() != 0) { + if (!content.empty()) { // If there are multiple content blocks if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content && builder.result().content.size() != 0) { builder.add_content("\n\n"); @@ -820,7 +839,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons builder.add_content(content); } - // This start is in thinking block, skip this tool call + // This start is in thinking block and toolcall_in_think not set, skip this tool call if (toolcall_in_think && !form.allow_toolcall_in_think) { continue; } @@ -829,7 +848,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons if (!tc) { GGML_ASSERT(builder.pos() == builder.input().size()); GGML_ASSERT(unclosed_reasoning_content.empty()); - GGML_ASSERT(!reasoning_unclosed); + if (!form.allow_toolcall_in_think) GGML_ASSERT(!reasoning_unclosed); break; } @@ -854,7 +873,6 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons /** * Parse content uses reasoning and XML-Style tool call - * TODO: Note that form.allow_toolcall_in_think is not tested yet. If anyone confirms it works, this comment can be removed. */ void common_chat_msg_parser::consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think, const std::string & end_think) { parse_msg_with_xml_tool_calls(*this, form, start_think, end_think); diff --git a/common/chat-parser-xml-toolcall.h b/common/chat-parser-xml-toolcall.h index 67face2b9..b309fb667 100644 --- a/common/chat-parser-xml-toolcall.h +++ b/common/chat-parser-xml-toolcall.h @@ -31,7 +31,7 @@ struct xml_tool_call_format { std::optional last_val_end = std::nullopt; std::optional last_tool_end = std::nullopt; bool trim_raw_argval = false; - bool allow_toolcall_in_think = false; // TODO: UNTESTED!!! + bool allow_toolcall_in_think = false; }; // make a GBNF that accept any strings except those containing any of the forbidden strings. diff --git a/common/chat-parser.cpp b/common/chat-parser.cpp index fe3e80037..d740dac06 100644 --- a/common/chat-parser.cpp +++ b/common/chat-parser.cpp @@ -917,12 +917,13 @@ static void common_chat_parse_kimi_k2(common_chat_msg_parser & builder) { form.tool_start = "<|tool_call_begin|>"; form.tool_sep = "<|tool_call_argument_begin|>{"; form.key_start = "\""; - form.key_val_sep = "\": "; - form.val_end = ", "; + form.key_val_sep = "\":"; + form.val_end = ","; form.tool_end = "}<|tool_call_end|>"; form.scope_end = "<|tool_calls_section_end|>"; form.raw_argval = false; form.last_val_end = ""; + form.allow_toolcall_in_think = true; return form; })(); builder.consume_reasoning_with_xml_tool_calls(form, "", ""); diff --git a/models/templates/Kimi-K2-Instruct.jinja b/models/templates/Kimi-K2-Instruct.jinja index a9439135b..6204fb396 100644 --- a/models/templates/Kimi-K2-Instruct.jinja +++ b/models/templates/Kimi-K2-Instruct.jinja @@ -14,7 +14,7 @@ {%- endmacro %} {%- set tool_response_queue = namespace(ids=[]) -%} -{%- set tool_call_counter = namespace(value=1) -%} +{%- set tool_call_counter = namespace(value=0) -%} {%- if tools -%} <|im_system|>tool_declare<|im_middle|>{{ tools | tojson }}<|im_end|> @@ -36,12 +36,8 @@ {%- if message['role'] == 'assistant' and message.get('tool_calls') -%} {{render_content(message)}}<|tool_calls_section_begin|> {%- for tool_call in message['tool_calls'] -%} - {%- if tool_call['id'] is defined -%} - {%- set formatted_id = tool_call['id'] -%} - {%- else -%} - {%- set formatted_id = 'functions.' + tool_call['function']['name'] + ':' + (tool_call_counter.value | string) -%} - {%- set tool_call_counter.value = tool_call_counter.value + 1 -%} - {%- endif -%} + {%- set formatted_id = 'functions.' + tool_call['function']['name'] + ':' + (tool_call_counter.value | string) -%} + {%- set tool_call_counter.value = tool_call_counter.value + 1 -%} {%- set _ = tool_response_queue.ids.append(formatted_id) -%} <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{% if tool_call['function']['arguments'] is string %}{{ tool_call['function']['arguments'] }}{% else %}{{ tool_call['function']['arguments'] | tojson }}{% endif %}<|tool_call_end|> {%- endfor -%} diff --git a/models/templates/Kimi-K2-Thinking.jinja b/models/templates/Kimi-K2-Thinking.jinja index 4c2af6a78..5641429f5 100644 --- a/models/templates/Kimi-K2-Thinking.jinja +++ b/models/templates/Kimi-K2-Thinking.jinja @@ -25,17 +25,13 @@ {%- endmacro -%} {%- set tool_response_queue = namespace(ids=[]) -%} -{%- set tool_call_counter = namespace(value=1) -%} +{%- set tool_call_counter = namespace(value=0) -%} {%- macro render_toolcalls(message) -%} <|tool_calls_section_begin|> {%- for tool_call in message['tool_calls'] -%} - {%- if tool_call['id'] is defined -%} - {%- set formatted_id = tool_call['id'] -%} - {%- else -%} - {%- set formatted_id = 'functions.' + tool_call['function']['name'] + ':' + (tool_call_counter.value | string) -%} - {%- set tool_call_counter.value = tool_call_counter.value + 1 -%} - {%- endif -%} + {%- set formatted_id = 'functions.' + tool_call['function']['name'] + ':' + (tool_call_counter.value | string) -%} + {%- set tool_call_counter.value = tool_call_counter.value + 1 -%} {%- set _ = tool_response_queue.ids.append(formatted_id) -%} <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{% if tool_call['function']['arguments'] is string %}{{ tool_call['function']['arguments'] }}{% else %}{{ tool_call['function']['arguments'] | tojson }}{% endif %}<|tool_call_end|> {%- endfor -%} diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index 62dd1583f..f765bda62 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -428,10 +428,38 @@ static void test_templates(const struct common_chat_templates * tmpls, const std */ template static void test_parser_with_streaming(const common_chat_msg & expected, const std::string & raw_message, T parse_msg) { + constexpr auto utf8_truncate_safe_len = [](const std::string_view s) -> size_t { + auto len = s.size(); + if (len == 0) return 0; + auto i = len; + for (size_t back = 0; back < 4 && i > 0; ++back) { + --i; + unsigned char c = s[i]; + if ((c & 0x80) == 0) { + return len; + } else if ((c & 0xC0) == 0xC0) { + size_t expected_len = 0; + if ((c & 0xE0) == 0xC0) expected_len = 2; + else if ((c & 0xF0) == 0xE0) expected_len = 3; + else if ((c & 0xF8) == 0xF0) expected_len = 4; + else return i; + if (len - i >= expected_len) { + return len; + } else { + return i; + } + } + } + return len - std::min(len, size_t(3)); + }; + constexpr auto utf8_truncate_safe_view = [utf8_truncate_safe_len](const std::string_view s) { + return s.substr(0, utf8_truncate_safe_len(s)); + }; + auto merged = simple_assist_msg(""); auto last_msg = parse_msg(""); for (size_t i = 1; i <= raw_message.size(); ++i) { - auto curr_msg = parse_msg(raw_message.substr(0, i)); + auto curr_msg = parse_msg(std::string(utf8_truncate_safe_view(std::string_view(raw_message).substr(0, i)))); if (curr_msg == simple_assist_msg("")) continue; LOG_INF("Streaming msg: %s\n", common_chat_msgs_to_json_oaicompat({curr_msg}).dump().c_str()); for (auto diff: common_chat_msg_diff::compute_diffs(last_msg, curr_msg)) { @@ -2659,14 +2687,14 @@ Hey there!<|im_end|> // Test parsing tool calls assert_msg_equals(message_assist_call, common_chat_parse( - "<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:1<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>", + "<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>", /* is_partial= */ false, {COMMON_CHAT_FORMAT_KIMI_K2})); // Test parsing tool calls with thinking assert_msg_equals(message_assist_call_thoughts, common_chat_parse( - "I'm\nthinking<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:1<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>", + "I'm\nthinking<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>", /* is_partial= */ false, { /* .format = */ COMMON_CHAT_FORMAT_KIMI_K2, @@ -2676,7 +2704,7 @@ Hey there!<|im_end|> // Test tool calls with extra content assert_msg_equals(message_assist_call_content, common_chat_parse( - "<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:1<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>Hello, world!\nWhat's up?", + "<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>Hello, world!\nWhat's up?", /* is_partial= */ false, {COMMON_CHAT_FORMAT_KIMI_K2} )); @@ -2684,7 +2712,7 @@ Hey there!<|im_end|> // Test tool calls with extra content AND thinking assert_msg_equals(message_assist_call_thoughts_content, common_chat_parse( - "I'm\nthinking<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:1<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>Hello, world!\nWhat's up?", + "I'm\nthinking<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>Hello, world!\nWhat's up?", /* is_partial= */ false, { /* .format = */ COMMON_CHAT_FORMAT_KIMI_K2, @@ -2693,47 +2721,152 @@ Hey there!<|im_end|> // Test streaming test_parser_with_streaming(message_assist_call_thoughts_content, - "I'm\nthinking\nHello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:1<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>", + "I'm\nthinking\nHello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>", [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { /* .format = */ COMMON_CHAT_FORMAT_KIMI_K2, /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK }); }); test_parser_with_streaming(message_assist_call_thoughts_unparsed, - "I'm\nthinking\n\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:1<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>", + "I'm\nthinking\n\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>", [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { /* .format = */ COMMON_CHAT_FORMAT_KIMI_K2, /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE }); }); test_parser_with_streaming(message_assist_call_thoughts_content, - "I'm\nthinking\n\n\nHello, world!\nWhat's up?\n\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:1<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>\n", + "I'm\nthinking\n\n\nHello, world!\nWhat's up?\n\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>\n", [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { /* .format = */ COMMON_CHAT_FORMAT_KIMI_K2, /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK }); }); test_parser_with_streaming(message_assist_call_withopt, - "<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function_with_opt:1<|tool_call_argument_begin|>{\"arg1\": 1, \"arg2\": 2}<|tool_call_end|><|tool_calls_section_end|>", + "<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function_with_opt:0<|tool_call_argument_begin|>{\"arg1\": 1, \"arg2\": 2}<|tool_call_end|><|tool_calls_section_end|>", [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { /* .format = */ COMMON_CHAT_FORMAT_KIMI_K2, /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE }); }); test_parser_with_streaming(simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking", "special_function", "{\"arg1\": \"123456\"}"), - "I'm\nthinkingHello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:1<|tool_call_argument_begin|>{\"arg1\": \"123456\"}<|tool_call_end|><|tool_calls_section_end|>", + "I'm\nthinkingHello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": \"123456\"}<|tool_call_end|><|tool_calls_section_end|>", [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { /* .format = */ COMMON_CHAT_FORMAT_KIMI_K2, /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK }); }); test_parser_with_streaming(simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking", "special_function", "{\"arg1\": [1, 2, \"345\", 6]}"), - "I'm\nthinkingHello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:1<|tool_call_argument_begin|>{\"arg1\": [1, 2, \"345\", 6]}<|tool_call_end|><|tool_calls_section_end|>", + "I'm\nthinkingHello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": [1, 2, \"345\", 6]}<|tool_call_end|><|tool_calls_section_end|>", [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { /* .format = */ COMMON_CHAT_FORMAT_KIMI_K2, /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK }); }); test_parser_with_streaming(simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking", "special_function", "{\"arg1\": {\"12\": 34, \"5\": [67, 8], \"9\": \"10\"}}"), - "I'm\nthinkingHello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:1<|tool_call_argument_begin|>{\"arg1\": {\"12\": 34, \"5\": [67, 8], \"9\": \"10\"}}<|tool_call_end|><|tool_calls_section_end|>", + "I'm\nthinkingHello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": {\"12\": 34, \"5\": [67, 8], \"9\": \"10\"}}<|tool_call_end|><|tool_calls_section_end|>", [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { /* .format = */ COMMON_CHAT_FORMAT_KIMI_K2, /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK }); }); + test_parser_with_streaming( + simple_assist_msg("", "", "complex_function", "{\"name\":\"John Doe\",\"age\":30,\"active\":true,\"score\":95.5}"), + "<|tool_calls_section_begin|><|tool_call_begin|>functions.complex_function:0<|tool_call_argument_begin|>" + "{\"name\": \"John Doe\", \"age\": 30, \"active\": true, \"score\": 95.5}" + "<|tool_call_end|><|tool_calls_section_end|>", + [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); }); + test_parser_with_streaming( + simple_assist_msg("", "", "web_search", "{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}"), + "<|tool_calls_section_begin|><|tool_call_begin|>functions.web_search:0<|tool_call_argument_begin|>" + "{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}" + "<|tool_call_end|><|tool_calls_section_end|>", + [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); }); + test_parser_with_streaming( + simple_assist_msg("", "", "read_file", "{\"args\": [{\"path\": \"src/providers/ThemeProvider.tsx\"}, {\"path\": \"src/components/Header.tsx\"}, {\"path\": \"src/components/ThemeToggle.tsx\"}, {\"path\": \"src/app/globals.css\"}, {\"path\": \"src/app/layout.tsx\"}]}"), + "<|tool_calls_section_begin|><|tool_call_begin|>functions.read_file:0<|tool_call_argument_begin|>" + "{\"args\": [{\"path\": \"src/providers/ThemeProvider.tsx\"}, {\"path\": \"src/components/Header.tsx\"}, {\"path\": \"src/components/ThemeToggle.tsx\"}, {\"path\": \"src/app/globals.css\"}, {\"path\": \"src/app/layout.tsx\"}]}" + "<|tool_call_end|><|tool_calls_section_end|>", + [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); }); + test_parser_with_streaming( + simple_assist_msg( + "Let me start by examining the relevant files to understand the current implementation.", "", + "read_file", + "{\"files\": [{\"path\": \"src/app/Partners.tsx\", \"line_ranges\": [\"1-100\"]}]}"), + "Let me start by examining the relevant files to understand the current implementation." + "<|tool_calls_section_begin|><|tool_call_begin|>functions.read_file:0<|tool_call_argument_begin|>" + "{\"files\":[{\"path\":\"src/app/Partners.tsx\",\"line_ranges\":[\"1-100\"]}]}" + "<|tool_call_end|><|tool_calls_section_end|>", + [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); }); + auto multi_tool_msg = simple_assist_msg("Let me call multiple tools.", "I'm thinking."); + multi_tool_msg.tool_calls.push_back({ "read_file", "{\"files\": [{\"path\": \"src/app/Partners.tsx\", \"line_ranges\": [\"1-100\"]}]}", "" }); + multi_tool_msg.tool_calls.push_back({ "web_search", "{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}", "" }); + multi_tool_msg.tool_calls.push_back({ "complex_function", "{\"name\": \"John Doe\", \"age\": 30, \"active\": true, \"score\": 95.5}", "" }); + multi_tool_msg.tool_calls.push_back({ "emoji_function", "{\"message\":\"Hello! 👋 🌟 🚀 Testing emojis: 😀😃😄😁 and symbols: ∑∏∆∇\"}", "" }); + test_parser_with_streaming(multi_tool_msg, + "I'm thinking.Let me call multiple tools." + "<|tool_calls_section_begin|>" + "<|tool_call_begin|>functions.read_file:0<|tool_call_argument_begin|>" + "{\"files\":[{\"path\":\"src/app/Partners.tsx\",\"line_ranges\":[\"1-100\"]}]}" + "<|tool_call_end|>" + "<|tool_call_begin|>functions.web_search:1<|tool_call_argument_begin|>" + "{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}" + "<|tool_call_end|>" + "<|tool_call_begin|>functions.complex_function:2<|tool_call_argument_begin|>" + "{\"name\": \"John Doe\", \"age\": 30, \"active\": true, \"score\": 95.5}" + "<|tool_call_end|>" + "<|tool_call_begin|>functions.emoji_function:3<|tool_call_argument_begin|>" + "{\"message\":\"Hello! 👋 🌟 🚀 Testing emojis: 😀😃😄😁 and symbols: ∑∏∆∇\"}" + "<|tool_call_end|>" + "<|tool_calls_section_end|>", + [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { + COMMON_CHAT_FORMAT_KIMI_K2, + COMMON_REASONING_FORMAT_DEEPSEEK + }); }); + test_parser_with_streaming( + simple_assist_msg("", "I'm thinking", "complex_function_in_think", "{\"name\":\"John Doe\",\"age\":30,\"active\":true,\"score\":95.5}"), + "I'm thinking<|tool_calls_section_begin|><|tool_call_begin|>functions.complex_function_in_think:0<|tool_call_argument_begin|>" + "{\"name\": \"John Doe\", \"age\": 30, \"active\": true, \"score\": 95.5}" + "<|tool_call_end|><|tool_calls_section_end|>", + [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { + COMMON_CHAT_FORMAT_KIMI_K2, + COMMON_REASONING_FORMAT_DEEPSEEK + }); }); + test_parser_with_streaming( + simple_assist_msg("Hello", "I'm thinkingI'm still thinking", "complex_function_in_think", "{\"name\":\"John Doe\",\"age\":30,\"active\":true,\"score\":95.5}"), + "I'm thinking<|tool_calls_section_begin|><|tool_call_begin|>functions.complex_function_in_think:0<|tool_call_argument_begin|>" + "{\"name\": \"John Doe\", \"age\": 30, \"active\": true, \"score\": 95.5}" + "<|tool_call_end|><|tool_calls_section_end|>I'm still thinkingHello", + [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { + COMMON_CHAT_FORMAT_KIMI_K2, + COMMON_REASONING_FORMAT_DEEPSEEK + }); }); + + // Test template rendering + common_chat_templates_inputs conversation_with_tools = inputs_tools; + conversation_with_tools.messages.push_back(simple_assist_msg("Let's do it", "Think first", "complex_function", "{\"name\":\"John Doe\",\"age\":30,\"active\":true,\"score\":95.5}")); + conversation_with_tools.messages.push_back({ + "tool", + "Tool response 1", + /* .content_parts = */ {}, + /* .tool_calls = */ {}, + /* .reasoning_content = */ "", + /* .tool_name = */ "complex_function", + /* .tool_call_id = */ "", + }); + conversation_with_tools.messages.push_back(simple_assist_msg("Continue", "Think next", "web_search", "{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}")); + conversation_with_tools.messages.push_back({ + "tool", + "Tool response 2", + /* .content_parts = */ {}, + /* .tool_calls = */ {}, + /* .reasoning_content = */ "", + /* .tool_name = */ "web_search", + /* .tool_call_id = */ "", + }); + conversation_with_tools.messages.push_back(simple_assist_msg("CC", "Think last", "read_file", "{\"args\": [{\"path\": \"src/providers/ThemeProvider.tsx\"}, {\"path\": \"src/components/Header.tsx\"}, {\"path\": \"src/components/ThemeToggle.tsx\"}, {\"path\": \"src/app/globals.css\"}, {\"path\": \"src/app/layout.tsx\"}]}")); + conversation_with_tools.messages.push_back({ + "tool", + "Tool response 3", + /* .content_parts = */ {}, + /* .tool_calls = */ {}, + /* .reasoning_content = */ "", + /* .tool_name = */ "read_file", + /* .tool_call_id = */ "", + }); + assert_equals(common_chat_templates_apply(tmpls.get(), conversation_with_tools).prompt, std::string("<|im_system|>tool_declare<|im_middle|>[{\"type\": \"function\", \"function\": {\"name\": \"special_function\", \"description\": \"I'm special\", \"parameters\": {\"type\": \"object\", \"properties\": {\"arg1\": {\"type\": \"integer\", \"description\": \"The arg.\"}}, \"required\": [\"arg1\"]}}}]<|im_end|><|im_system|>system<|im_middle|>You are Kimi, an AI assistant created by Moonshot AI.<|im_end|><|im_user|>user<|im_middle|>Hey there!<|im_end|><|im_assistant|>assistant<|im_middle|>Think firstLet's do it<|tool_calls_section_begin|><|tool_call_begin|>functions.complex_function:0<|tool_call_argument_begin|>{\"name\":\"John Doe\",\"age\":30,\"active\":true,\"score\":95.5}<|tool_call_end|><|tool_calls_section_end|><|im_end|><|im_system|>complex_function<|im_middle|>## Return of functions.complex_function:0\nTool response 1<|im_end|><|im_assistant|>assistant<|im_middle|>Think nextContinue<|tool_calls_section_begin|><|tool_call_begin|>functions.web_search:1<|tool_call_argument_begin|>{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}<|tool_call_end|><|tool_calls_section_end|><|im_end|><|im_system|>web_search<|im_middle|>## Return of functions.web_search:1\nTool response 2<|im_end|><|im_assistant|>assistant<|im_middle|>Think lastCC<|tool_calls_section_begin|><|tool_call_begin|>functions.read_file:2<|tool_call_argument_begin|>{\"args\": [{\"path\": \"src/providers/ThemeProvider.tsx\"}, {\"path\": \"src/components/Header.tsx\"}, {\"path\": \"src/components/ThemeToggle.tsx\"}, {\"path\": \"src/app/globals.css\"}, {\"path\": \"src/app/layout.tsx\"}]}<|tool_call_end|><|tool_calls_section_end|><|im_end|><|im_system|>read_file<|im_middle|>## Return of functions.read_file:2\nTool response 3<|im_end|><|im_assistant|>assistant<|im_middle|>")); // Test template generation for regular content test_templates(tmpls.get(), end_tokens, message_assist, tools, @@ -2742,7 +2875,7 @@ Hey there!<|im_end|> // Test template generation for tool calls test_templates(tmpls.get(), end_tokens, message_assist_call, tools, - "<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:1<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>", + "<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>", /* expect_grammar_triggered= */ true, /* test_grammar_if_triggered= */ true, /* common_reasoning_format= */ COMMON_REASONING_FORMAT_DEEPSEEK, @@ -2751,14 +2884,14 @@ Hey there!<|im_end|> // Test template generation for tools with optional parameters test_templates(tmpls.get(), end_tokens, message_assist_call_noopt, tools, - "<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function_with_opt:1<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>", + "<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function_with_opt:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>", /* expect_grammar_triggered= */ true, /* test_grammar_if_triggered= */ true, /* common_reasoning_format= */ COMMON_REASONING_FORMAT_DEEPSEEK, /* ignore_whitespace_differences= */ true ); test_templates(tmpls.get(), end_tokens, message_assist_call_withopt, tools, - "<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function_with_opt:1<|tool_call_argument_begin|>{\"arg1\": 1, \"arg2\": 2}<|tool_call_end|><|tool_calls_section_end|>", + "<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function_with_opt:0<|tool_call_argument_begin|>{\"arg1\": 1, \"arg2\": 2}<|tool_call_end|><|tool_calls_section_end|>", /* expect_grammar_triggered= */ true, /* test_grammar_if_triggered= */ true, /* common_reasoning_format= */ COMMON_REASONING_FORMAT_DEEPSEEK, From e4e9c4329c088d3aa97b8c242e18ff79bfe66248 Mon Sep 17 00:00:00 2001 From: "Piotr Wilkin (ilintar)" Date: Mon, 8 Dec 2025 14:32:41 +0100 Subject: [PATCH 10/14] Make graph_max_nodes vary by ubatch size (#17794) * Make graph_max_nodes vary by ubatch size for models where chunking might explode the graph * Update src/llama-context.h Co-authored-by: Georgi Gerganov * Add missing const --------- Co-authored-by: Georgi Gerganov --- src/llama-context.cpp | 12 ++++++------ src/llama-context.h | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index e04f0fc4f..417140071 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -248,7 +248,10 @@ llama_context::llama_context( LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size()); - const size_t max_nodes = this->graph_max_nodes(); + const uint32_t n_seqs = cparams.n_seq_max; + const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); + + const size_t max_nodes = this->graph_max_nodes(n_tokens); LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes); @@ -300,9 +303,6 @@ llama_context::llama_context( cross.v_embd.clear(); - const uint32_t n_seqs = cparams.n_seq_max; - const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - // avoid reserving graphs with zero outputs - assume one output per sequence n_outputs = n_seqs; @@ -1386,9 +1386,9 @@ void llama_context::output_reorder() { // graph // -uint32_t llama_context::graph_max_nodes() const { +uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const { if (model.arch == LLM_ARCH_QWEN3NEXT) { - return std::max(8192u, 32u*model.n_tensors()); + return std::max(n_tokens * 40, 32u * model.n_tensors()); } return std::max(1024u, 8u*model.n_tensors()); } diff --git a/src/llama-context.h b/src/llama-context.h index 20cbd7895..cd26eafe1 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -197,7 +197,7 @@ private: // public: - uint32_t graph_max_nodes() const; + uint32_t graph_max_nodes(uint32_t n_tokens) const; // can reuse the llm_graph_result instance of the context (for example to update a memory module) llm_graph_result * get_gf_res_reserve() const; From f896d2c34f7bb502c13986830b3ed7d85aac67d9 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Mon, 8 Dec 2025 14:35:28 +0100 Subject: [PATCH 11/14] server: improve speed of speculative decoding (#17808) * server: improve speed of speculative decoding * fix small draft case * add link to the PR * server : fix generation time measurement * server : fix draft acceptance logs (add SRV_CNT, SLT_CNT macros) * server : add comment * add PR to docs --------- Co-authored-by: Georgi Gerganov --- tools/server/README-dev.md | 1 + tools/server/server-common.h | 2 + tools/server/server-context.cpp | 181 ++++++++++++++++++-------------- 3 files changed, 108 insertions(+), 76 deletions(-) diff --git a/tools/server/README-dev.md b/tools/server/README-dev.md index 67ebe1aaf..df165c34a 100644 --- a/tools/server/README-dev.md +++ b/tools/server/README-dev.md @@ -81,6 +81,7 @@ For detailed instructions, see the [test documentation](./tests/README.md). - Separation of HTTP logic into dedicated files: https://github.com/ggml-org/llama.cpp/pull/17216 - Large-scale code base split into smaller files: https://github.com/ggml-org/llama.cpp/pull/17362 - Introduction of router mode: https://github.com/ggml-org/llama.cpp/pull/17470 +- Speculative decoding: https://github.com/ggml-org/llama.cpp/pull/17808 and rework in https://github.com/ggml-org/llama.cpp/pull/17808 diff --git a/tools/server/server-common.h b/tools/server/server-common.h index 0c4d84ffa..0629bb5ed 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -18,11 +18,13 @@ const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + " using json = nlohmann::ordered_json; #define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) +#define SLT_CNT(slot, fmt, ...) LOG_CNT("" fmt, __VA_ARGS__) #define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) #define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) #define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) #define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_CNT(fmt, ...) LOG_CNT("" fmt, __VA_ARGS__) #define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) #define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) #define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index d0039631d..3bf905102 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -102,6 +102,11 @@ struct server_slot { std::string generated_text; llama_tokens generated_tokens; + // idx of draft tokens in the main batch + // non-empty if we went to evaluate draft tokens + // ref: https://github.com/ggml-org/llama.cpp/pull/17808 + std::vector i_batch_dft; + std::vector generated_token_probs; bool has_next_token = true; @@ -150,7 +155,8 @@ struct server_slot { struct common_sampler * smpl = nullptr; - llama_token sampled; + llama_token sampled; // in speculative mode, this is the last accepted token + llama_tokens drafted; // stats size_t n_sent_text = 0; // number of sent text character @@ -180,6 +186,8 @@ struct server_slot { stopping_word = ""; n_sent_text = 0; + drafted.clear(); + i_batch_dft.clear(); generated_tokens.clear(); generated_token_probs.clear(); json_schema = json(); @@ -255,6 +263,31 @@ struct server_slot { generated_token_probs.push_back(token); } + int get_n_draft_max() const { + if (!can_speculate()) { + return 0; + } + + // determine the max draft that fits the current slot state + int n_draft_max = task->params.speculative.n_max; + + // note: slot.prompt is not yet expanded with the `id` token sampled above + // also, need to leave space for 1 extra token to allow context shifts + n_draft_max = std::min(n_draft_max, n_ctx - prompt.n_tokens() - 2); + + if (n_remaining > 0) { + n_draft_max = std::min(n_draft_max, n_remaining - 1); + } + + SLT_DBG(*this, "max possible draft: %d\n", n_draft_max); + + if (n_draft_max < task->params.speculative.n_min) { + SLT_DBG(*this, "the max possible draft is too small: %d < %d - skipping speculative decoding\n", n_draft_max, task->params.speculative.n_min); + n_draft_max = 0; + } + return n_draft_max; + } + // note: a slot can also be either a parent or a child bool is_parent() const { return is_processing() && task->n_children > 0; @@ -353,8 +386,7 @@ struct server_slot { if (n_draft_total > 0) { const float draft_ratio = (float) n_draft_accepted / n_draft_total; - SLT_INF(*this, - "\n" + SLT_CNT(*this, "draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n", draft_ratio, n_draft_accepted, n_draft_total ); @@ -1774,14 +1806,57 @@ struct server_context_impl { continue; } - slot.i_batch = batch.n_tokens; + // generate draft tokens in speculative decoding mode + // TODO: rework to have a single draft llama_context shared across all slots [TAG_SERVER_SPEC_REWORK] + // perform the speculative drafting for all sequences at the same time in a single batch + int n_draft_max = slot.get_n_draft_max(); + if (n_draft_max > 0) { + if (mctx) { + // we should never reach this, as speculative is automatically disabled if mmproj is loaded + GGML_ABORT("not supported by multimodal"); + } - common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true); + struct common_speculative_params params_spec; + params_spec.n_draft = n_draft_max; + params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.task->params.speculative.n_max; + params_spec.p_min = slot.task->params.speculative.p_min; + const llama_tokens & cached_text_tokens = slot.prompt.tokens.get_text_tokens(); + llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, slot.sampled); - slot.prompt.tokens.push_back(slot.sampled); + // add the sampled token to the batch + slot.i_batch_dft.push_back(batch.n_tokens); + common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true); + slot.prompt.tokens.push_back(slot.sampled); - SLT_DBG(slot, "slot decode token, n_ctx = %d, n_tokens = %d, truncated = %d\n", - slot.n_ctx, slot.prompt.n_tokens(), slot.truncated); + if (slot.task->params.speculative.n_min > (int) draft.size()) { + SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.task->params.speculative.n_min); + // fallback to normal decoding + slot.i_batch = slot.i_batch_dft[0]; + slot.drafted.clear(); + slot.i_batch_dft.clear(); + } else { + // keep track of total number of drafted tokens tested + slot.n_draft_total += draft.size(); + + // add all drafted tokens to the batch + for (size_t i = 0; i < draft.size(); i++) { + slot.i_batch_dft.push_back(batch.n_tokens); + common_batch_add(batch, draft[i], slot.prompt.tokens.pos_next(), { slot.id }, true); + slot.prompt.tokens.push_back(draft[i]); + } + slot.drafted = std::move(draft); + } + } else { + // no speculative decoding + slot.i_batch = batch.n_tokens; + + common_batch_add(batch, slot.sampled, slot.prompt.tokens.pos_next(), { slot.id }, true); + + slot.prompt.tokens.push_back(slot.sampled); + + SLT_DBG(slot, "slot decode token, n_ctx = %d, n_tokens = %d, truncated = %d\n", + slot.n_ctx, slot.prompt.n_tokens(), slot.truncated); + } } // process in chunks of params.n_batch @@ -2345,6 +2420,10 @@ struct server_context_impl { // on successful decode, restore the original batch size n_batch = llama_n_batch(ctx); + // technically, measuring the time here excludes the sampling time for the last batch + // but on the other hand, we don't want to do too many system calls to measure the time, so it's ok + const int64_t t_current = ggml_time_us(); + for (auto & slot : slots) { // may need to copy state to other slots if (slot.state == SLOT_STATE_DONE_PROMPT && slot.is_parent()) { @@ -2399,6 +2478,10 @@ struct server_context_impl { continue; // continue loop of slots } + if (slot.i_batch_dft.size() > 0) { + continue; // sample using speculative decoding + } + const int tok_idx = slot.i_batch - i; llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx); @@ -2409,8 +2492,6 @@ struct server_context_impl { slot.n_decoded += 1; - const int64_t t_current = ggml_time_us(); - if (slot.n_decoded == 1) { slot.t_start_generation = t_current; slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3; @@ -2439,84 +2520,32 @@ struct server_context_impl { } } - // do speculative decoding - // TODO: rework to have a single draft llama_context shared across all slots [TAG_SERVER_SPEC_REWORK] - // perform the speculative drafting for all sequences at the same time in a single batch + // speculative decoding - main model sample and accept for (auto & slot : slots) { - if (!slot.is_processing() || !slot.can_speculate()) { + if (slot.state != SLOT_STATE_GENERATING || slot.i_batch_dft.empty()) { continue; } - if (slot.state != SLOT_STATE_GENERATING) { - continue; - } - - if (mctx) { - // we should never reach this, as speculative is automatically disabled if mmproj is loaded - GGML_ABORT("not supported by multimodal"); - } - - // determine the max draft that fits the current slot state - int n_draft_max = slot.task->params.speculative.n_max; - - // note: slot.prompt is not yet expanded with the `id` token sampled above - // also, need to leave space for 1 extra token to allow context shifts - n_draft_max = std::min(n_draft_max, slot.n_ctx - slot.prompt.n_tokens() - 2); - - if (slot.n_remaining > 0) { - n_draft_max = std::min(n_draft_max, slot.n_remaining - 1); - } - - SLT_DBG(slot, "max possible draft: %d\n", n_draft_max); - - if (n_draft_max < slot.task->params.speculative.n_min) { - SLT_DBG(slot, "the max possible draft is too small: %d < %d - skipping speculative decoding\n", n_draft_max, slot.task->params.speculative.n_min); - - continue; - } - - llama_token id = slot.sampled; - - struct common_speculative_params params_spec; - params_spec.n_draft = n_draft_max; - params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.task->params.speculative.n_max; - params_spec.p_min = slot.task->params.speculative.p_min; - - const llama_tokens & cached_text_tokens = slot.prompt.tokens.get_text_tokens(); - llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, id); - - // ignore small drafts - if (slot.task->params.speculative.n_min > (int) draft.size()) { - SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.task->params.speculative.n_min); - - continue; - } - - // keep track of total number of drafted tokens tested - slot.n_draft_total += draft.size(); - - // construct the speculation batch - common_batch_clear(slot.batch_spec); - common_batch_add (slot.batch_spec, id, slot.prompt.tokens.pos_next(), { slot.id }, true); - - for (size_t i = 0; i < draft.size(); ++i) { - common_batch_add(slot.batch_spec, draft[i], slot.prompt.tokens.pos_next() + 1 + i, { slot.id }, true); - } - - SLT_DBG(slot, "decoding speculative batch, size = %d\n", slot.batch_spec.n_tokens); - - llama_decode(ctx, slot.batch_spec); + size_t n_draft = slot.drafted.size(); // the accepted tokens from the speculation - const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft); + const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, slot.i_batch_dft, slot.drafted); + slot.i_batch_dft.clear(); + slot.drafted.clear(); slot.n_decoded += ids.size(); + slot.t_token_generation = std::max(1, t_current - slot.t_start_generation) / 1e3; + // update how many tokens out of those tested were accepted slot.n_draft_accepted += ids.size() - 1; - slot.prompt.tokens.push_back(id); + // rollback to the state before sampling the draft tokens + slot.prompt.tokens.keep_first(slot.prompt.n_tokens() - n_draft); + + // add accepted tokens to the prompt slot.prompt.tokens.insert({ids.begin(), ids.end() - 1}); + slot.sampled = ids.back(); // last accepted token llama_memory_seq_rm(llama_get_memory(ctx), slot.id, slot.prompt.n_tokens(), -1); @@ -2539,7 +2568,7 @@ struct server_context_impl { } } - SLT_DBG(slot, "accepted %d/%d draft tokens, new n_tokens = %d\n", (int) ids.size() - 1, (int) draft.size(), slot.prompt.n_tokens()); + SLT_DBG(slot, "accepted %d/%d draft tokens, new n_tokens = %d\n", (int) ids.size() - 1, (int) slot.drafted.size(), slot.prompt.n_tokens()); } } From 68522c678daa7b65718f8a3de89bb2fbb139e26f Mon Sep 17 00:00:00 2001 From: Neo Zhang Date: Mon, 8 Dec 2025 22:09:39 +0800 Subject: [PATCH 12/14] ci : support bfloat16 SYCL release package (#17855) * support bfloat16 release package * add fallback file --- .github/workflows/release.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 3668e4e2c..77aec20c1 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -546,6 +546,8 @@ jobs: cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin From 951520ddb05402bb8844509a7683d1a9a517dfc6 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Mon, 8 Dec 2025 17:04:38 +0100 Subject: [PATCH 13/14] server: delegate result_state creation to server_task (#17835) * server: delegate result_state creation to server_task * remove unued states * add more docs --- tools/server/README-dev.md | 27 +++++++++++++++++++- tools/server/server-context.cpp | 21 +++++++--------- tools/server/server-context.h | 5 ++-- tools/server/server-queue.cpp | 13 ++++++++-- tools/server/server-queue.h | 6 ++--- tools/server/server-task.h | 44 +++++++++++++++++++-------------- 6 files changed, 76 insertions(+), 40 deletions(-) diff --git a/tools/server/README-dev.md b/tools/server/README-dev.md index df165c34a..fbcd6bc1f 100644 --- a/tools/server/README-dev.md +++ b/tools/server/README-dev.md @@ -42,7 +42,15 @@ graph TD server_response --> server_routes ``` -TODO: mention about how batching is handled by `server_slot` +### Batching + +The server context maintains a single batch shared across all slots. When `update_slots()` is invoked, the system iterates through all active slots to populate this batch. For each slot, either a generated token from the previous decoding step or available prompt tokens are added to the batch. + +Batching constraints apply: slots can only be batched together if they share compatible configurations. For instance, slots using a specific LoRA adapter can be batched with each other, but not with slots using a different LoRA adapter or no adapter at all. + +Once the batch reaches capacity or all slots have been processed, `llama_decode` is called to execute the inference. This operation represents the primary computational bottleneck in `update_slots()`. + +Following decoding, the system either retrieves embeddings or samples the next token using `common_sampler_sample`. If a slot has remaining prompt tokens to process, it yields until the next `update_slots()` iteration. ### Thread Management @@ -62,6 +70,23 @@ Each incoming HTTP request is handled by its own thread managed by the HTTP libr - All JSON formatting and chat template logic must stay in the HTTP layer. - Avoid passing raw JSON between the HTTP layer and `server_slot`. Instead, parse everything into native C++ types as early as possible. +### Example trace of a request + +Here is an example trace of an API request for text completion: + +- A request arrives at the HTTP layer. +- The request is routed to the corresponding handler inside `server_routes`. In this case, `handle_completions_impl` is invoked. +- The handler parses the input request, constructs a new `server_task`, and passes it to `server_res_generator`. +- `server_res_generator` creates a new `task_result_state` for each task: + - `task_result_state` stays in the HTTP layer, responsible for keeping track of the current state of the response (e.g., parsing tool calls or thinking messages). + - `server_task` is moved into `server_queue` inside `server_context`. +- `server_context` launches the task by moving it into an available slot (see `launch_slot_with_task()`). +- `update_slot()` processes the task as described in the "Batching" section above. +- Results may be sent using `send_partial_response` or `send_final_response`, which creates a new `server_task_result` and pushes it to the response queue. +- At the same time, `server_res_generator` listens to the response queue and retrieves this response. +- As the response is stateless, `server_res_generator` calls `response->update()` to update the response with the current state. +- `server_res_generator` then calls `response->to_json()` and passes the response to the HTTP layer. + ### Testing `llama-server` includes an automated test suite based on `pytest`. diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 3bf905102..4578f8d7a 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2589,6 +2589,10 @@ struct server_context_impl { int get_slot_n_ctx() { return slots.back().n_ctx; } + + server_response_reader get_response_reader() { + return server_response_reader(queue_tasks, queue_results, HTTP_POLLING_SECONDS); + } }; // @@ -2618,8 +2622,8 @@ llama_context * server_context::get_llama_context() const { return impl->ctx; } -std::pair server_context::get_queues() { - return { impl->queue_tasks, impl->queue_results }; +server_response_reader server_context::get_response_reader() { + return impl->get_response_reader(); } @@ -2628,7 +2632,7 @@ std::pair server_context::get_queues() { struct server_res_generator : server_http_res { server_response_reader rd; server_res_generator(server_context_impl & ctx_server) - : rd({ctx_server.queue_tasks, ctx_server.queue_results}, HTTP_POLLING_SECONDS) {} + : rd(ctx_server.queue_tasks, ctx_server.queue_results, HTTP_POLLING_SECONDS) {} void ok(const json & response_data) { status = 200; data = safe_json_to_str(response_data); @@ -2661,9 +2665,6 @@ static std::unique_ptr handle_completions_impl( try { std::vector tasks; - // tracking generation state and partial tool calls - std::vector states; - const auto & prompt = data.at("prompt"); // TODO: this log can become very long, put it behind a flag or think about a more compact format //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get().c_str() : prompt.dump(2).c_str()); @@ -2679,7 +2680,6 @@ static std::unique_ptr handle_completions_impl( inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true); } tasks.reserve(inputs.size()); - states.reserve(inputs.size()); int idx = 0; for (size_t i = 0; i < inputs.size(); i++) { server_task task = server_task(type); @@ -2698,7 +2698,6 @@ static std::unique_ptr handle_completions_impl( task.params.res_type = res_type; task.params.oaicompat_cmpl_id = completion_id; task.params.oaicompat_model = ctx_server.model_name; - states.push_back(task.params.oaicompat_chat_syntax); if (task.params.n_cmpl > 1) { task.n_children = task.params.n_cmpl - 1; @@ -2707,7 +2706,6 @@ static std::unique_ptr handle_completions_impl( task.id, ctx_server.queue_tasks.get_new_id(), idx++); - states.push_back(child.params.oaicompat_chat_syntax); tasks.push_back(std::move(child)); } } @@ -2715,7 +2713,6 @@ static std::unique_ptr handle_completions_impl( tasks.push_back(std::move(task)); } - rd.set_states(std::move(states)); rd.post_tasks(std::move(tasks)); } catch (const std::exception & e) { res->error(format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST)); @@ -3445,7 +3442,7 @@ void server_routes::init_routes() { // create and queue the task json responses = json::array(); - server_response_reader rd({ctx_server.queue_tasks, ctx_server.queue_results}, HTTP_POLLING_SECONDS); + server_response_reader rd = ctx_server.get_response_reader(); { std::vector tasks; tasks.reserve(documents.size()); @@ -3705,7 +3702,7 @@ std::unique_ptr server_routes::handle_embeddings_impl(cons // create and queue the task json responses = json::array(); - server_response_reader rd({ctx_server.queue_tasks, ctx_server.queue_results}, HTTP_POLLING_SECONDS); + server_response_reader rd = ctx_server.get_response_reader(); { std::vector tasks; for (size_t i = 0; i < tokenized_prompts.size(); i++) { diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 05b4afaee..eaa138087 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -31,9 +31,8 @@ struct server_context { // get the underlaying llama_context llama_context * get_llama_context() const; - // get the underlaying queue_tasks and queue_results - // used by CLI application - std::pair get_queues(); + // get a new response reader, used by CLI application + server_response_reader get_response_reader(); }; diff --git a/tools/server/server-queue.cpp b/tools/server/server-queue.cpp index 10196128d..3cceb2bbe 100644 --- a/tools/server/server-queue.cpp +++ b/tools/server/server-queue.cpp @@ -271,12 +271,21 @@ void server_response::terminate() { // server_response_reader // -void server_response_reader::set_states(std::vector && states) { - this->states = std::move(states); +void server_response_reader::post_task(server_task && task) { + GGML_ASSERT(id_tasks.empty() && "post_task() can only be called once per reader"); + id_tasks.insert(task.id); + states.push_back(task.create_state()); + queue_results.add_waiting_task_id(task.id); + queue_tasks.post(std::move(task)); } void server_response_reader::post_tasks(std::vector && tasks) { + GGML_ASSERT(id_tasks.empty() && "post_tasks() can only be called once per reader"); id_tasks = server_task::get_list_id(tasks); + states.reserve(tasks.size()); + for (size_t i = 0; i < tasks.size(); i++) { + states.push_back(tasks[i].create_state()); + } queue_results.add_waiting_tasks(tasks); queue_tasks.post(std::move(tasks)); } diff --git a/tools/server/server-queue.h b/tools/server/server-queue.h index a5c3179d8..726eadf4e 100644 --- a/tools/server/server-queue.h +++ b/tools/server/server-queue.h @@ -129,13 +129,13 @@ struct server_response_reader { std::vector states; // should_stop function will be called each polling_interval_seconds - server_response_reader(std::pair server_queues, int polling_interval_seconds) - : queue_tasks(server_queues.first), queue_results(server_queues.second), polling_interval_seconds(polling_interval_seconds) {} + server_response_reader(server_queue & queue_tasks, server_response & queue_results, int polling_interval_seconds) + : queue_tasks(queue_tasks), queue_results(queue_results), polling_interval_seconds(polling_interval_seconds) {} ~server_response_reader() { stop(); } - void set_states(std::vector && states); + void post_task(server_task && tasks); void post_tasks(std::vector && tasks); bool has_next() const; diff --git a/tools/server/server-task.h b/tools/server/server-task.h index da4e22a7c..9011ff944 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -85,6 +85,25 @@ struct task_params { json to_json(bool only_metrics = false) const; }; +// struct for tracking the state of a task (e.g., for streaming) +struct task_result_state { + // tracking diffs for partial tool calls + std::vector diffs; + common_chat_syntax oaicompat_chat_syntax; + common_chat_msg chat_msg; + std::string generated_text; // append new chunks of generated text here + std::vector generated_tool_call_ids; + + task_result_state(const common_chat_syntax & oaicompat_chat_syntax) + : oaicompat_chat_syntax(oaicompat_chat_syntax) {} + + // parse partial tool calls and update the internal state + common_chat_msg update_chat_msg( + const std::string & text_added, + bool is_partial, + std::vector & diffs); +}; + struct server_task { int id = -1; // to be filled by server_queue int index = -1; // used when there are multiple prompts (batch request) @@ -149,6 +168,12 @@ struct server_task { copy.tokens = tokens.clone(); return copy; } + + // the task will be moved into queue, then onto slots + // however, the state must be kept by caller (e.g., HTTP thread) + task_result_state create_state() const { + return task_result_state(params.oaicompat_chat_syntax); + } }; struct result_timings { @@ -180,25 +205,6 @@ struct result_prompt_progress { json to_json() const; }; -// struct for tracking the state of a task (e.g., for streaming) -struct task_result_state { - // tracking diffs for partial tool calls - std::vector diffs; - common_chat_syntax oaicompat_chat_syntax; - common_chat_msg chat_msg; - std::string generated_text; // append new chunks of generated text here - std::vector generated_tool_call_ids; - - task_result_state(const common_chat_syntax & oaicompat_chat_syntax) - : oaicompat_chat_syntax(oaicompat_chat_syntax) {} - - // parse partial tool calls and update the internal state - common_chat_msg update_chat_msg( - const std::string & text_added, - bool is_partial, - std::vector & diffs); -}; - struct server_task_result { int id = -1; int id_slot = -1; From 2fa51c19b028180b35d316e9ed06f5f0f7ada2c1 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Mon, 8 Dec 2025 17:13:08 +0100 Subject: [PATCH 14/14] model-conversion : add token ids to prompt token output [no ci] (#17863) This commit adds the token ids to the printed prompt outputs. The motivation for this is that is can be useful to see the actual token ids alongside the token strings for debugging. --- examples/model-conversion/logits.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/model-conversion/logits.cpp b/examples/model-conversion/logits.cpp index bbd095e60..5bcf06326 100644 --- a/examples/model-conversion/logits.cpp +++ b/examples/model-conversion/logits.cpp @@ -144,7 +144,7 @@ int main(int argc, char ** argv) { return 1; } std::string s(buf, n); - printf("%s", s.c_str()); + printf("%s (%d)", s.c_str(), id); } printf("\n");