Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.github/workflows/build-apple.yml
#	.github/workflows/build-cmake-pkg.yml
#	.github/workflows/release.yml
#	.pi/gg/SYSTEM.md
#	CMakeLists.txt
#	CODEOWNERS
#	README.md
#	build-xcframework.sh
#	ci/run.sh
#	docs/build.md
#	examples/CMakeLists.txt
#	examples/llama.android/lib/build.gradle.kts
#	ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl
#	tests/CMakeLists.txt
#	tests/test-backend-ops.cpp
#	tests/test-save-load-state.cpp
#	tools/batched-bench/CMakeLists.txt
#	tools/cli/CMakeLists.txt
#	tools/completion/CMakeLists.txt
#	tools/llama-bench/CMakeLists.txt
#	tools/perplexity/CMakeLists.txt
#	tools/quantize/CMakeLists.txt
#	tools/server/CMakeLists.txt
This commit is contained in:
Concedo 2026-05-22 20:42:51 +08:00
commit 632c41a72f
14 changed files with 265 additions and 262 deletions

View file

@ -1,20 +0,0 @@
set(TARGET llama-app)
add_executable(${TARGET} llama.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama)
target_link_libraries(${TARGET} PRIVATE
llama-server-impl
llama-cli-impl
llama-completion-impl
llama-bench-impl
llama-batched-bench-impl
llama-fit-params-impl
llama-quantize-impl
llama-perplexity-impl
)
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)
install(TARGETS ${TARGET} RUNTIME)
endif()

View file

@ -1,86 +0,0 @@
#include "build-info.h"
#include <cstdio>
#include <string>
#include <vector>
// visible
int llama_server(int argc, char ** argv);
int llama_cli(int argc, char ** argv);
// hidden
int llama_completion(int argc, char ** argv);
int llama_bench(int argc, char ** argv);
int llama_batched_bench(int argc, char ** argv);
int llama_fit_params(int argc, char ** argv);
int llama_quantize(int argc, char ** argv);
int llama_perplexity(int argc, char ** argv);
static int help(int argc, char ** argv);
static int version(int argc, char ** argv);
struct command {
const char * name;
const char * desc;
std::vector<std::string> aliases;
bool hidden;
int (*func)(int, char **);
};
static const command cmds[] = {
{"serve", "HTTP API server", {"server"}, false, llama_server },
{"cli", "Command-line interactive interface", {"client"}, false, llama_cli },
{"completion", "Text completion", {"complete"}, true, llama_completion },
{"bench", "Benchmark prompt processing and text generation", {}, true, llama_bench },
{"batched-bench", "Benchmark batched decoding performance", {}, true, llama_batched_bench},
{"fit-params", "Compute parameters to fit a model in device memory", {}, true, llama_fit_params },
{"quantize", "Quantize a model", {}, true, llama_quantize },
{"perplexity", "Compute model perplexity and KL divergence", {}, true, llama_perplexity },
{"version", "Show version", {}, true, version },
{"help", "Show available commands", {}, true, help },
};
static int version(int argc, char ** argv) {
printf("%s\n", llama_build_info());
return 0;
}
static int help(int argc, char ** argv) {
const bool show_all = argc >= 2 && std::string(argv[1]) == "all";
printf("Usage: llama <command> [options]\n\nAvailable commands:\n");
for (const auto & cmd : cmds) {
if (show_all || !cmd.hidden) {
printf(" %-15s %s\n", cmd.name, cmd.desc);
}
}
printf("\nRun 'llama <command> --help' for command-specific usage.\n");
return 0;
}
static bool matches(const std::string & arg, const command & cmd) {
if (arg == cmd.name) {
return true;
}
for (const auto & alias : cmd.aliases) {
if (arg == alias) {
return true;
}
}
return false;
}
int main(int argc, char ** argv) {
const std::string arg = argc >= 2 ? argv[1] : "help";
for (const auto & cmd : cmds) {
if (matches(arg, cmd)) {
return cmd.func(argc - 1, argv + 1);
}
}
fprintf(stderr, "error: unknown command '%s'\n", arg.c_str());
return 1;
}

View file

@ -1617,6 +1617,11 @@ class TextModel(ModelBase):
assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute]
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]
# k-mers can share text with a base-vocab BPE token (e.g. CCCCCC) and get
# dropped by get_vocab(); a reserved marker suffix (U+E000) keeps each
# k-mer's own id (llama.cpp strips it on detokenization)
for kmer in tokenizer.kmers: # ty: ignore[unresolved-attribute]
reverse_vocab[tokenizer.dna_token_to_id[kmer]] = kmer + "\ue000" # ty: ignore[unresolved-attribute]
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]

View file

@ -1,127 +0,0 @@
# Using multiple GPUs with llama.cpp
This guide explains how to run [llama.cpp](https://github.com/ggml-org/llama.cpp) across more than one GPU. It covers the split modes, the command-line flags that control them, the limitations you need to know about, and ready-to-use recipes for `llama-cli` and `llama-server`.
The CLI arguments listed here are the same for both tools - or most llama.cpp binaries for that matter.
---
## When you need multi-GPU
Reach for multi-GPU when one of these is true:
- **The model doesn't fit in a single GPU's VRAM.** By spreading the weights across two or more GPUs the whole model can stay on accelerators. Otherwise part of the model will need to be run off of the comparatively slower system RAM.
- **You want more throughput.** By distributing the computation across multiple GPUs, each individual GPU has to do less work. This can result in better prefill and/or token generation performance, depending on the split mode and interconnect speed vs. the speed of an individual GPU.
---
## The split modes
Set with `--split-mode` / `-sm`.
| Mode | What it does | When to use |
|---|---|---|
| `none` | Use a single GPU only. Pick which one with `--main-gpu`. | You explicitly want to confine the model to one GPU even though more are visible. |
| `layer` (**default**) | Pipeline parallelism. Each GPU holds a contiguous slice of layers. The KV cache for layer *l* lives on the GPU that owns layer *l*. | Default and most compatible multi-GPU choice. You want more memory than a single GPU provides and your priority is a fast prefill. Can tolerate slow interconnect speeds between GPUs. |
| `row` | **Deprecated.** Older row-split tensor-parallel path with comparatively poor performance. Splits only dense weights across GPUs. Superseded by `tensor` which should be universally superior if it can be used. | Avoid in new deployments. |
| `tensor` | **EXPERIMENTAL.** Tensor parallelism that splits both weights *and* KV across the participating GPUs via a "meta device" abstraction. | You want more memory than a single GPU provides and your priority is fast token generation. Prefill speeds approach pipeline parallel speeds for large, dense models and fast GPU interconnect speeds. Treat as experimental as the code is less mature than pipeline parallelism. Performance should be good for multiple NVIDIA GPUs using the CUDA backend, no guarantees otherwise. |
> Pipeline parallel (`layer`) vs. tensor parallel (`tensor`): pipeline-parallel runs different layers on different GPUs and processes tokens sequentially through the pipeline. This minimizes data transfers between GPUs but requires many tokens to scale well. Tensor-parallel splits each layer across GPUs and does multiple cross-GPU reductions per layer. This enables parallelizing any workload but is much more bottlenecked by the GPU interconnect speed. Pipeline-parallel maximizes batch throughput; tensor-parallel minimizes latency.
---
## Command-line arguments reference
| Short | Long | Value | Default | Notes |
|---|---|---|---|---|
| `-sm` | `--split-mode` | `none` \| `layer` \| `tensor` | `layer` | See modes above. |
| `-ts` | `--tensor-split` | comma-separated proportions, e.g. `3,1` | mode-dependent | How much of the model goes to each GPU. If omitted, `layer`/`row` use automatic splitting proportional to memory, while `tensor` splits tensor segments evenly. With `3,1` on two GPUs, GPU 0 gets 75 %, GPU 1 gets 25 %. The values follow the order in `--device`. |
| `-mg` | `--main-gpu` | integer device index | `0` | The single GPU used in `--split-mode none`. |
| `-ngl` | `--n-gpu-layers` / `--gpu-layers` | integer \| `auto` \| `all` | `auto` | Maximum number of layers to keep in VRAM. Use `999` or `all` to push everything possible to the GPUs. |
| `-dev` | `--device` | comma-separated device names, or `none` | auto | Restrict which devices llama.cpp may use. See `--list-devices` for names. |
| | `--list-devices` | - | - | Print the available devices and their memory. Run this first to learn the names you'd pass to `--device`. |
| `-fa` | `--flash-attn` | `on` \| `off` \| `auto` | `auto` | Required when using `--split-mode tensor` and/or quantized V cache. Supported (and therefore enabled by default) for most combinations of models and backends. |
| `-ctk` | `--cache-type-k` | `f32` \| `f16` \| `bf16` \| `q8_0` \| `q4_0` \| ... | `f16` | KV cache type for K. |
| `-ctv` | `--cache-type-v` | same as `-ctk` | `f16` | KV cache type for V. |
| `-fit` | `--fit` | `on` \| `off` | `on` | Auto-fit unset args to device memory. **Not supported with `tensor`. You may need to manually set the `--ctx-size` to make the model fit.** |
As for any CUDA program, the environment variable `CUDA_VISIBLE_DEVICES` can be used to control which GPUs to use for the CUDA backend: if you set it, llama.cpp only sees the specified GPUs. Use `--device` for selecting GPUs from among those visible to llama.cpp, this works for any backend.
---
## Recipes
### 1. Default - pipeline parallel across all visible GPUs
```bash
llama-cli -m model.gguf
llama-server -m model.gguf
```
Easiest configuration. KV cache spreads across the GPUs along with the layers. `--fit` (on by default) sizes things automatically.
### 2. Pipeline parallel with a custom split ratio
```bash
llama-cli -m model.gguf -ts 3,1
```
Useful when GPUs have different memory: GPU 0 (3 parts) and GPU 1 (1 part). Proportions are normalized so `-ts 3,1` is the same as e.g. `-ts 75,25`.
### 3. Single-GPU mode, picking a specific GPU
```bash
llama-cli --list-devices
llama-cli -m model.gguf -dev CUDA1
```
Use only the device listed as `CUDA1` when calling with `--list-devices`.
### 4. Tensor parallelism (experimental)
```bash
llama-cli -m model.gguf -sm tensor -ctk f16 -ctv f16
```
- `--flash-attn off` or (`--flash-attn auto` resolving to `off` when it isn't supported) is a hard error.
- KV cache types must be non-quantized: `f32`, `f16`, or `bf16`. Support for quantized KV cache is not implemented and trying to use it will result in an error.
- Mark this configuration as experimental in your tooling: validate output quality before deploying.
- `--split-mode tensor`is not implemented for all architectures. The following will fail with *"LLAMA_SPLIT_MODE_TENSOR not implemented for architecture '...'"*:
- **MoE / hybrid:** Grok, MPT, OLMoE, DeepSeek2, GLM-DSA, Nemotron-H, Nemotron-H-MoE, Granite-Hybrid, LFM2-MoE, Minimax-M2, Mistral4, Kimi-Linear, Jamba, Falcon-H1
- **State-space / RWKV-style:** Mamba, Mamba2 (and the hybrid Mamba-attention models above)
- **Other:** PLAMO2, MiniCPM3, Gemma-3n, OLMo2, BitNet, T5
### 5. With NCCL
There's no runtime flag for NCCL - it's selected at build time (`-DGGML_CUDA_NCCL=ON`, this is the default). Note that NCCL is **not** automatically distributed with CUDA and you may need to install it manually - when in doubt check the CMake log to see whether or not it can find the package. When llama.cpp is compiled with NCCL support it uses it automatically for cross-GPU reductions in `tensor` mode. When NCCL is missing on a multi-GPU build, you'll see this one-time warning and performance will be lower:
```
NVIDIA Collective Communications Library (NCCL) is unavailable, multi GPU performance will be suboptimal
```
When using the "ROCm" backend (which is the ggml CUDA code translated for AMD via HIP), the AMD equivalent RCCL can be used by compiling with `-DGGML_HIP_RCCL=ON`. Note that RCCL is by default *disabled* because (unlike NCCL) it was not universally beneficial during testing.
### 6. With CUDA peer-to-peer access (`GGML_CUDA_P2P`)
CUDA peer-to-peer (P2P) lets GPUs transfer data directly between each other instead of going through system memory, which generally improves multi-GPU performance. It is **opt-in** at runtime - set the environment variable `GGML_CUDA_P2P` to any value to enable it:
```bash
GGML_CUDA_P2P=1 llama-cli -m model.gguf -sm tensor
```
P2P requires driver support (usually restricted to workstation/datacenter GPUs) and **may cause crashes or corrupted outputs on some motherboards or BIOS configurations** (e.g. when IOMMU is enabled). If you see instability after enabling it, unset the variable.
---
## Troubleshooting
| Symptom | How to fix |
|---|---|
| Startup error *"SPLIT_MODE_TENSOR requires flash_attn to be enabled"* | Add `-fa on` or remove `-fa off`. |
| Startup error *"simultaneous use of SPLIT_MODE_TENSOR and KV cache quantization not implemented"* | Use `-ctk f16 -ctv f16` (or `bf16`/`f32`) with `--split-mode tensor`. |
| Startup error *"LLAMA_SPLIT_MODE_TENSOR not implemented for architecture 'X'"* | Architecture not on the TENSOR allow-list. Use `--split-mode layer`. |
| Warning *"NCCL is unavailable, multi GPU performance will be suboptimal"* | llama.cpp wasn't built with NCCL. Either accept the lower performance or install NCCL and rebuild. |
| CUDA OOM at startup or during prefill in `--split-mode tensor` | Auto-fit is disabled in this mode, so reduce memory pressure yourself. In order from least to most disruptive: lower `--ctx-size` (`-c`) (KV cache is roughly proportional to `n_ctx`); for `llama-server`, lower `--parallel` (`-np`) (a slot KV cache is allocated per concurrent sequence); as a last resort, reduce `--n-gpu-layers` (`-ngl`) (the remaining layers run on CPU and inference will be much slower). |
| Performance is worse with multi-GPU than single-GPU | The performance is bottlenecked by GPU interconnect speed. For `--split-mode tensor`, verify that NCCL is being used. Try `--split-mode layer` (less communication than `tensor`). Increase GPU interconnect speed via more PCIe lanes or e.g. NVLink (if available). |
| GPU not used at all | `--n-gpu-layers` is `0` or too low - try explicitly setting `-ngl all`. Or you are accidentally hiding the GPUs via an environment variable like `CUDA_VISIBLE_DEVICES=-1`. Or your build doesn't include support for the relevant backend. |
| Crashes or corrupted outputs after setting `GGML_CUDA_P2P=1` | Some motherboards and BIOS settings (e.g. with IOMMU enabled) don't support CUDA peer-to-peer reliably. Unset `GGML_CUDA_P2P`. |

View file

@ -1568,7 +1568,8 @@ static __inline__ void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_ke
return env == nullptr || std::atoi(env) != 0;
}();
if (env_pdl_enabled && ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= GGML_CUDA_CC_HOPPER) {
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
if (env_pdl_enabled && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_HOPPER) {
auto pdl_cfg = ggml_cuda_pdl_config(launch_params);
CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, kernel, std::forward<Args>(args)... ));

View file

@ -564,9 +564,20 @@ int ggml_metal_op_concat(ggml_metal_op_t ctx, int idx) {
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
const int nth = std::min(1024, ne0);
int nth = std::min(256, ne0);
ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
// when rows are small, we can batch them together in a single threadgroup
int nrptg = 1;
if (nth < 256) {
nrptg = std::min((256 + nth - 1) / nth, ne1);
if (nrptg * nth > 256) {
nrptg = 256 / nth;
}
}
const int nw0 = (ne1 + nrptg - 1) / nrptg;
ggml_metal_encoder_dispatch_threadgroups(enc, nw0, ne2, ne3, nth, nrptg, 1);
return 1;
}
@ -1786,7 +1797,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) {
nk0 = ne10/ggml_blck_size(op->type);
}
int nth = std::min<int>(nk0, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
int nth = std::min<int>(nk0*ne11, 256);
// when rows are small, we can batch them together in a single threadgroup
int nrptg = 1;
@ -1797,7 +1808,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) {
nrptg = (nth + nk0 - 1)/nk0;
nth = nk0;
if (nrptg*nth > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
if (nrptg*nth > 256) {
nrptg--;
}
}

View file

@ -7486,7 +7486,11 @@ kernel void kernel_concat(
const int i3 = tgpig.z;
const int i2 = tgpig.y;
const int i1 = tgpig.x;
const int i1 = ntg.y == 1 ? tgpig.x : tgpig.x*ntg.y + tpitg.y;
if (i1 >= args.ne1) {
return;
}
int o[4] = {0, 0, 0, 0};
o[args.dim] = args.dim == 0 ? args.ne00 : (args.dim == 1 ? args.ne01 : (args.dim == 2 ? args.ne02 : args.ne03));

View file

@ -504,6 +504,12 @@ static constexpr std::initializer_list<ggml_op> topk_moe_late_softmax { GGM
GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
GGML_OP_SOFT_MAX, GGML_OP_RESHAPE };
// Snake activation: y = x + sin(a*x)^2 * inv_b. Used by the optimize_graph reorder
// pass so it keeps the chain contiguous and by the dispatcher to detect the fusion.
static constexpr std::initializer_list<ggml_op> snake_pattern { GGML_OP_MUL, GGML_OP_SIN,
GGML_OP_SQR, GGML_OP_MUL,
GGML_OP_ADD };
//node #978 ( SOFT_MAX): ffn_moe_probs-15 ( 0K) [Vulka ] use=2: ffn_moe_logits-15 ( 0K) [Vulka ]
//node #979 ( RESHAPE): ffn_moe_probs-15 (re ( 0K) [Vulka ] use=1: ffn_moe_probs-15 ( 0K) [Vulka ]
//node #980 ( ARGSORT): ffn_moe_argsort-15 ( 0K) [Vulka ] use=1: ffn_moe_probs-15 ( 0K) [Vulka ]
@ -851,6 +857,9 @@ struct vk_device_struct {
vk_pipeline pipeline_im2col_3d_f32, pipeline_im2col_3d_f32_f16;
vk_pipeline pipeline_timestep_embedding_f32;
vk_pipeline pipeline_conv_transpose_1d_f32;
vk_pipeline pipeline_snake_f32;
vk_pipeline pipeline_snake_f16;
vk_pipeline pipeline_snake_bf16;
vk_pipeline pipeline_pool2d_f32;
vk_pipeline pipeline_rwkv_wkv6_f32;
vk_pipeline pipeline_rwkv_wkv7_f32;
@ -1480,6 +1489,11 @@ struct vk_op_conv_transpose_1d_push_constants {
int32_t s0;
};
struct vk_op_snake_push_constants {
uint32_t ne0;
uint32_t ne1;
};
struct vk_op_pool2d_push_constants {
uint32_t IW; uint32_t IH;
uint32_t OW; uint32_t OH;
@ -4850,6 +4864,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_snake_f32, "snake_f32", snake_f32_len, snake_f32_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_snake_f16, "snake_f16", snake_f16_len, snake_f16_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_snake_bf16, "snake_bf16", snake_bf16_len, snake_bf16_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
@ -12137,6 +12155,45 @@ static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context&
ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p));
}
// Dispatch the fused snake activation: y = x + sin^2(a * x) * inv_b.
// Match the naive mul -> sin -> sqr -> mul -> add chain and run the
// dedicated kernel directly. The pattern is validated by
// ggml_vk_can_fuse_snake before this call.
static void ggml_vk_snake_dispatch_fused(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) {
const ggml_tensor * mul0 = cgraph->nodes[node_idx + 0];
const ggml_tensor * sqr = cgraph->nodes[node_idx + 2];
const ggml_tensor * mul1 = cgraph->nodes[node_idx + 3];
ggml_tensor * add = cgraph->nodes[node_idx + 4];
// x carries the full activation shape, a is the broadcast operand
const ggml_tensor * x = ggml_are_same_shape(mul0, mul0->src[0]) ? mul0->src[0] : mul0->src[1];
const ggml_tensor * a = (x == mul0->src[0]) ? mul0->src[1] : mul0->src[0];
// mul1 reads sqr and inv_b in either operand order
const ggml_tensor * inv_b = (mul1->src[0] == sqr) ? mul1->src[1] : mul1->src[0];
vk_pipeline pipeline = nullptr;
switch (x->type) {
case GGML_TYPE_F32: pipeline = ctx->device->pipeline_snake_f32; break;
case GGML_TYPE_F16: pipeline = ctx->device->pipeline_snake_f16; break;
case GGML_TYPE_BF16: pipeline = ctx->device->pipeline_snake_bf16; break;
default: GGML_ABORT("unsupported type");
}
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
vk_subbuffer x_buf = ggml_vk_tensor_subbuffer(ctx, x);
vk_subbuffer a_buf = ggml_vk_tensor_subbuffer(ctx, a);
vk_subbuffer inv_b_buf = ggml_vk_tensor_subbuffer(ctx, inv_b);
vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, add);
vk_op_snake_push_constants pc{};
pc.ne0 = static_cast<uint32_t>(x->ne[0]);
pc.ne1 = static_cast<uint32_t>(x->ne[1]);
std::array<uint32_t, 3> elements = { pc.ne0, pc.ne1, 1 };
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { x_buf, a_buf, inv_b_buf, dst_buf }, pc, elements);
}
static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
uint32_t op = static_cast<uint32_t>(dst->op_params[0]);
const int32_t k1 = dst->op_params[1];
@ -13345,7 +13402,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
break;
case GGML_OP_MUL:
ggml_vk_mul(ctx, compute_ctx, src0, src1, node);
if (ctx->num_additional_fused_ops) {
ggml_vk_snake_dispatch_fused(ctx, compute_ctx, cgraph, node_idx);
} else {
ggml_vk_mul(ctx, compute_ctx, src0, src1, node);
}
break;
case GGML_OP_DIV:
@ -14718,6 +14779,65 @@ static bool ggml_vk_can_fuse_rope_set_rows(ggml_backend_vk_context * ctx, const
return true;
}
// Pattern check for the 5-op Snake fusion: mul -> sin -> sqr -> mul -> add.
// Verifies the chain shape, the closure x_in_add == x_in_mul0, and that
// the broadcast operands a and inv_b share a [1, C] layout.
static bool ggml_vk_can_fuse_snake(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx) {
GGML_UNUSED(ctx);
if (!ggml_can_fuse(cgraph, node_idx, snake_pattern)) {
return false;
}
const ggml_tensor * mul0 = cgraph->nodes[node_idx + 0];
const ggml_tensor * sin_node = cgraph->nodes[node_idx + 1];
const ggml_tensor * sqr = cgraph->nodes[node_idx + 2];
const ggml_tensor * mul1 = cgraph->nodes[node_idx + 3];
const ggml_tensor * add = cgraph->nodes[node_idx + 4];
const ggml_tensor * x = ggml_are_same_shape(mul0, mul0->src[0]) ? mul0->src[0] : mul0->src[1];
const ggml_tensor * a = (x == mul0->src[0]) ? mul0->src[1] : mul0->src[0];
const ggml_tensor * inv_b = (mul1->src[0] == sqr) ? mul1->src[1] : mul1->src[0];
const ggml_tensor * x_in_add = (add->src[0] == mul1) ? add->src[1] : add->src[0];
if (x_in_add != x) {
return false;
}
if (x->type != GGML_TYPE_F32 && x->type != GGML_TYPE_F16 && x->type != GGML_TYPE_BF16) {
return false;
}
// Shader bindings: data_a is A_TYPE so it follows x's precision, while
// data_b and data_c are hardcoded float, so the broadcast operands must
// be F32 regardless of x's type.
if (a->type != GGML_TYPE_F32) return false;
if (inv_b->type != GGML_TYPE_F32) return false;
// Chain intermediates and output share x's precision (single A_TYPE / D_TYPE pipeline).
if (mul0->type != x->type) return false;
if (sin_node->type != x->type) return false;
if (sqr->type != x->type) return false;
if (mul1->type != x->type) return false;
if (add->type != x->type) return false;
if (!ggml_are_same_shape(a, inv_b)) {
return false;
}
if (a->ne[0] != 1 || a->ne[1] != x->ne[1]) {
return false;
}
// Dispatch is 2D over (ne0, ne1), so x and add must be 2D and a / inv_b
// must collapse to [1, C, 1, 1]. Higher dims are not handled by the shader.
if (x->ne[2] != 1 || x->ne[3] != 1) return false;
if (add->ne[2] != 1 || add->ne[3] != 1) return false;
if (a->ne[2] != 1 || a->ne[3] != 1) return false;
if (inv_b->ne[2] != 1 || inv_b->ne[3] != 1) return false;
// Shader uses idx = i0 + i1 * ne0 and reads data_b[i1] / data_c[i1],
// so every operand must be contiguous.
if (!ggml_is_contiguous(x) || !ggml_is_contiguous(add) ||
!ggml_is_contiguous(a) || !ggml_is_contiguous(inv_b)) {
return false;
}
return true;
}
// Check whether the tensors overlap in memory.
// Fusions can potentially overwrite src tensors in ways that are not prevented
// by ggml-alloc. If the fusion src is being applied in a way that's elementwise
@ -15025,6 +15145,14 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
op_srcs_fused_elementwise[0] = false;
op_srcs_fused_elementwise[1] = false;
op_srcs_fused_elementwise[2] = false;
} else if (ggml_vk_can_fuse_snake(ctx, cgraph, i)) {
ctx->num_additional_fused_ops = 4;
fusion_string = "SNAKE";
// elementwise=true: snake.comp is safe under exact aliasing because each
// thread reads data_x[idx] into a register before writing data_d[idx]
// with a data dependency on that register. The overlap check still
// rejects partial overlaps (different base or size).
std::fill_n(op_srcs_fused_elementwise, 5, true);
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) &&
ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) &&
ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) {
@ -15315,6 +15443,9 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
if (keep_pattern(topk_moe_late_softmax)) {
continue;
}
if (keep_pattern(snake_pattern)) {
continue;
}
// First, grab the next unused node.
current_set.push_back(first_unused);
@ -15337,7 +15468,8 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
if (match_pattern(topk_moe_early_softmax_norm, j) ||
match_pattern(topk_moe_sigmoid_norm_bias, j) ||
match_pattern(topk_moe_early_softmax, j) ||
match_pattern(topk_moe_late_softmax, j)) {
match_pattern(topk_moe_late_softmax, j) ||
match_pattern(snake_pattern, j)) {
continue;
}
bool ok = true;

View file

@ -0,0 +1,49 @@
#version 450
#include "types.glsl"
// Fused snake activation: y = x + sin(b * x)^2 * c
// data_a [ne0, ne1] per element activation x (A_TYPE)
// data_b [1, ne1] per channel multiplier (float)
// data_c [1, ne1] per channel inverse scale (float, precomputed as 1 / freq)
// data_d [ne0, ne1] output y (D_TYPE)
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
layout (binding = 1) readonly buffer B {float data_b[];};
layout (binding = 2) readonly buffer C {float data_c[];};
layout (binding = 3) writeonly buffer D {D_TYPE data_d[];};
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
layout (push_constant) uniform parameter {
uint32_t ne0;
uint32_t ne1;
} p;
// Load A_TYPE to float
float load_val(uint32_t idx) {
#if defined(DATA_A_BF16)
return bf16_to_fp32(uint32_t(data_a[idx]));
#else
return float(data_a[idx]);
#endif
}
// Store float as D_TYPE
void store_val(uint32_t idx, float v) {
#if defined(DATA_D_BF16)
data_d[idx] = D_TYPE(fp32_to_bf16(v));
#else
data_d[idx] = D_TYPE(v);
#endif
}
void main() {
const uint32_t i0 = gl_GlobalInvocationID.x;
const uint32_t i1 = gl_GlobalInvocationID.y;
if (i0 >= p.ne0 || i1 >= p.ne1) return;
const uint32_t idx = i0 + i1 * p.ne0;
const float xi = load_val(idx);
const float s = sin(data_b[i1] * xi);
store_val(idx, xi + s * s * data_c[i1]);
}

View file

@ -969,6 +969,10 @@ void process_shaders() {
string_to_spv("conv_transpose_1d_f32", "conv_transpose_1d.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("snake_f32", "snake.comp", {{"DATA_A_F32", "1"}, {"A_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("snake_f16", "snake.comp", {{"DATA_A_F16", "1"}, {"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
string_to_spv("snake_bf16", "snake.comp", {{"DATA_A_BF16", "1"}, {"DATA_D_BF16", "1"}, {"A_TYPE", "uint16_t"}, {"D_TYPE", "uint16_t"}});
string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));

View file

@ -1806,6 +1806,11 @@ private:
const llm_tokenizer_plamo2 & tokenizer;
};
// reserved suffix (U+E000) that keeps DNA k-mers distinct from identical
// base-vocab BPE tokens (e.g. CCCCCC) in token_to_id; erased from id_to_token
// text at load
static const std::string dna_kmer_marker = "\xee\x80\x80";
struct llm_tokenizer_hybriddna_session : llm_tokenizer_bpe_session {
llm_tokenizer_hybriddna_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {}
@ -1861,34 +1866,22 @@ private:
c = char(c - 32);
}
}
auto is_valid_kmer = [](const std::string & s) {
for (char c : s) {
if (c != 'A' && c != 'C' && c != 'G' && c != 'T') {
return false;
}
}
return true;
// k-mers carry the reserved marker suffix; a non-ACGT k-mer simply
// isn't in the vocab and falls back to <oov>
auto kmer_token = [&](const std::string & kmer) {
const auto tok = vocab.text_to_token(kmer + dna_kmer_marker);
return tok != LLAMA_TOKEN_NULL ? tok : oov_id;
};
size_t i = 0;
for (; i + k <= seq.size(); i += k) {
const std::string kmer = seq.substr(i, k);
if (is_valid_kmer(kmer)) {
const auto tok = vocab.text_to_token(kmer);
output.push_back(tok != LLAMA_TOKEN_NULL ? tok : oov_id);
} else {
output.push_back(oov_id);
}
output.push_back(kmer_token(seq.substr(i, k)));
}
if (i < seq.size()) {
std::string kmer = seq.substr(i);
kmer.append(k - kmer.size(), 'A');
if (is_valid_kmer(kmer)) {
const auto tok = vocab.text_to_token(kmer);
output.push_back(tok != LLAMA_TOKEN_NULL ? tok : oov_id);
} else {
output.push_back(oov_id);
}
output.push_back(kmer_token(kmer));
}
}
@ -2596,6 +2589,23 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
}
GGML_ASSERT_CONTINUE(id_to_token.size() == token_to_id.size());
// hybriddna: the marker suffix kept k-mer ids distinct in token_to_id; erase
// it from id_to_token so the k-mers detokenize to the bare DNA sequence. The
// k-mers are the block right after <oov>, so only scan from there.
if (tokenizer_model == "hybriddna") {
const auto idx = token_to_id.find("<oov>");
if (idx != token_to_id.end()) {
auto it = id_to_token.begin() + idx->second + 1;
for (; it != id_to_token.end(); ++it) {
std::string & text = it->text;
if (text.size() > dna_kmer_marker.size()
&& text.compare(text.size() - dna_kmer_marker.size(), dna_kmer_marker.size(), dna_kmer_marker) == 0) {
text.erase(text.size() - dna_kmer_marker.size());
}
}
}
}
init_tokenizer(type);
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'

View file

@ -2,11 +2,16 @@
set(TARGET llama-fit-params-impl)
add_library(${TARGET} STATIC fit-params.cpp)
add_library(${TARGET} fit-params.cpp)
set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
if(LLAMA_TOOLS_INSTALL)
install(TARGETS ${TARGET} LIBRARY)
endif()
# llama-fit-params executable
set(TARGET llama-fit-params)

View file

@ -506,6 +506,9 @@ struct server_slot {
if (ptask) {
res["id_task"] = ptask->id;
res["n_prompt_tokens"] = (int32_t) prompt.tokens.size();
res["n_prompt_tokens_processed"] = n_prompt_tokens_processed;
res["n_prompt_tokens_cache"] = n_prompt_tokens_cache;
res["params"] = ptask->params.to_json(only_metrics);
res["next_token"] = {
{
@ -701,6 +704,10 @@ private:
bool sleeping = false;
void destroy() {
spec.reset();
ctx_dft.reset();
model_dft.reset();
llama_init.reset();
ctx_tgt = nullptr;

View file

@ -14,6 +14,7 @@
#include <mutex>
#include <condition_variable>
#include <cstring>
#include <cstdlib>
#include <atomic>
#include <chrono>
#include <queue>
@ -159,6 +160,13 @@ void server_model_meta::update_args(common_preset_context & ctx_preset, std::str
// TODO: maybe validate preset before rendering ?
// render args
args = preset.to_args(bin_path);
// unified binary dispatches by subcommand, re-inject it right after the
// binary path so the child starts as 'llama serve ...' not 'llama ...'
const char * app_cmd = std::getenv("LLAMA_APP_CMD");
if (app_cmd != nullptr && app_cmd[0] != '\0' && !bin_path.empty()) {
args.insert(args.begin() + 1, app_cmd);
}
}
void server_model_meta::update_caps() {