Merge commit 'd6f3030047' into concedo_experimental

# Conflicts: # examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py # examples/model-conversion/scripts/utils/semantic_check.py # ggml/CMakeLists.txt # ggml/src/CMakeLists.txt # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-cpu/amx/amx.cpp # ggml/src/ggml-cuda/CMakeLists.txt # ggml/src/ggml-hexagon/ggml-hexagon.cpp # ggml/src/ggml-hip/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-openvino/ggml-openvino.cpp # ggml/src/ggml-rpc/ggml-rpc.cpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp # ggml/src/ggml-virtgpu/ggml-backend.cpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-zdnn/ggml-zdnn.cpp # ggml/src/ggml-zendnn/ggml-zendnn.cpp # pyproject.toml # requirements/requirements-convert_legacy_llama.txt # requirements/requirements-tool_bench.txt # src/llama-model.cpp # src/llama.cpp # tests/test-llama-archs.cpp # tests/test-tokenizer-0.py # tests/test-tokenizer-random.py # tools/llama-bench/llama-bench.cpp # tools/perplexity/perplexity.cpp
2026-04-28 03:30:20 +00:00 · 2026-04-11 11:10:55 +08:00 · 2026-04-11 11:10:55 +08:00 · a165a73120
commit a165a73120
parent 8b90bfe094 d6f3030047
37 changed files with 3075 additions and 376 deletions
--- a/include/llama.h
+++ b/include/llama.h
@ -195,9 +195,10 @@ extern "C" {
    LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);

    enum llama_split_mode {
-        LLAMA_SPLIT_MODE_NONE  = 0, // single GPU
-        LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
-        LLAMA_SPLIT_MODE_ROW   = 2, // split layers and KV across GPUs, use tensor parallelism if supported
+        LLAMA_SPLIT_MODE_NONE   = 0, // single GPU
+        LLAMA_SPLIT_MODE_LAYER  = 1, // split layers and KV across GPUs
+        LLAMA_SPLIT_MODE_ROW    = 2, // split layers and KV across GPUs, use tensor parallelism if supported
+        LLAMA_SPLIT_MODE_TENSOR = 3,
    };

    // TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)