Merge branch 'upstream' into concedo_experimental

# Conflicts: # docs/build.md # examples/batched/batched.cpp # examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp # examples/deprecation-warning/deprecation-warning.cpp # examples/eval-callback/eval-callback.cpp # examples/gen-docs/gen-docs.cpp # examples/gguf-hash/gguf-hash.cpp # examples/gguf/gguf.cpp # examples/lookahead/lookahead.cpp # examples/lookup/lookup-create.cpp # examples/lookup/lookup-merge.cpp # examples/lookup/lookup-stats.cpp # examples/lookup/lookup.cpp # examples/parallel/parallel.cpp # examples/passkey/passkey.cpp # examples/retrieval/retrieval.cpp # examples/save-load-state/save-load-state.cpp # examples/simple-chat/simple-chat.cpp # examples/simple/simple.cpp # examples/speculative-simple/speculative-simple.cpp # examples/speculative/speculative.cpp # examples/sycl/ls-sycl-device.cpp # examples/training/finetune.cpp # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-cpu/amx/common.h # ggml/src/ggml-cpu/kleidiai/kernels.cpp # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-opencl/kernels/cvt.cl # ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl # ggml/src/ggml-opencl/kernels/transpose.cl # ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl # scripts/get-wikitext-2.sh # tests/test-backend-ops.cpp # tools/batched-bench/batched-bench.cpp # tools/cvector-generator/cvector-generator.cpp # tools/export-lora/export-lora.cpp # tools/imatrix/imatrix.cpp # tools/llama-bench/llama-bench.cpp # tools/perplexity/perplexity.cpp # tools/rpc/rpc-server.cpp # tools/tokenize/tokenize.cpp
2026-05-19 08:00:25 +00:00 · 2026-03-06 21:19:49 +08:00 · 2026-03-06 21:19:49 +08:00 · d20e60ddd5
commit d20e60ddd5
parent 2c38638b3d d969e933e1
13 changed files with 96 additions and 22 deletions
--- a/examples/diffusion/diffusion-cli.cpp
+++ b/examples/diffusion/diffusion-cli.cpp
@ -7,6 +7,7 @@
 #include <limits.h>

 #include <algorithm>
+#include <clocale>
 #include <cmath>
 #include <cstring>
 #include <limits>
@ -538,6 +539,8 @@ static std::string format_input_text(const std::string & prompt, const std::stri
 }

 int main(int argc, char ** argv) {
+    std::setlocale(LC_NUMERIC, "C");
+
    ggml_time_init();

    common_params params;
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -3,6 +3,7 @@
 #include "log.h"
 #include "llama.h"

+#include <clocale>
 #include <ctime>
 #include <algorithm>

@ -94,6 +95,8 @@ static void print_raw_embeddings(const float * emb,
 }

 int main(int argc, char ** argv) {
+    std::setlocale(LC_NUMERIC, "C");
+
    common_params params;

    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -7612,6 +7612,18 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_
            return false;
        }

+        if (device->driver_id == vk::DriverId::eIntelProprietaryWindows) {
+            // Intel Windows proprietary driver tuning
+            switch (src0_type) {
+            case GGML_TYPE_MXFP4:
+            case GGML_TYPE_Q4_K:
+            case GGML_TYPE_Q5_K:
+                return false;
+            default:
+                return true;
+            }
+        }
+
        switch (src0_type) {
        // From tests on A770 Linux, may need more tuning
        case GGML_TYPE_Q4_0:
--- a/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl
@ -7,6 +7,13 @@ struct Params {
    offset_src0: u32,
    offset_src1: u32,
    offset_dst: u32,
+    offset_merged_src0: u32,
+    offset_merged_src1: u32,
+
+    stride_src0_0: u32,
+    stride_src0_1: u32,
+    stride_src0_2: u32,
+    stride_src0_3: u32,

    stride_src1_0: u32,
    stride_src1_1: u32,
@ -23,6 +30,21 @@ struct Params {
    b_ne3: u32,
 };

+fn src0_index(_i: u32) -> u32 {
+    var i = _i;
+    let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0);
+    i = i % (params.a_ne2 * params.a_ne1 * params.a_ne0);
+    let a_i2 = i / (params.a_ne1 * params.a_ne0);
+    i = i % (params.a_ne1 * params.a_ne0);
+    let a_i1 = i / params.a_ne0;
+    let a_i0 = i % params.a_ne0;
+
+    return a_i0 * params.stride_src0_0 +
+           a_i1 * params.stride_src0_1 +
+           a_i2 * params.stride_src0_2 +
+           a_i3 * params.stride_src0_3;
+}
+
 fn src1_index(_i: u32) -> u32 {
    var i = _i;
    let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0);
@ -53,17 +75,22 @@ fn src1_index(_i: u32) -> u32 {
 #define DataType f16
 #endif

+#ifdef SRC_OVERLAP
+@group(0) @binding(0)
+var<storage, read_write> merged_src: array<DataType>;
+
+@group(0) @binding(1)
+var<storage, read_write> dst: array<DataType>;
+
+@group(0) @binding(2)
+var<uniform> params: Params;
+#else
@group(0) @binding(0)
 var<storage, read_write> src0: array<DataType>;

@group(0) @binding(1)
 var<storage, read_write> src1 : array<DataType>;
-
-#ifdef INPLACE
-@group(0) @binding(2)
-var<uniform> params: Params;
-
-#elif defined(OVERLAP)
+#if defined(INPLACE) || defined(OVERLAP)
@group(0) @binding(2)
 var<uniform> params: Params;

@ -74,6 +101,7 @@ var<storage, read_write> dst: array<DataType>;
@group(0) @binding(3)
 var<uniform> params: Params;
 #endif
+#endif

 fn op(a: DataType, b: DataType) -> DataType {
 #ifdef OP_ADD
@ -87,13 +115,17 @@ fn op(a: DataType, b: DataType) -> DataType {
 #endif
 }

-fn update(dst_i: u32, src0_i: u32, src1_i: u32){
+fn update(dst_i: u32, src0_i: u32, src1_i: u32) {
+#ifdef SRC_OVERLAP
+    let result = op(merged_src[src0_i], merged_src[src1_i]);
+#else
    let result = op(src0[src0_i], src1[src1_i]);
+#endif

 #ifdef INPLACE
-    src0[dst_i] = result;
+    src0[src0_i] = result;
 #elif defined(OVERLAP)
-    src1[dst_i] = result;
+    src1[src1_i] = result;
 #else
    dst[dst_i] = result;
 #endif
@ -102,6 +134,8 @@ fn update(dst_i: u32, src0_i: u32, src1_i: u32){
@compute @workgroup_size(WG_SIZE)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
    if (gid.x < params.ne) {
-        update(params.offset_dst + gid.x, params.offset_src0 + gid.x, params.offset_src1 + src1_index(gid.x));
+        let src0_i = params.offset_src0 + params.offset_merged_src0 + src0_index(gid.x);
+        let src1_i = params.offset_src1 + params.offset_merged_src1 + src1_index(gid.x);
+        update(params.offset_dst + gid.x, src0_i, src1_i);
    }
 }
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -1426,16 +1426,14 @@ static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
    }
    next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type);
    for (int i = 1; i < GGML_MAX_DIMS; i++) {
-        if (tensor->ne[i] != 1) {
-            if (i > n) {
-                if (tensor->nb[i] != next_nb) {
-                    return false;
-                }
-                next_nb *= tensor->ne[i];
-            } else {
-                // this dimension does not need to be contiguous
-                next_nb = tensor->ne[i]*tensor->nb[i];
+        if (i > n) {
+            if (tensor->ne[i] != 1 && tensor->nb[i] != next_nb) {
+                return false;
            }
+            next_nb *= tensor->ne[i];
+        } else {
+            // this dimension does not need to be contiguous
+            next_nb = tensor->ne[i]*tensor->nb[i];
        }
    }
    return true;
--- a/src/llama-impl.cpp
+++ b/src/llama-impl.cpp
@ -100,9 +100,9 @@ std::string format(const char * fmt, ...) {

 std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
    char buf[256];
-    snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0));
+    snprintf(buf, sizeof(buf), "%6" PRId64, ne.at(0));
    for (size_t i = 1; i < ne.size(); i++) {
-        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i));
+        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %6" PRId64, ne.at(i));
    }
    return buf;
 }
--- a/tools/completion/completion.cpp
+++ b/tools/completion/completion.cpp
@ -7,6 +7,7 @@
 #include "chat.h"
 #include "build-info.h"

+#include <clocale>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
@ -85,6 +86,8 @@ static void sigint_handler(int signo) {
 #endif

 int main(int argc, char ** argv) {
+    std::setlocale(LC_NUMERIC, "C");
+
    common_params params;
    g_params = &params;

@ -377,7 +380,7 @@ int main(int argc, char ** argv) {
            // remove any "future" tokens that we might have inherited from the previous session
            if (session_tokens.size() > n_match) {
                if (!llama_memory_seq_rm(mem, -1, n_match, -1)) {
-                    LOG_WRN("%s: unable to resuse common prefix (for example, when the memory is recurrent)\n", __func__);
+                    LOG_WRN("%s: unable to reuse common prefix (for example, when the memory is recurrent)\n", __func__);
                    llama_memory_clear(mem, true);
                    session_tokens.clear();
                    n_match = 0;
--- a/tools/gguf-split/gguf-split.cpp
+++ b/tools/gguf-split/gguf-split.cpp
@ -7,6 +7,7 @@
 #include <algorithm>
 #include <cinttypes>
 #include <climits>
+#include <clocale>
 #include <cstdio>
 #include <cstdlib>
 #include <stdexcept>
@ -568,6 +569,8 @@ static void gguf_merge(const split_params & split_params) {
 }

 int main(int argc, const char ** argv) {
+    std::setlocale(LC_NUMERIC, "C");
+
    split_params params;
    split_params_parse(argc, argv, params);

--- a/tools/mtmd/deprecation-warning.cpp
+++ b/tools/mtmd/deprecation-warning.cpp
@ -1,7 +1,10 @@
+#include <clocale>
 #include <cstdio>
 #include <string>

 int main(int argc, char** argv) {
+    std::setlocale(LC_NUMERIC, "C");
+
    std::string filename = "main";
    if (argc >= 1) {
        filename = argv[0];
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@ -13,6 +13,7 @@
 #include <vector>
 #include <limits.h>
 #include <cinttypes>
+#include <clocale>

 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 #include <signal.h>
@ -274,6 +275,8 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
 }

 int main(int argc, char ** argv) {
+    std::setlocale(LC_NUMERIC, "C");
+
    ggml_time_init();

    common_params params;
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@ -3,6 +3,10 @@
 #include "llama.h"
 #include "gguf.h"

+#include <algorithm>
+#include <cctype>
+#include <clocale>
+#include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <vector>
@ -486,6 +490,8 @@ static bool parse_layer_prune(const char * data, std::vector<int> & prune_layers
 }

 int main(int argc, char ** argv) {
+    std::setlocale(LC_NUMERIC, "C");
+
    if (argc < 3) {
        usage(argv[0]);
    }
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@ -8,6 +8,7 @@
 #include "log.h"

 #include <atomic>
+#include <clocale>
 #include <exception>
 #include <signal.h>
 #include <thread> // for std::thread::hardware_concurrency
@ -67,6 +68,8 @@ static server_http_context::handler_t ex_wrapper(server_http_context::handler_t
 }

 int main(int argc, char ** argv) {
+    std::setlocale(LC_NUMERIC, "C");
+
    // own arguments required by this example
    common_params params;

--- a/tools/tts/tts.cpp
+++ b/tools/tts/tts.cpp
@ -10,6 +10,7 @@
 #include <nlohmann/json.hpp>

 #include <algorithm>
+#include <clocale>
 #include <cmath>
 #include <cstdio>
 #include <fstream>
@ -536,6 +537,8 @@ static std::string audio_data_from_speaker(json speaker, const outetts_version t
 }

 int main(int argc, char ** argv) {
+    std::setlocale(LC_NUMERIC, "C");
+
    common_params params;

    params.out_file = "output.wav";