diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp
index d50f75409..d38bfe7f8 100644
--- a/examples/diffusion/diffusion-cli.cpp
+++ b/examples/diffusion/diffusion-cli.cpp
@@ -7,6 +7,7 @@
 #include <limits.h>
 
 #include <algorithm>
+#include <clocale>
 #include <cmath>
 #include <cstring>
 #include <limits>
@@ -538,6 +539,8 @@ static std::string format_input_text(const std::string & prompt, const std::stri
 }
 
 int main(int argc, char ** argv) {
+    std::setlocale(LC_NUMERIC, "C");
+
     ggml_time_init();
 
     common_params params;
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index d8eaaa269..33ef2a752 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -3,6 +3,7 @@
 #include "log.h"
 #include "llama.h"
 
+#include <clocale>
 #include <ctime>
 #include <algorithm>
 
@@ -94,6 +95,8 @@ static void print_raw_embeddings(const float * emb,
 }
 
 int main(int argc, char ** argv) {
+    std::setlocale(LC_NUMERIC, "C");
+
     common_params params;
 
     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 9d241230f..104de7abf 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -7612,6 +7612,18 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_
             return false;
         }
 
+        if (device->driver_id == vk::DriverId::eIntelProprietaryWindows) {
+            // Intel Windows proprietary driver tuning
+            switch (src0_type) {
+            case GGML_TYPE_MXFP4:
+            case GGML_TYPE_Q4_K:
+            case GGML_TYPE_Q5_K:
+                return false;
+            default:
+                return true;
+            }
+        }
+
         switch (src0_type) {
         // From tests on A770 Linux, may need more tuning
         case GGML_TYPE_Q4_0:
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl
index 55dd66408..a748dc1b8 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl
@@ -7,6 +7,13 @@ struct Params {
     offset_src0: u32,
     offset_src1: u32,
     offset_dst: u32,
+    offset_merged_src0: u32,
+    offset_merged_src1: u32,
+
+    stride_src0_0: u32,
+    stride_src0_1: u32,
+    stride_src0_2: u32,
+    stride_src0_3: u32,
 
     stride_src1_0: u32,
     stride_src1_1: u32,
@@ -23,6 +30,21 @@ struct Params {
     b_ne3: u32,
 };
 
+fn src0_index(_i: u32) -> u32 {
+    var i = _i;
+    let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0);
+    i = i % (params.a_ne2 * params.a_ne1 * params.a_ne0);
+    let a_i2 = i / (params.a_ne1 * params.a_ne0);
+    i = i % (params.a_ne1 * params.a_ne0);
+    let a_i1 = i / params.a_ne0;
+    let a_i0 = i % params.a_ne0;
+
+    return a_i0 * params.stride_src0_0 +
+           a_i1 * params.stride_src0_1 +
+           a_i2 * params.stride_src0_2 +
+           a_i3 * params.stride_src0_3;
+}
+
 fn src1_index(_i: u32) -> u32 {
     var i = _i;
     let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0);
@@ -53,17 +75,22 @@ fn src1_index(_i: u32) -> u32 {
 #define DataType f16
 #endif
 
+#ifdef SRC_OVERLAP
+@group(0) @binding(0)
+var<storage, read_write> merged_src: array<DataType>;
+
+@group(0) @binding(1)
+var<storage, read_write> dst: array<DataType>;
+
+@group(0) @binding(2)
+var<uniform> params: Params;
+#else
 @group(0) @binding(0)
 var<storage, read_write> src0: array<DataType>;
 
 @group(0) @binding(1)
 var<storage, read_write> src1 : array<DataType>;
-
-#ifdef INPLACE
-@group(0) @binding(2)
-var<uniform> params: Params;
-
-#elif defined(OVERLAP)
+#if defined(INPLACE) || defined(OVERLAP)
 @group(0) @binding(2)
 var<uniform> params: Params;
 
@@ -74,6 +101,7 @@ var<storage, read_write> dst: array<DataType>;
 @group(0) @binding(3)
 var<uniform> params: Params;
 #endif
+#endif
 
 fn op(a: DataType, b: DataType) -> DataType {
 #ifdef OP_ADD
@@ -87,13 +115,17 @@ fn op(a: DataType, b: DataType) -> DataType {
 #endif
 }
 
-fn update(dst_i: u32, src0_i: u32, src1_i: u32){
+fn update(dst_i: u32, src0_i: u32, src1_i: u32) {
+#ifdef SRC_OVERLAP
+    let result = op(merged_src[src0_i], merged_src[src1_i]);
+#else
     let result = op(src0[src0_i], src1[src1_i]);
+#endif
 
 #ifdef INPLACE
-    src0[dst_i] = result;
+    src0[src0_i] = result;
 #elif defined(OVERLAP)
-    src1[dst_i] = result;
+    src1[src1_i] = result;
 #else
     dst[dst_i] = result;
 #endif
@@ -102,6 +134,8 @@ fn update(dst_i: u32, src0_i: u32, src1_i: u32){
 @compute @workgroup_size(WG_SIZE)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
     if (gid.x < params.ne) {
-        update(params.offset_dst + gid.x, params.offset_src0 + gid.x, params.offset_src1 + src1_index(gid.x));
+        let src0_i = params.offset_src0 + params.offset_merged_src0 + src0_index(gid.x);
+        let src1_i = params.offset_src1 + params.offset_merged_src1 + src1_index(gid.x);
+        update(params.offset_dst + gid.x, src0_i, src1_i);
     }
 }
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 70a8d66fa..e7eab5129 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1426,16 +1426,14 @@ static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
     }
     next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type);
     for (int i = 1; i < GGML_MAX_DIMS; i++) {
-        if (tensor->ne[i] != 1) {
-            if (i > n) {
-                if (tensor->nb[i] != next_nb) {
-                    return false;
-                }
-                next_nb *= tensor->ne[i];
-            } else {
-                // this dimension does not need to be contiguous
-                next_nb = tensor->ne[i]*tensor->nb[i];
+        if (i > n) {
+            if (tensor->ne[i] != 1 && tensor->nb[i] != next_nb) {
+                return false;
             }
+            next_nb *= tensor->ne[i];
+        } else {
+            // this dimension does not need to be contiguous
+            next_nb = tensor->ne[i]*tensor->nb[i];
         }
     }
     return true;
diff --git a/src/llama-impl.cpp b/src/llama-impl.cpp
index 2d6f9e7c5..aee258a66 100644
--- a/src/llama-impl.cpp
+++ b/src/llama-impl.cpp
@@ -100,9 +100,9 @@ std::string format(const char * fmt, ...) {
 
 std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
     char buf[256];
-    snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0));
+    snprintf(buf, sizeof(buf), "%6" PRId64, ne.at(0));
     for (size_t i = 1; i < ne.size(); i++) {
-        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i));
+        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %6" PRId64, ne.at(i));
     }
     return buf;
 }
diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp
index e588fb8ff..b545079c4 100644
--- a/tools/completion/completion.cpp
+++ b/tools/completion/completion.cpp
@@ -7,6 +7,7 @@
 #include "chat.h"
 #include "build-info.h"
 
+#include <clocale>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
@@ -85,6 +86,8 @@ static void sigint_handler(int signo) {
 #endif
 
 int main(int argc, char ** argv) {
+    std::setlocale(LC_NUMERIC, "C");
+
     common_params params;
     g_params = &params;
 
@@ -377,7 +380,7 @@ int main(int argc, char ** argv) {
             // remove any "future" tokens that we might have inherited from the previous session
             if (session_tokens.size() > n_match) {
                 if (!llama_memory_seq_rm(mem, -1, n_match, -1)) {
-                    LOG_WRN("%s: unable to resuse common prefix (for example, when the memory is recurrent)\n", __func__);
+                    LOG_WRN("%s: unable to reuse common prefix (for example, when the memory is recurrent)\n", __func__);
                     llama_memory_clear(mem, true);
                     session_tokens.clear();
                     n_match = 0;
diff --git a/tools/gguf-split/gguf-split.cpp b/tools/gguf-split/gguf-split.cpp
index 1c1fb77e7..e935810aa 100644
--- a/tools/gguf-split/gguf-split.cpp
+++ b/tools/gguf-split/gguf-split.cpp
@@ -7,6 +7,7 @@
 #include <algorithm>
 #include <cinttypes>
 #include <climits>
+#include <clocale>
 #include <cstdio>
 #include <cstdlib>
 #include <stdexcept>
@@ -568,6 +569,8 @@ static void gguf_merge(const split_params & split_params) {
 }
 
 int main(int argc, const char ** argv) {
+    std::setlocale(LC_NUMERIC, "C");
+
     split_params params;
     split_params_parse(argc, argv, params);
 
diff --git a/tools/mtmd/deprecation-warning.cpp b/tools/mtmd/deprecation-warning.cpp
index dded0a56a..2b31a9d8b 100644
--- a/tools/mtmd/deprecation-warning.cpp
+++ b/tools/mtmd/deprecation-warning.cpp
@@ -1,7 +1,10 @@
+#include <clocale>
 #include <cstdio>
 #include <string>
 
 int main(int argc, char** argv) {
+    std::setlocale(LC_NUMERIC, "C");
+
     std::string filename = "main";
     if (argc >= 1) {
         filename = argv[0];
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index 054c7faa6..ba00e0853 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -13,6 +13,7 @@
 #include <vector>
 #include <limits.h>
 #include <cinttypes>
+#include <clocale>
 
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 #include <signal.h>
@@ -274,6 +275,8 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
 }
 
 int main(int argc, char ** argv) {
+    std::setlocale(LC_NUMERIC, "C");
+
     ggml_time_init();
 
     common_params params;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 9609ea32e..472542a54 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -3,6 +3,10 @@
 #include "llama.h"
 #include "gguf.h"
 
+#include <algorithm>
+#include <cctype>
+#include <clocale>
+#include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <vector>
@@ -486,6 +490,8 @@ static bool parse_layer_prune(const char * data, std::vector<int> & prune_layers
 }
 
 int main(int argc, char ** argv) {
+    std::setlocale(LC_NUMERIC, "C");
+
     if (argc < 3) {
         usage(argv[0]);
     }
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index f353dcdde..fab0bb587 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -8,6 +8,7 @@
 #include "log.h"
 
 #include <atomic>
+#include <clocale>
 #include <exception>
 #include <signal.h>
 #include <thread> // for std::thread::hardware_concurrency
@@ -67,6 +68,8 @@ static server_http_context::handler_t ex_wrapper(server_http_context::handler_t
 }
 
 int main(int argc, char ** argv) {
+    std::setlocale(LC_NUMERIC, "C");
+
     // own arguments required by this example
     common_params params;
 
diff --git a/tools/tts/tts.cpp b/tools/tts/tts.cpp
index ac55a8b1c..dc2fa494b 100644
--- a/tools/tts/tts.cpp
+++ b/tools/tts/tts.cpp
@@ -10,6 +10,7 @@
 #include <nlohmann/json.hpp>
 
 #include <algorithm>
+#include <clocale>
 #include <cmath>
 #include <cstdio>
 #include <fstream>
@@ -536,6 +537,8 @@ static std::string audio_data_from_speaker(json speaker, const outetts_version t
 }
 
 int main(int argc, char ** argv) {
+    std::setlocale(LC_NUMERIC, "C");
+
     common_params params;
 
     params.out_file = "output.wav";