diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index d50f75409..d38bfe7f8 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -538,6 +539,8 @@ static std::string format_input_text(const std::string & prompt, const std::stri } int main(int argc, char ** argv) { + std::setlocale(LC_NUMERIC, "C"); + ggml_time_init(); common_params params; diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index d8eaaa269..33ef2a752 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -3,6 +3,7 @@ #include "log.h" #include "llama.h" +#include #include #include @@ -94,6 +95,8 @@ static void print_raw_embeddings(const float * emb, } int main(int argc, char ** argv) { + std::setlocale(LC_NUMERIC, "C"); + common_params params; if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) { diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 9d241230f..104de7abf 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -7612,6 +7612,18 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_ return false; } + if (device->driver_id == vk::DriverId::eIntelProprietaryWindows) { + // Intel Windows proprietary driver tuning + switch (src0_type) { + case GGML_TYPE_MXFP4: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + return false; + default: + return true; + } + } + switch (src0_type) { // From tests on A770 Linux, may need more tuning case GGML_TYPE_Q4_0: diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl index 55dd66408..a748dc1b8 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl @@ -7,6 +7,13 @@ struct Params { offset_src0: u32, offset_src1: u32, offset_dst: u32, + offset_merged_src0: u32, + offset_merged_src1: u32, + + stride_src0_0: u32, + stride_src0_1: u32, + stride_src0_2: u32, + stride_src0_3: u32, stride_src1_0: u32, stride_src1_1: u32, @@ -23,6 +30,21 @@ struct Params { b_ne3: u32, }; +fn src0_index(_i: u32) -> u32 { + var i = _i; + let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0); + i = i % (params.a_ne2 * params.a_ne1 * params.a_ne0); + let a_i2 = i / (params.a_ne1 * params.a_ne0); + i = i % (params.a_ne1 * params.a_ne0); + let a_i1 = i / params.a_ne0; + let a_i0 = i % params.a_ne0; + + return a_i0 * params.stride_src0_0 + + a_i1 * params.stride_src0_1 + + a_i2 * params.stride_src0_2 + + a_i3 * params.stride_src0_3; +} + fn src1_index(_i: u32) -> u32 { var i = _i; let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0); @@ -53,17 +75,22 @@ fn src1_index(_i: u32) -> u32 { #define DataType f16 #endif +#ifdef SRC_OVERLAP +@group(0) @binding(0) +var merged_src: array; + +@group(0) @binding(1) +var dst: array; + +@group(0) @binding(2) +var params: Params; +#else @group(0) @binding(0) var src0: array; @group(0) @binding(1) var src1 : array; - -#ifdef INPLACE -@group(0) @binding(2) -var params: Params; - -#elif defined(OVERLAP) +#if defined(INPLACE) || defined(OVERLAP) @group(0) @binding(2) var params: Params; @@ -74,6 +101,7 @@ var dst: array; @group(0) @binding(3) var params: Params; #endif +#endif fn op(a: DataType, b: DataType) -> DataType { #ifdef OP_ADD @@ -87,13 +115,17 @@ fn op(a: DataType, b: DataType) -> DataType { #endif } -fn update(dst_i: u32, src0_i: u32, src1_i: u32){ +fn update(dst_i: u32, src0_i: u32, src1_i: u32) { +#ifdef SRC_OVERLAP + let result = op(merged_src[src0_i], merged_src[src1_i]); +#else let result = op(src0[src0_i], src1[src1_i]); +#endif #ifdef INPLACE - src0[dst_i] = result; + src0[src0_i] = result; #elif defined(OVERLAP) - src1[dst_i] = result; + src1[src1_i] = result; #else dst[dst_i] = result; #endif @@ -102,6 +134,8 @@ fn update(dst_i: u32, src0_i: u32, src1_i: u32){ @compute @workgroup_size(WG_SIZE) fn main(@builtin(global_invocation_id) gid: vec3) { if (gid.x < params.ne) { - update(params.offset_dst + gid.x, params.offset_src0 + gid.x, params.offset_src1 + src1_index(gid.x)); + let src0_i = params.offset_src0 + params.offset_merged_src0 + src0_index(gid.x); + let src1_i = params.offset_src1 + params.offset_merged_src1 + src1_index(gid.x); + update(params.offset_dst + gid.x, src0_i, src1_i); } } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 70a8d66fa..e7eab5129 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1426,16 +1426,14 @@ static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) { } next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type); for (int i = 1; i < GGML_MAX_DIMS; i++) { - if (tensor->ne[i] != 1) { - if (i > n) { - if (tensor->nb[i] != next_nb) { - return false; - } - next_nb *= tensor->ne[i]; - } else { - // this dimension does not need to be contiguous - next_nb = tensor->ne[i]*tensor->nb[i]; + if (i > n) { + if (tensor->ne[i] != 1 && tensor->nb[i] != next_nb) { + return false; } + next_nb *= tensor->ne[i]; + } else { + // this dimension does not need to be contiguous + next_nb = tensor->ne[i]*tensor->nb[i]; } } return true; diff --git a/src/llama-impl.cpp b/src/llama-impl.cpp index 2d6f9e7c5..aee258a66 100644 --- a/src/llama-impl.cpp +++ b/src/llama-impl.cpp @@ -100,9 +100,9 @@ std::string format(const char * fmt, ...) { std::string llama_format_tensor_shape(const std::vector & ne) { char buf[256]; - snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0)); + snprintf(buf, sizeof(buf), "%6" PRId64, ne.at(0)); for (size_t i = 1; i < ne.size(); i++) { - snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i)); + snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %6" PRId64, ne.at(i)); } return buf; } diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp index e588fb8ff..b545079c4 100644 --- a/tools/completion/completion.cpp +++ b/tools/completion/completion.cpp @@ -7,6 +7,7 @@ #include "chat.h" #include "build-info.h" +#include #include #include #include @@ -85,6 +86,8 @@ static void sigint_handler(int signo) { #endif int main(int argc, char ** argv) { + std::setlocale(LC_NUMERIC, "C"); + common_params params; g_params = ¶ms; @@ -377,7 +380,7 @@ int main(int argc, char ** argv) { // remove any "future" tokens that we might have inherited from the previous session if (session_tokens.size() > n_match) { if (!llama_memory_seq_rm(mem, -1, n_match, -1)) { - LOG_WRN("%s: unable to resuse common prefix (for example, when the memory is recurrent)\n", __func__); + LOG_WRN("%s: unable to reuse common prefix (for example, when the memory is recurrent)\n", __func__); llama_memory_clear(mem, true); session_tokens.clear(); n_match = 0; diff --git a/tools/gguf-split/gguf-split.cpp b/tools/gguf-split/gguf-split.cpp index 1c1fb77e7..e935810aa 100644 --- a/tools/gguf-split/gguf-split.cpp +++ b/tools/gguf-split/gguf-split.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -568,6 +569,8 @@ static void gguf_merge(const split_params & split_params) { } int main(int argc, const char ** argv) { + std::setlocale(LC_NUMERIC, "C"); + split_params params; split_params_parse(argc, argv, params); diff --git a/tools/mtmd/deprecation-warning.cpp b/tools/mtmd/deprecation-warning.cpp index dded0a56a..2b31a9d8b 100644 --- a/tools/mtmd/deprecation-warning.cpp +++ b/tools/mtmd/deprecation-warning.cpp @@ -1,7 +1,10 @@ +#include #include #include int main(int argc, char** argv) { + std::setlocale(LC_NUMERIC, "C"); + std::string filename = "main"; if (argc >= 1) { filename = argv[0]; diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index 054c7faa6..ba00e0853 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) #include @@ -274,6 +275,8 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) { } int main(int argc, char ** argv) { + std::setlocale(LC_NUMERIC, "C"); + ggml_time_init(); common_params params; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 9609ea32e..472542a54 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -3,6 +3,10 @@ #include "llama.h" #include "gguf.h" +#include +#include +#include +#include #include #include #include @@ -486,6 +490,8 @@ static bool parse_layer_prune(const char * data, std::vector & prune_layers } int main(int argc, char ** argv) { + std::setlocale(LC_NUMERIC, "C"); + if (argc < 3) { usage(argv[0]); } diff --git a/tools/server/server.cpp b/tools/server/server.cpp index f353dcdde..fab0bb587 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -8,6 +8,7 @@ #include "log.h" #include +#include #include #include #include // for std::thread::hardware_concurrency @@ -67,6 +68,8 @@ static server_http_context::handler_t ex_wrapper(server_http_context::handler_t } int main(int argc, char ** argv) { + std::setlocale(LC_NUMERIC, "C"); + // own arguments required by this example common_params params; diff --git a/tools/tts/tts.cpp b/tools/tts/tts.cpp index ac55a8b1c..dc2fa494b 100644 --- a/tools/tts/tts.cpp +++ b/tools/tts/tts.cpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -536,6 +537,8 @@ static std::string audio_data_from_speaker(json speaker, const outetts_version t } int main(int argc, char ** argv) { + std::setlocale(LC_NUMERIC, "C"); + common_params params; params.out_file = "output.wav";