Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	docs/build.md
#	examples/batched/batched.cpp
#	examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
#	examples/deprecation-warning/deprecation-warning.cpp
#	examples/eval-callback/eval-callback.cpp
#	examples/gen-docs/gen-docs.cpp
#	examples/gguf-hash/gguf-hash.cpp
#	examples/gguf/gguf.cpp
#	examples/lookahead/lookahead.cpp
#	examples/lookup/lookup-create.cpp
#	examples/lookup/lookup-merge.cpp
#	examples/lookup/lookup-stats.cpp
#	examples/lookup/lookup.cpp
#	examples/parallel/parallel.cpp
#	examples/passkey/passkey.cpp
#	examples/retrieval/retrieval.cpp
#	examples/save-load-state/save-load-state.cpp
#	examples/simple-chat/simple-chat.cpp
#	examples/simple/simple.cpp
#	examples/speculative-simple/speculative-simple.cpp
#	examples/speculative/speculative.cpp
#	examples/sycl/ls-sycl-device.cpp
#	examples/training/finetune.cpp
#	ggml/src/ggml-cpu/CMakeLists.txt
#	ggml/src/ggml-cpu/amx/common.h
#	ggml/src/ggml-cpu/kleidiai/kernels.cpp
#	ggml/src/ggml-opencl/CMakeLists.txt
#	ggml/src/ggml-opencl/ggml-opencl.cpp
#	ggml/src/ggml-opencl/kernels/cvt.cl
#	ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl
#	ggml/src/ggml-opencl/kernels/transpose.cl
#	ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
#	ggml/src/ggml-webgpu/ggml-webgpu.cpp
#	ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl
#	ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl
#	ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl
#	scripts/get-wikitext-2.sh
#	tests/test-backend-ops.cpp
#	tools/batched-bench/batched-bench.cpp
#	tools/cvector-generator/cvector-generator.cpp
#	tools/export-lora/export-lora.cpp
#	tools/imatrix/imatrix.cpp
#	tools/llama-bench/llama-bench.cpp
#	tools/perplexity/perplexity.cpp
#	tools/rpc/rpc-server.cpp
#	tools/tokenize/tokenize.cpp
This commit is contained in:
Concedo 2026-03-06 21:19:49 +08:00
commit d20e60ddd5
13 changed files with 96 additions and 22 deletions

View file

@ -7,6 +7,7 @@
#include <limits.h>
#include <algorithm>
#include <clocale>
#include <cmath>
#include <cstring>
#include <limits>
@ -538,6 +539,8 @@ static std::string format_input_text(const std::string & prompt, const std::stri
}
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
ggml_time_init();
common_params params;

View file

@ -3,6 +3,7 @@
#include "log.h"
#include "llama.h"
#include <clocale>
#include <ctime>
#include <algorithm>
@ -94,6 +95,8 @@ static void print_raw_embeddings(const float * emb,
}
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
common_params params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {

View file

@ -7612,6 +7612,18 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_
return false;
}
if (device->driver_id == vk::DriverId::eIntelProprietaryWindows) {
// Intel Windows proprietary driver tuning
switch (src0_type) {
case GGML_TYPE_MXFP4:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
return false;
default:
return true;
}
}
switch (src0_type) {
// From tests on A770 Linux, may need more tuning
case GGML_TYPE_Q4_0:

View file

@ -7,6 +7,13 @@ struct Params {
offset_src0: u32,
offset_src1: u32,
offset_dst: u32,
offset_merged_src0: u32,
offset_merged_src1: u32,
stride_src0_0: u32,
stride_src0_1: u32,
stride_src0_2: u32,
stride_src0_3: u32,
stride_src1_0: u32,
stride_src1_1: u32,
@ -23,6 +30,21 @@ struct Params {
b_ne3: u32,
};
fn src0_index(_i: u32) -> u32 {
var i = _i;
let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0);
i = i % (params.a_ne2 * params.a_ne1 * params.a_ne0);
let a_i2 = i / (params.a_ne1 * params.a_ne0);
i = i % (params.a_ne1 * params.a_ne0);
let a_i1 = i / params.a_ne0;
let a_i0 = i % params.a_ne0;
return a_i0 * params.stride_src0_0 +
a_i1 * params.stride_src0_1 +
a_i2 * params.stride_src0_2 +
a_i3 * params.stride_src0_3;
}
fn src1_index(_i: u32) -> u32 {
var i = _i;
let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0);
@ -53,17 +75,22 @@ fn src1_index(_i: u32) -> u32 {
#define DataType f16
#endif
#ifdef SRC_OVERLAP
@group(0) @binding(0)
var<storage, read_write> merged_src: array<DataType>;
@group(0) @binding(1)
var<storage, read_write> dst: array<DataType>;
@group(0) @binding(2)
var<uniform> params: Params;
#else
@group(0) @binding(0)
var<storage, read_write> src0: array<DataType>;
@group(0) @binding(1)
var<storage, read_write> src1 : array<DataType>;
#ifdef INPLACE
@group(0) @binding(2)
var<uniform> params: Params;
#elif defined(OVERLAP)
#if defined(INPLACE) || defined(OVERLAP)
@group(0) @binding(2)
var<uniform> params: Params;
@ -74,6 +101,7 @@ var<storage, read_write> dst: array<DataType>;
@group(0) @binding(3)
var<uniform> params: Params;
#endif
#endif
fn op(a: DataType, b: DataType) -> DataType {
#ifdef OP_ADD
@ -87,13 +115,17 @@ fn op(a: DataType, b: DataType) -> DataType {
#endif
}
fn update(dst_i: u32, src0_i: u32, src1_i: u32){
fn update(dst_i: u32, src0_i: u32, src1_i: u32) {
#ifdef SRC_OVERLAP
let result = op(merged_src[src0_i], merged_src[src1_i]);
#else
let result = op(src0[src0_i], src1[src1_i]);
#endif
#ifdef INPLACE
src0[dst_i] = result;
src0[src0_i] = result;
#elif defined(OVERLAP)
src1[dst_i] = result;
src1[src1_i] = result;
#else
dst[dst_i] = result;
#endif
@ -102,6 +134,8 @@ fn update(dst_i: u32, src0_i: u32, src1_i: u32){
@compute @workgroup_size(WG_SIZE)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
if (gid.x < params.ne) {
update(params.offset_dst + gid.x, params.offset_src0 + gid.x, params.offset_src1 + src1_index(gid.x));
let src0_i = params.offset_src0 + params.offset_merged_src0 + src0_index(gid.x);
let src1_i = params.offset_src1 + params.offset_merged_src1 + src1_index(gid.x);
update(params.offset_dst + gid.x, src0_i, src1_i);
}
}

View file

@ -1426,16 +1426,14 @@ static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
}
next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
if (tensor->ne[i] != 1) {
if (i > n) {
if (tensor->nb[i] != next_nb) {
return false;
}
next_nb *= tensor->ne[i];
} else {
// this dimension does not need to be contiguous
next_nb = tensor->ne[i]*tensor->nb[i];
if (i > n) {
if (tensor->ne[i] != 1 && tensor->nb[i] != next_nb) {
return false;
}
next_nb *= tensor->ne[i];
} else {
// this dimension does not need to be contiguous
next_nb = tensor->ne[i]*tensor->nb[i];
}
}
return true;

View file

@ -100,9 +100,9 @@ std::string format(const char * fmt, ...) {
std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
char buf[256];
snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0));
snprintf(buf, sizeof(buf), "%6" PRId64, ne.at(0));
for (size_t i = 1; i < ne.size(); i++) {
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i));
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %6" PRId64, ne.at(i));
}
return buf;
}

View file

@ -7,6 +7,7 @@
#include "chat.h"
#include "build-info.h"
#include <clocale>
#include <cstdio>
#include <cstring>
#include <ctime>
@ -85,6 +86,8 @@ static void sigint_handler(int signo) {
#endif
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
common_params params;
g_params = &params;
@ -377,7 +380,7 @@ int main(int argc, char ** argv) {
// remove any "future" tokens that we might have inherited from the previous session
if (session_tokens.size() > n_match) {
if (!llama_memory_seq_rm(mem, -1, n_match, -1)) {
LOG_WRN("%s: unable to resuse common prefix (for example, when the memory is recurrent)\n", __func__);
LOG_WRN("%s: unable to reuse common prefix (for example, when the memory is recurrent)\n", __func__);
llama_memory_clear(mem, true);
session_tokens.clear();
n_match = 0;

View file

@ -7,6 +7,7 @@
#include <algorithm>
#include <cinttypes>
#include <climits>
#include <clocale>
#include <cstdio>
#include <cstdlib>
#include <stdexcept>
@ -568,6 +569,8 @@ static void gguf_merge(const split_params & split_params) {
}
int main(int argc, const char ** argv) {
std::setlocale(LC_NUMERIC, "C");
split_params params;
split_params_parse(argc, argv, params);

View file

@ -1,7 +1,10 @@
#include <clocale>
#include <cstdio>
#include <string>
int main(int argc, char** argv) {
std::setlocale(LC_NUMERIC, "C");
std::string filename = "main";
if (argc >= 1) {
filename = argv[0];

View file

@ -13,6 +13,7 @@
#include <vector>
#include <limits.h>
#include <cinttypes>
#include <clocale>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
@ -274,6 +275,8 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
}
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
ggml_time_init();
common_params params;

View file

@ -3,6 +3,10 @@
#include "llama.h"
#include "gguf.h"
#include <algorithm>
#include <cctype>
#include <clocale>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <vector>
@ -486,6 +490,8 @@ static bool parse_layer_prune(const char * data, std::vector<int> & prune_layers
}
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
if (argc < 3) {
usage(argv[0]);
}

View file

@ -8,6 +8,7 @@
#include "log.h"
#include <atomic>
#include <clocale>
#include <exception>
#include <signal.h>
#include <thread> // for std::thread::hardware_concurrency
@ -67,6 +68,8 @@ static server_http_context::handler_t ex_wrapper(server_http_context::handler_t
}
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
// own arguments required by this example
common_params params;

View file

@ -10,6 +10,7 @@
#include <nlohmann/json.hpp>
#include <algorithm>
#include <clocale>
#include <cmath>
#include <cstdio>
#include <fstream>
@ -536,6 +537,8 @@ static std::string audio_data_from_speaker(json speaker, const outetts_version t
}
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
common_params params;
params.out_file = "output.wav";