From a7417f55945eb7c7a5ea6807e66564b8066a4e50 Mon Sep 17 00:00:00 2001 From: Romain Biessy Date: Mon, 30 Jun 2025 14:52:02 +0200 Subject: [PATCH 01/20] ggml-cpu: sycl: Re-enable exp f16 (#14462) --- ggml/src/ggml-cpu/CMakeLists.txt | 5 +++++ ggml/src/ggml-sycl/ggml-sycl.cpp | 4 +--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 46f678b85..66a5ad8d2 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -589,4 +589,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name) if (EMSCRIPTEN) set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128") endif() + + if (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM") + # The compiler automatically enables "-ffast-math" which can cause NaNs in tests due to "-fassociative-math" + target_compile_options(${GGML_CPU_NAME} PRIVATE "-fno-associative-math") + endif() endfunction() diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 4ecca4165..ae5e06257 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4215,6 +4215,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_TANH: + case GGML_UNARY_OP_EXP: case GGML_UNARY_OP_SGN: case GGML_UNARY_OP_ABS: case GGML_UNARY_OP_ELU: @@ -4223,9 +4224,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g #else return ggml_is_contiguous(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) && (op->type == op->src[0]->type); #endif - case GGML_UNARY_OP_EXP: - // Disable FP16 until we find out the root cause of failing fp16 sycl::exp - return ggml_is_contiguous(op->src[0]) && (op->type == op->src[0]->type) && op->src[0]->type == GGML_TYPE_F32; default: return false; } From 5dd942de5922a22ec8446a4ad2203738dbcb9389 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 30 Jun 2025 17:04:05 +0300 Subject: [PATCH 02/20] metal : disable fast-math for some cpy kernels (#14460) * metal : disable fast-math for some cpy kernels ggml-ci * cont : disable for q4_1 ggml-ci * cont : disable for iq4_nl ggml-ci --- ggml/src/ggml-metal/ggml-metal.metal | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index fc3cfe35a..dac45c7a9 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -138,6 +138,7 @@ void quantize_q4_0(device const float * src, device block_q4_0 & dst) { } void quantize_q4_1(device const float * src, device block_q4_1 & dst) { +#pragma METAL fp math_mode(safe) float min = FLT_MAX; float max = -FLT_MAX; @@ -203,6 +204,7 @@ void quantize_q5_0(device const float * src, device block_q5_0 & dst) { } void quantize_q5_1(device const float * src, device block_q5_1 & dst) { +#pragma METAL fp math_mode(safe) float max = src[0]; float min = src[0]; @@ -239,6 +241,7 @@ void quantize_q5_1(device const float * src, device block_q5_1 & dst) { } void quantize_iq4_nl(device const float * src, device block_iq4_nl & dst) { +#pragma METAL fp math_mode(safe) float amax = 0.0f; // absolute max float max = 0.0f; From 745f11fed09ba2303f3f494efb12286d382608e5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 30 Jun 2025 18:03:03 +0300 Subject: [PATCH 03/20] memory : correctly handle failure in apply() (#14438) ggml-ci --- src/llama-kv-cache-unified-iswa.cpp | 2 +- src/llama-kv-cache-unified.cpp | 2 +- src/llama-memory-hybrid.cpp | 2 +- src/llama-memory-recurrent.cpp | 10 +++++++++- src/llama-memory.cpp | 17 +++++++++++++++++ src/llama-memory.h | 3 +++ 6 files changed, 32 insertions(+), 4 deletions(-) diff --git a/src/llama-kv-cache-unified-iswa.cpp b/src/llama-kv-cache-unified-iswa.cpp index b9169299c..d1f839b63 100644 --- a/src/llama-kv-cache-unified-iswa.cpp +++ b/src/llama-kv-cache-unified-iswa.cpp @@ -246,7 +246,7 @@ bool llama_kv_cache_unified_iswa_context::next() { } bool llama_kv_cache_unified_iswa_context::apply() { - assert(status == LLAMA_MEMORY_STATUS_SUCCESS); + assert(!llama_memory_status_is_fail(status)); bool res = true; diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp index 8517b722a..7f7b162ff 100644 --- a/src/llama-kv-cache-unified.cpp +++ b/src/llama-kv-cache-unified.cpp @@ -1776,7 +1776,7 @@ bool llama_kv_cache_unified_context::next() { } bool llama_kv_cache_unified_context::apply() { - assert(status == LLAMA_MEMORY_STATUS_SUCCESS); + assert(!llama_memory_status_is_fail(status)); // no ubatches -> this is a KV cache update if (ubatches.empty()) { diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp index 15cde98d1..67cbf9554 100644 --- a/src/llama-memory-hybrid.cpp +++ b/src/llama-memory-hybrid.cpp @@ -218,7 +218,7 @@ bool llama_memory_hybrid_context::next() { } bool llama_memory_hybrid_context::apply() { - assert(status == LLAMA_MEMORY_STATUS_SUCCESS); + assert(!llama_memory_status_is_fail(status)); bool res = true; diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp index e52156bf3..6ed84057c 100644 --- a/src/llama-memory-recurrent.cpp +++ b/src/llama-memory-recurrent.cpp @@ -1071,7 +1071,15 @@ bool llama_memory_recurrent_context::next() { } bool llama_memory_recurrent_context::apply() { - assert(status == LLAMA_MEMORY_STATUS_SUCCESS); + assert(!llama_memory_status_is_fail(status)); + + // no ubatches -> this is an update + if (ubatches.empty()) { + // recurrent cache never performs updates + assert(status == LLAMA_MEMORY_STATUS_NO_UPDATE); + + return true; + } mem->find_slot(ubatches[i_next]); diff --git a/src/llama-memory.cpp b/src/llama-memory.cpp index f1107672c..ca6844c32 100644 --- a/src/llama-memory.cpp +++ b/src/llama-memory.cpp @@ -40,3 +40,20 @@ llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_me // if either status has an update, then the combined status has an update return has_update ? LLAMA_MEMORY_STATUS_SUCCESS : LLAMA_MEMORY_STATUS_NO_UPDATE; } + +bool llama_memory_status_is_fail(llama_memory_status status) { + switch (status) { + case LLAMA_MEMORY_STATUS_SUCCESS: + case LLAMA_MEMORY_STATUS_NO_UPDATE: + { + return false; + } + case LLAMA_MEMORY_STATUS_FAILED_PREPARE: + case LLAMA_MEMORY_STATUS_FAILED_COMPUTE: + { + return true; + } + } + + return false; +} diff --git a/src/llama-memory.h b/src/llama-memory.h index 16b7e5ee2..e8ba336e8 100644 --- a/src/llama-memory.h +++ b/src/llama-memory.h @@ -31,6 +31,9 @@ enum llama_memory_status { // useful for implementing hybrid memory types (e.g. iSWA) llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_memory_status s1); +// helper function for checking if a memory status indicates a failure +bool llama_memory_status_is_fail(llama_memory_status status); + // the interface for managing the memory context during batch processing // this interface is implemented per memory type. see: // - llama_kv_cache_unified_context From 0a5a3b5cdfd887cf0f8e09d9ff89dee130cfcdde Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Mon, 30 Jun 2025 23:57:04 +0800 Subject: [PATCH 04/20] Add Conv2d for CPU (#14388) * Conv2D: Add CPU version * Half decent * Tiled approach for F32 * remove file * Fix tests * Support F16 operations * add assert about size * Review: further formatting fixes, add assert and use CPU version of fp32->fp16 --- ggml/include/ggml.h | 12 +++ ggml/src/ggml-cpu/ggml-cpu.c | 11 ++- ggml/src/ggml-cpu/ops.cpp | 181 +++++++++++++++++++++++++++++++++++ ggml/src/ggml-cpu/ops.h | 5 + ggml/src/ggml.c | 44 ++++++++- 5 files changed, 250 insertions(+), 3 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index e5dda969a..8bd3ca929 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -482,6 +482,7 @@ extern "C" { GGML_OP_CONV_TRANSPOSE_1D, GGML_OP_IM2COL, GGML_OP_IM2COL_BACK, + GGML_OP_CONV_2D, GGML_OP_CONV_2D_DW, GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_POOL_1D, @@ -1813,6 +1814,17 @@ extern "C" { struct ggml_tensor * b, int stride); + GGML_API struct ggml_tensor * ggml_conv_2d_direct( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC] + struct ggml_tensor * b, // input data [W, H, C, N] + int s0, // stride dimension 0 + int s1, // stride dimension 1 + int p0, // padding dimension 0 + int p1, // padding dimension 1 + int d0, // dilation dimension 0 + int d1); // dilation dimension 1 + enum ggml_op_pool { GGML_OP_POOL_MAX, GGML_OP_POOL_AVG, diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 1d68cde71..11ff228f0 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1193,7 +1193,7 @@ static void ggml_compute_forward_mul_mat_one_chunk( } } -static void ggml_compute_forward_mul_mat( +void ggml_compute_forward_mul_mat( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -1866,6 +1866,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_im2col_back_f32(params, tensor); } break; + case GGML_OP_CONV_2D: + { + ggml_compute_forward_conv_2d(params, tensor); + } break; case GGML_OP_CONV_2D_DW: { ggml_compute_forward_conv_2d_dw(params, tensor); @@ -2228,6 +2232,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } break; case GGML_OP_IM2COL: case GGML_OP_IM2COL_BACK: + case GGML_OP_CONV_2D: case GGML_OP_CONV_2D_DW: case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_CONV_TRANSPOSE_2D: @@ -2746,6 +2751,10 @@ struct ggml_cplan ggml_graph_plan( GGML_ABORT("fatal error"); } } break; + case GGML_OP_CONV_2D: + { + cur = GGML_IM2COL_WORK_SIZE; + } break; case GGML_OP_CONV_TRANSPOSE_2D: { const int64_t ne00 = node->src[0]->ne[0]; // W diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 27586ed1f..6948c00b4 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -3,6 +3,7 @@ #include "ggml-cpu.h" #include "ggml-impl.h" #include "binary-ops.h" +#include "ggml.h" #include "unary-ops.h" #include "vec.h" @@ -6545,6 +6546,186 @@ void ggml_compute_forward_im2col_back_f32( } } +static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params, int64_t m, int64_t n, int64_t k, + void * a, void * b, float * c) { + const ggml_type_traits * traits = ggml_get_type_traits(type); + struct ggml_tensor src1 = {}; + src1.type = type; + src1.ne[0] = k; + src1.ne[1] = m; + src1.ne[2] = 1; + src1.ne[3] = 1; + src1.nb[0] = traits->type_size; + src1.nb[1] = k * traits->type_size; + src1.nb[2] = src1.nb[1]; + src1.nb[3] = src1.nb[2]; + src1.data = a; + + struct ggml_tensor src0 = {}; + src0.type = type; + src0.ne[0] = k; + src0.ne[1] = n; + src0.ne[2] = 1; + src0.ne[3] = 1; + src0.nb[0] = traits->type_size; + src0.nb[1] = k * traits->type_size; + src0.nb[2] = src0.nb[1]; + src0.nb[3] = src0.nb[2]; + src0.data = b; + + struct ggml_tensor dst = {}; + dst.ne[0] = n; + dst.ne[1] = m; + dst.ne[2] = 1; + dst.ne[3] = 1; + dst.nb[0] = sizeof(float); + dst.nb[1] = n * sizeof(float); + dst.nb[2] = dst.nb[1]; + dst.nb[3] = dst.nb[2]; + dst.data = c; + dst.src[0] = &src0; + dst.src[1] = &src1; + + ggml_compute_forward_mul_mat(params, &dst); +} + +// ggml_compute_forward_conv_2d + +static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params, + const ggml_tensor * kernel, // [KW, KH, IC, OC] + const ggml_tensor * src, // [W, H, C, N] + ggml_tensor * dst, // [OW, OH, OC, N] + ggml_type kernel_type) { + + GGML_ASSERT(ggml_is_contiguous(kernel)); + GGML_ASSERT(kernel_type == GGML_TYPE_F16 || kernel_type == GGML_TYPE_F32); + GGML_ASSERT(kernel->type == kernel_type); + + const ggml_type_traits * traits = ggml_get_type_traits(kernel_type); + + const int32_t stride_x = dst->op_params[0]; + const int32_t stride_y = dst->op_params[1]; + const int32_t pad_x = dst->op_params[2]; + const int32_t pad_y = dst->op_params[3]; + const int32_t dilation_x = dst->op_params[4]; + const int32_t dilation_y = dst->op_params[5]; + + const int64_t c_in = src->ne[2]; + const int64_t c_out = kernel->ne[3]; + GGML_ASSERT(c_in == kernel->ne[2]); + + const int64_t src_w = src->ne[0]; + const int64_t src_h = src->ne[1]; + const int64_t knl_w = kernel->ne[0]; + const int64_t knl_h = kernel->ne[1]; + const int64_t dst_w = dst->ne[0]; + const int64_t dst_h = dst->ne[1]; + + const float * src_data = (float *) src->data; + void * knl_data = kernel->data; + float * dst_data = (float *) dst->data; + + const int64_t knl_n = knl_w * knl_h * c_in; + const int64_t patch_total = dst->ne[3] * dst_w * dst_h; + + const int64_t space_per_patch = knl_n * traits->type_size + c_out * sizeof(float); + const int64_t batch_size = params->wsize / space_per_patch; + const int64_t patches_per_batch = batch_size > 8 ? (batch_size / 8) * 8 : batch_size; + const int64_t batch_n = (patch_total + patches_per_batch - 1) / patches_per_batch; + + GGML_ASSERT(patches_per_batch > 0 && batch_size >= 1); + + void * tmp = params->wdata; + + for (int64_t batch_i = 0; batch_i < batch_n; ++batch_i) { + + const int64_t patch_start_batch = batch_i * patches_per_batch; + const int64_t patch_end_batch = std::min(patch_start_batch + patches_per_batch, + patch_total); + const int64_t patch_n = patch_end_batch - patch_start_batch; + + const int64_t patch_per_thread = (patch_n + params->nth - 1) / params->nth; + const int64_t patch_start = patch_start_batch + params->ith * patch_per_thread; + const int64_t patch_end = std::min(patch_start + patch_per_thread, patch_end_batch); + + //im2col for a patch + for (int64_t p = patch_start; p < patch_end; ++p) { + const int64_t batch_n = p / (dst_w * dst_h); + const int64_t src_x = (p / dst_w) % dst_h; + const int64_t src_y = p % dst_w; + + const float * src_base = (const float *)((const char *)src_data + batch_n * src->nb[3]); + char * dst_row = (char *) tmp + (p % patches_per_batch) * knl_n * traits->type_size; + + for (int64_t ic = 0; ic < c_in; ++ic) { + for (int64_t ky = 0; ky < knl_h; ++ky) { + for (int64_t kx = 0; kx < knl_w; ++kx) { + const int64_t sy = src_x * stride_y + ky * dilation_y - pad_y; + const int64_t sx = src_y * stride_x + kx * dilation_x - pad_x; + + int64_t dst_idx = ic * (knl_h * knl_w) + ky * knl_w + kx; + + float src_val; + if (sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) { + src_val = 0.0f; + } else { + const float * src_ptr = (const float *)((const char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]); + src_val = *src_ptr; + } + + char * element_ptr = dst_row + dst_idx * traits->type_size; + if (kernel_type == GGML_TYPE_F32) { + *(float *) element_ptr = src_val; + } else if (kernel_type == GGML_TYPE_F16) { + *(ggml_fp16_t *) element_ptr = GGML_CPU_FP32_TO_FP16(src_val); + } + } + } + } + } // patches handled by this thread + + ggml_barrier(params->threadpool); + + float * gemm_output = (float *) ((char *) tmp + patches_per_batch * knl_n * traits->type_size); + + GGML_ASSERT(gemm_output + patch_n * c_out <= (float*)tmp + params->wsize); + + // GEMM: patches[patch_n, knl_n] × kernel[knl_n, c_out] = output[patch_n, c_out] + ggml_call_mul_mat(kernel_type, params, patch_n, c_out, knl_n, tmp, knl_data, gemm_output); + + ggml_barrier(params->threadpool); + + + //permute back [OC, N, OH, OW] to [N, OC, OH, OW] + const int64_t permute_per_thread = (patch_n + params->nth - 1) / params->nth; + const int64_t permute_start = params->ith * permute_per_thread; + const int64_t permute_end = std::min(permute_start + permute_per_thread, patch_n); + + for (int64_t i = permute_start; i < permute_end; ++i) { + const int64_t p = patch_start_batch + i; + const int64_t batch_n = p / (dst_w * dst_h); + const int64_t dst_y = (p / dst_w) % dst_h; + const int64_t dst_x = p % dst_w; + + for (int64_t oc = 0; oc < c_out; ++oc) { + const float value = gemm_output[i * c_out + oc]; + float * dst_ptr = (float *)((char *)dst_data + dst_x * dst->nb[0] + dst_y * dst->nb[1] + oc * dst->nb[2] + batch_n * dst->nb[3]); + *dst_ptr = value; + } + } + } +} + +void ggml_compute_forward_conv_2d( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + ggml_compute_forward_conv_2d_impl(params, src0, src1, dst, src0->type); +} + // ggml_compute_forward_conv_transpose_2d void ggml_compute_forward_conv_transpose_2d( diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h index 5b384e4ba..3a32ec20d 100644 --- a/ggml/src/ggml-cpu/ops.h +++ b/ggml/src/ggml-cpu/ops.h @@ -20,6 +20,9 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); +// Work buffer size for im2col operations in CONV2D +#define GGML_IM2COL_WORK_SIZE (16 * 1024 * 1024) + #ifdef __cplusplus extern "C" { #endif @@ -65,6 +68,7 @@ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struc void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); @@ -107,6 +111,7 @@ void ggml_compute_forward_custom(const struct ggml_compute_params * params, stru void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst); #ifdef __cplusplus } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 14000b55a..47fe37b13 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -945,6 +945,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CONV_TRANSPOSE_1D", "IM2COL", "IM2COL_BACK", + "CONV_2D", "CONV_2D_DW", "CONV_TRANSPOSE_2D", "POOL_1D", @@ -986,7 +987,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "GLU", }; -static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85"); +static_assert(GGML_OP_COUNT == 86, "GGML_OP_COUNT != 86"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1044,6 +1045,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "conv_transpose_1d(x)", "im2col(x)", "im2col_back(x)", + "conv_2d(x)", "conv_2d_dw(x)", "conv_transpose_2d(x)", "pool_1d(x)", @@ -1085,7 +1087,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "glu(x)", }; -static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85"); +static_assert(GGML_OP_COUNT == 86, "GGML_OP_COUNT != 86"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -4291,6 +4293,44 @@ struct ggml_tensor * ggml_conv_2d_dw_direct( return result; } +// ggml_conv_2d_direct + +struct ggml_tensor * ggml_conv_2d_direct( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC] + struct ggml_tensor * b, // input data [W, H, C, N] + int s0, // stride dimension 0 + int s1, // stride dimension 1 + int p0, // padding dimension 0 + int p1, // padding dimension 1 + int d0, // dilation dimension 0 + int d1) {// dilation dimension 1 + + GGML_ASSERT(a->ne[2] == b->ne[2]); + //GGML_ASSERT(a->type == b->type); + + int64_t ne[4]; + ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); + ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1); + ne[2] = a->ne[3]; + ne[3] = b->ne[3]; + + struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne); + + ggml_set_op_params_i32(result, 0, s0); + ggml_set_op_params_i32(result, 1, s1); + ggml_set_op_params_i32(result, 2, p0); + ggml_set_op_params_i32(result, 3, p1); + ggml_set_op_params_i32(result, 4, d0); + ggml_set_op_params_i32(result, 5, d1); + + result->op = GGML_OP_CONV_2D; + result->src[0] = a; + result->src[1] = b; + + return result; +} + // ggml_conv_transpose_2d_p0 static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) { From 79b33b231774d5c39c8df018e9a276becae6d41a Mon Sep 17 00:00:00 2001 From: lhez Date: Tue, 1 Jul 2025 00:19:16 -0700 Subject: [PATCH 05/20] opencl : add GEGLU, REGLU, SWIGLU (#14456) --- ggml/src/ggml-opencl/CMakeLists.txt | 1 + ggml/src/ggml-opencl/ggml-opencl.cpp | 124 +++++++++++++++++ ggml/src/ggml-opencl/kernels/glu.cl | 201 +++++++++++++++++++++++++++ 3 files changed, 326 insertions(+) create mode 100644 ggml/src/ggml-opencl/kernels/glu.cl diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index 0e2a41964..45a488334 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -65,6 +65,7 @@ set(GGML_OPENCL_KERNELS gemv_noshuffle_general gemv_noshuffle get_rows + glu group_norm im2col_f32 im2col_f16 diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 96e8a8588..496e47575 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -351,6 +351,7 @@ struct ggml_backend_opencl_context { cl_program program_gemv_noshuffle_general; cl_program program_gemv_noshuffle; cl_program program_get_rows; + cl_program program_glu; cl_program program_im2col_f16; cl_program program_im2col_f32; cl_program program_mul_mat_Ab_Bi_8x4; @@ -401,6 +402,8 @@ struct ggml_backend_opencl_context { cl_kernel kernel_relu; cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16; cl_kernel kernel_clamp; + cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu, + kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16; cl_kernel kernel_norm; cl_kernel kernel_rms_norm; cl_kernel kernel_group_norm; @@ -738,6 +741,27 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("."); } + // glu + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "glu.cl.h" + }; +#else + const std::string kernel_src = read_file("glu.cl"); +#endif + backend_ctx->program_glu = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + + CL_CHECK((backend_ctx->kernel_geglu = clCreateKernel(backend_ctx->program_glu, "kernel_geglu", &err), err)); + CL_CHECK((backend_ctx->kernel_reglu = clCreateKernel(backend_ctx->program_glu, "kernel_reglu", &err), err)); + CL_CHECK((backend_ctx->kernel_swiglu = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu", &err), err)); + CL_CHECK((backend_ctx->kernel_geglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_f16", &err), err)); + CL_CHECK((backend_ctx->kernel_reglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_reglu_f16", &err), err)); + CL_CHECK((backend_ctx->kernel_swiglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu_f16", &err), err)); + GGML_LOG_CONT("."); + } + // get_rows { #ifdef GGML_OPENCL_EMBED_KERNELS @@ -2242,6 +2266,15 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te default: return false; } + case GGML_OP_GLU: + switch (ggml_get_glu_op(op)) { + case GGML_GLU_OP_GEGLU: + case GGML_GLU_OP_REGLU: + case GGML_GLU_OP_SWIGLU: + return ggml_is_contiguous_1(op->src[0]) && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16); + default: + return false; + } case GGML_OP_CLAMP: return op->src[0]->type == GGML_TYPE_F32; case GGML_OP_SOFT_MAX: @@ -6143,6 +6176,91 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } +static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0); + GGML_ASSERT(src0->extra); + GGML_ASSERT(dst); + GGML_ASSERT(dst->extra); + + GGML_ASSERT(ggml_is_contiguous_1(src0)); + + if (src1) { + GGML_ASSERT(src1); + GGML_ASSERT(src1->extra); + GGML_ASSERT(ggml_are_same_shape(src0, src1)); + } + + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + + cl_kernel kernel; + switch (ggml_get_glu_op(dst)) { + case GGML_GLU_OP_GEGLU: + if (dst->type == GGML_TYPE_F32) { + kernel = backend_ctx->kernel_geglu; + } else { + kernel = backend_ctx->kernel_geglu_f16; + } + break; + case GGML_GLU_OP_REGLU: + if (dst->type == GGML_TYPE_F32) { + kernel = backend_ctx->kernel_reglu; + } else { + kernel = backend_ctx->kernel_reglu_f16; + } + break; + case GGML_GLU_OP_SWIGLU: + if (dst->type == GGML_TYPE_F32) { + kernel = backend_ctx->kernel_swiglu; + } else { + kernel = backend_ctx->kernel_swiglu_f16; + } + break; + default: + GGML_ABORT("Unsupported glu op"); + } + + ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; + ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; + + ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr; + + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; + + cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0; + + const int ne0 = dst->ne[0]; + + const cl_ulong nb01 = src0->nb[1]; + const cl_ulong nb11 = src1 ? src1->nb[1] : nb01; + + const cl_ulong nb1 = dst->nb[1]; + + const int swp = ((const int32_t *) dst->op_params)[1]; + const int ne00_off = src1 ? 0 : (swp ? ne0 : 0); + const int ne10_off = src1 ? 0 : (swp ? 0 : ne0); + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), src1 ? &extra1->data_device : &extra0->data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb11)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne0)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb1)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne00_off)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10_off)); + + const size_t nrows = ggml_nrows(src0); + size_t nth = 512; + size_t global_work_size[] = {nrows*nth, 1, 1}; + size_t local_work_size[] = {nth, 1, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); +} + //------------------------------------------------------------------------------ // Op offloading //------------------------------------------------------------------------------ @@ -6244,6 +6362,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor default: return false; } break; + case GGML_OP_GLU: + if (!any_on_device) { + return false; + } + func = ggml_cl_glu; + break; case GGML_OP_CLAMP: if (!any_on_device) { return false; diff --git a/ggml/src/ggml-opencl/kernels/glu.cl b/ggml/src/ggml-opencl/kernels/glu.cl new file mode 100644 index 000000000..ba861d8b1 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/glu.cl @@ -0,0 +1,201 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#define GELU_COEF_A 0.044715f +#define SQRT_2_OVER_PI 0.79788456080286535587989211986876f + +//------------------------------------------------------------------------------ +// geglu +//------------------------------------------------------------------------------ +kernel void kernel_geglu( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global char * dst, + ulong offsetd, + ulong nb01, + ulong nb11, + int ne0, + ulong nb1, + int ne00_off, + int ne10_off +) { + src0 = (global char*)((global char*)src0 + offset0); + src1 = (global char*)((global char*)src1 + offset1); + dst = (global char*)((global char*)dst + offsetd); + + global float * src0_row = (global float *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off; + global float * src1_row = (global float *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off; + global float * dst_row = (global float *) ((global char *) dst + get_group_id(0)*nb1); + + for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) { + const float x0 = src0_row[i0]; + const float x1 = src1_row[i0]; + + const float gelu = 0.5f*x0*(1.0f + tanh(SQRT_2_OVER_PI*x0*(1.0f + GELU_COEF_A*x0*x0))); + + dst_row[i0] = gelu*x1; + } +} + +kernel void kernel_geglu_f16( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global char * dst, + ulong offsetd, + ulong nb01, + ulong nb11, + int ne0, + ulong nb1, + int ne00_off, + int ne10_off +) { + src0 = (global char*)((global char*)src0 + offset0); + src1 = (global char*)((global char*)src1 + offset1); + dst = (global char*)((global char*)dst + offsetd); + + global half * src0_row = (global half *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off; + global half * src1_row = (global half *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off; + global half * dst_row = (global half *) ((global char *) dst + get_group_id(0)*nb1); + + for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) { + const half x0 = src0_row[i0]; + const half x1 = src1_row[i0]; + + const half gelu = 0.5f*x0*(1.0f + tanh(SQRT_2_OVER_PI*x0*(1.0f + GELU_COEF_A*x0*x0))); + + dst_row[i0] = gelu*x1; + } +} + +//------------------------------------------------------------------------------ +// reglu +//------------------------------------------------------------------------------ +kernel void kernel_reglu( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global char * dst, + ulong offsetd, + ulong nb01, + ulong nb11, + int ne0, + ulong nb1, + int ne00_off, + int ne10_off +) { + src0 = (global char*)((global char*)src0 + offset0); + src1 = (global char*)((global char*)src1 + offset1); + dst = (global char*)((global char*)dst + offsetd); + + global float * src0_row = (global float *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off; + global float * src1_row = (global float *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off; + global float * dst_row = (global float *) ((global char *) dst + get_group_id(0)*nb1); + + for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) { + const float x0 = src0_row[i0]; + const float x1 = src1_row[i0]; + + dst_row[i0] = x0*x1*(x0 > 0.0f); + } +} + +kernel void kernel_reglu_f16( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global char * dst, + ulong offsetd, + ulong nb01, + ulong nb11, + int ne0, + ulong nb1, + int ne00_off, + int ne10_off +) { + src0 = (global char*)((global char*)src0 + offset0); + src1 = (global char*)((global char*)src1 + offset1); + dst = (global char*)((global char*)dst + offsetd); + + global half * src0_row = (global half *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off; + global half * src1_row = (global half *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off; + global half * dst_row = (global half *) ((global char *) dst + get_group_id(0)*nb1); + + for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) { + const half x0 = src0_row[i0]; + const half x1 = src1_row[i0]; + + dst_row[i0] = x0*x1*(x0 > 0.0f); + } +} + +//------------------------------------------------------------------------------ +// swiglu +//------------------------------------------------------------------------------ +kernel void kernel_swiglu( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global char * dst, + ulong offsetd, + ulong nb01, + ulong nb11, + int ne0, + ulong nb1, + int ne00_off, + int ne10_off +) { + src0 = (global char*)((global char*)src0 + offset0); + src1 = (global char*)((global char*)src1 + offset1); + dst = (global char*)((global char*)dst + offsetd); + + global float * src0_row = (global float *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off; + global float * src1_row = (global float *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off; + global float * dst_row = (global float *) ((global char *) dst + get_group_id(0)*nb1); + + for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) { + const float x0 = src0_row[i0]; + const float x1 = src1_row[i0]; + + const float silu = x0 / (1.0f + exp(-x0)); + + dst_row[i0] = silu*x1; + } +} + +kernel void kernel_swiglu_f16( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global char * dst, + ulong offsetd, + ulong nb01, + ulong nb11, + int ne0, + ulong nb1, + int ne00_off, + int ne10_off +) { + src0 = (global char*)((global char*)src0 + offset0); + src1 = (global char*)((global char*)src1 + offset1); + dst = (global char*)((global char*)dst + offsetd); + + global half * src0_row = (global half *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off; + global half * src1_row = (global half *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off; + global half * dst_row = (global half *) ((global char *) dst + get_group_id(0)*nb1); + + for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) { + const half x0 = src0_row[i0]; + const half x1 = src1_row[i0]; + + const half silu = x0 / (1.0f + exp(-x0)); + + dst_row[i0] = silu*x1; + } +} From 497be7c01dab243100f9c42e0a763a746e42d038 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 24 Jun 2025 06:10:16 +0200 Subject: [PATCH 06/20] ggml-quants : rename best_mad to best_error (ggml/1283) This commit renames the variable `best_mad` to `best_error` in the `make_qkx2_quants` function. The motivation for this is that the name `best_mad` can be somewhat confusing if mean absolute deviation (MAD) is not in use. --- ggml/src/ggml-quants.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index e389a46db..9a7d1b22d 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -568,14 +568,14 @@ static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, co } float iscale = nmax/(max - min); float scale = 1/iscale; - float best_mad = 0; + float best_error = 0; for (int i = 0; i < n; ++i) { int l = nearest_int(iscale*(x[i] - min)); L[i] = MAX(0, MIN(nmax, l)); float diff = scale * L[i] + min - x[i]; diff = use_mad ? fabsf(diff) : diff * diff; float w = weights[i]; - best_mad += w * diff; + best_error += w * diff; } if (nstep < 1) { *the_min = -min; @@ -601,18 +601,18 @@ static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, co this_min = 0; this_scale = sum_xl / sum_l2; } - float mad = 0; + float cur_error = 0; for (int i = 0; i < n; ++i) { float diff = this_scale * Laux[i] + this_min - x[i]; diff = use_mad ? fabsf(diff) : diff * diff; float w = weights[i]; - mad += w * diff; + cur_error += w * diff; } - if (mad < best_mad) { + if (cur_error < best_error) { for (int i = 0; i < n; ++i) { L[i] = Laux[i]; } - best_mad = mad; + best_error = cur_error; scale = this_scale; min = this_min; } From 431b2c24f31bf5411786c378b409d0202df8d53c Mon Sep 17 00:00:00 2001 From: Acly Date: Tue, 1 Jul 2025 09:11:00 +0200 Subject: [PATCH 07/20] ggml-cpu : "align corners" for bilinear upscale/downscale (ggml/1285) * add "align corners" mode for bilinear upscale, and allow downscaling * add ggml_interpolate, deprecate ggml_upscale_ext, pass in align-corners as bit-flag * test-backend-ops: replace ggml_upscale_ext with ggml_interpolate, add test cases for downscale and align-corners --- ggml/include/ggml.h | 22 +++++++++++++++++++-- ggml/src/ggml-cpu/ops.cpp | 19 ++++++++++++------- ggml/src/ggml.c | 39 +++++++++++++++++++++++--------------- tests/test-backend-ops.cpp | 16 +++++++++------- 4 files changed, 65 insertions(+), 31 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 8bd3ca929..f8238f315 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1867,6 +1867,12 @@ extern "C" { enum ggml_scale_mode { GGML_SCALE_MODE_NEAREST = 0, GGML_SCALE_MODE_BILINEAR = 1, + + GGML_SCALE_MODE_COUNT + }; + + enum ggml_scale_flag { + GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8) }; // interpolate @@ -1879,14 +1885,26 @@ extern "C" { // interpolate // interpolate scale to specified dimensions - GGML_API struct ggml_tensor * ggml_upscale_ext( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_upscale_ext( struct ggml_context * ctx, struct ggml_tensor * a, int ne0, int ne1, int ne2, int ne3, - enum ggml_scale_mode mode); + enum ggml_scale_mode mode), + "use ggml_interpolate instead"); + + // Up- or downsamples the input to the specified size. + // 2D scale modes (eg. bilinear) are applied to the first two dimensions. + GGML_API struct ggml_tensor * ggml_interpolate( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3, + uint32_t mode); // ggml_scale_mode [ | ggml_scale_flag...] // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0] GGML_API struct ggml_tensor * ggml_pad( diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 6948c00b4..dd83efde7 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -7276,12 +7276,13 @@ static void ggml_compute_forward_upscale_f32( GGML_TENSOR_UNARY_OP_LOCALS - const float sf0 = (float)ne0/src0->ne[0]; - const float sf1 = (float)ne1/src0->ne[1]; - const float sf2 = (float)ne2/src0->ne[2]; - const float sf3 = (float)ne3/src0->ne[3]; + float sf0 = (float)ne0/src0->ne[0]; + float sf1 = (float)ne1/src0->ne[1]; + float sf2 = (float)ne2/src0->ne[2]; + float sf3 = (float)ne3/src0->ne[3]; - const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0); + const int32_t mode_flags = ggml_get_op_params_i32(dst, 0); + const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF); if (mode == GGML_SCALE_MODE_NEAREST) { for (int64_t i3 = 0; i3 < ne3; i3++) { @@ -7302,8 +7303,12 @@ static void ggml_compute_forward_upscale_f32( } } } else if (mode == GGML_SCALE_MODE_BILINEAR) { - // setting a pixel offset of 0 would replicate the behavior of pytorch interpolate with align_corners=True - const float pixel_offset = 0.5f; + float pixel_offset = 0.5f; + if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) { + pixel_offset = 0.0f; + sf0 = (float)(ne0 - 1) / (src0->ne[0] - 1); + sf1 = (float)(ne1 - 1) / (src0->ne[1] - 1); + } for (int64_t i3 = 0; i3 < ne3; i3++) { const int64_t i03 = i3 / sf3; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 47fe37b13..b499f3785 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4447,24 +4447,21 @@ struct ggml_tensor * ggml_pool_2d_back( return result; } -// ggml_upscale +// ggml_upscale / ggml_interpolate -static struct ggml_tensor * ggml_upscale_impl( +static struct ggml_tensor * ggml_interpolate_impl( struct ggml_context * ctx, struct ggml_tensor * a, - int ne0, - int ne1, - int ne2, - int ne3, - enum ggml_scale_mode mode) { - GGML_ASSERT(a->ne[0] <= ne0); - GGML_ASSERT(a->ne[1] <= ne1); - GGML_ASSERT(a->ne[2] <= ne2); - GGML_ASSERT(a->ne[3] <= ne3); - + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3, + uint32_t mode) { + GGML_ASSERT((mode & 0xFF) < GGML_SCALE_MODE_COUNT); + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3); - ggml_set_op_params_i32(result, 0, mode); + ggml_set_op_params_i32(result, 0, (int32_t)mode); result->op = GGML_OP_UPSCALE; result->src[0] = a; @@ -4477,7 +4474,8 @@ struct ggml_tensor * ggml_upscale( struct ggml_tensor * a, int scale_factor, enum ggml_scale_mode mode) { - return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3], mode); + GGML_ASSERT(scale_factor > 1); + return ggml_interpolate_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3], mode); } struct ggml_tensor * ggml_upscale_ext( @@ -4488,7 +4486,18 @@ struct ggml_tensor * ggml_upscale_ext( int ne2, int ne3, enum ggml_scale_mode mode) { - return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3, mode); + return ggml_interpolate_impl(ctx, a, ne0, ne1, ne2, ne3, mode); +} + +struct ggml_tensor * ggml_interpolate( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3, + uint32_t mode) { + return ggml_interpolate_impl(ctx, a, ne0, ne1, ne2, ne3, mode); } // ggml_pad diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index dd2dfcc0a..43be1e928 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3296,28 +3296,28 @@ struct test_upscale : public test_case { } }; -// GGML_OP_UPSCALE (ext) -struct test_upscale_ext : public test_case { +// GGML_OP_UPSCALE (via ggml_interpolate) +struct test_interpolate : public test_case { const ggml_type type; const std::array ne; const std::array ne_tgt; - const ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST; + const uint32_t mode = GGML_SCALE_MODE_NEAREST; std::string vars() override { return VARS_TO_STR4(type, ne, ne_tgt, mode); } - test_upscale_ext(ggml_type type = GGML_TYPE_F32, + test_interpolate(ggml_type type = GGML_TYPE_F32, std::array ne = {2, 5, 7, 11}, std::array ne_tgt = {5, 7, 11, 13}, - ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST) + uint32_t mode = GGML_SCALE_MODE_NEAREST) : type(type), ne(ne), ne_tgt(ne_tgt), mode(mode) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); ggml_set_name(a, "a"); - ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3], mode); + ggml_tensor * out = ggml_interpolate(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3], mode); ggml_set_name(out, "out"); return out; @@ -4799,8 +4799,10 @@ static std::vector> make_test_cases_eval() { for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR}) { test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode)); test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode, true)); - test_cases.emplace_back(new test_upscale_ext(GGML_TYPE_F32, {2, 5, 7, 11}, {5, 7, 11, 13}, mode)); + test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {2, 5, 7, 11}, {5, 7, 11, 13}, mode)); + test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {5, 7, 11, 13}, {2, 5, 7, 11}, mode)); } + test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {2, 5, 7, 11}, {5, 7, 11, 13}, GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS)); test_cases.emplace_back(new test_sum()); test_cases.emplace_back(new test_sum_rows()); From f61c05d4b1f8ad498a5cf7f0f57a40383aad563c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 1 Jul 2025 10:27:52 +0300 Subject: [PATCH 08/20] sync : ggml ggml-ci --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index bd1e04ed0..06704ca97 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -9e4bee1c5afc2d677a5b32ecb90cbdb483e81fff +67ad436cb653ac1ef0986f9fb0c6191ec828d1ed From a6a47958a1d9edc628cedc2f9320c84b59fe1f4f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 1 Jul 2025 11:05:48 +0300 Subject: [PATCH 09/20] ggml : remove trailing whitespace (#0) --- ggml/src/ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index b499f3785..c51cb57cc 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4458,7 +4458,7 @@ static struct ggml_tensor * ggml_interpolate_impl( int64_t ne3, uint32_t mode) { GGML_ASSERT((mode & 0xFF) < GGML_SCALE_MODE_COUNT); - + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3); ggml_set_op_params_i32(result, 0, (int32_t)mode); From eff5e45443a248f89cc61e64786f38b68b4da489 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 1 Jul 2025 10:14:21 +0200 Subject: [PATCH 10/20] add GELU_ERF (#14455) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 11 ++++++ .../ggml-vulkan/vulkan-shaders/gelu_erf.comp | 39 +++++++++++++++++++ .../vulkan-shaders/vulkan-shaders-gen.cpp | 2 + 3 files changed, 52 insertions(+) create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 4696f1fe4..1ff38d8c7 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -431,6 +431,7 @@ struct vk_device_struct { // [src/dst 0=fp32,1=fp16] vk_pipeline pipeline_gelu[2]; + vk_pipeline pipeline_gelu_erf[2]; vk_pipeline pipeline_gelu_quick[2]; vk_pipeline pipeline_silu[2]; vk_pipeline pipeline_relu[2]; @@ -2761,6 +2762,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); CREATE_UNARY(gelu) + CREATE_UNARY(gelu_erf) CREATE_UNARY(gelu_quick) CREATE_UNARY(silu) CREATE_UNARY(relu) @@ -6481,6 +6483,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_silu[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_GELU: return ctx->device->pipeline_gelu[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_GELU_ERF: + return ctx->device->pipeline_gelu_erf[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_GELU_QUICK: return ctx->device->pipeline_gelu_quick[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_RELU: @@ -8827,6 +8831,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr switch (ggml_get_unary_op(node)) { case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_TANH: @@ -9072,6 +9077,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr switch (ggml_get_unary_op(node)) { case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_TANH: @@ -9289,6 +9295,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * switch (ggml_get_unary_op(tensor)) { case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_TANH: @@ -10095,6 +10102,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_RELU: @@ -10835,6 +10843,9 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { case GGML_UNARY_OP_GELU: tensor_clone = ggml_gelu(ggml_ctx, src_clone[0]); break; + case GGML_UNARY_OP_GELU_ERF: + tensor_clone = ggml_gelu_erf(ggml_ctx, src_clone[0]); + break; case GGML_UNARY_OP_GELU_QUICK: tensor_clone = ggml_gelu_quick(ggml_ctx, src_clone[0]); break; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp b/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp new file mode 100644 index 000000000..5fd5a5e70 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp @@ -0,0 +1,39 @@ +#version 450 + +#include "generic_head.comp" +#include "types.comp" + +#extension GL_EXT_control_flow_attributes : enable + +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; +layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; + +void main() { + // based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation + // ref: https://www.johndcook.com/blog/python_erf/ + const float p_erf = 0.3275911f; + const float a1_erf = 0.254829592f; + const float a2_erf = -0.284496736f; + const float a3_erf = 1.421413741f; + const float a4_erf = -1.453152027f; + const float a5_erf = 1.061405429f; + + const float SQRT_2_INV = 0.70710678118654752440084436210484f; + const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; + + if (i >= p.KX) { + return; + } + + const float a = float(data_a[i]); + const float a_div_sqr2 = a * SQRT_2_INV; + const float sign_x = sign(a_div_sqr2); + const float x = abs(a_div_sqr2); + const float t = 1.0f / (1.0f + p_erf * x); + const float y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x); + const float erf_approx = sign_x * y; + + data_d[i] = D_TYPE(0.5f * a * (1.0f + erf_approx)); +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 23fc50bf2..297a2a771 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -574,6 +574,8 @@ void process_shaders() { string_to_spv("gelu_f16", "gelu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("gelu_erf_f16", "gelu_erf.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); + string_to_spv("gelu_erf_f32", "gelu_erf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("gelu_quick_f16", "gelu_quick.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("silu_f16", "silu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); From 6a746cf9c40f8d93d13eb8a6c2b989a15e7fa61a Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Tue, 1 Jul 2025 03:43:08 -0500 Subject: [PATCH 11/20] vulkan: Split large mul_mat_id to fit in shared memory (#14451) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 37 +++++++++++++++++++++++++--- tests/test-backend-ops.cpp | 5 ++++ 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 1ff38d8c7..7c11890d9 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -5966,7 +5966,30 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) { ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun); } else { - ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun); + // Split based on number of ids, to fit in shared memory + const uint32_t nei0 = (uint32_t)src2->ne[0]; + const uint32_t nei1 = (uint32_t)src2->ne[1]; + + GGML_ASSERT(nei0 <= 4096); + const uint32_t split_size = std::min(nei1, 4096u / nei0); + + ggml_tensor src1_copy = *src1; + ggml_tensor src2_copy = *src2; + ggml_tensor dst_copy = *dst; + + for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) { + const uint32_t n_tokens = std::min(split_size, nei1 - token_start); + + src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2]; + src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1]; + dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2]; + + src1_copy.ne[2] = n_tokens; + src2_copy.ne[1] = n_tokens; + dst_copy.ne[2] = n_tokens; + + ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun); + } } } @@ -10135,9 +10158,15 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm ggml_type src0_type = op->src[0]->type; ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; const vk_device& device = ggml_vk_get_device(ctx->device); - if (op->op == GGML_OP_MUL_MAT_ID && !device->mul_mat_id_s[src0_type] && !device->mul_mat_id_m[src0_type] && !device->mul_mat_id_l[src0_type]) { - // If there's not enough shared memory for row_ids and the result tile, fallback to CPU - return false; + if (op->op == GGML_OP_MUL_MAT_ID) { + if (!device->mul_mat_id_s[src0_type] && !device->mul_mat_id_m[src0_type] && !device->mul_mat_id_l[src0_type]) { + // If there's not enough shared memory for row_ids and the result tile, fallback to CPU + return false; + } + // Check against size of shared memory variable + if (op->src[2]->ne[0] > 4096) { + return false; + } } switch (src0_type) { case GGML_TYPE_F32: diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 43be1e928..2b502e8b1 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -4623,6 +4623,11 @@ static std::vector> make_test_cases_eval() { // this case is verified (pass) in Intel(R) Data Center GPU Max 1100 (sycl backend) and NV A30 (cuda backend) // test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F16, 512, 262144, 9216, {1, 1}, {1, 1})); + // test large experts*tokens + for (bool b : {false, true}) { + test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 16, 16, b, 32, 1024, 16)); + } + for (ggml_type type_a : base_types) { for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) { for (int n_mats : {4, 8}) { From 343b6e94b61674ac94e64ede7b7ea3365793f7ed Mon Sep 17 00:00:00 2001 From: Chenguang Li <757486878@qq.com> Date: Tue, 1 Jul 2025 16:47:30 +0800 Subject: [PATCH 12/20] CANN: update aclnnGroupedMatmulV2 to aclnnGroupedMatmulV3 (#14411) * [CANN]update to aclnnGroupedMatmulV2 Signed-off-by: noemotiovon <757486878@qq.com> * Support MUL_MAT_ID on 310p Signed-off-by: noemotiovon <757486878@qq.com> * fix editorconfig Signed-off-by: noemotiovon <757486878@qq.com> --------- Signed-off-by: noemotiovon <757486878@qq.com> --- ggml/src/ggml-cann/aclnn_ops.cpp | 69 ++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 437ece2d4..69483de8f 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -65,7 +65,7 @@ #include #include #include -#include +#include #include #include @@ -2654,6 +2654,67 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* memcpy(ori_src0_nb, cast_nb, sizeof(ori_src0_nb)); } +#ifdef ASCEND_310P + ggml_tensor src0_row = *src0; + ggml_tensor src1_row = *src1; + ggml_tensor dst_row = *dst; + + if (src0->type == GGML_TYPE_F16) { + src0_row.type = GGML_TYPE_F32; + } + + // src0_row [D, M, 1, 1] weight without permute + src0_row.ne[2] = 1; + src0_row.ne[3] = 1; + src0_row.nb[0] = ori_src0_nb[0]; + src0_row.nb[1] = ori_src0_nb[1]; + src0_row.nb[2] = ori_src0_nb[1]; + src0_row.nb[3] = ori_src0_nb[1]; + + // src1_row [D, 1, 1, 1] -> input + src1_row.ne[1] = 1; + src1_row.ne[2] = 1; + src1_row.ne[3] = 1; + src1_row.nb[2] = nb11; + src1_row.nb[3] = nb11; + + // dst_row [M, 1, 1, 1] -> out + dst_row.ne[1] = 1; + dst_row.ne[2] = 1; + dst_row.ne[3] = 1; + dst_row.nb[2] = nb1; + dst_row.nb[3] = nb1; + + //create weight for one row + for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) { + for (int64_t id = 0; id < n_ids; id++) { + // expert index + int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]); + GGML_ASSERT(i02 >= 0 && i02 < n_as); + + // If B = 1 (broadcast), always use 0; otherwise, use id. + int64_t i11 = (ne11 == 1 ? 0 : id); + int64_t i12 = iid1; + + int64_t i1 = id; + int64_t i2 = i12; + + void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2]; + void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12; + void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2; + + src0_row.data = src0_tmp_ptr; + src1_row.data = src1_tmp_ptr; + dst_row.data = dst_tmp_ptr; + dst_row.src[0] = &src0_row; + dst_row.src[1] = &src1_row; + + ggml_cann_mul_mat(ctx, &dst_row); + } + } + return; +#endif + std::vector src0_tensor_vec; std::vector src1_tensor_vec; std::vector dst_tensor_vec; @@ -2701,9 +2762,9 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* } size_t GROUP_SIZE = 128; - // GroupedMatmulV2 required tensor_list.size < 128 + // GroupedMatmulV3 required tensor_list.size < 128 for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) { - // split and call GroupedMatmulV2 + // split and call GroupedMatmulV3 size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size()); std::vector src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end); std::vector src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end); @@ -2713,7 +2774,7 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size()); aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size()); - GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV2, src1_tensor_list, src0_tensor_list, + GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV3, src1_tensor_list, src0_tensor_list, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list); ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list); From 1b2aaf28acd632b79bc3b07acb5dad877dc7dbb1 Mon Sep 17 00:00:00 2001 From: Grzegorz Grasza Date: Tue, 1 Jul 2025 15:44:11 +0200 Subject: [PATCH 13/20] Add Vulkan images to docker.md (#14472) Right now it's not easy to find those. --- docs/docker.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/docker.md b/docs/docker.md index f8f0573c1..cbb333ee3 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -25,6 +25,9 @@ Additionally, there the following images, similar to the above: - `ghcr.io/ggml-org/llama.cpp:full-intel`: Same as `full` but compiled with SYCL support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:light-intel`: Same as `light` but compiled with SYCL support. (platforms: `linux/amd64`) - `ghcr.io/ggml-org/llama.cpp:server-intel`: Same as `server` but compiled with SYCL support. (platforms: `linux/amd64`) +- `ghcr.io/ggml-org/llama.cpp:full-vulkan`: Same as `full` but compiled with Vulkan support. (platforms: `linux/amd64`) +- `ghcr.io/ggml-org/llama.cpp:light-vulkan`: Same as `light` but compiled with Vulkan support. (platforms: `linux/amd64`) +- `ghcr.io/ggml-org/llama.cpp:server-vulkan`: Same as `server` but compiled with Vulkan support. (platforms: `linux/amd64`) The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now). From de569441470332ff922c23fb0413cc957be75b25 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 1 Jul 2025 18:04:08 +0300 Subject: [PATCH 14/20] ci : disable fast-math for Metal GHA CI (#14478) * ci : disable fast-math for Metal GHA CI ggml-ci * cont : remove -g flag ggml-ci --- .github/workflows/build.yml | 3 ++- ggml/src/ggml-metal/CMakeLists.txt | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4ea8ea3c0..5d4fb5272 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -84,7 +84,8 @@ jobs: -DCMAKE_BUILD_RPATH="@loader_path" \ -DLLAMA_FATAL_WARNINGS=ON \ -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=ON \ + -DGGML_METAL_EMBED_LIBRARY=OFF \ + -DGGML_METAL_SHADER_DEBUG=ON \ -DGGML_RPC=ON cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) diff --git a/ggml/src/ggml-metal/CMakeLists.txt b/ggml/src/ggml-metal/CMakeLists.txt index 77187efc1..0ca8a3c55 100644 --- a/ggml/src/ggml-metal/CMakeLists.txt +++ b/ggml/src/ggml-metal/CMakeLists.txt @@ -71,7 +71,9 @@ else() # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1 # note: unfortunately, we have to call it default.metallib instead of ggml.metallib # ref: https://github.com/ggerganov/whisper.cpp/issues/1720 - set(XC_FLAGS -fno-fast-math -fno-inline -g) + # note: adding -g causes segmentation fault during compile + #set(XC_FLAGS -fno-fast-math -fno-inline -g) + set(XC_FLAGS -fno-fast-math -fno-inline) else() set(XC_FLAGS -O3) endif() @@ -90,7 +92,7 @@ else() add_custom_command( OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o - | - xcrun -sdk macosx metallib - -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib + xcrun -sdk macosx metallib - -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal DEPENDS ggml-metal.metal ${METALLIB_COMMON} From 68b3cd65141586ef13a698baa9029627f038f102 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Ganster?= Date: Wed, 2 Jul 2025 07:19:31 +0200 Subject: [PATCH 15/20] ggml : Callback before abort (#14481) * Add a callback that will be called just before abort. This allows apps without a console to display a message to the user and save data if needed. * Return previous callback to allow callback chaining * style fixes --------- Co-authored-by: Diego Devesa --- ggml/include/ggml.h | 7 +++++++ ggml/src/ggml.c | 23 +++++++++++++++++++---- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index f8238f315..ec5478db8 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -314,6 +314,13 @@ extern "C" { #endif + // Function type used in fatal error callbacks + typedef void (*ggml_abort_callback_t)(const char * error_message); + + // Set the abort callback (passing null will restore original abort functionality: printing a message to stdout) + // Returns the old callback for chaining + GGML_API ggml_abort_callback_t ggml_set_abort_callback(ggml_abort_callback_t callback); + GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4) GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index c51cb57cc..4227fb101 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -202,19 +202,34 @@ void ggml_print_backtrace(void) { } #endif +static ggml_abort_callback_t g_abort_callback = NULL; + +// Set the abort callback (passing null will restore original abort functionality: printing a message to stdout) +GGML_API ggml_abort_callback_t ggml_set_abort_callback(ggml_abort_callback_t callback) { + ggml_abort_callback_t ret_val = g_abort_callback; + g_abort_callback = callback; + return ret_val; +} + void ggml_abort(const char * file, int line, const char * fmt, ...) { fflush(stdout); - fprintf(stderr, "%s:%d: ", file, line); + char message[2048]; + int offset = snprintf(message, sizeof(message), "%s:%d: ", file, line); va_list args; va_start(args, fmt); - vfprintf(stderr, fmt, args); + vsnprintf(message + offset, sizeof(message) - offset, fmt, args); va_end(args); - fprintf(stderr, "\n"); + if (g_abort_callback) { + g_abort_callback(message); + } else { + // default: print error and backtrace to stderr + fprintf(stderr, "%s\n", message); + ggml_print_backtrace(); + } - ggml_print_backtrace(); abort(); } From 85841e121db5b96ae4d277a3cd3995eef66b98ec Mon Sep 17 00:00:00 2001 From: Eric Zhang <34133756+EZForever@users.noreply.github.com> Date: Wed, 2 Jul 2025 13:41:35 +0800 Subject: [PATCH 16/20] github : add OpenCL backend to issue templates (#14492) --- .github/ISSUE_TEMPLATE/010-bug-compilation.yml | 2 +- .github/ISSUE_TEMPLATE/011-bug-results.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml index b85bf5741..d538fdff1 100644 --- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml @@ -40,7 +40,7 @@ body: attributes: label: GGML backends description: Which GGML backends do you know to be affected? - options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan] + options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan, OpenCL] multiple: true validations: required: true diff --git a/.github/ISSUE_TEMPLATE/011-bug-results.yml b/.github/ISSUE_TEMPLATE/011-bug-results.yml index 1ccef0793..2acec7114 100644 --- a/.github/ISSUE_TEMPLATE/011-bug-results.yml +++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml @@ -42,7 +42,7 @@ body: attributes: label: GGML backends description: Which GGML backends do you know to be affected? - options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan] + options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan, OpenCL] multiple: true validations: required: true From 611ba4b264c8fbb9d2d978bd6a5c17aeab1700fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Wed, 2 Jul 2025 09:02:51 +0200 Subject: [PATCH 17/20] ci : add OpenCL to labeler workflow (#14496) --- .github/labeler.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/labeler.yml b/.github/labeler.yml index 3c2f67707..a4c6e3d33 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -93,3 +93,8 @@ Ascend NPU: - ggml/include/ggml-cann.h - ggml/src/ggml-cann/** - docs/backend/CANN.md +OpenCL: + - changed-files: + - any-glob-to-any-file: + - ggml/include/ggml-opencl.h + - ggml/src/ggml-opencl/** From 603e43dc913f90d9c3291624728b731687eddc35 Mon Sep 17 00:00:00 2001 From: lhez Date: Wed, 2 Jul 2025 00:07:42 -0700 Subject: [PATCH 18/20] opencl : update upscale to support align corners (#14488) --- ggml/src/ggml-opencl/ggml-opencl.cpp | 56 +++++++++++++++---------- ggml/src/ggml-opencl/kernels/upscale.cl | 5 +-- 2 files changed, 36 insertions(+), 25 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 496e47575..518a18ef3 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -4453,7 +4453,8 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0); + const int mode_flags = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0); + const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF); cl_kernel kernel = nullptr; if (mode == GGML_SCALE_MODE_NEAREST) { @@ -4484,18 +4485,22 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg const cl_ulong nb02 = src0->nb[2]; const cl_ulong nb03 = src0->nb[3]; - const int ne00_src = src0->ne[0]; - const int ne01_src = src0->ne[1]; + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + const int ne03 = src0->ne[3]; - const int ne10_dst = dst->ne[0]; - const int ne11_dst = dst->ne[1]; - const int ne12_dst = dst->ne[2]; - const int ne13_dst = dst->ne[3]; + const int ne0 = dst->ne[0]; + const int ne1 = dst->ne[1]; + const int ne2 = dst->ne[2]; + const int ne3 = dst->ne[3]; - const float sf0 = (float)dst->ne[0] / src0->ne[0]; - const float sf1 = (float)dst->ne[1] / src0->ne[1]; - const float sf2 = (float)dst->ne[2] / src0->ne[2]; - const float sf3 = (float)dst->ne[3] / src0->ne[3]; + float sf0 = (float)ne0 / ne00; + float sf1 = (float)ne1 / ne01; + float sf2 = (float)ne2 / ne02; + float sf3 = (float)ne3 / ne03; + + float pixel_offset = 0.5f; CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0)); @@ -4507,29 +4512,36 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb03)); if (mode == GGML_SCALE_MODE_NEAREST) { - CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne10_dst)); - CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11_dst)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12_dst)); - CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne13_dst)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne0)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne1)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne2)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne3)); CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &sf0)); CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float), &sf1)); CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf2)); CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf3)); } else if (mode == GGML_SCALE_MODE_BILINEAR) { - CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00_src)); - CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01_src)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10_dst)); - CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11_dst)); - CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12_dst)); - CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13_dst)); + if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) { + sf0 = (float)(ne0 - 1) / (ne00 - 1); + sf1 = (float)(ne1 - 1) / (ne01 - 1); + pixel_offset = 0.0f; + } + + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne0)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne1)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne2)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne3)); CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf0)); CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf1)); CL_CHECK(clSetKernelArg(kernel, 16, sizeof(float), &sf2)); CL_CHECK(clSetKernelArg(kernel, 17, sizeof(float), &sf3)); + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float), &pixel_offset)); } - size_t dst_total_elements = (size_t)ne10_dst * ne11_dst * ne12_dst * ne13_dst; + size_t dst_total_elements = (size_t)ne0 * ne1 * ne2 * ne3; if (dst_total_elements == 0) { return; } diff --git a/ggml/src/ggml-opencl/kernels/upscale.cl b/ggml/src/ggml-opencl/kernels/upscale.cl index 219d31dbb..25c68351b 100644 --- a/ggml/src/ggml-opencl/kernels/upscale.cl +++ b/ggml/src/ggml-opencl/kernels/upscale.cl @@ -60,7 +60,8 @@ kernel void kernel_upscale_bilinear( float sf0, float sf1, float sf2, - float sf3 + float sf3, + float pixel_offset ) { global const char * src_base = (global const char *)p_src0 + off_src0; global float * dst_base = (global float *)((global char *)p_dst + off_dst); @@ -80,8 +81,6 @@ kernel void kernel_upscale_bilinear( int i02_src = (int)(i12_dst / sf2); int i03_src = (int)(i13_dst / sf3); - const float pixel_offset = 0.5f; - float y_src_f = ((float)i11_dst + pixel_offset) / sf1 - pixel_offset; long y0_src = (long)floor(y_src_f); long y1_src = y0_src + 1; From c8a4e470f65c3a932e18eddc5fba1844876d7463 Mon Sep 17 00:00:00 2001 From: Eric Zhang <34133756+EZForever@users.noreply.github.com> Date: Wed, 2 Jul 2025 19:00:04 +0800 Subject: [PATCH 19/20] opencl : skip empty nodes on cgraph compute (#14491) --- ggml/src/ggml-opencl/ggml-opencl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 518a18ef3..f747f7568 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -2187,7 +2187,7 @@ static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggm // dependencies. sync_with_other_backends(backend); - if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; } From d7f5f4e578d1f60b0835d1734a50438c309b3e5c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 2 Jul 2025 14:12:07 +0300 Subject: [PATCH 20/20] simple-chat : fix context-exceeded condition (#14494) * simple-chat : fix context-exceeded condition ggml-ci * cont : fix n_ctx_used computation ggml-ci --- examples/simple-chat/simple-chat.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp index cf1178043..57195df33 100644 --- a/examples/simple-chat/simple-chat.cpp +++ b/examples/simple-chat/simple-chat.cpp @@ -113,15 +113,16 @@ int main(int argc, char ** argv) { while (true) { // check if we have enough space in the context to evaluate this batch int n_ctx = llama_n_ctx(ctx); - int n_ctx_used = llama_memory_seq_pos_max(llama_get_memory(ctx), 0); + int n_ctx_used = llama_memory_seq_pos_max(llama_get_memory(ctx), 0) + 1; if (n_ctx_used + batch.n_tokens > n_ctx) { printf("\033[0m\n"); fprintf(stderr, "context size exceeded\n"); exit(0); } - if (llama_decode(ctx, batch)) { - GGML_ABORT("failed to decode\n"); + int ret = llama_decode(ctx, batch); + if (ret != 0) { + GGML_ABORT("failed to decode, ret = %d\n", ret); } // sample the next token