From b3b6d862cfdf190e1b9ad961639a25f5ebc0c7e3 Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Thu, 24 Apr 2025 07:18:33 +0000 Subject: [PATCH 1/2] vulkan: matmul gcn tuning (#13016) * tune matmul for gcn * this one is more power efficient * Update ggml/src/ggml-vulkan/ggml-vulkan.cpp Co-authored-by: 0cc4m * disable this tune for the proprietary driver --------- Co-authored-by: 0cc4m --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 39f3cd343..c0bdb9e17 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -246,6 +246,7 @@ struct vk_device_struct { bool pipeline_robustness; vk::Device device; uint32_t vendor_id; + vk::DriverId driver_id; vk_device_architecture architecture; vk_queue compute_queue; vk_queue transfer_queue; @@ -1740,6 +1741,11 @@ static void ggml_vk_load_shaders(vk_device& device) { m_warptile_mmq_int = { 128, 64, 64, 32, subgroup_size_8, 32, 2, 2, 2, 1, subgroup_size_8 }; s_warptile_mmq_int = { subgroup_size_32, 32, 32, 32, 32, 32, 2, 2, 1, 1, subgroup_size_8 }; + // chip specific tuning + if ((device->architecture == AMD_GCN) && (device->driver_id != vk::DriverId::eAmdProprietary)) { + m_warptile_mmq = m_warptile_mmq_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 }; + } + l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 }; m_mmq_wg_denoms = m_wg_denoms = { 64, 64, 1 }; s_mmq_wg_denoms = s_wg_denoms = { 32, 32, 1 }; @@ -2658,6 +2664,7 @@ static vk_device ggml_vk_get_device(size_t idx) { device->physical_device.getProperties2(&props2); device->properties = props2.properties; device->vendor_id = device->properties.vendorID; + device->driver_id = driver_props.driverID; const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE"); From 7604a7d6b80e78eef8f275fc700d0f64820d672f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 24 Apr 2025 10:38:30 +0300 Subject: [PATCH 2/2] metal : fix floating-point range of attention scores in FA kernels (#13090) ggml-ci --- ggml/src/ggml-metal/ggml-metal.metal | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 8d6e99e62..9f4147e93 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -3192,7 +3192,7 @@ kernel void kernel_flash_attn_ext( { float S[Q] = { [0 ... Q-1] = 0.0f }; - float M[Q] = { [0 ... Q-1] = -__FLT16_MAX__/2 }; + float M[Q] = { [0 ... Q-1] = -__FLT_MAX__/2 }; // thread indices inside the simdgroup // TODO: see if we can utilize quad-group functions for better performance @@ -3452,7 +3452,7 @@ kernel void kernel_flash_attn_ext( // reduce the warps sequentially for (ushort sg = 1; sg < nsg; ++sg) { float S = { 0.0f }; - float M = { -__FLT16_MAX__/2 }; + float M = { -__FLT_MAX__/2 }; threadgroup_barrier(mem_flags::mem_threadgroup); @@ -3699,7 +3699,7 @@ kernel void kernel_flash_attn_ext_vec( { float S = 0.0f; - float M = -__FLT16_MAX__/2; + float M = -__FLT_MAX__/2; // thread indices inside the simdgroup const short tx = tiisg%NL;