mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # CMakeLists.txt # cmake/common.cmake # docs/backend/SYCL.md # examples/main/README.md # examples/speculative/speculative.cpp # ggml/CMakeLists.txt # ggml/src/CMakeLists.txt # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-musa/CMakeLists.txt # ggml/src/ggml-sycl/CMakeLists.txt # ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt # tests/test-backend-ops.cpp
This commit is contained in:
commit
0c90d2ebcf
58 changed files with 4222 additions and 1537 deletions
|
@ -33,6 +33,7 @@
|
|||
|
||||
#include "ggml-vulkan-shaders.cpp"
|
||||
|
||||
#define ROUNDUP_POW2(M, N) (((M) + (N) - 1) & ~((N) - 1))
|
||||
#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
|
||||
|
||||
#define VK_VENDOR_ID_AMD 0x1002
|
||||
|
@ -153,6 +154,66 @@ static void ggml_vk_destroy_buffer(vk_buffer& buf);
|
|||
|
||||
static constexpr uint32_t mul_mat_vec_max_cols = 8;
|
||||
|
||||
enum vk_device_architecture {
|
||||
OTHER,
|
||||
AMD_GCN,
|
||||
AMD_RDNA1,
|
||||
AMD_RDNA2,
|
||||
AMD_RDNA3,
|
||||
};
|
||||
|
||||
static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) {
|
||||
vk::PhysicalDeviceProperties props = device.getProperties();
|
||||
|
||||
if (props.vendorID == VK_VENDOR_ID_AMD) {
|
||||
const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
|
||||
|
||||
bool amd_shader_core_properties = false;
|
||||
bool integer_dot_product = false;
|
||||
bool subgroup_size_control = false;
|
||||
|
||||
for (const auto& properties : ext_props) {
|
||||
if (strcmp("VK_AMD_shader_core_properties", properties.extensionName) == 0) {
|
||||
amd_shader_core_properties = true;
|
||||
} else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0) {
|
||||
integer_dot_product = true;
|
||||
} else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
|
||||
subgroup_size_control = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!amd_shader_core_properties || !integer_dot_product || !subgroup_size_control) {
|
||||
return vk_device_architecture::OTHER;
|
||||
}
|
||||
|
||||
vk::PhysicalDeviceProperties2 props2;
|
||||
vk::PhysicalDeviceShaderCorePropertiesAMD shader_core_props_amd;
|
||||
vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR integer_dot_props;
|
||||
vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
|
||||
|
||||
props2.pNext = &shader_core_props_amd;
|
||||
shader_core_props_amd.pNext = &integer_dot_props;
|
||||
integer_dot_props.pNext = &subgroup_size_control_props;
|
||||
|
||||
device.getProperties2(&props2);
|
||||
|
||||
if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 64) {
|
||||
return vk_device_architecture::AMD_GCN;
|
||||
}
|
||||
if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 32) {
|
||||
// RDNA
|
||||
if (shader_core_props_amd.wavefrontsPerSimd == 20) {
|
||||
return vk_device_architecture::AMD_RDNA1;
|
||||
}
|
||||
if (integer_dot_props.integerDotProduct4x8BitPackedMixedSignednessAccelerated) {
|
||||
return vk_device_architecture::AMD_RDNA3;
|
||||
}
|
||||
return vk_device_architecture::AMD_RDNA2;
|
||||
}
|
||||
}
|
||||
return vk_device_architecture::OTHER;
|
||||
}
|
||||
|
||||
struct vk_device_struct {
|
||||
std::mutex mutex;
|
||||
|
||||
|
@ -165,6 +226,7 @@ struct vk_device_struct {
|
|||
bool pipeline_robustness;
|
||||
vk::Device device;
|
||||
uint32_t vendor_id;
|
||||
vk_device_architecture architecture;
|
||||
vk_queue compute_queue;
|
||||
vk_queue transfer_queue;
|
||||
bool single_queue;
|
||||
|
@ -246,6 +308,7 @@ struct vk_device_struct {
|
|||
vk_pipeline pipeline_group_norm_f32;
|
||||
vk_pipeline pipeline_rms_norm_f32;
|
||||
vk_pipeline pipeline_rms_norm_back_f32;
|
||||
vk_pipeline pipeline_l2_norm_f32;
|
||||
vk_pipeline pipeline_gelu_f32;
|
||||
vk_pipeline pipeline_gelu_quick_f32;
|
||||
vk_pipeline pipeline_silu_f32;
|
||||
|
@ -270,6 +333,7 @@ struct vk_device_struct {
|
|||
vk_pipeline pipeline_timestep_embedding_f32;
|
||||
vk_pipeline pipeline_pool2d_f32;
|
||||
vk_pipeline pipeline_rwkv_wkv6_f32;
|
||||
vk_pipeline pipeline_rwkv_wkv7_f32;
|
||||
vk_pipeline pipeline_opt_step_adamw_f32;
|
||||
|
||||
// [2][2][2] is for {f16acc,f32acc}x{large,small_rows}x{unaligned, aligned}
|
||||
|
@ -372,6 +436,7 @@ struct vk_mat_mat_push_constants {
|
|||
uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
|
||||
uint32_t k_split;
|
||||
uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3;
|
||||
uint32_t padded_N;
|
||||
};
|
||||
struct vk_mat_vec_push_constants {
|
||||
uint32_t ncols; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
|
||||
|
@ -384,6 +449,7 @@ struct vk_mat_mat_id_push_constants {
|
|||
uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
|
||||
uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
|
||||
uint32_t nei0; uint32_t nei1; uint32_t nbi1; uint32_t ne11;
|
||||
uint32_t padded_N;
|
||||
};
|
||||
struct vk_mat_vec_id_push_constants {
|
||||
uint32_t ncols; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
|
||||
|
@ -569,6 +635,13 @@ struct vk_op_rwkv_wkv6_push_constants {
|
|||
uint32_t H;
|
||||
};
|
||||
|
||||
struct vk_op_rwkv_wkv7_push_constants {
|
||||
uint32_t B;
|
||||
uint32_t T;
|
||||
uint32_t C;
|
||||
uint32_t H;
|
||||
};
|
||||
|
||||
// Allow pre-recording command buffers
|
||||
struct vk_staging_memcpy {
|
||||
vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
|
||||
|
@ -1449,6 +1522,73 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
|
|||
return supported;
|
||||
}
|
||||
|
||||
struct GpuPipelineConfig {
|
||||
// GPU architecture identifier.
|
||||
// Example: vk_device_architecture::AMD_GCN
|
||||
vk_device_architecture arch;
|
||||
|
||||
// Mapping of pipeline names to their specific subgroup sizes.
|
||||
// Example: {"soft_max_f32", 64}
|
||||
std::unordered_map<std::string, uint32_t> pipelines;
|
||||
|
||||
// Default subgroup size for this GPU.
|
||||
// Defaults to 0 if not explicitly provided.
|
||||
uint32_t default_subgroup_size = 0;
|
||||
};
|
||||
|
||||
// Pipeline configuration for RDNA1 GPUs.
|
||||
static const std::unordered_map<std::string, uint32_t> rdna1_pipelines = {
|
||||
{"soft_max", 64}, {"im2col", 64},
|
||||
{"argmax", 64}, {"mul_mat_vec", 64},
|
||||
{"mul_mat_vec_f16", 32}, {"mul_mat_vec_f32_f16", 32}
|
||||
};
|
||||
|
||||
// Pipeline configuration for RDNA2 GPUs.
|
||||
static const std::unordered_map<std::string, uint32_t> rdna2_pipelines = {
|
||||
{"soft_max", 64}, {"im2col", 64},
|
||||
};
|
||||
|
||||
static constexpr uint32_t RDNA_DEFAULT_SUBGROUP_SIZE = 32;
|
||||
|
||||
// Define configurations for different GPUs.
|
||||
static std::vector<GpuPipelineConfig> gpu_pipeline_configs = {
|
||||
{
|
||||
vk_device_architecture::AMD_RDNA1,
|
||||
{
|
||||
rdna1_pipelines,
|
||||
},
|
||||
RDNA_DEFAULT_SUBGROUP_SIZE
|
||||
},
|
||||
{
|
||||
vk_device_architecture::AMD_RDNA2,
|
||||
{
|
||||
rdna2_pipelines,
|
||||
},
|
||||
RDNA_DEFAULT_SUBGROUP_SIZE
|
||||
},
|
||||
};
|
||||
|
||||
static uint32_t get_subgroup_size(const std::string &pipeline_name, const vk_device_architecture &arch) {
|
||||
for (const auto &config : gpu_pipeline_configs) {
|
||||
if (config.arch == arch) {
|
||||
auto pipIt = config.pipelines.find(pipeline_name);
|
||||
if (pipIt != config.pipelines.end()) {
|
||||
return pipIt->second;
|
||||
}
|
||||
std::vector<std::pair<std::string, uint32_t>> sorted_pipelines(config.pipelines.begin(), config.pipelines.end());
|
||||
std::sort(sorted_pipelines.begin(), sorted_pipelines.end(),
|
||||
[](const auto &a, const auto &b) { return a.first.size() > b.first.size(); });
|
||||
for (const auto &entry : sorted_pipelines) {
|
||||
if (pipeline_name.find(entry.first) != std::string::npos) {
|
||||
return entry.second;
|
||||
}
|
||||
}
|
||||
return config.default_subgroup_size;
|
||||
}
|
||||
}
|
||||
return 0; // If no matching configuration is found
|
||||
}
|
||||
|
||||
static void ggml_vk_load_shaders(vk_device& device) {
|
||||
VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
|
||||
|
||||
|
@ -1470,36 +1610,36 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
uint32_t l_align, m_align, s_align;
|
||||
if (device->coopmat2) {
|
||||
// spec constants and tile sizes for non-quant matmul/matmul_id
|
||||
l_warptile = { 256, 128, 256, 64 };
|
||||
m_warptile = { 256, 128, 128, 64 };
|
||||
s_warptile = { 128, 64, 64, 64 };
|
||||
l_warptile = { 256, 128, 256, 64, 1 };
|
||||
m_warptile = { 256, 128, 128, 64, 0 };
|
||||
s_warptile = { 128, 64, 64, 64, 0 };
|
||||
l_wg_denoms = {128, 256, 1 };
|
||||
m_wg_denoms = {128, 128, 1 };
|
||||
s_wg_denoms = { 64, 64, 1 };
|
||||
|
||||
// spec constants and tile sizes for quant matmul (non-Qi_K)
|
||||
l_warptile_mmq = { 256, 128, 256, 64 };
|
||||
m_warptile_mmq = { 256, 128, 128, 64 };
|
||||
s_warptile_mmq = { 256, 128, 128, 64 };
|
||||
l_warptile_mmq = { 256, 128, 256, 64, 1 };
|
||||
m_warptile_mmq = { 256, 128, 128, 64, 1 };
|
||||
s_warptile_mmq = { 256, 32, 64, 128, 0 };
|
||||
l_mmq_wg_denoms = { 128, 256, 1 };
|
||||
m_mmq_wg_denoms = { 128, 128, 1 };
|
||||
s_mmq_wg_denoms = { 128, 128, 1 };
|
||||
s_mmq_wg_denoms = { 32, 64, 1 };
|
||||
|
||||
// spec constants and tile sizes for quant matmul (Qi_K)
|
||||
l_warptile_mmq_k = { 256, 128, 512, 16 };
|
||||
m_warptile_mmq_k = { 256, 128, 256, 16 };
|
||||
s_warptile_mmq_k = { 256, 32, 128, 64 };
|
||||
l_mmq_wg_denoms_k = { 128, 512, 1 };
|
||||
m_mmq_wg_denoms_k = { 128, 256, 1 };
|
||||
s_mmq_wg_denoms_k = { 32, 128, 1 };
|
||||
l_warptile_mmq_k = { 256, 64, 128, 64, 1 };
|
||||
m_warptile_mmq_k = { 256, 32, 64, 64, 0 };
|
||||
s_warptile_mmq_k = { 256, 32, 32, 128, 0 };
|
||||
l_mmq_wg_denoms_k = { 64, 128, 1 };
|
||||
m_mmq_wg_denoms_k = { 32, 64, 1 };
|
||||
s_mmq_wg_denoms_k = { 32, 32, 1 };
|
||||
|
||||
// spec constants and tile sizes for quant matmul_id
|
||||
l_warptile_mmqid = { 256, 128, 128, 16 };
|
||||
m_warptile_mmqid = { 256, 128, 64, 16 };
|
||||
s_warptile_mmqid = { 256, 64, 64, 16 };
|
||||
l_mmqid_wg_denoms = { 128, 128, 1 };
|
||||
l_warptile_mmqid = { 256, 128, 64, 16, 0 };
|
||||
m_warptile_mmqid = { 256, 128, 64, 16, 0 };
|
||||
s_warptile_mmqid = { 256, 128, 64, 16, 0 };
|
||||
l_mmqid_wg_denoms = { 128, 64, 1 };
|
||||
m_mmqid_wg_denoms = { 128, 64, 1 };
|
||||
s_mmqid_wg_denoms = { 64, 64, 1 };
|
||||
s_mmqid_wg_denoms = { 128, 64, 1 };
|
||||
|
||||
l_align = 128;
|
||||
m_align = 64;
|
||||
|
@ -1575,6 +1715,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
|
||||
uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
|
||||
|
||||
if (!require_full_subgroups && required_subgroup_size == 0) {
|
||||
required_subgroup_size = get_subgroup_size(name, device->architecture);
|
||||
}
|
||||
|
||||
if (!pipeline) {
|
||||
pipeline = std::make_shared<vk_pipeline_struct>();
|
||||
pipeline->name = name;
|
||||
|
@ -2132,6 +2276,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rms_norm_back_f32, "rms_norm_back_f32", rms_norm_back_f32_len, rms_norm_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_l2_norm_f32, "l2_norm_f32", l2_norm_f32_len, l2_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
@ -2243,6 +2388,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv7_f32, "rwkv_wkv7_f32", rwkv_wkv7_f32_len, rwkv_wkv7_f32_data, "main", 8, sizeof(vk_op_rwkv_wkv7_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
for (auto &c : compiles) {
|
||||
|
@ -2251,7 +2398,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
device->need_compiles = false;
|
||||
}
|
||||
|
||||
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props);
|
||||
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch);
|
||||
|
||||
static vk_device ggml_vk_get_device(size_t idx) {
|
||||
VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")");
|
||||
|
@ -2280,6 +2427,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||
device->physical_device = physical_devices[dev_num];
|
||||
const std::vector<vk::ExtensionProperties> ext_props = device->physical_device.enumerateDeviceExtensionProperties();
|
||||
|
||||
device->architecture = get_device_architecture(device->physical_device);
|
||||
|
||||
const char* GGML_VK_PREFER_HOST_MEMORY = getenv("GGML_VK_PREFER_HOST_MEMORY");
|
||||
device->prefer_host_memory = GGML_VK_PREFER_HOST_MEMORY != nullptr;
|
||||
|
||||
|
@ -2292,7 +2441,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||
bool coopmat2_support = false;
|
||||
device->coopmat_support = false;
|
||||
|
||||
// Check if maintenance4 is supported
|
||||
for (const auto& properties : ext_props) {
|
||||
if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
|
||||
maintenance4_support = true;
|
||||
|
@ -2380,13 +2528,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||
|
||||
if (GGML_VK_SUBALLOCATION_BLOCK_SIZE != nullptr) {
|
||||
device->suballocation_block_size = std::stoul(GGML_VK_SUBALLOCATION_BLOCK_SIZE);
|
||||
#if defined(_WIN32)
|
||||
} else if (device->vendor_id == VK_VENDOR_ID_NVIDIA) {
|
||||
} else {
|
||||
// Limit batching of allocations to 1GB by default to avoid fragmentation issues
|
||||
device->suballocation_block_size = 1024*1024*1024;
|
||||
#endif
|
||||
} else {
|
||||
device->suballocation_block_size = device->max_memory_allocation_size;
|
||||
}
|
||||
device->suballocation_block_size = std::min(device->suballocation_block_size, device->max_memory_allocation_size);
|
||||
|
||||
|
@ -2405,7 +2549,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||
|
||||
device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
|
||||
|
||||
if (!ggml_vk_khr_cooperative_matrix_support(device->properties, driver_props)) {
|
||||
if (!ggml_vk_khr_cooperative_matrix_support(device->properties, driver_props, device->architecture)) {
|
||||
device->coopmat_support = false;
|
||||
}
|
||||
|
||||
|
@ -2787,7 +2931,10 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|||
subgroup_props.pNext = &driver_props;
|
||||
physical_device.getProperties2(&props2);
|
||||
|
||||
const size_t subgroup_size = subgroup_props.subgroupSize;
|
||||
vk_device_architecture arch = get_device_architecture(physical_device);
|
||||
uint32_t default_subgroup_size = get_subgroup_size("", arch);
|
||||
const size_t subgroup_size = (default_subgroup_size != 0) ? default_subgroup_size : subgroup_props.subgroupSize;
|
||||
|
||||
const bool uma = props2.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
|
||||
bool fp16_storage = false;
|
||||
|
@ -2813,7 +2960,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|||
}
|
||||
}
|
||||
|
||||
if (!ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props)) {
|
||||
const vk_device_architecture device_architecture = get_device_architecture(physical_device);
|
||||
|
||||
if (!ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props, device_architecture)) {
|
||||
coopmat_support = false;
|
||||
}
|
||||
|
||||
|
@ -3858,10 +4007,14 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
|||
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ")");
|
||||
|
||||
if (ctx->device->coopmat2) {
|
||||
if ((ctx->device->mul_mat_l[src0_type] && (m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_s[src0_type])) {
|
||||
// Use large shader when the N dimension is greater than the medium shader's tile size
|
||||
uint32_t crossover_large = mmp->m->wg_denoms[1];
|
||||
if ((ctx->device->mul_mat_l[src0_type] && (n > crossover_large)) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_s[src0_type])) {
|
||||
return aligned ? mmp->a_l : mmp->l;
|
||||
}
|
||||
if ((ctx->device->mul_mat_m[src0_type] && (m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) || !ctx->device->mul_mat_s[src0_type]) {
|
||||
// Use medium shader when the N dimension is greater than the small shader's tile size
|
||||
uint32_t crossover_medium = mmp->s->wg_denoms[1];
|
||||
if ((ctx->device->mul_mat_m[src0_type] && (n > crossover_medium)) || !ctx->device->mul_mat_s[src0_type]) {
|
||||
return aligned ? mmp->a_m : mmp->m;
|
||||
}
|
||||
return aligned ? mmp->a_s : mmp->s;
|
||||
|
@ -3886,18 +4039,19 @@ static void ggml_vk_matmul(
|
|||
vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer,
|
||||
uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
|
||||
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
|
||||
uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3) {
|
||||
uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3,
|
||||
uint32_t padded_n) {
|
||||
VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")");
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
if (split_k == 1) {
|
||||
const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3 };
|
||||
const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3, padded_n };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, sizeof(vk_mat_mat_push_constants), &pc, { m, n, batch });
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_ASSERT(batch_stride_d == m * n);
|
||||
|
||||
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3 };
|
||||
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, padded_n };
|
||||
// Make sure enough workgroups get assigned for split k to work
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, sizeof(vk_mat_mat_push_constants), &pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
|
@ -3906,13 +4060,17 @@ static void ggml_vk_matmul(
|
|||
}
|
||||
|
||||
static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned, ggml_type src0_type) {
|
||||
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ")");
|
||||
VK_LOG_DEBUG("ggml_vk_guess_matmul_id_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ")");
|
||||
|
||||
if (ctx->device->coopmat2) {
|
||||
if ((ctx->device->mul_mat_id_l[src0_type] && (m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) || (!ctx->device->mul_mat_id_m[src0_type] && !ctx->device->mul_mat_id_s[src0_type])) {
|
||||
// Use large shader when the N dimension is greater than the medium shader's tile size
|
||||
uint32_t crossover_large = mmp->m->wg_denoms[1];
|
||||
if ((ctx->device->mul_mat_id_l[src0_type] && (n > crossover_large)) || (!ctx->device->mul_mat_id_m[src0_type] && !ctx->device->mul_mat_id_s[src0_type])) {
|
||||
return aligned ? mmp->a_l : mmp->l;
|
||||
}
|
||||
if ((ctx->device->mul_mat_id_m[src0_type] && (m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) || !ctx->device->mul_mat_id_s[src0_type]) {
|
||||
// Use medium shader when the N dimension is greater than the small shader's tile size
|
||||
uint32_t crossover_medium = mmp->s->wg_denoms[1];
|
||||
if ((ctx->device->mul_mat_id_m[src0_type] && (n > crossover_medium)) || !ctx->device->mul_mat_id_s[src0_type]) {
|
||||
return aligned ? mmp->a_m : mmp->m;
|
||||
}
|
||||
return aligned ? mmp->a_s : mmp->s;
|
||||
|
@ -3937,14 +4095,15 @@ static void ggml_vk_matmul_id(
|
|||
vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& ids,
|
||||
uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d,
|
||||
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
|
||||
uint32_t n_as, uint32_t nei0, uint32_t nei1, uint32_t nbi1, uint32_t ne11) {
|
||||
uint32_t n_as, uint32_t nei0, uint32_t nei1, uint32_t nbi1, uint32_t ne11,
|
||||
uint32_t padded_n) {
|
||||
VK_LOG_DEBUG("ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " <<
|
||||
"m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", " <<
|
||||
"batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", " <<
|
||||
"n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")");
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d,
|
||||
nei0, nei1, nbi1, ne11 };
|
||||
nei0, nei1, nbi1, ne11, padded_n };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, sizeof(vk_mat_mat_id_push_constants), &pc, { m, nei1, n_as });
|
||||
}
|
||||
|
||||
|
@ -4106,15 +4265,17 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
|||
// Not implemented
|
||||
GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT
|
||||
|
||||
const int x_ne = ne01 * ne00;
|
||||
const int y_ne = ne11 * ne10;
|
||||
const int d_ne = ne11 * ne01;
|
||||
|
||||
const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11, qx_needs_dequant ? GGML_TYPE_F16 : src0->type));
|
||||
const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8;
|
||||
|
||||
vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned, qx_needs_dequant ? GGML_TYPE_F16 : src0->type);
|
||||
|
||||
// Reserve extra storage in the N dimension for the Y matrix, so we can avoid bounds-checking
|
||||
uint32_t padded_n = qy_needs_dequant ? ROUNDUP_POW2(ne11, pipeline->wg_denoms[1]) :ne11;
|
||||
const int x_ne = ne01 * ne00;
|
||||
const int y_ne = padded_n * ne10;
|
||||
const int d_ne = ne11 * ne01;
|
||||
|
||||
const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, pipeline);
|
||||
|
||||
const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type);
|
||||
|
@ -4237,7 +4398,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
|||
{ d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k },
|
||||
ne01, ne11, ne10,
|
||||
ne10, ne10, ne01, stride_batch_x, stride_batch_y, ne20*ne21,
|
||||
split_k, ne12*ne13, ne02, ne12, r2, r3
|
||||
split_k, ne12*ne13, ne02, ne12, r2, r3, padded_n
|
||||
); // NOLINT
|
||||
}
|
||||
|
||||
|
@ -4688,15 +4849,17 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|||
// Not implemented
|
||||
GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT
|
||||
|
||||
const uint64_t x_ne = ne01 * ne00;
|
||||
const uint64_t y_ne = ne11 * ne10;
|
||||
const uint64_t d_ne = ne21 * ne20;
|
||||
|
||||
const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_id_pipeline_align(ctx, mmp, ne01, nei1, qx_needs_dequant ? GGML_TYPE_F16 : src0->type));
|
||||
const bool aligned = ne10 == kpad && ne01 > 8 && nei1 > 8;
|
||||
|
||||
vk_pipeline pipeline = ggml_vk_guess_matmul_id_pipeline(ctx, mmp, ne01, nei1, aligned, qx_needs_dequant ? GGML_TYPE_F16 : src0->type);
|
||||
|
||||
// Reserve extra storage in the N dimension for the Y matrix, so we can avoid bounds-checking
|
||||
uint32_t padded_n = qy_needs_dequant ? ROUNDUP_POW2(ne11, pipeline->wg_denoms[1]) :ne11;
|
||||
const uint64_t x_ne = ne01 * ne00;
|
||||
const uint64_t y_ne = padded_n * ne10;
|
||||
const uint64_t d_ne = ne21 * ne20;
|
||||
|
||||
const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type);
|
||||
const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
|
||||
const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne;
|
||||
|
@ -4815,7 +4978,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|||
{ d_D, d_buf_offset, d_sz * ne22 * ne23 }, { d_ids, ids_buf_offset, ids_sz },
|
||||
ne01, ne21, ne10, ne10, ne10, ne01,
|
||||
stride_batch_x, stride_batch_y, ne20*ne21,
|
||||
n_as, nei0, nei1, nbi1 / ggml_type_size(ids->type), ne11
|
||||
n_as, nei0, nei1, nbi1 / ggml_type_size(ids->type), ne11, padded_n
|
||||
); // NOLINT
|
||||
}
|
||||
|
||||
|
@ -5326,6 +5489,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|||
return ctx->device->pipeline_rms_norm_back_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_L2_NORM:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_l2_norm_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(dst)) {
|
||||
case GGML_UNARY_OP_SILU:
|
||||
|
@ -5465,6 +5633,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|||
return ctx->device->pipeline_rwkv_wkv6_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_RWKV_WKV7:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_rwkv_wkv7_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_opt_step_adamw_f32;
|
||||
|
@ -5712,6 +5885,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|||
case GGML_OP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_RMS_NORM_BACK:
|
||||
case GGML_OP_L2_NORM:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_SOFT_MAX_BACK:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
|
@ -5961,23 +6135,17 @@ static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
|||
}, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_op_f32_rwkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_rwkv_wkv6_push_constants&& pc, bool dryrun = false) {
|
||||
const ggml_tensor * k = dst->src[0];
|
||||
const ggml_tensor * v = dst->src[1];
|
||||
const ggml_tensor * r = dst->src[2];
|
||||
const ggml_tensor * tf = dst->src[3];
|
||||
const ggml_tensor * td = dst->src[4];
|
||||
const ggml_tensor * state = dst->src[5];
|
||||
static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_rwkv_wkv6_push_constants&& pc, int version, bool dryrun = false) {
|
||||
GGML_ASSERT(version == 6 || version == 7);
|
||||
int num_srcs = version == 6 ? 6 : 7;
|
||||
|
||||
for (int i = 0; i < num_srcs; i++) {
|
||||
GGML_ASSERT(!ggml_is_quantized(dst->src[i]->type));
|
||||
}
|
||||
|
||||
GGML_ASSERT(!ggml_is_quantized(k->type));
|
||||
GGML_ASSERT(!ggml_is_quantized(v->type));
|
||||
GGML_ASSERT(!ggml_is_quantized(r->type));
|
||||
GGML_ASSERT(!ggml_is_quantized(tf->type));
|
||||
GGML_ASSERT(!ggml_is_quantized(td->type));
|
||||
GGML_ASSERT(!ggml_is_quantized(state->type));
|
||||
GGML_ASSERT(dst->buffer != nullptr);
|
||||
|
||||
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, k, v, r, dst, GGML_OP_RWKV_WKV6);
|
||||
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, dst->src[0], dst->src[1], dst->src[2], dst, dst->op);
|
||||
GGML_ASSERT(pipeline != nullptr);
|
||||
|
||||
if (dryrun) {
|
||||
|
@ -5986,89 +6154,73 @@ static void ggml_vk_op_f32_rwkv6(ggml_backend_vk_context * ctx, vk_context& subc
|
|||
}
|
||||
|
||||
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
||||
ggml_backend_vk_buffer_context * k_buf_ctx = (ggml_backend_vk_buffer_context *)k->buffer->context;
|
||||
ggml_backend_vk_buffer_context * v_buf_ctx = (ggml_backend_vk_buffer_context *)v->buffer->context;
|
||||
ggml_backend_vk_buffer_context * r_buf_ctx = (ggml_backend_vk_buffer_context *)r->buffer->context;
|
||||
ggml_backend_vk_buffer_context * tf_buf_ctx = (ggml_backend_vk_buffer_context *)tf->buffer->context;
|
||||
ggml_backend_vk_buffer_context * td_buf_ctx = (ggml_backend_vk_buffer_context *)td->buffer->context;
|
||||
ggml_backend_vk_buffer_context * state_buf_ctx = (ggml_backend_vk_buffer_context *)state->buffer->context;
|
||||
ggml_backend_vk_buffer_context * src_buf_ctxs[7] = { nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr };
|
||||
for (int i = 0; i < num_srcs; i++) {
|
||||
src_buf_ctxs[i] = (ggml_backend_vk_buffer_context *)dst->src[i]->buffer->context;
|
||||
}
|
||||
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
|
||||
vk_buffer d_D = nullptr, d_K = nullptr, d_V = nullptr, d_R = nullptr, d_TF = nullptr, d_TD = nullptr, d_State = nullptr;
|
||||
size_t k_offset = 0, v_offset = 0, r_offset = 0, tf_offset = 0, td_offset = 0, state_offset = 0, dst_offset = 0;
|
||||
bool K_uma = false, V_uma = false, R_uma = false, TF_uma = false, TD_uma = false, STATE_uma = false, DST_uma = false;
|
||||
vk_buffer d_D = nullptr, d_srcs[7] = { nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr };
|
||||
size_t dst_offset = 0, src_offsets[7] = { 0, 0, 0, 0, 0, 0, 0 };
|
||||
bool dst_uma = false, srcs_uma[7] = { false, false, false, false, false, false, false };
|
||||
|
||||
if (ctx->device->uma) {
|
||||
ggml_vk_host_get(ctx->device, k->data, d_K, k_offset);
|
||||
ggml_vk_host_get(ctx->device, v->data, d_V, v_offset);
|
||||
ggml_vk_host_get(ctx->device, r->data, d_R, r_offset);
|
||||
ggml_vk_host_get(ctx->device, tf->data, d_TF, tf_offset);
|
||||
ggml_vk_host_get(ctx->device, td->data, d_TD, td_offset);
|
||||
ggml_vk_host_get(ctx->device, state->data, d_State, state_offset);
|
||||
for (int i = 0; i < num_srcs; i++) {
|
||||
ggml_vk_host_get(ctx->device, dst->src[i]->data, d_srcs[i], src_offsets[i]);
|
||||
srcs_uma[i] = d_srcs[i] != nullptr;
|
||||
}
|
||||
|
||||
ggml_vk_host_get(ctx->device, dst->data, d_D, dst_offset);
|
||||
|
||||
K_uma = d_K != nullptr;
|
||||
V_uma = d_V != nullptr;
|
||||
R_uma = d_R != nullptr;
|
||||
TF_uma = d_TF != nullptr;
|
||||
TD_uma = d_TD != nullptr;
|
||||
STATE_uma = d_State != nullptr;
|
||||
DST_uma = d_D != nullptr;
|
||||
dst_uma = d_D != nullptr;
|
||||
}
|
||||
|
||||
if (!K_uma) {
|
||||
d_K = k_buf_ctx->dev_buffer;
|
||||
k_offset = vk_tensor_offset(k) + k->view_offs;
|
||||
uint64_t src_sizes[7] = { 0, 0, 0, 0, 0, 0, 0 };
|
||||
for (int i = 0; i < num_srcs; i++) {
|
||||
src_sizes[i] = ggml_nbytes(dst->src[i]);
|
||||
if (!srcs_uma[i]) {
|
||||
d_srcs[i] = src_buf_ctxs[i]->dev_buffer;
|
||||
src_offsets[i] = vk_tensor_offset(dst->src[i]) + dst->src[i]->view_offs;
|
||||
}
|
||||
}
|
||||
if (!V_uma) {
|
||||
d_V = v_buf_ctx->dev_buffer;
|
||||
v_offset = vk_tensor_offset(v) + v->view_offs;
|
||||
}
|
||||
if (!R_uma) {
|
||||
d_R = r_buf_ctx->dev_buffer;
|
||||
r_offset = vk_tensor_offset(r) + r->view_offs;
|
||||
}
|
||||
if (!TF_uma) {
|
||||
d_TF = tf_buf_ctx->dev_buffer;
|
||||
tf_offset = vk_tensor_offset(tf) + tf->view_offs;
|
||||
}
|
||||
if (!TD_uma) {
|
||||
d_TD = td_buf_ctx->dev_buffer;
|
||||
td_offset = vk_tensor_offset(td) + td->view_offs;
|
||||
}
|
||||
if (!STATE_uma) {
|
||||
d_State = state_buf_ctx->dev_buffer;
|
||||
state_offset = vk_tensor_offset(state) + state->view_offs;
|
||||
}
|
||||
if (!DST_uma) {
|
||||
|
||||
const uint64_t dst_size = ggml_nbytes(dst);
|
||||
if (!dst_uma) {
|
||||
d_D = dst_buf_ctx->dev_buffer;
|
||||
dst_offset = vk_tensor_offset(dst) + dst->view_offs;
|
||||
}
|
||||
|
||||
const uint64_t k_size = ggml_nbytes(k);
|
||||
const uint64_t v_size = ggml_nbytes(v);
|
||||
const uint64_t r_size = ggml_nbytes(r);
|
||||
const uint64_t tf_size = ggml_nbytes(tf);
|
||||
const uint64_t td_size = ggml_nbytes(td);
|
||||
const uint64_t state_size = ggml_nbytes(state);
|
||||
const uint64_t dst_size = ggml_nbytes(dst);
|
||||
|
||||
std::array<uint32_t, 3> elements = {
|
||||
(uint32_t)(pc.B * pc.H),
|
||||
1,
|
||||
1
|
||||
};
|
||||
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {
|
||||
vk_subbuffer{ d_K, k_offset, k_size },
|
||||
vk_subbuffer{ d_V, v_offset, v_size },
|
||||
vk_subbuffer{ d_R, r_offset, r_size },
|
||||
vk_subbuffer{ d_TF, tf_offset, tf_size },
|
||||
vk_subbuffer{ d_TD, td_offset, td_size },
|
||||
vk_subbuffer{ d_State, state_offset, state_size },
|
||||
vk_subbuffer{ d_D, dst_offset, dst_size }
|
||||
}, sizeof(vk_op_rwkv_wkv6_push_constants), &pc, elements);
|
||||
if (version == 6) {
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {
|
||||
vk_subbuffer{ d_srcs[0], src_offsets[0], src_sizes[0] },
|
||||
vk_subbuffer{ d_srcs[1], src_offsets[1], src_sizes[1] },
|
||||
vk_subbuffer{ d_srcs[2], src_offsets[2], src_sizes[2] },
|
||||
vk_subbuffer{ d_srcs[3], src_offsets[3], src_sizes[3] },
|
||||
vk_subbuffer{ d_srcs[4], src_offsets[4], src_sizes[4] },
|
||||
vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] },
|
||||
vk_subbuffer{ d_D, dst_offset, dst_size }
|
||||
}, sizeof(vk_op_rwkv_wkv6_push_constants), &pc, elements);
|
||||
} else if (version == 7) {
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {
|
||||
vk_subbuffer{ d_srcs[0], src_offsets[0], src_sizes[0] },
|
||||
vk_subbuffer{ d_srcs[1], src_offsets[1], src_sizes[1] },
|
||||
vk_subbuffer{ d_srcs[2], src_offsets[2], src_sizes[2] },
|
||||
vk_subbuffer{ d_srcs[3], src_offsets[3], src_sizes[3] },
|
||||
vk_subbuffer{ d_srcs[4], src_offsets[4], src_sizes[4] },
|
||||
vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] },
|
||||
vk_subbuffer{ d_srcs[6], src_offsets[6], src_sizes[6] },
|
||||
vk_subbuffer{ d_D, dst_offset, dst_size }
|
||||
}, sizeof(vk_op_rwkv_wkv7_push_constants), &pc, elements);
|
||||
} else {
|
||||
// shouldn't happen
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) {
|
||||
|
@ -6077,7 +6229,7 @@ static void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx,
|
|||
const size_t n_heads = dst->src[0]->ne[1];
|
||||
const size_t n_seqs = dst->src[5]->ne[1];
|
||||
|
||||
ggml_vk_op_f32_rwkv6(
|
||||
ggml_vk_op_f32_wkv(
|
||||
ctx, subctx, dst,
|
||||
{
|
||||
(uint32_t)n_seqs,
|
||||
|
@ -6085,6 +6237,26 @@ static void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx,
|
|||
(uint32_t)n_embed,
|
||||
(uint32_t)n_heads,
|
||||
},
|
||||
6,
|
||||
dryrun
|
||||
);
|
||||
}
|
||||
|
||||
static void ggml_vk_rwkv_wkv7(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) {
|
||||
const size_t seq_length = dst->src[0]->ne[2];
|
||||
const size_t n_embed = dst->ne[0];
|
||||
const size_t n_heads = dst->src[0]->ne[1];
|
||||
const size_t n_seqs = dst->src[6]->ne[1];
|
||||
|
||||
ggml_vk_op_f32_wkv(
|
||||
ctx, subctx, dst,
|
||||
{
|
||||
(uint32_t)n_seqs,
|
||||
(uint32_t)seq_length,
|
||||
(uint32_t)n_embed,
|
||||
(uint32_t)n_heads,
|
||||
},
|
||||
7,
|
||||
dryrun
|
||||
);
|
||||
}
|
||||
|
@ -6386,6 +6558,11 @@ static void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& sub
|
|||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_RMS_NORM_BACK, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_l2_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
float * op_params = (float *)dst->op_params;
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_L2_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
|
||||
}
|
||||
|
@ -6775,7 +6952,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|||
ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k),
|
||||
m, n, k,
|
||||
k, k, m, k*m, k*n, m*n,
|
||||
split_k, batch, batch, batch, 1, 1
|
||||
split_k, batch, batch, batch, 1, 1, n
|
||||
);
|
||||
}
|
||||
ggml_vk_ctx_end(subctx);
|
||||
|
@ -7120,7 +7297,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|||
ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k),
|
||||
m, n, k,
|
||||
k, k, m, k*m, k*n, m*n,
|
||||
split_k, batch, batch, batch, 1, 1
|
||||
split_k, batch, batch, batch, 1, 1, n
|
||||
);
|
||||
}
|
||||
ggml_vk_ctx_end(subctx);
|
||||
|
@ -7381,6 +7558,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
case GGML_OP_GROUP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_RMS_NORM_BACK:
|
||||
case GGML_OP_L2_NORM:
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_SOFT_MAX_BACK:
|
||||
|
@ -7397,6 +7575,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
case GGML_OP_RWKV_WKV7:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
|
@ -7443,6 +7622,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
case GGML_OP_GROUP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_RMS_NORM_BACK:
|
||||
case GGML_OP_L2_NORM:
|
||||
case GGML_OP_UNARY:
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
|
@ -7560,6 +7740,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
case GGML_OP_RMS_NORM_BACK:
|
||||
ggml_vk_rms_norm_back(ctx, compute_ctx, src0, src1, node, dryrun);
|
||||
|
||||
break;
|
||||
case GGML_OP_L2_NORM:
|
||||
ggml_vk_l2_norm(ctx, compute_ctx, src0, node, dryrun);
|
||||
|
||||
break;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(node)) {
|
||||
|
@ -7650,6 +7834,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|||
|
||||
break;
|
||||
|
||||
case GGML_OP_RWKV_WKV7:
|
||||
ggml_vk_rwkv_wkv7(ctx, compute_ctx, node, dryrun);
|
||||
|
||||
break;
|
||||
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
ggml_vk_opt_step_adamw(ctx, compute_ctx, node, dryrun);
|
||||
|
||||
|
@ -7723,6 +7912,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
|||
case GGML_OP_GROUP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_RMS_NORM_BACK:
|
||||
case GGML_OP_L2_NORM:
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_SOFT_MAX_BACK:
|
||||
|
@ -7742,6 +7932,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
|||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
case GGML_OP_RWKV_WKV7:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_REPEAT:
|
||||
case GGML_OP_REPEAT_BACK:
|
||||
|
@ -8253,8 +8444,12 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
|
||||
uint64_t total_mat_mul_bytes = 0;
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
|
||||
if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
|
||||
total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
|
||||
}
|
||||
}
|
||||
if (ctx->device->need_compiles) {
|
||||
ggml_vk_load_shaders(ctx->device);
|
||||
|
@ -8275,17 +8470,27 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
bool first_node_in_batch = true; // true if next node will be first node in a batch
|
||||
int submit_node_idx = 0; // index to first node in a batch
|
||||
|
||||
// Submit work every nodes_per_submit nodes to overlap CPU cmdbuffer generation with GPU execution.
|
||||
// Start with a smaller count to get work submitted right away, and increase it after each submit.
|
||||
int nodes_per_submit = 20;
|
||||
// Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
|
||||
// Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
|
||||
// (and scaled down based on model size, so smaller models submit earlier).
|
||||
// Also submit at least every 100 nodes, in case there are workloads without as much matmul.
|
||||
int nodes_per_submit = 100;
|
||||
int submitted_nodes = 0;
|
||||
int submit_count = 0;
|
||||
uint64_t mul_mat_bytes = 0;
|
||||
uint64_t mul_mat_bytes_per_submit = std::min(uint64_t(100*1000*1000), total_mat_mul_bytes / 40u);
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
if (first_node_in_batch) {
|
||||
submit_node_idx = i;
|
||||
}
|
||||
|
||||
bool submit = (submitted_nodes >= nodes_per_submit) || (i == last_node);
|
||||
if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
|
||||
mul_mat_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
|
||||
}
|
||||
|
||||
bool submit = (submitted_nodes >= nodes_per_submit) ||
|
||||
(mul_mat_bytes >= mul_mat_bytes_per_submit) ||
|
||||
(i == last_node);
|
||||
|
||||
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
|
||||
|
||||
|
@ -8302,13 +8507,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
if (submit) {
|
||||
first_node_in_batch = true;
|
||||
submitted_nodes = 0;
|
||||
switch (submit_count) {
|
||||
case 0:
|
||||
nodes_per_submit = 50;
|
||||
break;
|
||||
default:
|
||||
nodes_per_submit = 100;
|
||||
break;
|
||||
mul_mat_bytes = 0;
|
||||
if (submit_count < 3) {
|
||||
mul_mat_bytes_per_submit *= 2;
|
||||
}
|
||||
submit_count++;
|
||||
}
|
||||
|
@ -8659,6 +8860,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|||
case GGML_OP_NORM:
|
||||
case GGML_OP_GROUP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_L2_NORM:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_SUB:
|
||||
|
@ -8688,6 +8890,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
case GGML_OP_RWKV_WKV7:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
return true;
|
||||
|
@ -8834,7 +9037,7 @@ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::ve
|
|||
UNUSED(instance_extensions);
|
||||
}
|
||||
|
||||
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props) {
|
||||
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) {
|
||||
switch (props.vendorID) {
|
||||
case VK_VENDOR_ID_INTEL:
|
||||
// Intel drivers don't support coopmat properly yet
|
||||
|
@ -8842,10 +9045,7 @@ static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDevicePrope
|
|||
case VK_VENDOR_ID_AMD:
|
||||
if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) {
|
||||
// Workaround for AMD proprietary driver reporting support on all GPUs
|
||||
const std::string name = props.deviceName;
|
||||
return name.rfind("AMD Radeon RX 7", 0) == 0 || name.rfind("AMD Radeon(TM) RX 7", 0) == 0 || // RDNA 3 consumer GPUs
|
||||
name.rfind("AMD Radeon PRO W7", 0) == 0 || name.rfind("AMD Radeon(TM) PRO W7", 0) == 0 || // RDNA 3 workstation GPUs
|
||||
name.rfind("AMD Radeon 7", 0) == 0 || name.rfind("AMD Radeon(TM) 7", 0) == 0; // RDNA 3 APUs
|
||||
return arch == vk_device_architecture::AMD_RDNA3;
|
||||
}
|
||||
return true;
|
||||
default:
|
||||
|
@ -9075,6 +9275,9 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
|
|||
tensor_clone = ggml_rms_norm_back(ggml_ctx, src_clone[0], src_clone[1], eps);
|
||||
} else if (tensor->op == GGML_OP_SILU_BACK) {
|
||||
tensor_clone = ggml_silu_back(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
} else if (tensor->op == GGML_OP_L2_NORM) {
|
||||
const float eps = ((float *) tensor->op_params)[0];
|
||||
tensor_clone = ggml_l2_norm(ggml_ctx, src_clone[0], eps);
|
||||
} else if (tensor->op == GGML_OP_SOFT_MAX) {
|
||||
if (src1 != nullptr) {
|
||||
tensor_clone = ggml_soft_max_ext(ggml_ctx, src_clone[0], src_clone[1], ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
|
||||
|
@ -9194,6 +9397,9 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
|
|||
} else if (tensor->op == GGML_OP_RWKV_WKV6) {
|
||||
tensor_clone = ggml_rwkv_wkv6(ggml_ctx, src_clone[0], src_clone[1],
|
||||
src_clone[2], src_clone[3], src_clone[4], src_clone[5]);
|
||||
} else if (tensor->op == GGML_OP_RWKV_WKV7) {
|
||||
tensor_clone = ggml_rwkv_wkv7(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3],
|
||||
src_clone[4], src_clone[5], src_clone[6]);
|
||||
} else if (tensor->op == GGML_OP_OPT_STEP_ADAMW) {
|
||||
src_clone[0]->flags = src0->flags;
|
||||
tensor_clone = ggml_opt_step_adamw(ggml_ctx, src_clone[0], src_clone[1],
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue