mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/cuda.Dockerfile # .devops/musa.Dockerfile # .github/workflows/build.yml # README.md # docs/docker.md # examples/imatrix/imatrix.cpp # examples/llama-bench/llama-bench.cpp # examples/main/README.md # examples/perplexity/perplexity.cpp # examples/server/README.md # ggml/src/ggml-cpu/ggml-cpu.c # ggml/src/ggml-cuda/CMakeLists.txt # models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja # models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja # scripts/get_chat_template.py # scripts/sync-ggml.last # tests/test-chat.cpp # tests/test-gguf.cpp # tests/test-sampling.cpp
This commit is contained in:
commit
754fef5204
27 changed files with 1845 additions and 548 deletions
|
@ -1434,6 +1434,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
|
||||
|
||||
// some shaders have a minimum subgroup size
|
||||
const uint32_t subgroup_size_8 = std::max(device->subgroup_size, 8u);
|
||||
const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u);
|
||||
const uint32_t subgroup_size_32 = std::max(device->subgroup_size, 32u);
|
||||
|
||||
|
@ -1496,13 +1497,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
const uint32_t tk_m = device->coopmat_support ? device->coopmat_k : 1;
|
||||
const uint32_t tk_s = device->coopmat_support ? device->coopmat_k : 1;
|
||||
|
||||
l_warptile = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, tm_l, tn_l, tk_l, device->subgroup_size };
|
||||
m_warptile = { 128, 64, 64, 16, device->subgroup_size, 32, 2, tm_m, tn_m, tk_m, device->subgroup_size };
|
||||
s_warptile = { subgroup_size_16, 32, 32, 16, 32, 32, 2, tm_s, tn_s, tk_s, device->subgroup_size };
|
||||
l_warptile = { 128, 128, 128, 16, subgroup_size_8 * 2, 64, 2, tm_l, tn_l, tk_l, subgroup_size_8 };
|
||||
m_warptile = { 128, 64, 64, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
||||
s_warptile = { subgroup_size_16, 32, 32, 16, 32, 32, 2, tm_s, tn_s, tk_s, subgroup_size_8 };
|
||||
|
||||
l_warptile_mmq = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, tm_l, tn_l, tk_l, device->subgroup_size };
|
||||
m_warptile_mmq = { 128, 64, 64, 32, device->subgroup_size, 32, 2, tm_m, tn_m, tk_m, device->subgroup_size };
|
||||
s_warptile_mmq = { subgroup_size_32, 32, 32, 32, 32, 32, 2, tm_s, tn_s, tk_s, device->subgroup_size };
|
||||
l_warptile_mmq = { 128, 128, 128, 32, subgroup_size_8 * 2, 64, 2, tm_l, tn_l, tk_l, subgroup_size_8 };
|
||||
m_warptile_mmq = { 128, 64, 64, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
||||
s_warptile_mmq = { subgroup_size_32, 32, 32, 32, 32, 32, 2, tm_s, tn_s, tk_s, subgroup_size_8 };
|
||||
|
||||
l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 };
|
||||
m_mmq_wg_denoms = m_wg_denoms = { 64, 64, 1 };
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue