Merge commit '4ccea213bc' into concedo_experimental

# Conflicts:
#	.devops/cpu.Dockerfile
#	.devops/cuda.Dockerfile
#	.devops/intel.Dockerfile
#	.devops/musa.Dockerfile
#	.devops/rocm.Dockerfile
#	.github/workflows/bench.yml.disabled
#	.github/workflows/build.yml
#	.github/workflows/server.yml
#	CMakeLists.txt
#	build-xcframework.sh
#	ci/run.sh
#	common/CMakeLists.txt
#	examples/llama.android/llama/build.gradle.kts
#	examples/perplexity/perplexity.cpp
#	examples/run/CMakeLists.txt
#	examples/server/tests/README.md
#	examples/sycl/win-build-sycl.bat
#	ggml/src/ggml-cann/aclnn_ops.cpp
#	ggml/src/ggml-cann/aclnn_ops.h
#	ggml/src/ggml-cpu/CMakeLists.txt
#	ggml/src/ggml-cpu/ggml-cpu.c
#	licenses/LICENSE-linenoise
#	scripts/sync-ggml.last
#	tests/CMakeLists.txt
This commit is contained in:
Concedo 2025-04-08 21:26:23 +08:00
commit b99ee451f8
29 changed files with 11032 additions and 12914 deletions

View file

@ -1842,6 +1842,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
// can't use 256 for D==80.
uint32_t wg_size = (small_rows && (D % 32) == 0) ? 256 : 128;
auto rows_cols = fa_rows_cols(D, clamp, type, small_rows);
// mask dim1 is padded to 64, we rely on this to avoid clamping mask loads
GGML_ASSERT((GGML_KQ_MASK_PAD % rows_cols[0]) == 0);
return {wg_size, rows_cols[0], rows_cols[1], (D), clamp};
};
@ -5528,6 +5530,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
// the "aligned" shader variant will forcibly align strides, for performance
(q_stride & 7) == 0 && (k_stride & 7) == 0 && (v_stride & 7) == 0;
// mask dim1 is padded to 64, we rely on this to avoid clamping mask loads
GGML_ASSERT((nem1 % GGML_KQ_MASK_PAD) == 0);
vk_pipeline pipeline = pipelines[aligned];
assert(pipeline);