diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index ebadff907..e22576d34 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -351,7 +351,8 @@ struct clip_ctx { std::vector backend_ptrs; std::vector backend_buft; - ggml_backend_ptr backend; + ggml_backend_t backend; + ggml_backend_buffer_ptr buf; ggml_backend_sched_ptr sched; @@ -363,31 +364,35 @@ struct clip_ctx { if(enable_gpu_clip) { #ifdef GGML_USE_CUDA - backend = ggml_backend_ptr(ggml_backend_cuda_init(0)); + backend = ggml_backend_cuda_init(0); LOG_INF("%s: CLIP using CUDA backend\n", __func__); #endif #ifdef GGML_USE_METAL - backend = ggml_backend_ptr(ggml_backend_metal_init()); + backend = ggml_backend_metal_init(); LOG_INF("%s: CLIP using Metal backend\n", __func__); #endif #ifdef GGML_USE_VULKAN - backend = ggml_backend_ptr(ggml_backend_vk_init(0)); + backend = ggml_backend_vk_init(0); LOG_INF("%s: CLIP using Vulkan backend\n", __func__); #endif } if (!backend) { - backend = ggml_backend_ptr(ggml_backend_cpu_init()); + backend = ggml_backend_cpu_init(); LOG_INF("%s: CLIP using CPU backend\n", __func__); } - backend_ptrs.push_back(backend.get()); - backend_buft.push_back(ggml_backend_get_default_buffer_type(backend.get())); + backend_ptrs.push_back(backend); + backend_buft.push_back(ggml_backend_get_default_buffer_type(backend)); sched.reset( ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false) ); } + + ~clip_ctx() { + ggml_backend_free(backend); + } }; static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch & imgs) { @@ -1555,7 +1560,7 @@ struct clip_model_loader { } // alloc memory and offload data - ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend.get()); + ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend); ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft)); ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS); for (auto & t : tensors_to_load) { @@ -2950,8 +2955,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima if (window_mask) ggml_backend_tensor_set(window_mask, mask.data(), 0, ggml_nbytes(window_mask)); } - if (ggml_backend_is_cpu(ctx->backend.get())) { - ggml_backend_cpu_set_n_threads(ctx->backend.get(), n_threads); + if (ggml_backend_is_cpu(ctx->backend)) { + ggml_backend_cpu_set_n_threads(ctx->backend, n_threads); } auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp index a8f4bc417..e1baa85f9 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp @@ -201,6 +201,11 @@ void main() { uint32_t q_stride = p.gqa_ratio > 1 ? (p.nb02 / 4) : p.nb01; uint32_t k_stride = p.nb11; uint32_t v_stride = p.nb21; + // When using grouped query attention, all rows use the same mask (stride 0). + // "p.gqa_ratio >> 16" is just a roundabout way of writing zero + // that prevents the compiler from folding the "&" through the select + // and breaking the alignment detection. + uint32_t m_stride = (p.gqa_ratio > 1) ? (p.gqa_ratio >> 16) : KV; // hint to the compiler that strides are aligned for the aligned variant of the shader if (Clamp != gl_CooperativeMatrixClampModeConstantNV) { @@ -209,6 +214,7 @@ void main() { k_stride &= ~7; v_stride &= ~7; #endif + m_stride &= ~7; } tensorLayoutQ = setTensorLayoutStrideNV(tensorLayoutQ, q_stride, 1); tensorLayoutK = setTensorLayoutStrideNV(tensorLayoutK, k_stride, 1); @@ -261,10 +267,7 @@ void main() { if (p.mask != 0) { tensorLayoutNV<2, Clamp> tensorLayoutM = createTensorLayoutNV(2, Clamp); tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV); - // When using grouped query attention, all rows use the same mask. - if (p.gqa_ratio > 1) { - tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, 0, 1); - } + tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, m_stride, 1); coopmat mv;