From 5a8ae3053ced350ed300ba91600519fcad1c6ba7 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Thu, 5 Jun 2025 00:17:58 -0500 Subject: [PATCH 01/10] vulkan: automatically deduce size of push constants (#13936) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 82 ++++++++++++++++++---------- 1 file changed, 54 insertions(+), 28 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index a4026f88f..1d25d79d5 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -4079,7 +4079,33 @@ static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bo return s; } -static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list const& descriptor_buffer_infos, size_t push_constant_size, const void* push_constants, std::array elements) { +template size_t push_constant_size(const T &t) { + static_assert(std::is_class::value, "T must be a struct/class"); + GGML_UNUSED(t); + return sizeof(T); +} +template size_t push_constant_size(const std::vector &t) { + GGML_UNUSED(t); + return sizeof(T) * t.size(); +} +template size_t push_constant_size(const std::array &t) { + GGML_UNUSED(t); + return sizeof(T) * N; +} + +template const T *push_constant_data(const T &t) { + static_assert(std::is_class::value, "T must be a struct/class"); + return &t; +} +template const T *push_constant_data(const std::vector &t) { + return t.data(); +} +template const T *push_constant_data(const std::array &t) { + return t.data(); +} + +template +static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list const& descriptor_buffer_infos, const T &push_constants, std::array elements) { const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]); const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]); const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]); @@ -4095,7 +4121,7 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() }; ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {}); - subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants); + subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size(push_constants), push_constant_data(push_constants)); subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline); subctx->s->buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipeline->layout, @@ -4558,7 +4584,7 @@ static void ggml_vk_matmul( ggml_vk_sync_buffers(subctx); if (split_k == 1) { const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3, padded_n }; - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, sizeof(vk_mat_mat_push_constants), &pc, { m, n, batch }); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc, { m, n, batch }); return; } @@ -4566,10 +4592,10 @@ static void ggml_vk_matmul( const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, padded_n }; // Make sure enough workgroups get assigned for split k to work - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, sizeof(vk_mat_mat_push_constants), &pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch }); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch }); ggml_vk_sync_buffers(subctx); const std::array pc2 = { (uint32_t)(m * n * batch), split_k }; - ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 }); + ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2, { m * n * batch, 1, 1 }); } static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, uint32_t m, uint32_t n, bool aligned, ggml_type src0_type) { @@ -4617,7 +4643,7 @@ static void ggml_vk_matmul_id( ggml_vk_sync_buffers(subctx); const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, nei0, nei1, nbi1, ne11, padded_n }; - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, sizeof(vk_mat_mat_id_push_constants), &pc, { m, nei1, n_as }); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, pc, { m, nei1, n_as }); } static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) { @@ -4738,7 +4764,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& }; init_pushconst_fastdiv(pc); ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements); } static vk_pipeline ggml_vk_get_quantize_pipeline(ggml_backend_vk_context * ctx, ggml_type type) { @@ -4757,7 +4783,7 @@ static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& sub vk_pipeline pipeline = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(uint32_t), &ne, { ne, 1, 1 }); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, std::array{ne}, { ne, 1, 1 }); } static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { @@ -4957,7 +4983,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub } else if (qx_needs_dequant) { const std::vector pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); + ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); } if (y_non_contig) { ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); @@ -5173,7 +5199,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} }, - sizeof(vk_mat_vec_push_constants), &pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z }); + pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z }); } static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { @@ -5261,7 +5287,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c } ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, workgroups_z }); + ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, workgroups_z }); } static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { @@ -5344,7 +5370,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con const std::array pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, - { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); + { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); } static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { @@ -5560,7 +5586,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& const std::vector pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, - { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); + { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); } if (y_non_contig) { ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); @@ -5780,7 +5806,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } }, - sizeof(vk_mat_vec_id_push_constants), &pc, { groups_x, (uint32_t)nei0, groups_z }); + pc, { groups_x, (uint32_t)nei0, groups_z }); } static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) { @@ -6130,7 +6156,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx // there's no more than one tile of rows (i.e. workgroups_x would have been // one). We reuse workgroups_x to mean the number of splits, so we need to // cancel out the divide by wg_denoms[0]. - sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z }); + pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z }); ggml_vk_sync_buffers(subctx); const std::array pc2 = { D, (uint32_t)ne1, split_k }; @@ -6139,7 +6165,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE}, vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE}, }, - pc2.size() * uint32_t{sizeof(uint32_t)}, pc2.data(), { (uint32_t)ne1, 1, 1 }); + pc2, { (uint32_t)ne1, 1, 1 }); } else { ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { @@ -6149,7 +6175,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE}, vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE}, }, - sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x, workgroups_y, workgroups_z }); + pc, { workgroups_x, workgroups_y, workgroups_z }); } } @@ -6827,7 +6853,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co } ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements); } else if (op == GGML_OP_ROPE || op == GGML_OP_ROPE_BACK) { // Empty src2 is possible in rope, but the shader needs a buffer vk_subbuffer subbuf_z; @@ -6838,26 +6864,26 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co } ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements); } else if (op == GGML_OP_IM2COL) { // im2col uses only src1 and dst buffers ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements); } else if (op == GGML_OP_COUNT_EQUAL) { ggml_vk_sync_buffers(subctx); // count_equal assumes that destination buffer is initialized with zeroes ggml_vk_buffer_memset_async(subctx, d_D, d_buf_offset, 0, d_sz); ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements); } else if (use_src2) { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements); } else if (use_src1) { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements); } else { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements); } } @@ -7026,7 +7052,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx vk_subbuffer{ d_srcs[4], src_offsets[4], src_sizes[4] }, vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] }, vk_subbuffer{ d_D, dst_offset, dst_size } - }, sizeof(vk_op_rwkv_wkv6_push_constants), &pc, elements); + }, pc, elements); } else if (version == 7) { ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_srcs[0], src_offsets[0], src_sizes[0] }, @@ -7037,7 +7063,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] }, vk_subbuffer{ d_srcs[6], src_offsets[6], src_sizes[6] }, vk_subbuffer{ d_D, dst_offset, dst_size } - }, sizeof(vk_op_rwkv_wkv7_push_constants), &pc, elements); + }, pc, elements); } else { // shouldn't happen GGML_ASSERT(false); @@ -7174,7 +7200,7 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont vk_subbuffer{ d_GM, gm_offset, gm_size }, vk_subbuffer{ d_GV, gv_offset, gv_size }, vk_subbuffer{ d_P, p_offset, p_size }, - }, sizeof(vk_op_push_constants), &pc, elements); + }, pc, elements); } static void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) { @@ -8063,7 +8089,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_ vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); ggml_vk_ctx_begin(ctx->device, subctx); const std::vector pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne }; - ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1}); + ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc, { (uint32_t)ne, 1, 1}); ggml_vk_ctx_end(subctx); auto begin = std::chrono::high_resolution_clock::now(); From 9e31bec4fd53634c9e5b04650488a09a055f5dab Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 5 Jun 2025 09:06:29 +0300 Subject: [PATCH 02/10] context : fix pos_min initialization upon error decode (#14008) ggml-ci --- src/llama-context.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 7c1a642c1..f1b43b9cc 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1058,7 +1058,10 @@ int llama_context::decode(llama_batch & inp_batch) { if (!res) { // the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache - llama_pos pos_min[LLAMA_MAX_PARALLEL_SEQUENCES] = { std::numeric_limits::max() }; + llama_pos pos_min[LLAMA_MAX_PARALLEL_SEQUENCES]; + for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) { + pos_min[s] = std::numeric_limits::max(); + } for (uint32_t i = 0; i < ubatch.n_tokens; ++i) { const auto & seq_id = ubatch.seq_id[i][0]; From 9f47fa5792bae5312615439e68dbf1826913f7ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Thu, 5 Jun 2025 09:29:18 +0200 Subject: [PATCH 03/10] vocab : warn about missing mask token (#14022) --- src/llama-vocab.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index b51976699..ba2e1864e 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -2098,7 +2098,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"}) || _contains_any(general_arch, {"nomic-bert-moe"}) ) { - _set_token_attr("", LLAMA_TOKEN_ATTR_LSTRIP, true); + if (token_to_id.count("") == 0) { + LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__); + } else { + _set_token_attr("", LLAMA_TOKEN_ATTR_LSTRIP, true); + } } else if (_contains_any(model_name, {"phi-3", "phi3"})) { for (auto id : cache_special_tokens) { _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true); From d01d112abb055549d33bb8aac1755eb0c40918ad Mon Sep 17 00:00:00 2001 From: Olexandr88 Date: Thu, 5 Jun 2025 10:50:55 +0300 Subject: [PATCH 04/10] readme : add badge (#13938) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 91401fa98..385ac04d8 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) +[![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases) [![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml) [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml) From 3a077146a4761fdbd24bdd8eb098f46b8adc4dda Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Thu, 5 Jun 2025 02:57:42 -0700 Subject: [PATCH 05/10] llama : allow using mmap without PrefetchVirtualMemory, apply GGML_WIN_VER to llama.cpp sources (#14013) --- CMakeLists.txt | 5 +++++ ggml/CMakeLists.txt | 2 +- ggml/src/CMakeLists.txt | 1 - src/llama-mmap.cpp | 2 +- 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ac3e90903..f73470dff 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -159,6 +159,11 @@ if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML) # ... otherwise assume ggml is added by a parent CMakeLists.txt endif() +if (MINGW) + # Target Windows 8 for PrefetchVirtualMemory + add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER}) +endif() + # # build the library # diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 3d01184a2..e186fdf3c 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -137,7 +137,7 @@ set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM") set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC") -if (WIN32) +if (MINGW) set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version") endif() diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 76b24bd9d..7dcb031f0 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -125,7 +125,6 @@ if (NOT MSVC) endif() if (MINGW) - # Target Windows 8 for PrefetchVirtualMemory add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER}) endif() diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 9da97f1bc..47497cf95 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -401,7 +401,7 @@ struct llama_mmap::impl { } } #else - throw std::runtime_error("PrefetchVirtualMemory unavailable"); + LLAMA_LOG_DEBUG("skipping PrefetchVirtualMemory because _WIN32_WINNT < 0x602\n"); #endif } } From 7f37b6cf1e2c1b90bf0d9c8d91904b4b6c512748 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 5 Jun 2025 15:29:22 +0300 Subject: [PATCH 06/10] memory : migrate from llama_kv_cache to more generic llama_memory (#14006) * memory : merge llama_kv_cache into llama_memory + new `llama_memory` API ggml-ci * context : fix casts ggml-ci --- include/llama.h | 100 ++++++++++++-- src/CMakeLists.txt | 1 - src/llama-context.cpp | 223 ++++++++++++++++++------------ src/llama-context.h | 8 +- src/llama-graph.h | 2 +- src/llama-kv-cache-recurrent.h | 28 ++-- src/llama-kv-cache-unified-iswa.h | 28 ++-- src/llama-kv-cache-unified.h | 30 ++-- src/llama-kv-cache.cpp | 1 - src/llama-kv-cache.h | 41 ------ src/llama-memory.h | 82 +++++++---- 11 files changed, 324 insertions(+), 220 deletions(-) delete mode 100644 src/llama-kv-cache.cpp delete mode 100644 src/llama-kv-cache.h diff --git a/include/llama.h b/include/llama.h index da0f652cf..21808c881 100644 --- a/include/llama.h +++ b/include/llama.h @@ -61,7 +61,10 @@ extern "C" { struct llama_model; struct llama_context; struct llama_sampler; - struct llama_kv_cache; + + typedef struct llama_memory_i * llama_memory_t; + + struct llama_kv_cache; // DEPRECATED (use llama_memory instead) typedef int32_t llama_pos; typedef int32_t llama_token; @@ -493,9 +496,11 @@ extern "C" { DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead"); LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx); - LLAMA_API struct llama_kv_cache * llama_get_kv_self ( struct llama_context * ctx); + LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx); LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type + DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead"); + LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model); LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model); @@ -609,7 +614,78 @@ extern "C" { int32_t il_end); // - // KV cache + // Memory + // + + // Clear the memory contents + LLAMA_API void llama_memory_clear(llama_memory_t mem); + + // Removes all tokens that belong to the specified sequence and have positions in [p0, p1) + // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails + // seq_id < 0 : match any sequence + // p0 < 0 : [0, p1] + // p1 < 0 : [p0, inf) + LLAMA_API bool llama_memory_seq_rm( + llama_memory_t mem, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1); + + // Copy all tokens that belong to the specified sequence to another sequence + // p0 < 0 : [0, p1] + // p1 < 0 : [p0, inf) + LLAMA_API void llama_memory_seq_cp( + llama_memory_t mem, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1); + + // Removes all tokens that do not belong to the specified sequence + LLAMA_API void llama_memory_seq_keep( + llama_memory_t mem, + llama_seq_id seq_id); + + // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) + // p0 < 0 : [0, p1] + // p1 < 0 : [p0, inf) + LLAMA_API void llama_memory_seq_add( + llama_memory_t mem, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta); + + // Integer division of the positions by factor of `d > 1` + // p0 < 0 : [0, p1] + // p1 < 0 : [p0, inf) + LLAMA_API void llama_memory_seq_div( + llama_memory_t mem, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d); + + // Returns the smallest position present in the memory for the specified sequence + // This is typically non-zero only for SWA caches + // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory + // Return -1 if the sequence is empty + LLAMA_API llama_pos llama_memory_seq_pos_min( + llama_memory_t mem, + llama_seq_id seq_id); + + // Returns the largest position present in the memory for the specified sequence + // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory + // Return -1 if the sequence is empty + LLAMA_API llama_pos llama_memory_seq_pos_max( + llama_memory_t mem, + llama_seq_id seq_id); + + // Check if the memory supports shifting + LLAMA_API bool llama_memory_can_shift(llama_memory_t mem); + + // + // KV cache for self-attention (TODO: deprecate in favor of llama_memory) // // Returns the number of tokens in the KV cache (slow, use only for debug) @@ -623,7 +699,7 @@ extern "C" { // Clear the KV cache - both cell info is erased and KV data is zeroed LLAMA_API void llama_kv_self_clear( - struct llama_context * ctx); + struct llama_context * ctx); // Removes all tokens that belong to the specified sequence and have positions in [p0, p1) // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails @@ -694,14 +770,14 @@ extern "C" { // Defragment the KV cache // This will be applied: // - lazily on next llama_decode() - LLAMA_API DEPRECATED(void llama_kv_self_defrag(struct llama_context * ctx), + DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx), "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'"); // Check if the context supports KV cache shifting LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx); // Apply the KV cache updates (such as K-shifts, defragmentation, etc.) - LLAMA_API DEPRECATED(void llama_kv_self_update(struct llama_context * ctx), + DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx), "simply remove this call, updates are applied lazily on the next llama_decode()"); // @@ -709,7 +785,7 @@ extern "C" { // // Returns the *actual* size in bytes of the state - // (logits, embedding and kv_cache) + // (logits, embedding and memory) // Only use when saving the state, not when restoring it, otherwise the size may be too small. LLAMA_API size_t llama_state_get_size(struct llama_context * ctx); LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx), @@ -765,12 +841,12 @@ extern "C" { size_t n_token_count), "use llama_state_save_file instead"); - // Get the exact size needed to copy the KV cache of a single sequence + // Get the exact size needed to copy the state of a single sequence LLAMA_API size_t llama_state_seq_get_size( struct llama_context * ctx, llama_seq_id seq_id); - // Copy the KV cache of a single sequence into the specified buffer + // Copy the state of a single sequence into the specified buffer LLAMA_API size_t llama_state_seq_get_data( struct llama_context * ctx, uint8_t * dst, @@ -836,16 +912,16 @@ extern "C" { // For encode-decoder contexts, processes the batch using the encoder. // Can store the encoder output internally for later use by the decoder's cross-attention layers. // 0 - success - // < 0 - error. the KV cache state is restored to the state before this call + // < 0 - error. the memory state is restored to the state before this call LLAMA_API int32_t llama_encode( struct llama_context * ctx, struct llama_batch batch); // Process a batch of tokens. - // Requires KV cache. + // Requires the context to have a memory. // For encode-decoder contexts, processes the batch using the decoder. // Positive return values does not mean a fatal error, but rather a warning. - // Upon non-zero return values, the KV cache state is restored to the state before this call + // Upon non-zero return values, the memory state is restored to the state before this call // 0 - success // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context) // 2 - aborted diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d20bd4fe2..70be604e4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -20,7 +20,6 @@ add_library(llama llama-hparams.cpp llama-impl.cpp llama-io.cpp - llama-kv-cache.cpp llama-kv-cache-unified.cpp llama-kv-cache-unified-iswa.cpp llama-kv-cache-recurrent.cpp diff --git a/src/llama-context.cpp b/src/llama-context.cpp index f1b43b9cc..c29fe7e4c 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2,9 +2,9 @@ #include "llama-impl.h" #include "llama-io.h" +#include "llama-memory.h" #include "llama-mmap.h" #include "llama-model.h" -#include "llama-kv-cache.h" #include #include @@ -277,10 +277,9 @@ llama_context::llama_context( int n_nodes_tg = -1; // simulate full KV cache - llama_kv_cache * kv_self = static_cast(memory.get()); - const auto kv_state = kv_self->init_full(); - if (!kv_state) { + const auto mstate = memory->init_full(); + if (!mstate) { throw std::runtime_error("failed to initialize KV cache"); } @@ -288,7 +287,7 @@ llama_context::llama_context( // reserve pp graph first so that buffers are only allocated once { - auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, kv_state.get()); + auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mstate.get()); if (!gf) { throw std::runtime_error("failed to allocate compute pp buffers"); } @@ -299,7 +298,7 @@ llama_context::llama_context( // reserve with tg graph to get the number of splits and nodes { - auto * gf = graph_reserve(1, 1, 1, kv_state.get()); + auto * gf = graph_reserve(1, 1, 1, mstate.get()); if (!gf) { throw std::runtime_error("failed to allocate compute tg buffers"); } @@ -310,7 +309,7 @@ llama_context::llama_context( // reserve again with pp graph to avoid ggml-alloc reallocations during inference { - auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, kv_state.get()); + auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mstate.get()); if (!gf) { throw std::runtime_error("failed to allocate compute pp buffers"); } @@ -419,14 +418,8 @@ uint32_t llama_context::n_threads_batch() const { return cparams.n_threads_batch; } -llama_kv_cache * llama_context::get_kv_self() { - llama_kv_cache * kv_self = static_cast(memory.get()); - return kv_self; -} - -const llama_kv_cache * llama_context::get_kv_self() const { - llama_kv_cache * kv_self = static_cast(memory.get()); - return kv_self; +llama_memory_t llama_context::get_memory() const { + return memory.get(); } void llama_context::kv_self_defrag_sched() { @@ -442,15 +435,13 @@ bool llama_context::kv_self_update(bool optimize) { return false; } - llama_kv_cache * kv_self = static_cast(memory.get()); - { // TODO: remove in the future optimize |= memory_force_optimize; memory_force_optimize = false; - const auto kv_state = kv_self->init_update(this, optimize); - switch (kv_state->get_status()) { + const auto mstate = memory->init_update(this, optimize); + switch (mstate->get_status()) { case LLAMA_MEMORY_STATUS_SUCCESS: { // noop @@ -468,23 +459,25 @@ bool llama_context::kv_self_update(bool optimize) { } } - if (!kv_state->apply()) { + if (!mstate->apply()) { LLAMA_LOG_ERROR("%s: failed to apply memory update\n", __func__); } } - // if the KV cache did any computation, we have to reserve a new worst-case graph - const auto kv_state = kv_self->init_full(); - if (!kv_state) { - throw std::runtime_error("failed to initialize memory state"); - } + // if the memory module did any computation, we have to reserve a new worst-case graph + { + const auto mstate = memory->init_full(); + if (!mstate) { + throw std::runtime_error("failed to initialize memory state"); + } - const uint32_t n_seqs = cparams.n_seq_max; - const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); + const uint32_t n_seqs = cparams.n_seq_max; + const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, kv_state.get()); - if (!gf) { - LLAMA_LOG_ERROR("%s: failed to reserve graph after the memory update\n", __func__); + auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mstate.get()); + if (!gf) { + LLAMA_LOG_ERROR("%s: failed to reserve graph after the memory update\n", __func__); + } } return true; @@ -912,10 +905,8 @@ int llama_context::decode(llama_batch & inp_batch) { } } - llama_kv_cache * kv_self = static_cast(memory.get()); - // temporary allocate memory for the input batch if needed - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->seq_pos_max(0) + 1); + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : memory->seq_pos_max(0) + 1); const llama_batch & batch = batch_allocr.batch; @@ -977,21 +968,21 @@ int llama_context::decode(llama_batch & inp_batch) { // handle any pending defrags/shifts kv_self_update(false); - llama_memory_state_ptr kv_state; + llama_memory_state_ptr mstate; while (true) { - kv_state = kv_self->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ n_outputs_all == n_tokens_all); - if (!kv_state) { + mstate = memory->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ n_outputs_all == n_tokens_all); + if (!mstate) { return -2; } - switch (kv_state->get_status()) { + switch (mstate->get_status()) { case LLAMA_MEMORY_STATUS_SUCCESS: { } break; case LLAMA_MEMORY_STATUS_NO_UPDATE: { - LLAMA_LOG_ERROR("%s: unexpected memory state status: %d\n", __func__, kv_state->get_status()); + LLAMA_LOG_ERROR("%s: unexpected memory state status: %d\n", __func__, mstate->get_status()); return -2; } @@ -1031,7 +1022,7 @@ int llama_context::decode(llama_batch & inp_batch) { int64_t n_outputs_prev = 0; do { - const auto & ubatch = kv_state->get_ubatch(); + const auto & ubatch = mstate->get_ubatch(); // count the outputs in this u_batch { @@ -1054,7 +1045,7 @@ int llama_context::decode(llama_batch & inp_batch) { ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); ggml_status status; - const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, kv_state.get(), status); + const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mstate.get(), status); if (!res) { // the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache @@ -1076,7 +1067,7 @@ int llama_context::decode(llama_batch & inp_batch) { LLAMA_LOG_WARN("%s: removing KV cache entries for seq_id = %d, pos = [%d, +inf)\n", __func__, s, pos_min[s]); - llama_kv_self_seq_rm(this, s, pos_min[s], -1); + memory->seq_rm(s, pos_min[s], -1); } switch (status) { @@ -1170,7 +1161,7 @@ int llama_context::decode(llama_batch & inp_batch) { } n_outputs_prev += n_outputs; - } while (kv_state->next()); + } while (mstate->next()); // set to total number of outputs in the batch, for use in llama_get_logits_ith n_outputs = n_outputs_all; @@ -1179,7 +1170,7 @@ int llama_context::decode(llama_batch & inp_batch) { { bool sorted_output = true; - auto & out_ids = kv_state->out_ids(); + auto & out_ids = mstate->out_ids(); GGML_ASSERT(out_ids.size() == (size_t) n_outputs_all); @@ -1847,11 +1838,9 @@ size_t llama_context::state_write_data(llama_io_write_i & io) { } } - llama_kv_cache * kv_self = static_cast(memory.get()); - - if (kv_self != nullptr) { + if (memory != nullptr) { LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__); - kv_self->state_write(io); + memory->state_write(io); } return io.n_bytes(); @@ -1938,9 +1927,7 @@ size_t llama_context::state_read_data(llama_io_read_i & io) { if (memory) { LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__); - llama_kv_cache * kv_self = static_cast(memory.get()); - - kv_self->state_read(io); + memory->state_read(io); } return io.n_bytes(); @@ -1950,9 +1937,7 @@ size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id s GGML_UNUSED(seq_id); if (memory) { - llama_kv_cache * kv_self = static_cast(memory.get()); - - kv_self->state_write(io, seq_id); + memory->state_write(io, seq_id); } return io.n_bytes(); @@ -1962,9 +1947,7 @@ size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq GGML_UNUSED(seq_id); if (memory) { - llama_kv_cache * kv_self = static_cast(memory.get()); - - kv_self->state_read(io, seq_id); + memory->state_read(io, seq_id); } return io.n_bytes(); @@ -2069,9 +2052,7 @@ void llama_context::opt_epoch_iter( const uint32_t n_batch = std::min(this->n_batch(), n_ctx); const uint32_t n_ubatch = std::min(this->n_ubatch(), n_batch); - llama_kv_cache * kv_self = static_cast(memory.get()); - - kv_self->clear(); + memory->clear(); for (uint32_t pos_ctx = 0; pos_ctx < n_ctx; pos_ctx += n_batch) { batch.n_tokens = n_batch; @@ -2094,8 +2075,8 @@ void llama_context::opt_epoch_iter( int64_t n_outputs_all = n_tokens_all; - auto kv_state = kv_self->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ true); - if (!kv_state || kv_state->get_status() != LLAMA_MEMORY_STATUS_SUCCESS) { + auto mstate = memory->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ true); + if (!mstate || mstate->get_status() != LLAMA_MEMORY_STATUS_SUCCESS) { LLAMA_LOG_ERROR("%s: could not initialize batch\n", __func__); break; } @@ -2108,17 +2089,17 @@ void llama_context::opt_epoch_iter( uint32_t pos_batch = 0; do { - const auto & ubatch = kv_state->get_ubatch(); + const auto & ubatch = mstate->get_ubatch(); n_outputs = ubatch.n_tokens; - if (!kv_state->apply()) { + if (!mstate->apply()) { LLAMA_LOG_ERROR("%s: failed to update the memory state\n", __func__); break; } auto * gf = graph_init(); - auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, kv_state.get()); + auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mstate.get()); struct ggml_context * ctx_compute_opt; { @@ -2153,7 +2134,7 @@ void llama_context::opt_epoch_iter( ggml_free(ctx_compute_opt); pos_batch += ubatch.n_tokens; - } while (kv_state->next()); + } while (mstate->next()); } } @@ -2314,8 +2295,9 @@ const llama_model * llama_get_model(const llama_context * ctx) { return &ctx->get_model(); } +// deprecated llama_kv_cache * llama_get_kv_self(llama_context * ctx) { - return ctx->get_kv_self(); + return dynamic_cast(ctx->get_memory()); } // deprecated @@ -2435,13 +2417,82 @@ int32_t llama_apply_adapter_cvec( return res ? 0 : -1; } +// +// memory +// + +llama_memory_t llama_get_memory(const struct llama_context * ctx) { + return ctx->get_memory(); +} + +void llama_memory_clear(llama_memory_t mem) { + mem->clear(); +} + +bool llama_memory_seq_rm( + llama_memory_t mem, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1) { + return mem->seq_rm(seq_id, p0, p1); +} + +void llama_memory_seq_cp( + llama_memory_t mem, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1) { + mem->seq_cp(seq_id_src, seq_id_dst, p0, p1); +} + +void llama_memory_seq_keep( + llama_memory_t mem, + llama_seq_id seq_id) { + mem->seq_keep(seq_id); +} + +void llama_memory_seq_add( + llama_memory_t mem, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta) { + mem->seq_add(seq_id, p0, p1, delta); +} + +void llama_memory_seq_div( + llama_memory_t mem, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d) { + mem->seq_div(seq_id, p0, p1, d); +} + +llama_pos llama_memory_seq_pos_min( + llama_memory_t mem, + llama_seq_id seq_id) { + return mem->seq_pos_min(seq_id); +} + +llama_pos llama_memory_seq_pos_max( + llama_memory_t mem, + llama_seq_id seq_id) { + return mem->seq_pos_max(seq_id); +} + +bool llama_memory_can_shift(llama_memory_t mem) { + return mem->get_can_shift(); +} + // // kv cache // // deprecated int32_t llama_kv_self_n_tokens(const llama_context * ctx) { - const auto * kv = ctx->get_kv_self(); + const auto * kv = llama_get_memory(ctx); if (!kv) { return 0; } @@ -2463,7 +2514,7 @@ int32_t llama_kv_self_n_tokens(const llama_context * ctx) { // deprecated // note: this is the same as above - will be removed anyway, so it's ok int32_t llama_kv_self_used_cells(const llama_context * ctx) { - const auto * kv = ctx->get_kv_self(); + const auto * kv = llama_get_memory(ctx); if (!kv) { return 0; } @@ -2483,12 +2534,12 @@ int32_t llama_kv_self_used_cells(const llama_context * ctx) { } void llama_kv_self_clear(llama_context * ctx) { - auto * kv = ctx->get_kv_self(); + auto * kv = llama_get_memory(ctx); if (!kv) { return; } - kv->clear(); + llama_memory_clear(kv); } bool llama_kv_self_seq_rm( @@ -2496,12 +2547,12 @@ bool llama_kv_self_seq_rm( llama_seq_id seq_id, llama_pos p0, llama_pos p1) { - auto * kv = ctx->get_kv_self(); + auto * kv = llama_get_memory(ctx); if (!kv) { return true; } - return kv->seq_rm(seq_id, p0, p1); + return llama_memory_seq_rm(kv, seq_id, p0, p1); } void llama_kv_self_seq_cp( @@ -2510,21 +2561,21 @@ void llama_kv_self_seq_cp( llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { - auto * kv = ctx->get_kv_self(); + auto * kv = llama_get_memory(ctx); if (!kv) { return; } - kv->seq_cp(seq_id_src, seq_id_dst, p0, p1); + llama_memory_seq_cp(kv, seq_id_src, seq_id_dst, p0, p1); } void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) { - auto * kv = ctx->get_kv_self(); + auto * kv = llama_get_memory(ctx); if (!kv) { return; } - kv->seq_keep(seq_id); + llama_memory_seq_keep(kv, seq_id); } void llama_kv_self_seq_add( @@ -2533,12 +2584,12 @@ void llama_kv_self_seq_add( llama_pos p0, llama_pos p1, llama_pos delta) { - auto * kv = ctx->get_kv_self(); + auto * kv = llama_get_memory(ctx); if (!kv) { return; } - kv->seq_add(seq_id, p0, p1, delta); + llama_memory_seq_add(kv, seq_id, p0, p1, delta); } void llama_kv_self_seq_div( @@ -2547,30 +2598,30 @@ void llama_kv_self_seq_div( llama_pos p0, llama_pos p1, int d) { - auto * kv = ctx->get_kv_self(); + auto * kv = llama_get_memory(ctx); if (!kv) { return; } - kv->seq_div(seq_id, p0, p1, d); + llama_memory_seq_div(kv, seq_id, p0, p1, d); } llama_pos llama_kv_self_seq_pos_min(llama_context * ctx, llama_seq_id seq_id) { - const auto * kv = ctx->get_kv_self(); + auto * kv = llama_get_memory(ctx); if (!kv) { return -1; } - return kv->seq_pos_min(seq_id); + return llama_memory_seq_pos_min(kv, seq_id); } llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) { - const auto * kv = ctx->get_kv_self(); + auto * kv = llama_get_memory(ctx); if (!kv) { return -1; } - return kv->seq_pos_max(seq_id); + return llama_memory_seq_pos_max(kv, seq_id); } // deprecated @@ -2580,12 +2631,12 @@ void llama_kv_self_defrag(llama_context * ctx) { } bool llama_kv_self_can_shift(const llama_context * ctx) { - const auto * kv = ctx->get_kv_self(); + auto * kv = llama_get_memory(ctx); if (!kv) { return false; } - return kv->get_can_shift(); + return llama_memory_can_shift(kv); } // llama state API diff --git a/src/llama-context.h b/src/llama-context.h index c1c7efb31..2e0da8c83 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -13,13 +13,12 @@ #include struct llama_model; -struct llama_kv_cache; class llama_io_read_i; class llama_io_write_i; -class llama_memory_i; -class llama_memory_state_i; +struct llama_memory_i; +struct llama_memory_state_i; struct llama_context { // init scheduler and compute buffers, reserve worst-case graphs @@ -47,8 +46,7 @@ struct llama_context { uint32_t n_threads() const; uint32_t n_threads_batch() const; - llama_kv_cache * get_kv_self(); - const llama_kv_cache * get_kv_self() const; + llama_memory_t get_memory() const; // return true of the KV cache was updated // TODO: remove diff --git a/src/llama-graph.h b/src/llama-graph.h index d1c5dd1bf..2b1cfa5b7 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -17,7 +17,7 @@ struct ggml_tensor; struct llama_ubatch; struct llama_cparams; -class llama_memory_state_i; +struct llama_memory_state_i; class llama_kv_cache_unified_state; class llama_kv_cache_unified_iswa_state; diff --git a/src/llama-kv-cache-recurrent.h b/src/llama-kv-cache-recurrent.h index b32f258fb..cb813dfe8 100644 --- a/src/llama-kv-cache-recurrent.h +++ b/src/llama-kv-cache-recurrent.h @@ -2,7 +2,7 @@ #include "llama-batch.h" #include "llama-graph.h" -#include "llama-kv-cache.h" +#include "llama-memory.h" #include #include @@ -13,7 +13,7 @@ // TODO: extract the KV cache state used for graph computation into llama_kv_cache_recurrent_state_i // see the implementation of llama_kv_cache_unified_state_i for an example how to do it -class llama_kv_cache_recurrent : public llama_kv_cache { +class llama_kv_cache_recurrent : public llama_memory_i { public: llama_kv_cache_recurrent( const llama_model & model, @@ -29,6 +29,16 @@ public: // llama_memory_i // + llama_memory_state_ptr init_batch( + const llama_batch & batch, + uint32_t n_ubatch, + bool embd_pooled, + bool logits_all) override; + + llama_memory_state_ptr init_full() override; + + llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) override; + void clear() override; bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override; @@ -40,20 +50,6 @@ public: llama_pos seq_pos_min(llama_seq_id seq_id) const override; llama_pos seq_pos_max(llama_seq_id seq_id) const override; - // - // llama_kv_cache - // - - llama_memory_state_ptr init_batch( - const llama_batch & batch, - uint32_t n_ubatch, - bool embd_pooled, - bool logits_all) override; - - llama_memory_state_ptr init_full() override; - - llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) override; - bool prepare(const std::vector & ubatches); // find a contiguous slot of kv cells and emplace the ubatch there diff --git a/src/llama-kv-cache-unified-iswa.h b/src/llama-kv-cache-unified-iswa.h index cba5bbe95..3fabcd6b8 100644 --- a/src/llama-kv-cache-unified-iswa.h +++ b/src/llama-kv-cache-unified-iswa.h @@ -11,7 +11,7 @@ // utilizes two instances of llama_kv_cache_unified // the first instance is for the non-SWA layers of the model and the second instance is for the SWA layers -class llama_kv_cache_unified_iswa : public llama_kv_cache { +class llama_kv_cache_unified_iswa : public llama_memory_i { public: llama_kv_cache_unified_iswa( const llama_model & model, @@ -31,21 +31,6 @@ public: // llama_memory_i // - void clear() override; - - bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override; - void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override; - void seq_keep(llama_seq_id seq_id) override; - void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) override; - void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override; - - llama_pos seq_pos_min(llama_seq_id seq_id) const override; - llama_pos seq_pos_max(llama_seq_id seq_id) const override; - - // - // llama_kv_cache - // - llama_memory_state_ptr init_batch( const llama_batch & batch, uint32_t n_ubatch, @@ -58,6 +43,17 @@ public: bool get_can_shift() const override; + void clear() override; + + bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override; + void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override; + void seq_keep(llama_seq_id seq_id) override; + void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) override; + void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override; + + llama_pos seq_pos_min(llama_seq_id seq_id) const override; + llama_pos seq_pos_max(llama_seq_id seq_id) const override; + // state write/load void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override; diff --git a/src/llama-kv-cache-unified.h b/src/llama-kv-cache-unified.h index 6ff388a88..d01a9abd7 100644 --- a/src/llama-kv-cache-unified.h +++ b/src/llama-kv-cache-unified.h @@ -2,8 +2,8 @@ #include "llama-batch.h" #include "llama-graph.h" -#include "llama-kv-cache.h" #include "llama-kv-cells.h" +#include "llama-memory.h" #include #include @@ -17,7 +17,7 @@ struct llama_context; // llama_kv_cache_unified // -class llama_kv_cache_unified : public llama_kv_cache { +class llama_kv_cache_unified : public llama_memory_i { public: static uint32_t get_padding(const llama_cparams & cparams); @@ -56,21 +56,6 @@ public: // llama_memory_i // - void clear() override; - - bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override; - void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override; - void seq_keep(llama_seq_id seq_id) override; - void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) override; - void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override; - - llama_pos seq_pos_min(llama_seq_id seq_id) const override; - llama_pos seq_pos_max(llama_seq_id seq_id) const override; - - // - // llama_kv_cache - // - llama_memory_state_ptr init_batch( const llama_batch & batch, uint32_t n_ubatch, @@ -83,6 +68,17 @@ public: bool get_can_shift() const override; + void clear() override; + + bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override; + void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override; + void seq_keep(llama_seq_id seq_id) override; + void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) override; + void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override; + + llama_pos seq_pos_min(llama_seq_id seq_id) const override; + llama_pos seq_pos_max(llama_seq_id seq_id) const override; + // state write/load void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp deleted file mode 100644 index aefd23e32..000000000 --- a/src/llama-kv-cache.cpp +++ /dev/null @@ -1 +0,0 @@ -#include "llama-kv-cache.h" diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h deleted file mode 100644 index 17a5e5cb8..000000000 --- a/src/llama-kv-cache.h +++ /dev/null @@ -1,41 +0,0 @@ -#pragma once - -#include "llama.h" -#include "llama-memory.h" - -class llama_io_write_i; -class llama_io_read_i; - -struct llama_kv_cache : public llama_memory_i { - virtual ~llama_kv_cache() = default; - - // TODO: move the init_ interfaces to llama_memory_i - - // split the input batch into a set of ubatches and verify that they can fit into the cache - // return a state object containing the ubatches and KV cache state required to process them - // check the llama_memory_state_i::get_status() for the result - virtual llama_memory_state_ptr init_batch( - const llama_batch & batch, - uint32_t n_ubatch, - bool embd_pooled, - bool logits_all) = 0; - - // simulate full cache, used for allocating worst-case compute buffers - virtual llama_memory_state_ptr init_full() = 0; - - // prepare for any pending memory updates, such as shifts, defrags, etc. - // status == LLAMA_MEMORY_STATUS_NO_UPDATE if there is nothing to update - virtual llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) = 0; - - // getters - virtual bool get_can_shift() const = 0; - - bool get_can_edit() const override { return get_can_shift(); } - - // - // state write/read - // - - virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0; - virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) = 0; -}; diff --git a/src/llama-memory.h b/src/llama-memory.h index ab0d399c4..5993b59be 100644 --- a/src/llama-memory.h +++ b/src/llama-memory.h @@ -7,6 +7,9 @@ struct llama_ubatch; +class llama_io_write_i; +class llama_io_read_i; + struct llama_memory_params { // kv cache ggml_type type_k; @@ -16,28 +19,6 @@ struct llama_memory_params { bool swa_full; }; -// general concept of LLM memory -// the KV cache is a type of LLM memory, but there can be other types -class llama_memory_i { -public: - virtual ~llama_memory_i() = default; - - virtual void clear() = 0; - - virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0; - virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0; - virtual void seq_keep(llama_seq_id seq_id) = 0; - virtual void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) = 0; - virtual void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) = 0; - - virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0; - virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0; - - virtual bool get_can_edit() const = 0; -}; - -using llama_memory_ptr = std::unique_ptr; - enum llama_memory_status { LLAMA_MEMORY_STATUS_SUCCESS = 0, LLAMA_MEMORY_STATUS_NO_UPDATE, @@ -58,8 +39,7 @@ llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_me // the only method that can mutate the memory and the memory state is llama_memory_i::apply() // // TODO: rename to llama_memory_context_i ? -class llama_memory_state_i { -public: +struct llama_memory_state_i { virtual ~llama_memory_state_i() = default; // consume the current ubatch from the state and proceed to the next one @@ -81,3 +61,57 @@ public: }; using llama_memory_state_ptr = std::unique_ptr; + +// general concept of LLM memory +// the KV cache is a type of LLM memory, but there can be other types +struct llama_memory_i { + virtual ~llama_memory_i() = default; + + // split the input batch into a set of ubatches and verify that they can fit into the cache + // return a state object containing the ubatches and KV cache state required to process them + // check the llama_memory_state_i::get_status() for the result + virtual llama_memory_state_ptr init_batch( + const llama_batch & batch, + uint32_t n_ubatch, + bool embd_pooled, + bool logits_all) = 0; + + // simulate full cache, used for allocating worst-case compute buffers + virtual llama_memory_state_ptr init_full() = 0; + + // prepare for any pending memory updates, such as shifts, defrags, etc. + // status == LLAMA_MEMORY_STATUS_NO_UPDATE if there is nothing to update + virtual llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) = 0; + + // getters + virtual bool get_can_shift() const = 0; + + // + // ops + // + + virtual void clear() = 0; + + virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0; + virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0; + virtual void seq_keep(llama_seq_id seq_id) = 0; + virtual void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) = 0; + virtual void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) = 0; + + virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0; + virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0; + + // + // state write/read + // + + virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0; + virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) = 0; +}; + +using llama_memory_ptr = std::unique_ptr; + +// TODO: temporary until the llama_kv_cache is removed from the public API +struct llama_kv_cache : public llama_memory_i { + virtual ~llama_kv_cache() = default; +}; From 146b88e8b3e16d22cea22f8be982a4d45e913b1e Mon Sep 17 00:00:00 2001 From: pockers21 <134406831+pockers21@users.noreply.github.com> Date: Thu, 5 Jun 2025 06:25:29 -0700 Subject: [PATCH 07/10] ci: fix CUDA build failure on autodl cloud machines (#14005) Replace CMAKE_CUDA_ARCHITECTURES=native with nvidia-smi detection as 'native' fails on autodl cloud environments. Co-authored-by: pockers21 --- ci/run.sh | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/ci/run.sh b/ci/run.sh index b49a3a5f8..2968a7dd4 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -46,7 +46,20 @@ if [ ! -z ${GG_BUILD_METAL} ]; then fi if [ ! -z ${GG_BUILD_CUDA} ]; then - CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native" + CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON" + + if command -v nvidia-smi >/dev/null 2>&1; then + CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d '.') + if [[ -n "$CUDA_ARCH" && "$CUDA_ARCH" =~ ^[0-9]+$ ]]; then + CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH}" + else + echo "Warning: Using fallback CUDA architectures" + CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=61;70;75;80;86;89" + fi + else + echo "Error: nvidia-smi not found, cannot build with CUDA" + exit 1 + fi fi if [ ! -z ${GG_BUILD_SYCL} ]; then From 669c13e0f67edfba891c5bd7105eb4376bcf8626 Mon Sep 17 00:00:00 2001 From: Masato Nakasaka Date: Thu, 5 Jun 2025 23:00:29 +0900 Subject: [PATCH 08/10] vulkan: Enable VK_KHR_cooperative_matrix extension for Intel Xe2 GPUs (#14001) * allowing B580 and U9-288V * experimenting code to detect Xe2 * allowing coopmat only for Xe2 GPUs * fixed comment wording * fixed comment wording * removed unnecessary driver check --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 34 ++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 1d25d79d5..3e43b03bc 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -196,6 +196,7 @@ enum vk_device_architecture { AMD_RDNA1, AMD_RDNA2, AMD_RDNA3, + INTEL_XE2, }; static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) { @@ -246,6 +247,34 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& } return vk_device_architecture::AMD_RDNA2; } + } else if (props.vendorID == VK_VENDOR_ID_INTEL) { + const std::vector ext_props = device.enumerateDeviceExtensionProperties(); + + bool subgroup_size_control = false; + + for (const auto& properties : ext_props) { + if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) { + subgroup_size_control = true; + } + } + + if (!subgroup_size_control) { + return vk_device_architecture::OTHER; + } + + vk::PhysicalDeviceProperties2 props2; + vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props; + + props2.pNext = &subgroup_size_control_props; + device.getProperties2(&props2); + + if (subgroup_size_control_props.minSubgroupSize == 16) { + // Xe2 architecture uses SIMD16 while previous Xe and Gen architecture uses SIMD8. + // Minimum subgroup size matches the SIMD width so we distinguish architecture by checking this value. + // https://www.intel.com/content/www/us/en/content-details/824434/2024-intel-tech-tour-xe2-and-lunar-lake-s-gpu.html + // https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html + return vk_device_architecture::INTEL_XE2; + } } return vk_device_architecture::OTHER; } @@ -10263,8 +10292,9 @@ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::ve static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) { switch (props.vendorID) { case VK_VENDOR_ID_INTEL: - // Intel drivers don't support coopmat properly yet - return false; + // Only allowing Xe2 GPU at the moment since Xe2 GPU can gain significant performance boost, + // while some older hardware (ex. Arc A770) has performance regressions + return arch == vk_device_architecture::INTEL_XE2; case VK_VENDOR_ID_AMD: if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) { // Workaround for AMD proprietary driver reporting support on all GPUs From 1caae7fc6c77551cb1066515e0f414713eebb367 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Thu, 5 Jun 2025 17:42:31 +0200 Subject: [PATCH 09/10] gguf-py : add add_classifier_output_labels method to writer (#14031) * add add_classifier_output_labels * use add_classifier_output_labels --- convert_hf_to_gguf.py | 3 +-- gguf-py/gguf/gguf_writer.py | 3 +++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ec3b5697d..7b9893c8a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3709,8 +3709,7 @@ class BertModel(TextModel): self._try_set_pooling_type() if self.cls_out_labels: - key_name = gguf.Keys.Classifier.OUTPUT_LABELS.format(arch = gguf.MODEL_ARCH_NAMES[self.model_arch]) - self.gguf_writer.add_array(key_name, [v for k, v in sorted(self.cls_out_labels.items())]) + self.gguf_writer.add_classifier_output_labels([v for k, v in sorted(self.cls_out_labels.items())]) def set_vocab(self): tokens, toktypes, tokpre = self.get_vocab_base() diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index de6e45ae8..adc673e38 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -935,6 +935,9 @@ class GGUFWriter: def add_eom_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.EOM_ID, id) + def add_classifier_output_labels(self, labels: Sequence[str]) -> None: + self.add_array(Keys.Classifier.OUTPUT_LABELS.format(arch=self.arch), labels) + # for vision models def add_clip_has_vision_encoder(self, value: bool) -> None: From d17a809ef0af09b16625e991a76f6fe80d9c332e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Fri, 6 Jun 2025 09:03:25 +0200 Subject: [PATCH 10/10] llama : support multiple classifier outputs and labels (#13940) --- examples/embedding/embedding.cpp | 19 ++++++++-- include/llama.h | 9 ++++- src/llama-context.cpp | 7 ++-- src/llama-model-loader.cpp | 59 +++++++++++++++++++++++--------- src/llama-model.cpp | 28 ++++++++++++++- src/llama-model.h | 3 ++ 6 files changed, 101 insertions(+), 24 deletions(-) diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 71f700877..8bef7f8f6 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -236,9 +236,24 @@ int main(int argc, char ** argv) { LOG("\n"); } } else if (pooling_type == LLAMA_POOLING_TYPE_RANK) { + const uint32_t n_cls_out = llama_model_n_cls_out(model); + std::vector cls_out_labels; + + for (uint32_t i = 0; i < n_cls_out; i++) { + const char * label = llama_model_cls_label(model, i); + const std::string label_i(label == nullptr ? "" : label); + cls_out_labels.emplace_back(label_i.empty() ? std::to_string(i) : label_i); + } + for (int j = 0; j < n_embd_count; j++) { - // NOTE: if you change this log - update the tests in ci/run.sh - LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]); + for (uint32_t i = 0; i < n_cls_out; i++) { + // NOTE: if you change this log - update the tests in ci/run.sh + if (n_cls_out == 1) { + LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]); + } else { + LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd + i], cls_out_labels[i].c_str()); + } + } } } else { // print the first part of the embeddings or for a single prompt, the full embedding diff --git a/include/llama.h b/include/llama.h index 21808c881..aa5330e2a 100644 --- a/include/llama.h +++ b/include/llama.h @@ -514,6 +514,13 @@ extern "C" { // Get the model's RoPE frequency scaling factor LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model); + // Returns the number of classifier outputs (only valid for classifier models) + // Undefined behavior for non-classifier models + LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model); + + // Returns label of classifier output by index ( bool llama_model_loader::get_arr(const std::string & key, std::vector & result, bool required) { - const int kid = gguf_find_key(meta.get(), key.c_str()); + const gguf_context * ctx = meta.get(); + const int kid = gguf_find_key(ctx, key.c_str()); - if (kid < 0 || gguf_get_kv_type(meta.get(), kid) != GGUF_TYPE_ARRAY) { + if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) { if (required) { throw std::runtime_error(format("array key not found in model: %s", key.c_str())); } @@ -298,28 +299,40 @@ namespace GGUFMeta { } struct GGUFMeta::ArrayInfo arr_info = - GGUFMeta::GKV::get_kv(meta.get(), kid); + GGUFMeta::GKV::get_kv(ctx, kid); switch (arr_info.gt) { case GGUF_TYPE_UINT32: - case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same::value) || - (std::is_same::value)); break; - case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; + case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same::value) || + (std::is_same::value)); break; + case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; + case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same::value)); break; default: - throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str())); + throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str())); } - result.resize(arr_info.length); - result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length); + if constexpr (std::is_same::value) { + const size_t n_items = gguf_get_arr_n(ctx, kid); + result.clear(); + + for (size_t i = 0; i < n_items; i++) { + const T value = gguf_get_arr_str(ctx, kid, i); + result.emplace_back(value); + } + } else { + result.resize(arr_info.length); + result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length); + } return true; } template bool llama_model_loader::get_arr(const std::string & key, std::array & result, bool required) { - const int kid = gguf_find_key(meta.get(), key.c_str()); + const gguf_context * ctx = meta.get(); + const int kid = gguf_find_key(ctx, key.c_str()); - if (kid < 0 || gguf_get_kv_type(meta.get(), kid) != GGUF_TYPE_ARRAY) { + if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) { if (required) { throw std::runtime_error(format("array key not found in model: %s", key.c_str())); } @@ -327,22 +340,32 @@ namespace GGUFMeta { } struct GGUFMeta::ArrayInfo arr_info = - GGUFMeta::GKV::get_kv(meta.get(), kid); + GGUFMeta::GKV::get_kv(ctx, kid); switch (arr_info.gt) { case GGUF_TYPE_UINT32: - case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same::value) || - (std::is_same::value)); break; - case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; + case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same::value) || + (std::is_same::value)); break; + case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; + case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same::value)); break; default: - throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str())); + throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str())); } if (arr_info.length > N_MAX) { throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX)); } - std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin()); + if constexpr (std::is_same::value) { + const size_t n_items = gguf_get_arr_n(ctx, kid); + + for (size_t i = 0; i < n_items; i++) { + const T value = gguf_get_arr_str(ctx, kid, i); + result[i] = value; + } + } else { + std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin()); + } return true; } @@ -352,6 +375,8 @@ namespace GGUFMeta { return get_arr(llm_kv(kid), result, required); } + template bool llama_model_loader::get_arr>(enum llm_kv kid, std::vector & result, bool required); + template bool llama_model_loader::get_key(const std::string & key, T & result, bool required) { auto it = kv_overrides.find(key); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index afef84870..915d5a927 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -543,6 +543,12 @@ void llama_model::load_hparams(llama_model_loader & ml) { uint32_t n_vocab = 0; ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false); + // for classifier models + ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false); + if (!classifier_labels.empty()) { + hparams.n_cls_out = classifier_labels.size(); + } + // arch-specific KVs switch (arch) { case LLM_ARCH_LLAMA: @@ -686,7 +692,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); - ml.get_arr_n(LLM_KV_CLASSIFIER_OUTPUT_LABELS, hparams.n_cls_out, false); switch (hparams.n_layer) { case 3: @@ -4362,6 +4367,15 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state); LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank); LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms); + + if (!classifier_labels.empty()) { + LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out); + + size_t i = 0; + for (auto label : classifier_labels) { + LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str()); + } + } } LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str()); @@ -13602,6 +13616,18 @@ int32_t llama_model_n_swa(const llama_model * model) { return model->hparams.n_swa; } +uint32_t llama_model_n_cls_out(const struct llama_model * model) { + return model->hparams.n_cls_out; +} + +const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) { + if (i < model->classifier_labels.size()) { + return model->classifier_labels[i].c_str(); + } + + return nullptr; +} + // deprecated int32_t llama_n_ctx_train(const llama_model * model) { return llama_model_n_ctx_train(model); diff --git a/src/llama-model.h b/src/llama-model.h index cbea2cb33..18b714620 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -329,6 +329,9 @@ struct llama_model { llama_hparams hparams = {}; llama_vocab vocab; + // for classifier models + std::vector classifier_labels; + struct ggml_tensor * tok_embd = nullptr; struct ggml_tensor * type_embd = nullptr; struct ggml_tensor * pos_embd = nullptr;