diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 2d376a602..945652263 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -676,6 +676,15 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size) size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any) + // Ensure we don't try to read more data than is available in the source buffer 'data' + // or write more than the tensor can hold. + const size_t total_tensor_size = (size_t)nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + + // Calculate how many full rows and how many remaining bytes we need to process. + const int64_t n_full_rows = n_bytes_to_copy / row_size; + const size_t n_rem_bytes = n_bytes_to_copy % row_size; + void * buf_pd = ggml_aligned_malloc(row_size_pd); GGML_ASSERT(buf_pd != NULL); @@ -687,7 +696,8 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size) init_row_q4x4x2((block_q4_0 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros - for (int64_t i = 0; i < nrows; i++) { + // 1. Process all the full rows + for (int64_t i = 0; i < n_full_rows; i++) { const uint8_t * src = (const uint8_t *) data + (i * row_size); uint8_t * dst = (uint8_t *) t->data + (i * row_size); @@ -696,6 +706,25 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size) memcpy(dst, buf_rp, row_size); } + // 2. Process the final, potentially partial, row + if (n_rem_bytes > 0) { + const int64_t i = n_full_rows; + const uint8_t * src = (const uint8_t *) data + (i * row_size); + uint8_t * dst = (uint8_t *) t->data + (i * row_size); + + // re-init the row because we are potentially copying a partial row + init_row_q4x4x2((block_q4_0 *) buf_pd, t->ne[0]); + + // Copy only the remaining bytes from the source. + memcpy(buf_pd, src, n_rem_bytes); + + // Repack the entire buffer + repack_row_q4x4x2((uint8_t *) buf_rp, (const block_q4_0 *) buf_pd, t->ne[0]); + + // Write only the corresponding remaining bytes to the destination tensor. + memcpy(dst, buf_rp, n_rem_bytes); + } + ggml_aligned_free(buf_pd, row_size_pd); ggml_aligned_free(buf_rp, row_size_rp); } @@ -708,6 +737,14 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size) size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any) + // Ensure we don't try to copy more data than the tensor actually contains. + const size_t total_tensor_size = (size_t)nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + + // Calculate how many full rows and how many remaining bytes we need to process. + const int64_t n_full_rows = n_bytes_to_copy / row_size; + const size_t n_rem_bytes = n_bytes_to_copy % row_size; + void * buf_pd = ggml_aligned_malloc(row_size_pd); GGML_ASSERT(buf_pd != NULL); @@ -719,7 +756,8 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size) memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros - for (int64_t i = 0; i < nrows; i++) { + // 1. Process all the full rows + for (int64_t i = 0; i < n_full_rows; i++) { const uint8_t * src = (const uint8_t *) t->data + (i * row_size); uint8_t * dst = (uint8_t *) data + (i * row_size); @@ -728,6 +766,20 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size) memcpy(dst, buf_rp, row_size); } + // 2. Process the final, potentially partial, row + if (n_rem_bytes > 0) { + const int64_t i = n_full_rows; + const uint8_t * src = (const uint8_t *) t->data + (i * row_size); + uint8_t * dst = (uint8_t *) data + (i * row_size); + + // We still need to read and unpack the entire source row because quantization is block-based. + memcpy(buf_pd, src, row_size); + unpack_row_q4x4x2((block_q4_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]); + + // But we only copy the remaining number of bytes to the destination. + memcpy(dst, buf_rp, n_rem_bytes); + } + ggml_aligned_free(buf_pd, row_size_pd); ggml_aligned_free(buf_rp, row_size_rp); } @@ -950,6 +1002,15 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size) size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any) + // Ensure we don't try to read more data than is available in the source buffer 'data' + // or write more than the tensor can hold. + const size_t total_tensor_size = (size_t)nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + + // Calculate how many full rows and how many remaining bytes we need to process. + const int64_t n_full_rows = n_bytes_to_copy / row_size; + const size_t n_rem_bytes = n_bytes_to_copy % row_size; + void * buf_pd = ggml_aligned_malloc(row_size_pd); GGML_ASSERT(buf_pd != NULL); @@ -961,7 +1022,8 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size) init_row_q8x4x2((block_q8_0 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros - for (int64_t i = 0; i < nrows; i++) { + // 1. Process all the full rows + for (int64_t i = 0; i < n_full_rows; i++) { const uint8_t * src = (const uint8_t *) data + (i * row_size); uint8_t * dst = (uint8_t *) t->data + (i * row_size); @@ -970,6 +1032,25 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size) memcpy(dst, buf_rp, row_size); } + // 2. Process the final, potentially partial, row + if (n_rem_bytes > 0) { + const int64_t i = n_full_rows; + const uint8_t * src = (const uint8_t *) data + (i * row_size); + uint8_t * dst = (uint8_t *) t->data + (i * row_size); + + // re-init the row because we are potentially copying a partial row + init_row_q8x4x2((block_q8_0 *) buf_pd, t->ne[0]); + + // Copy only the remaining bytes from the source. + memcpy(buf_pd, src, n_rem_bytes); + + // Repack the entire buffer + repack_row_q8x4x2((uint8_t *) buf_rp, (const block_q8_0 *) buf_pd, t->ne[0]); + + // Write only the corresponding remaining bytes to the destination tensor. + memcpy(dst, buf_rp, n_rem_bytes); + } + ggml_aligned_free(buf_pd, row_size_pd); ggml_aligned_free(buf_rp, row_size_rp); } @@ -982,6 +1063,14 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size) size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any) + // Ensure we don't try to copy more data than the tensor actually contains. + const size_t total_tensor_size = (size_t)nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + + // Calculate how many full rows and how many remaining bytes we need to process. + const int64_t n_full_rows = n_bytes_to_copy / row_size; + const size_t n_rem_bytes = n_bytes_to_copy % row_size; + void * buf_pd = ggml_aligned_malloc(row_size_pd); GGML_ASSERT(buf_pd != NULL); @@ -993,7 +1082,8 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size) memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros - for (int64_t i = 0; i < nrows; i++) { + // 1. Process all the full rows + for (int64_t i = 0; i < n_full_rows; i++) { const uint8_t * src = (const uint8_t *) t->data + (i * row_size); uint8_t * dst = (uint8_t *) data + (i * row_size); @@ -1002,6 +1092,20 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size) memcpy(dst, buf_rp, row_size); } + // 2. Process the final, potentially partial, row + if (n_rem_bytes > 0) { + const int64_t i = n_full_rows; + const uint8_t * src = (const uint8_t *) t->data + (i * row_size); + uint8_t * dst = (uint8_t *) data + (i * row_size); + + // We still need to read and unpack the entire source row because quantization is block-based. + memcpy(buf_pd, src, row_size); + unpack_row_q8x4x2((block_q8_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]); + + // But we only copy the remaining number of bytes to the destination. + memcpy(dst, buf_rp, n_rem_bytes); + } + ggml_aligned_free(buf_pd, row_size_pd); ggml_aligned_free(buf_rp, row_size_rp); } @@ -1249,6 +1353,15 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any) + // Ensure we don't try to read more data than is available in the source buffer 'data' + // or write more than the tensor can hold. + const size_t total_tensor_size = (size_t)nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + + // Calculate how many full rows and how many remaining bytes we need to process. + const int64_t n_full_rows = n_bytes_to_copy / row_size; + const size_t n_rem_bytes = n_bytes_to_copy % row_size; + void * buf_pd = ggml_aligned_malloc(row_size_pd); GGML_ASSERT(buf_pd != NULL); @@ -1260,7 +1373,8 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si init_row_mxfp4x4x2((block_mxfp4 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros - for (int64_t i = 0; i < nrows; i++) { + // 1. Process all the full rows + for (int64_t i = 0; i < n_full_rows; i++) { const uint8_t * src = (const uint8_t *) data + (i * row_size); uint8_t * dst = (uint8_t *) t->data + (i * row_size); @@ -1269,6 +1383,25 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si memcpy(dst, buf_rp, row_size); } + // 2. Process the final, potentially partial, row + if (n_rem_bytes > 0) { + const int64_t i = n_full_rows; + const uint8_t * src = (const uint8_t *) data + (i * row_size); + uint8_t * dst = (uint8_t *) t->data + (i * row_size); + + // re-init the row because we are potentially copying a partial row + init_row_mxfp4x4x2((block_mxfp4 *) buf_pd, t->ne[0]); + + // Copy only the remaining bytes from the source. + memcpy(buf_pd, src, n_rem_bytes); + + // Repack the entire buffer (partial data + zero padding). + repack_row_mxfp4x4x2((uint8_t *) buf_rp, (const block_mxfp4 *) buf_pd, t->ne[0]); + + // Write only the corresponding remaining bytes to the destination tensor. + memcpy(dst, buf_rp, n_rem_bytes); + } + ggml_aligned_free(buf_pd, row_size_pd); ggml_aligned_free(buf_rp, row_size_rp); } @@ -1281,6 +1414,14 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any) + // Ensure we don't try to copy more data than the tensor actually contains. + const size_t total_tensor_size = (size_t)nrows * row_size; + const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; + + // Calculate how many full rows and how many remaining bytes we need to process. + const int64_t n_full_rows = n_bytes_to_copy / row_size; + const size_t n_rem_bytes = n_bytes_to_copy % row_size; + void * buf_pd = ggml_aligned_malloc(row_size_pd); GGML_ASSERT(buf_pd != NULL); @@ -1292,7 +1433,8 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros - for (int64_t i = 0; i < nrows; i++) { + // 1. Process all the full rows + for (int64_t i = 0; i < n_full_rows; i++) { const uint8_t * src = (const uint8_t *) t->data + (i * row_size); uint8_t * dst = (uint8_t *) data + (i * row_size); @@ -1301,6 +1443,20 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si memcpy(dst, buf_rp, row_size); } + // 2. Process the final, potentially partial, row + if (n_rem_bytes > 0) { + const int64_t i = n_full_rows; + const uint8_t * src = (const uint8_t *) t->data + (i * row_size); + uint8_t * dst = (uint8_t *) data + (i * row_size); + + // We still need to read and unpack the entire source row because the format is block-based. + memcpy(buf_pd, src, row_size); + unpack_row_mxfp4x4x2((block_mxfp4 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]); + + // But we only copy the remaining number of bytes to the destination to respect the size limit. + memcpy(dst, buf_rp, n_rem_bytes); + } + ggml_aligned_free(buf_pd, row_size_pd); ggml_aligned_free(buf_rp, row_size_rp); } @@ -1319,19 +1475,19 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer, switch (tensor->type) { case GGML_TYPE_Q4_0: GGML_ASSERT(offset == 0); - GGML_ASSERT(size == ggml_nbytes(tensor)); + GGML_ASSERT(offset + size <= ggml_nbytes(tensor)); repack_q4_0_q4x4x2(tensor, data, size); break; case GGML_TYPE_Q8_0: GGML_ASSERT(offset == 0); - GGML_ASSERT(size == ggml_nbytes(tensor)); + GGML_ASSERT(offset + size <= ggml_nbytes(tensor)); repack_q8_0_q8x4x2(tensor, data, size); break; case GGML_TYPE_MXFP4: GGML_ASSERT(offset == 0); - GGML_ASSERT(size == ggml_nbytes(tensor)); + GGML_ASSERT(offset + size <= ggml_nbytes(tensor)); repack_mxfp4_mxfp4x4x2(tensor, data, size); break; @@ -1355,19 +1511,19 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer, switch (tensor->type) { case GGML_TYPE_Q4_0: GGML_ASSERT(offset == 0); - GGML_ASSERT(size == ggml_nbytes(tensor)); + GGML_ASSERT(offset + size <= ggml_nbytes(tensor)); repack_q4x4x2_q4_0(data, tensor, size); break; case GGML_TYPE_Q8_0: GGML_ASSERT(offset == 0); - GGML_ASSERT(size == ggml_nbytes(tensor)); + GGML_ASSERT(offset + size <= ggml_nbytes(tensor)); repack_q8x4x2_q8_0(data, tensor, size); break; case GGML_TYPE_MXFP4: GGML_ASSERT(offset == 0); - GGML_ASSERT(size == ggml_nbytes(tensor)); + GGML_ASSERT(offset + size <= ggml_nbytes(tensor)); repack_mxfp4x4x2_mxfp4(data, tensor, size); break; diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index f96db4d85..631501656 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -161,8 +161,13 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline); struct vk_matmul_pipeline_struct { vk_pipeline l, m, s; vk_pipeline a_l, a_m, a_s; + // Returns true when all unaligned pipelines are null. + // We only check for unaligned variants since one of the unaligned pipelines must exist + // while aligned pipelines are optional + bool is_empty() const { + return l == nullptr && m == nullptr && s == nullptr; + } }; - typedef std::shared_ptr vk_matmul_pipeline; struct vk_matmul_pipeline2 { @@ -5109,7 +5114,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte if (src1_type == GGML_TYPE_Q8_1) { vk_matmul_pipeline pipelines = ctx->device->pipeline_dequant_mul_mat_mat_q8_1[src0_type].f32acc; - if (pipelines->s == nullptr && pipelines->m == nullptr && pipelines->l == nullptr) { + if (pipelines->is_empty()) { return nullptr; } @@ -5258,7 +5263,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co if (src1_type == GGML_TYPE_Q8_1) { vk_matmul_pipeline pipelines = ctx->device->pipeline_dequant_mul_mat_mat_id_q8_1[src0_type].f32acc; - if (pipelines->s == nullptr && pipelines->m == nullptr && pipelines->l == nullptr) { + if (pipelines->is_empty()) { return nullptr; } @@ -5293,16 +5298,17 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co return nullptr; } + vk_matmul_pipeline2& mmp = ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type]; // XXX TODO 'prec' is not actually allowed in mul_mat_id. bool prefer_fp16acc = ctx->device->fp16 /*&& prec == GGML_PREC_DEFAULT*/; - bool support_fp16acc = ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f16acc != nullptr; - bool support_fp32acc = ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f32acc != nullptr; + bool support_fp16acc = !mmp.f16acc->is_empty(); + bool support_fp32acc = !mmp.f32acc->is_empty(); if (support_fp16acc && (prefer_fp16acc || !support_fp32acc)) { - return ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f16acc; + return mmp.f16acc; } else { GGML_ASSERT(support_fp32acc); - return ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f32acc; + return mmp.f32acc; } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 03fac5e09..fc4560fba 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -331,7 +331,8 @@ void string_to_spv_func(std::string name, std::string in_path, std::string out_p // disable spirv-opt for coopmat shaders for https://github.com/ggerganov/llama.cpp/issues/10734 // disable spirv-opt for bf16 shaders for https://github.com/ggml-org/llama.cpp/issues/15344 - std::string opt_level = (coopmat || name.find("bf16") != std::string::npos) ? "" : "-O"; + // disable spirv-opt for rope shaders for https://github.com/ggml-org/llama.cpp/issues/16860 + std::string opt_level = (coopmat || name.find("bf16") != std::string::npos || name.find("rope") != std::string::npos) ? "" : "-O"; std::cout << "string_to_spv: " << name << "\n"; diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp index 0d4939fdb..86a1a4ba1 100644 --- a/src/llama-batch.cpp +++ b/src/llama-batch.cpp @@ -261,15 +261,29 @@ bool llama_batch_allocr::init( const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1; - if (p0 >= 0 && p0 >= seq_pos_min(s)) { - LLAMA_LOG_ERROR( - "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n" - " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n" - " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n" - " for M-RoPE, it is required that the position satisfies: X < Y\n", - __func__, s, s, p0, s, seq_pos_min(s)); + if (batch.token) { + if (p0 >= 0 && p0 >= seq_pos_min(s)) { + LLAMA_LOG_ERROR( + "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n" + " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n" + " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n" + " for M-RoPE, it is required that the position satisfies: X < Y\n", + __func__, s, s, p0, s, seq_pos_min(s)); - return false; + return false; + } + } else { + // embedding inputs can have overlapping positions + if (p0 >= 0 && p0 > seq_pos_min(s)) { + LLAMA_LOG_ERROR( + "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n" + " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n" + " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n" + " for M-RoPE, it is required that the position satisfies: X <= Y\n", + __func__, s, s, p0, s, seq_pos_min(s)); + + return false; + } } } } else { diff --git a/tools/server/server.cpp b/tools/server/server.cpp index d56468595..92d30664e 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -3608,7 +3608,7 @@ struct server_context { slot.task->params.sampling.preserved_tokens.find(token) != slot.task->params.sampling.preserved_tokens.end(); }; - // frist, add sampled tokens from any ongoing sequences + // first, add sampled tokens from any ongoing sequences for (auto & slot : slots) { if (slot.state != SLOT_STATE_GENERATING) { continue; @@ -3796,7 +3796,7 @@ struct server_context { // when the prompt prefix does not match, print the tokens around the mismatch // this is useful for debugging prompt caching - { + if (slots_debug) { const int np0 = std::max(n_past - 4, 0); const int np1 = std::min(n_past + 6, std::min(slot.prompt.tokens.size(), slot.task->tokens.size())); @@ -3950,7 +3950,7 @@ struct server_context { // If using an alora, there may be uncached tokens that come // before the invocation sequence. When this happens, the // tokens before the invocation sequence need to be - // processed without the adpter in a separate batch, then + // processed without the adapter in a separate batch, then // the adapter needs to be enabled for the remaining tokens. if (lora_all_alora(slot.lora) && slot.alora_invocation_start - 1 > slot.prompt.n_tokens()) { SLT_DBG(slot, "processing pre-alora tokens without the adapter (n_tokens = %d, alora_invocation_start = %d)\n", slot.prompt.n_tokens(), slot.alora_invocation_start);