From 532802f938c6a18cc6a704057ab571f253fd77ed Mon Sep 17 00:00:00 2001 From: Christian Kastner Date: Wed, 11 Jun 2025 19:07:44 +0000 Subject: [PATCH 01/24] Implement GGML_CPU_ALL_VARIANTS for ARM (#14080) * ggml-cpu: Factor out feature detection build from x86 * ggml-cpu: Add ARM feature detection and scoring This is analogous to cpu-feats-x86.cpp. However, to detect compile-time activation of features, we rely on GGML_USE_ which need to be set in cmake, instead of GGML_ that users would set for x86. This is because on ARM, users specify features with GGML_CPU_ARM_ARCH, rather than with individual flags. * ggml-cpu: Implement GGML_CPU_ALL_VARIANTS for ARM Like x86, however to pass around arch flags within cmake, we use GGML_INTERNAL_ as we don't have GGML_. Some features are optional, so we may need to build multiple backends per arch version (armv8.2_1, armv8.2_2, ...), and let the scoring function sort out which one can be used. * ggml-cpu: Limit ARM GGML_CPU_ALL_VARIANTS to Linux for now The other platforms will need their own specific variants. This also fixes the bug that the the variant-building branch was always being executed as the else-branch of GGML_NATIVE=OFF. The branch is moved to an elseif-branch which restores the previous behavior. --- ggml/src/CMakeLists.txt | 42 ++++++++--- ggml/src/ggml-cpu/CMakeLists.txt | 70 +++++++++++++++--- ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp | 94 ++++++++++++++++++++++++ 3 files changed, 183 insertions(+), 23 deletions(-) create mode 100644 ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index d91dbc46f..726da5e04 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -270,17 +270,23 @@ endfunction() function(ggml_add_cpu_backend_variant tag_name) set(GGML_CPU_TAG_NAME ${tag_name}) # other: OPENMP LLAMAFILE CPU_HBM - foreach (feat NATIVE - SSE42 - AVX AVX2 BMI2 AVX_VNNI FMA F16C - AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 - AMX_TILE AMX_INT8 AMX_BF16) - set(GGML_${feat} OFF) - endforeach() + if (GGML_SYSTEM_ARCH STREQUAL "x86") + foreach (feat NATIVE + SSE42 + AVX AVX2 BMI2 AVX_VNNI FMA F16C + AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 + AMX_TILE AMX_INT8 AMX_BF16) + set(GGML_${feat} OFF) + endforeach() - foreach (feat ${ARGN}) - set(GGML_${feat} ON) - endforeach() + foreach (feat ${ARGN}) + set(GGML_${feat} ON) + endforeach() + elseif (GGML_SYSTEM_ARCH STREQUAL "ARM") + foreach (feat ${ARGN}) + set(GGML_INTERNAL_${feat} ON) + endforeach() + endif() ggml_add_cpu_backend_variant_impl(${tag_name}) endfunction() @@ -290,6 +296,8 @@ ggml_add_backend(CPU) if (GGML_CPU_ALL_VARIANTS) if (NOT GGML_BACKEND_DL) message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL") + elseif (GGML_CPU_ARM_ARCH) + message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS") endif() if (GGML_SYSTEM_ARCH STREQUAL "x86") ggml_add_cpu_backend_variant(x64) @@ -303,8 +311,20 @@ if (GGML_CPU_ALL_VARIANTS) # MSVC doesn't support AMX ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8) endif() + elseif(GGML_SYSTEM_ARCH STREQUAL "ARM" AND CMAKE_SYSTEM_NAME MATCHES "Linux") + # Many of these features are optional so we build versions with popular + # combinations and name the backends based on the version they were + # first released with + ggml_add_cpu_backend_variant(armv8.0_1) + ggml_add_cpu_backend_variant(armv8.2_1 DOTPROD) + ggml_add_cpu_backend_variant(armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC) + ggml_add_cpu_backend_variant(armv8.2_3 DOTPROD FP16_VECTOR_ARITHMETIC SVE) + ggml_add_cpu_backend_variant(armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8) + ggml_add_cpu_backend_variant(armv8.6_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2) + ggml_add_cpu_backend_variant(armv9.2_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SME) + ggml_add_cpu_backend_variant(armv9.2_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2 SME) else() - message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported on ${GGML_SYSTEM_ARCH}") + message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}") endif() elseif (GGML_CPU) ggml_add_cpu_backend_variant_impl("") diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 77dfc10df..e4c0fa8d0 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -1,3 +1,17 @@ +function(ggml_add_cpu_backend_features cpu_name arch) + # The feature detection code is compiled as a separate target so that + # it can be built without the architecture flags + # Since multiple variants of the CPU backend may be included in the same + # build, using set_source_files_properties() to set the arch flags is not possible + set(GGML_CPU_FEATS_NAME ${cpu_name}-feats) + add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp) + target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include) + target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN}) + target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED) + set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_link_libraries(${cpu_name} PRIVATE ${GGML_CPU_FEATS_NAME}) +endfunction() + function(ggml_add_cpu_backend_variant_impl tag_name) if (tag_name) set(GGML_CPU_NAME ggml-cpu-${tag_name}) @@ -143,6 +157,49 @@ function(ggml_add_cpu_backend_variant_impl tag_name) else() if (GGML_CPU_ARM_ARCH) list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH}) + elseif(GGML_CPU_ALL_VARIANTS) + if (CMAKE_SYSTEM_NAME MATCHES "Linux") + # Begin with the lowest baseline + set(ARM_MCPU "armv8-a") + set(ARCH_TAGS "") + set(ARCH_DEFINITIONS "") + + # When a feature is selected, bump the MCPU to the first + # version that supported it + if (GGML_INTERNAL_DOTPROD) + set(ARM_MCPU "armv8.2-a") + set(ARCH_TAGS "${ARCH_TAGS}+dotprod") + list(APPEND ARCH_DEFINITIONS GGML_USE_DOTPROD) + endif() + if (GGML_INTERNAL_FP16_VECTOR_ARITHMETIC) + set(ARM_MCPU "armv8.2-a") + set(ARCH_TAGS "${ARCH_TAGS}+fp16") + list(APPEND ARCH_DEFINITIONS GGML_USE_FP16_VECTOR_ARITHMETIC) + endif() + if (GGML_INTERNAL_SVE) + set(ARM_MCPU "armv8.2-a") + set(ARCH_TAGS "${ARCH_TAGS}+sve") + list(APPEND ARCH_DEFINITIONS GGML_USE_SVE) + endif() + if (GGML_INTERNAL_MATMUL_INT8) + set(ARM_MCPU "armv8.6-a") + set(ARCH_TAGS "${ARCH_TAGS}+i8mm") + list(APPEND ARCH_DEFINITIONS GGML_USE_MATMUL_INT8) + endif() + if (GGML_INTERNAL_SVE2) + set(ARM_MCPU "armv8.6-a") + set(ARCH_TAGS "${ARCH_TAGS}+sve2") + list(APPEND ARCH_DEFINITIONS GGML_USE_SVE2) + endif() + if (GGML_INTERNAL_SME) + set(ARM_MCPU "armv9.2-a") + set(ARCH_TAGS "${ARCH_TAGS}+sme") + list(APPEND ARCH_DEFINITIONS GGML_USE_SME) + endif() + + list(APPEND ARCH_FLAGS "-march=${ARM_MCPU}${ARCH_TAGS}") + ggml_add_cpu_backend_features(${GGML_CPU_NAME} arm ${ARCH_DEFINITIONS}) + endif() endif() endif() @@ -306,18 +363,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name) # the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS") endif() - - # The feature detection code is compiled as a separate target so that - # it can be built without the architecture flags - # Since multiple variants of the CPU backend may be included in the same - # build, using set_source_files_properties() to set the arch flags is not possible - set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats) - add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/x86/cpu-feats.cpp) - target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include) - target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS}) - target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED) - set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME}) + ggml_add_cpu_backend_features(${GGML_CPU_NAME} x86 ${ARCH_DEFINITIONS}) endif() elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC") message(STATUS "PowerPC detected") diff --git a/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp b/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp new file mode 100644 index 000000000..67369147c --- /dev/null +++ b/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp @@ -0,0 +1,94 @@ +#include "ggml-backend-impl.h" + +#if defined(__aarch64__) + +#if defined(__linux__) +#include +#elif defined(__APPLE__) +#include +#endif + +#if !defined(HWCAP2_I8MM) +#define HWCAP2_I8MM (1 << 13) +#endif + +#if !defined(HWCAP2_SME) +#define HWCAP2_SME (1 << 23) +#endif + +struct aarch64_features { + // has_neon not needed, aarch64 has NEON guaranteed + bool has_dotprod = false; + bool has_fp16_va = false; + bool has_sve = false; + bool has_sve2 = false; + bool has_i8mm = false; + bool has_sme = false; + + aarch64_features() { +#if defined(__linux__) + uint32_t hwcap = getauxval(AT_HWCAP); + uint32_t hwcap2 = getauxval(AT_HWCAP2); + + has_dotprod = !!(hwcap & HWCAP_ASIMDDP); + has_fp16_va = !!(hwcap & HWCAP_FPHP); + has_sve = !!(hwcap & HWCAP_SVE); + has_sve2 = !!(hwcap2 & HWCAP2_SVE2); + has_i8mm = !!(hwcap2 & HWCAP2_I8MM); + has_sme = !!(hwcap2 & HWCAP2_SME); +#elif defined(__APPLE__) + int oldp = 0; + size_t size = sizeof(oldp); + + if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) == 0) { + has_dotprod = static_cast(oldp); + } + + if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) == 0) { + has_i8mm = static_cast(oldp); + } + + if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) == 0) { + has_sme = static_cast(oldp); + } + + // Apple apparently does not implement SVE yet +#endif + } +}; + +static int ggml_backend_cpu_aarch64_score() { + int score = 1; + aarch64_features af; + +#ifdef GGML_USE_DOTPROD + if (!af.has_dotprod) { return 0; } + score += 1<<1; +#endif +#ifdef GGML_USE_FP16_VECTOR_ARITHMETIC + if (!af.has_fp16_va) { return 0; } + score += 1<<2; +#endif +#ifdef GGML_USE_SVE + if (!af.has_sve) { return 0; } + score += 1<<3; +#endif +#ifdef GGML_USE_MATMUL_INT8 + if (!af.has_i8mm) { return 0; } + score += 1<<4; +#endif +#ifdef GGML_USE_SVE2 + if (!af.has_sve2) { return 0; } + score += 1<<5; +#endif +#ifdef GGML_USE_SME + if (!af.has_sme) { return 0; } + score += 1<<6; +#endif + + return score; +} + +GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_aarch64_score) + +# endif // defined(__aarch64__) From 2e89f76b7af2c0b827be785e445f2e2b3e52e1ca Mon Sep 17 00:00:00 2001 From: bandoti <141645996+bandoti@users.noreply.github.com> Date: Wed, 11 Jun 2025 17:19:44 -0300 Subject: [PATCH 02/24] common: fix issue with regex_escape routine on windows (#14133) --- common/common.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index 218f1e1dc..e23887c70 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -466,7 +466,7 @@ size_t string_find_partial_stop(const std::string_view & str, const std::string_ std::string regex_escape(const std::string & s) { static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]"); - return std::regex_replace(s, special_chars, "\\$0"); + return std::regex_replace(s, special_chars, "\\$&"); } std::string string_join(const std::vector & values, const std::string & separator) { From a20b2b05bce6622c585459ebf46f142f113d021c Mon Sep 17 00:00:00 2001 From: compilade Date: Thu, 12 Jun 2025 02:56:04 -0400 Subject: [PATCH 03/24] context : round n_tokens to next multiple of n_seqs when reserving (#14140) This fixes RWKV inference which otherwise failed when the worst case ubatch.n_seq_tokens rounded to 0. --- src/llama-context.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index b130b484b..525a00d8a 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1332,7 +1332,7 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs); if (n_tokens % n_seqs != 0) { - n_tokens = (n_tokens / n_seqs) * n_seqs; + n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs n_outputs = std::min(n_outputs, n_tokens); LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs); From 9596506965f65be5d802ecef6a315fe43d2391a8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 12 Jun 2025 10:02:15 +0300 Subject: [PATCH 04/24] kv-cache : fix split_equal handling in unified implementation (#14130) ggml-ci --- src/llama-context.cpp | 2 + src/llama-kv-cache-unified-iswa.cpp | 83 ++++++++++++++------ src/llama-kv-cache-unified.cpp | 114 +++++++++++++++++----------- 3 files changed, 128 insertions(+), 71 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 525a00d8a..8cea21d69 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -877,6 +877,8 @@ int llama_context::encode(llama_batch & inp_batch) { memcpy(cross.v_embd.data(), embd, ggml_nbytes(t_embd)); // remember the sequence ids used during the encoding - needed for cross attention later + // TODO: the seuqence indexing here is likely not correct in the general case + // probably works only for split_simple cross.seq_ids_enc.resize(n_tokens); for (int32_t i = 0; i < n_tokens; i++) { cross.seq_ids_enc[i].clear(); diff --git a/src/llama-kv-cache-unified-iswa.cpp b/src/llama-kv-cache-unified-iswa.cpp index 28d182654..caa58ea9a 100644 --- a/src/llama-kv-cache-unified-iswa.cpp +++ b/src/llama-kv-cache-unified-iswa.cpp @@ -98,33 +98,66 @@ llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const { llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) { GGML_UNUSED(embd_pooled); - // TODO: if we fail with split_simple, we should attempt different splitting strategies + // first try simple split + do { + auto sbatch = llama_sbatch(batch, hparams.n_embd, true, logits_all); + + std::vector ubatches; + + while (sbatch.n_tokens > 0) { + auto ubatch = sbatch.split_simple(n_ubatch); + + ubatches.push_back(ubatch); + } + + auto heads_base = kv_base->prepare(ubatches); + if (heads_base.empty()) { + break; + } + + auto heads_swa = kv_swa->prepare(ubatches); + if (heads_swa.empty()) { + break; + } + + assert(heads_base.size() == heads_swa.size()); + + return std::make_unique( + this, std::move(sbatch), std::move(heads_base), std::move(heads_swa), std::move(ubatches)); + } while (false); + + // if it fails, try equal split + do { + auto sbatch = llama_sbatch(batch, hparams.n_embd, false, logits_all); + + std::vector ubatches; + + while (sbatch.n_tokens > 0) { + auto ubatch = sbatch.split_equal(n_ubatch); + + ubatches.push_back(ubatch); + } + + auto heads_base = kv_base->prepare(ubatches); + if (heads_base.empty()) { + break; + } + + auto heads_swa = kv_swa->prepare(ubatches); + if (heads_swa.empty()) { + break; + } + + assert(heads_base.size() == heads_swa.size()); + + return std::make_unique( + this, std::move(sbatch), std::move(heads_base), std::move(heads_swa), std::move(ubatches)); + } while (false); + + // TODO: if we fail again, we should attempt different splitting strategies // but to do that properly, we first have to refactor the batches to be more flexible - auto sbatch = llama_sbatch(batch, hparams.n_embd, true, logits_all); - - std::vector ubatches; - - while (sbatch.n_tokens > 0) { - auto ubatch = sbatch.split_simple(n_ubatch); - - ubatches.push_back(ubatch); - } - - auto heads_base = kv_base->prepare(ubatches); - if (heads_base.empty()) { - return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); - } - - auto heads_swa = kv_swa->prepare(ubatches); - if (heads_swa.empty()) { - return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); - } - - assert(heads_base.size() == heads_swa.size()); - - return std::make_unique( - this, std::move(sbatch), std::move(heads_base), std::move(heads_swa), std::move(ubatches)); + return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); } llama_memory_state_ptr llama_kv_cache_unified_iswa::init_full() { diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp index 1a9f4e315..ddeb138f3 100644 --- a/src/llama-kv-cache-unified.cpp +++ b/src/llama-kv-cache-unified.cpp @@ -314,20 +314,24 @@ llama_memory_state_ptr llama_kv_cache_unified::init_batch( bool logits_all) { GGML_UNUSED(embd_pooled); - auto sbatch = llama_sbatch(batch, hparams.n_embd, true, logits_all); + do { + auto sbatch = llama_sbatch(batch, hparams.n_embd, true, logits_all); - std::vector ubatches; - while (sbatch.n_tokens > 0) { - ubatches.push_back(sbatch.split_simple(n_ubatch)); - } + std::vector ubatches; + while (sbatch.n_tokens > 0) { + ubatches.push_back(sbatch.split_simple(n_ubatch)); + } - auto heads = prepare(ubatches); - if (heads.empty()) { - return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); - } + auto heads = prepare(ubatches); + if (heads.empty()) { + break; + } - return std::make_unique( - this, std::move(sbatch), std::move(heads), std::move(ubatches)); + return std::make_unique( + this, std::move(sbatch), std::move(heads), std::move(ubatches)); + } while (false); + + return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); } llama_memory_state_ptr llama_kv_cache_unified::init_full() { @@ -521,7 +525,6 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const { } if (debug > 0) { - LLAMA_LOG_CONT("\n"); LLAMA_LOG_DEBUG("%s: n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n", __func__, cells.used_max_p1(), cells.get_used(), head, get_size(), n_swa); if ((debug == 2 && n_swa > 0) || debug > 2) { @@ -530,7 +533,13 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const { if (cells.is_empty(i)) { ss += '.'; } else { - ss += std::to_string(cells.seq_get(i)); + assert(cells.seq_count(i) >= 1); + + if (cells.seq_count(i) == 1) { + ss += std::to_string(cells.seq_get(i)); + } else { + ss += 'M'; + } } if (i%256 == 255) { ss += " *"; @@ -636,6 +645,12 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const { } void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch) { + if (debug > 0) { + LLAMA_LOG_DEBUG("%s: ubatch info:\n", __func__); + LLAMA_LOG_DEBUG("%s: n_tokens = %d, equal_seqs = %d\n", __func__, ubatch.n_tokens, ubatch.equal_seqs); + LLAMA_LOG_DEBUG("%s: n_seq_tokens = %d, n_seqs = %d\n", __func__, ubatch.n_seq_tokens, ubatch.n_seqs); + } + // keep track of the max sequence position that we would overwrite with this ubatch // for non-SWA cache, this would be always empty llama_seq_id seq_pos_max_rm[LLAMA_MAX_PARALLEL_SEQUENCES]; @@ -643,22 +658,26 @@ void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch seq_pos_max_rm[s] = -1; } - for (uint32_t i = 0; i < ubatch.n_tokens; ++i) { - if (!cells.is_empty(head_cur + i)) { - assert(cells.seq_count(head_cur + i) == 1); + for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { + for (uint32_t j = 0; j < ubatch.n_seq_tokens; ++j) { + const uint32_t idx = s*ubatch.n_seq_tokens + j; - const llama_seq_id seq_id = cells.seq_get(head_cur + i); - const llama_pos pos = cells.pos_get(head_cur + i); + if (!cells.is_empty(head_cur + idx)) { + assert(cells.seq_count(head_cur + idx) == 1); - seq_pos_max_rm[seq_id] = std::max(seq_pos_max_rm[seq_id], pos); + const llama_seq_id seq_id = cells.seq_get(head_cur + idx); + const llama_pos pos = cells.pos_get(head_cur + idx); - cells.rm(head_cur + i); - } + seq_pos_max_rm[seq_id] = std::max(seq_pos_max_rm[seq_id], pos); - cells.pos_set(head_cur + i, ubatch.pos[i]); + cells.rm(head_cur + idx); + } - for (int32_t j = 0; j < ubatch.n_seq_id[i]; j++) { - cells.seq_add(head_cur + i, ubatch.seq_id[i][j]); + cells.pos_set(head_cur + idx, ubatch.pos[idx]); + + for (int32_t i = 0; i < ubatch.n_seq_id[s]; i++) { + cells.seq_add(head_cur + idx, ubatch.seq_id[s][i]); + } } } @@ -677,7 +696,6 @@ void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch seq_rm(s, cells.seq_pos_min(s), seq_pos_max_rm[s] + 1); } } - // move the head at the end of the slot head = head_cur + ubatch.n_tokens; } @@ -774,14 +792,14 @@ ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_ } void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const { - const int64_t n_tokens = ubatch->n_tokens; - const int64_t n_seq_tokens = ubatch->n_seq_tokens; - const int64_t n_seqs = ubatch->n_seqs; + const uint32_t n_tokens = ubatch->n_tokens; + const uint32_t n_seq_tokens = ubatch->n_seq_tokens; + const uint32_t n_seqs = ubatch->n_seqs; GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); float * data = (float *) dst->data; - const auto n_kv = dst->ne[0]; + const int64_t n_kv = dst->ne[0]; // Use only the previous KV cells of the correct sequence for each token of the ubatch. // It's assumed that if a token in the batch has multiple sequences, they are equivalent. @@ -795,12 +813,14 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub // xxxxx----- // xxxxx----- // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615 - for (int h = 0; h < 1; ++h) { - for (int s = 0; s < n_seqs; ++s) { + for (uint32_t h = 0; h < 1; ++h) { + for (uint32_t s = 0; s < n_seqs; ++s) { const llama_seq_id seq_id = ubatch->seq_id[s][0]; - for (int j = 0; j < n_seq_tokens; ++j) { - const llama_pos p1 = ubatch->pos[s*n_seq_tokens + j]; + for (uint32_t j = 0; j < n_seq_tokens; ++j) { + const uint32_t idx = s*n_seq_tokens + j; + + const llama_pos p1 = ubatch->pos[idx]; for (uint32_t i = 0; i < n_kv; ++i) { float f = 0.0f; @@ -830,16 +850,16 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub f = -INFINITY; } - data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f; + data[h*(n_kv*n_tokens) + idx*n_kv + i] = f; } } } // mask padded tokens if (data) { - for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) { - for (uint32_t j = 0; j < n_kv; ++j) { - data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY; + for (uint32_t j = n_tokens; j < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++j) { + for (uint32_t i = 0; i < n_kv; ++i) { + data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; } } } @@ -1490,9 +1510,11 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell seq_rm(dest_seq_id, -1, -1); llama_sbatch sbatch; - llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false); + llama_ubatch ubatch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false); - batch.n_tokens = cell_count; + ubatch.n_tokens = cell_count; + ubatch.n_seq_tokens = cell_count; + ubatch.n_seqs = 1; for (uint32_t i = 0; i < cell_count; ++i) { llama_pos pos; @@ -1512,18 +1534,18 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell io.read_to(&seq_id, sizeof(seq_id)); } - batch.pos[i] = pos; - batch.n_seq_id[i] = n_seq_id; - batch.seq_id[i] = &dest_seq_id; + ubatch.pos[i] = pos; + ubatch.n_seq_id[i] = n_seq_id; + ubatch.seq_id[i] = &dest_seq_id; } - const auto head_cur = find_slot(batch); + const auto head_cur = find_slot(ubatch); if (head_cur < 0) { LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); return false; } - apply_ubatch(head_cur, batch); + apply_ubatch(head_cur, ubatch); // keep the head at the old position because we will read the KV data into it in state_read_data() head = head_cur; @@ -1531,8 +1553,8 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell // DEBUG CHECK: head_cur should be our first cell, head_cur + cell_count - 1 should be our last cell (verify seq_id and pos values) // Assume that this is one contiguous block of cells GGML_ASSERT(head_cur + cell_count <= cells.size()); - GGML_ASSERT(cells.pos_get(head_cur) == batch.pos[0]); - GGML_ASSERT(cells.pos_get(head_cur + cell_count - 1) == batch.pos[cell_count - 1]); + GGML_ASSERT(cells.pos_get(head_cur) == ubatch.pos[0]); + GGML_ASSERT(cells.pos_get(head_cur + cell_count - 1) == ubatch.pos[cell_count - 1]); GGML_ASSERT(cells.seq_has(head_cur, dest_seq_id)); GGML_ASSERT(cells.seq_has(head_cur + cell_count - 1, dest_seq_id)); } else { From e2c0b6e46a5596665569ae765f0993cea2619af6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 12 Jun 2025 10:14:24 +0300 Subject: [PATCH 05/24] cmake : handle whitepsaces in path during metal build (#14126) * cmake : handle whitepsaces in path during metal build ggml-ci * cont : proper fix ggml-ci --------- Co-authored-by: Daniel Bevenius --- ggml/src/ggml-metal/CMakeLists.txt | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-metal/CMakeLists.txt b/ggml/src/ggml-metal/CMakeLists.txt index e22232780..77187efc1 100644 --- a/ggml/src/ggml-metal/CMakeLists.txt +++ b/ggml/src/ggml-metal/CMakeLists.txt @@ -44,21 +44,22 @@ if (GGML_METAL_EMBED_LIBRARY) set(METALLIB_SOURCE_EMBED_TMP "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp") add_custom_command( - OUTPUT ${METALLIB_EMBED_ASM} + OUTPUT "${METALLIB_EMBED_ASM}" COMMAND echo "Embedding Metal library" - COMMAND sed -e '/__embed_ggml-common.h__/r ${METALLIB_COMMON}' -e '/__embed_ggml-common.h__/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED_TMP} - COMMAND sed -e '/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}' -e '/\#include \"ggml-metal-impl.h\"/d' < ${METALLIB_SOURCE_EMBED_TMP} > ${METALLIB_SOURCE_EMBED} - COMMAND echo ".section __DATA,__ggml_metallib" > ${METALLIB_EMBED_ASM} - COMMAND echo ".globl _ggml_metallib_start" >> ${METALLIB_EMBED_ASM} - COMMAND echo "_ggml_metallib_start:" >> ${METALLIB_EMBED_ASM} - COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM} - COMMAND echo ".globl _ggml_metallib_end" >> ${METALLIB_EMBED_ASM} - COMMAND echo "_ggml_metallib_end:" >> ${METALLIB_EMBED_ASM} + COMMAND sed -e "/__embed_ggml-common.h__/r ${METALLIB_COMMON}" -e "/__embed_ggml-common.h__/d" < "${METALLIB_SOURCE}" > "${METALLIB_SOURCE_EMBED_TMP}" + COMMAND sed -e "/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}" -e "/\#include \"ggml-metal-impl.h\"/d" < "${METALLIB_SOURCE_EMBED_TMP}" > "${METALLIB_SOURCE_EMBED}" + COMMAND echo ".section __DATA,__ggml_metallib" > "${METALLIB_EMBED_ASM}" + COMMAND echo ".globl _ggml_metallib_start" >> "${METALLIB_EMBED_ASM}" + COMMAND echo "_ggml_metallib_start:" >> "${METALLIB_EMBED_ASM}" + COMMAND echo .incbin "\"${METALLIB_SOURCE_EMBED}\"" >> "${METALLIB_EMBED_ASM}" + COMMAND echo ".globl _ggml_metallib_end" >> "${METALLIB_EMBED_ASM}" + COMMAND echo "_ggml_metallib_end:" >> "${METALLIB_EMBED_ASM}" DEPENDS ../ggml-common.h ggml-metal.metal ggml-metal-impl.h COMMENT "Generate assembly for embedded Metal library" + VERBATIM ) - target_sources(ggml-metal PRIVATE ${METALLIB_EMBED_ASM}) + target_sources(ggml-metal PRIVATE "${METALLIB_EMBED_ASM}") else() if (GGML_METAL_SHADER_DEBUG) # custom command to do the following: From c3ee46fab49a765d2e32e171e9ed7a5fa121dd9c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 12 Jun 2025 11:49:26 +0300 Subject: [PATCH 06/24] batch : remove logits_all flag (#14141) ggml-ci --- src/llama-batch.cpp | 10 ++-------- src/llama-batch.h | 4 +--- src/llama-context.cpp | 6 +++--- src/llama-kv-cache-recurrent.cpp | 4 ++-- src/llama-kv-cache-recurrent.h | 3 +-- src/llama-kv-cache-unified-iswa.cpp | 6 +++--- src/llama-kv-cache-unified-iswa.h | 3 +-- src/llama-kv-cache-unified.cpp | 5 ++--- src/llama-kv-cache-unified.h | 3 +-- src/llama-memory.h | 3 +-- 10 files changed, 17 insertions(+), 30 deletions(-) diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp index 6a19a2431..58787fdba 100644 --- a/src/llama-batch.cpp +++ b/src/llama-batch.cpp @@ -105,12 +105,7 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s ubatch.seq_id = batch->seq_id + seq.offset; } } - if (logits_all) { - for (size_t i = 0; i < length; ++i) { - ubatch.output[ubatch.n_tokens + i] = 1; - out_ids.push_back(ids[seq.offset + i]); - } - } else if (batch->logits) { + if (batch->logits) { if (ubatch.equal_seqs) { for (size_t i = 0; i < length; ++i) { size_t id = ids[seq.offset + i]; @@ -197,11 +192,10 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) { return ubatch; } -llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) { +llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split) { GGML_ASSERT(batch.n_tokens >= 0); this->batch = &batch; this->n_embd = n_embd; - this->logits_all = logits_all; n_tokens = batch.n_tokens; ids.resize(n_tokens); diff --git a/src/llama-batch.h b/src/llama-batch.h index b8260b94f..989fb6cf9 100644 --- a/src/llama-batch.h +++ b/src/llama-batch.h @@ -39,8 +39,6 @@ struct llama_sbatch { size_t n_embd; - bool logits_all; // TODO: remove once lctx.logits_all is removed too - // sorted indices into the batch std::vector ids; // batch indices of the output @@ -76,7 +74,7 @@ struct llama_sbatch { llama_ubatch split_seq(size_t n_ubatch); llama_sbatch() = default; - llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false); + llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false); }; // temporary allocate memory for the input batch if needed diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 8cea21d69..ebcba6993 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -764,7 +764,7 @@ int llama_context::encode(llama_batch & inp_batch) { const int64_t n_embd = hparams.n_embd; - llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true); + llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true); const llama_ubatch ubatch = sbatch.split_simple(n_tokens); @@ -976,7 +976,7 @@ int llama_context::decode(llama_batch & inp_batch) { llama_memory_state_ptr mstate; while (true) { - mstate = memory->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ n_outputs_all == n_tokens_all); + mstate = memory->init_batch(batch, cparams.n_ubatch, embd_pooled); if (!mstate) { return -2; } @@ -2080,7 +2080,7 @@ void llama_context::opt_epoch_iter( int64_t n_outputs_all = n_tokens_all; - auto mstate = memory->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ true); + auto mstate = memory->init_batch(batch, cparams.n_ubatch, embd_pooled); if (!mstate || mstate->get_status() != LLAMA_MEMORY_STATUS_SUCCESS) { LLAMA_LOG_ERROR("%s: could not initialize batch\n", __func__); break; diff --git a/src/llama-kv-cache-recurrent.cpp b/src/llama-kv-cache-recurrent.cpp index f8cdd5280..de23b4ad2 100644 --- a/src/llama-kv-cache-recurrent.cpp +++ b/src/llama-kv-cache-recurrent.cpp @@ -359,10 +359,10 @@ llama_pos llama_kv_cache_recurrent::seq_pos_max(llama_seq_id seq_id) const { return result; } -llama_memory_state_ptr llama_kv_cache_recurrent::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) { +llama_memory_state_ptr llama_kv_cache_recurrent::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled) { GGML_UNUSED(embd_pooled); - auto sbatch = llama_sbatch(batch, hparams.n_embd, false, logits_all); + auto sbatch = llama_sbatch(batch, hparams.n_embd, false); std::vector ubatches; diff --git a/src/llama-kv-cache-recurrent.h b/src/llama-kv-cache-recurrent.h index 4b33bafd7..d7c02ea87 100644 --- a/src/llama-kv-cache-recurrent.h +++ b/src/llama-kv-cache-recurrent.h @@ -32,8 +32,7 @@ public: llama_memory_state_ptr init_batch( const llama_batch & batch, uint32_t n_ubatch, - bool embd_pooled, - bool logits_all) override; + bool embd_pooled) override; llama_memory_state_ptr init_full() override; diff --git a/src/llama-kv-cache-unified-iswa.cpp b/src/llama-kv-cache-unified-iswa.cpp index caa58ea9a..9814f7663 100644 --- a/src/llama-kv-cache-unified-iswa.cpp +++ b/src/llama-kv-cache-unified-iswa.cpp @@ -95,12 +95,12 @@ llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const { return kv_swa->seq_pos_max(seq_id); } -llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) { +llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled) { GGML_UNUSED(embd_pooled); // first try simple split do { - auto sbatch = llama_sbatch(batch, hparams.n_embd, true, logits_all); + auto sbatch = llama_sbatch(batch, hparams.n_embd, true); std::vector ubatches; @@ -128,7 +128,7 @@ llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(const llama_batch // if it fails, try equal split do { - auto sbatch = llama_sbatch(batch, hparams.n_embd, false, logits_all); + auto sbatch = llama_sbatch(batch, hparams.n_embd, false); std::vector ubatches; diff --git a/src/llama-kv-cache-unified-iswa.h b/src/llama-kv-cache-unified-iswa.h index 3dbf33ed7..d114c7378 100644 --- a/src/llama-kv-cache-unified-iswa.h +++ b/src/llama-kv-cache-unified-iswa.h @@ -34,8 +34,7 @@ public: llama_memory_state_ptr init_batch( const llama_batch & batch, uint32_t n_ubatch, - bool embd_pooled, - bool logits_all) override; + bool embd_pooled) override; llama_memory_state_ptr init_full() override; diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp index ddeb138f3..89606c598 100644 --- a/src/llama-kv-cache-unified.cpp +++ b/src/llama-kv-cache-unified.cpp @@ -310,12 +310,11 @@ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const { llama_memory_state_ptr llama_kv_cache_unified::init_batch( const llama_batch & batch, uint32_t n_ubatch, - bool embd_pooled, - bool logits_all) { + bool embd_pooled) { GGML_UNUSED(embd_pooled); do { - auto sbatch = llama_sbatch(batch, hparams.n_embd, true, logits_all); + auto sbatch = llama_sbatch(batch, hparams.n_embd, true); std::vector ubatches; while (sbatch.n_tokens > 0) { diff --git a/src/llama-kv-cache-unified.h b/src/llama-kv-cache-unified.h index cf4c691ba..d6dcd19f2 100644 --- a/src/llama-kv-cache-unified.h +++ b/src/llama-kv-cache-unified.h @@ -59,8 +59,7 @@ public: llama_memory_state_ptr init_batch( const llama_batch & batch, uint32_t n_ubatch, - bool embd_pooled, - bool logits_all) override; + bool embd_pooled) override; llama_memory_state_ptr init_full() override; diff --git a/src/llama-memory.h b/src/llama-memory.h index 991aae781..42e226dc0 100644 --- a/src/llama-memory.h +++ b/src/llama-memory.h @@ -73,8 +73,7 @@ struct llama_memory_i { virtual llama_memory_state_ptr init_batch( const llama_batch & batch, uint32_t n_ubatch, - bool embd_pooled, - bool logits_all) = 0; + bool embd_pooled) = 0; // simulate full cache, used for allocating worst-case compute buffers virtual llama_memory_state_ptr init_full() = 0; From f6e1a7aa8787b5c00acba6370cb70a0beff48b1e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 12 Jun 2025 11:50:01 +0300 Subject: [PATCH 07/24] context : simplify output counting logic during decode (#14142) * batch : remove logits_all flag ggml-ci * context : simplify output counting logic during decode ggml-ci * cont : fix comments --- src/llama-batch.cpp | 7 ++++--- src/llama-batch.h | 2 +- src/llama-context.cpp | 42 +++++++++++++++++++++++------------------- 3 files changed, 28 insertions(+), 23 deletions(-) diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp index 58787fdba..69e0d7549 100644 --- a/src/llama-batch.cpp +++ b/src/llama-batch.cpp @@ -306,9 +306,10 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0 batch.seq_id = seq_id.data(); } if (!batch.logits) { - logits.resize(batch.n_tokens); - logits[logits.size() - 1] = true; - batch.logits = logits.data(); + // by default return the output only for the last token + output.resize(batch.n_tokens); + output[output.size() - 1] = true; + batch.logits = output.data(); } } diff --git a/src/llama-batch.h b/src/llama-batch.h index 989fb6cf9..7ad82b528 100644 --- a/src/llama-batch.h +++ b/src/llama-batch.h @@ -85,7 +85,7 @@ struct llama_batch_allocr { std::vector pos; std::vector n_seq_id; std::vector seq_id; - std::vector logits; + std::vector output; // optionally fulfill the batch returned by llama_batch_get_one llama_batch_allocr(struct llama_batch in_batch, llama_pos p0); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index ebcba6993..2e551bf6e 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -758,6 +758,7 @@ int llama_context::encode(llama_batch & inp_batch) { t_compute_start_us = ggml_time_us(); } + // TODO: this clear of the buffer can easily be forgotten - need something better embd_seq.clear(); n_queued_tokens += n_tokens; @@ -940,6 +941,25 @@ int llama_context::decode(llama_batch & inp_batch) { } } + // this indicates we are doing pooled embedding + const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; + + int64_t n_outputs_all = 0; + + // count outputs + for (uint32_t i = 0; i < n_tokens_all; ++i) { + n_outputs_all += batch.logits[i] != 0; + } + + if (embd_pooled) { + // require that all tokens are output + if (n_outputs_all != n_tokens_all) { + LLAMA_LOG_ERROR("%s: pooled embedding requires that all tokens are output (n_outputs_all = %" PRId64 ", n_tokens_all = %" PRId64 ")\n", + __func__, n_outputs_all, n_tokens_all); + return -1; + } + } + GGML_ASSERT(n_tokens_all <= cparams.n_batch); GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens"); @@ -949,25 +969,9 @@ int llama_context::decode(llama_batch & inp_batch) { } n_queued_tokens += n_tokens_all; - // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens - const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; - + // TODO: this clear of the buffer can easily be forgotten - need something better embd_seq.clear(); - int64_t n_outputs_all = 0; - - // count outputs - if (batch.logits && !embd_pooled) { - for (uint32_t i = 0; i < n_tokens_all; ++i) { - n_outputs_all += batch.logits[i] != 0; - } - } else if (embd_pooled) { - n_outputs_all = n_tokens_all; - } else { - // keep last output only - n_outputs_all = 1; - } - bool did_optimize = false; // handle any pending defrags/shifts @@ -1029,7 +1033,7 @@ int llama_context::decode(llama_batch & inp_batch) { do { const auto & ubatch = mstate->get_ubatch(); - // count the outputs in this u_batch + // count the outputs in this ubatch { int32_t n_outputs_new = 0; @@ -2073,7 +2077,7 @@ void llama_context::opt_epoch_iter( n_queued_tokens += n_tokens_all; - // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens + // this indicates we are doing pooled embedding const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; embd_seq.clear(); From 7d516443dd7766569110b38e6374649bee6eb1c4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 12 Jun 2025 11:51:38 +0300 Subject: [PATCH 08/24] server : re-enable SWA speculative decoding (#14131) ggml-ci --- tools/server/server.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 1b1cf439b..8efb8b704 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2017,11 +2017,6 @@ struct server_context { params_base.n_cache_reuse = 0; SRV_WRN("%s\n", "cache_reuse is not supported by this context, it will be disabled"); } - - if (!params_base.speculative.model.path.empty()) { - SRV_ERR("%s\n", "err: speculative decode is not supported by this context"); - return false; - } } return true; From a681b4ba83a61dce71a4f24e558efe7278d8b1a9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 12 Jun 2025 14:43:09 +0300 Subject: [PATCH 09/24] readme : remove project status link (#14149) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 385ac04d8..928100f3c 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases) [![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml) -[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml) +[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml) Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++ From ed52f3668e633423054a4eab61bb7efee47025ab Mon Sep 17 00:00:00 2001 From: Anton Mitkov Date: Thu, 12 Jun 2025 14:15:11 +0100 Subject: [PATCH 10/24] sycl: Remove not needed copy f16->f32 for dnnl mul mat (#14125) --- ggml/src/ggml-sycl/gemm.hpp | 3 +++ ggml/src/ggml-sycl/ggml-sycl.cpp | 9 +++------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-sycl/gemm.hpp b/ggml/src/ggml-sycl/gemm.hpp index 6cbc7e0f6..5efe03d36 100644 --- a/ggml/src/ggml-sycl/gemm.hpp +++ b/ggml/src/ggml-sycl/gemm.hpp @@ -65,6 +65,9 @@ public: dnnl::primitive_attr primitive_attr; primitive_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); +#ifdef GGML_SYCL_F16 + primitive_attr.set_fpmath_mode(dnnl::fpmath_mode::f16); +#endif auto a_mem = dnnl::memory(a_in_md, eng, const_cast(a)); auto b_mem = dnnl::memory(b_in_md, eng, const_cast(b)); diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 3693b0a43..feb30304f 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2127,21 +2127,18 @@ inline void ggml_sycl_op_mul_mat_sycl( const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16 ? (const sycl::half *)src1->data + src1_padded_row_size : src1_as_f16.get(); - ggml_sycl_pool_alloc dst_f16(ctx.pool(), row_diff * src1_ncols); #if GGML_SYCL_DNNL if (!g_ggml_sycl_disable_dnn) { DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr, DnnlGemmWrapper::to_dt(), src0_ptr, DnnlGemmWrapper::to_dt(), - dst_f16.get(), DnnlGemmWrapper::to_dt(), stream); - scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2, - " : converting dst to fp32"); - const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst); - to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream); + dst_dd_i, DnnlGemmWrapper::to_dt(), stream); } else #endif { + ggml_sycl_pool_alloc dst_f16(ctx.pool(), row_diff * src1_ncols); + const sycl::half alpha_f16 = 1.0f; const sycl::half beta_f16 = 0.0f; SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm( From c33fe8b8c4427202706b1434e9fc8ab5752c9cac Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 13 Jun 2025 08:03:54 +0300 Subject: [PATCH 11/24] vocab : prevent heap overflow when vocab is too small (#14145) ggml-ci --- src/llama-vocab.cpp | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index ba2e1864e..d8c9d9730 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -2768,26 +2768,26 @@ void llama_vocab::impl::print_info() const { LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size()); // special tokens - if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token[special_bos_id].text.c_str() ); } - if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token[special_eos_id].text.c_str() ); } - if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token[special_eot_id].text.c_str() ); } - if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token[special_eom_id].text.c_str() ); } - if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token[special_unk_id].text.c_str() ); } - if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token[special_sep_id].text.c_str() ); } - if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token[special_pad_id].text.c_str() ); } - if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token[special_mask_id].text.c_str() ); } + if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); } + if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); } + if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); } + if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); } + if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); } + if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); } + if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); } + if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); } - if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token[linefeed_id].text.c_str() ); } + if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); } - if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token[special_fim_pre_id].text.c_str() ); } - if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token[special_fim_suf_id].text.c_str() ); } - if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token[special_fim_mid_id].text.c_str() ); } - if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token[special_fim_pad_id].text.c_str() ); } - if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token[special_fim_rep_id].text.c_str() ); } - if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token[special_fim_sep_id].text.c_str() ); } + if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); } + if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); } + if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); } + if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); } + if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); } + if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); } for (const auto & id : special_eog_ids) { - LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token[id].text.c_str() ); + LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() ); } LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len); From 09cf2c7c655c90e53e100f29b830a788bab0653d Mon Sep 17 00:00:00 2001 From: Christian Kastner Date: Fri, 13 Jun 2025 06:51:34 +0000 Subject: [PATCH 12/24] cmake : Improve build-info.cpp generation (#14156) * cmake: Simplify build-info.cpp generation The rebuild of build-info.cpp still gets triggered when .git/index gets changes. * cmake: generate build-info.cpp in build dir --- common/CMakeLists.txt | 24 +++++++----------------- common/cmake/build-info-gen-cpp.cmake | 24 ------------------------ 2 files changed, 7 insertions(+), 41 deletions(-) delete mode 100644 common/cmake/build-info-gen-cpp.cmake diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 8ba02c146..f43a630c9 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -23,31 +23,21 @@ if(EXISTS "${PROJECT_SOURCE_DIR}/.git") endif() if(EXISTS "${GIT_DIR}/index") - set(GIT_INDEX "${GIT_DIR}/index") + # For build-info.cpp below + set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${GIT_DIR}/index") else() message(WARNING "Git index not found in git repository.") - set(GIT_INDEX "") endif() else() message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.") - set(GIT_INDEX "") endif() -# Add a custom command to rebuild build-info.cpp when .git/index changes -add_custom_command( - OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp" - COMMENT "Generating build details from Git" - COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} - -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR} - -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake" - WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}" - DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX} - VERBATIM -) +set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in") +set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp") +configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE}) + set(TARGET build_info) -add_library(${TARGET} OBJECT build-info.cpp) +add_library(${TARGET} OBJECT ${OUTPUT_FILE}) if (BUILD_SHARED_LIBS) set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) endif() diff --git a/common/cmake/build-info-gen-cpp.cmake b/common/cmake/build-info-gen-cpp.cmake deleted file mode 100644 index fbc92b52c..000000000 --- a/common/cmake/build-info-gen-cpp.cmake +++ /dev/null @@ -1,24 +0,0 @@ -include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) - -set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in") -set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp") - -# Only write the build info if it changed -if(EXISTS ${OUTPUT_FILE}) - file(READ ${OUTPUT_FILE} CONTENTS) - string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS}) - set(OLD_COMMIT ${CMAKE_MATCH_1}) - string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS}) - set(OLD_COMPILER ${CMAKE_MATCH_1}) - string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS}) - set(OLD_TARGET ${CMAKE_MATCH_1}) - if ( - NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR - NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR - NOT OLD_TARGET STREQUAL BUILD_TARGET - ) - configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE}) - endif() -else() - configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE}) -endif() From c61285e7396c8e526fe7794c19e8d4f1c99bfc51 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Fri, 13 Jun 2025 08:45:37 +0100 Subject: [PATCH 13/24] SYCL: Bump oneMath commit (#14152) Update oneMath commit to merged PR https://github.com/uxlfoundation/oneMath/pull/669 which adds SYCL-Graph support for recording CUDA BLAS commands. With this change the `MUL_MAT` tests now pass on DPC++ CUDA backends with SYCL-Graph enabled. Prior to this change, an error would be thrown. ``` $ GGML_SYCL_DISABLE_GRAPH=0 ./bin/test-backend-ops -b SYCL0 -o MUL_MAT -p type_a=f16,type_b=f32,m=16,n=1,k=256,bs=\\[1,1\\],nr=\\[2 UR CUDA ERROR: Value: 700 Name: CUDA_ERROR_ILLEGAL_ADDRESS Description: an illegal memory access was encountered Function: operator() Source Location: $HOME/dpcpp/unified-runtime/source/adapters/cuda/queue.cpp:154 Native API failed. Native API returns: 2147483646 (UR_RESULT_ERROR_UNKNOWN) Exception caught at file:$HOME/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp, line:3598, func:operator() SYCL error: CHECK_TRY_ERROR((stream)->wait()): Meet error in this line code! in function ggml_backend_sycl_synchronize at $HOME/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp:3598 $HOME/llama.cpp/ggml/src/ggml-sycl/../ggml-sycl/common.hpp:118: SYCL error Could not attach to process. If your uid matches the uid of the target process, check the setting of /proc/sys/kernel/yama/ptrace_scope, or try again as the root user. For more details, see /etc/sysctl.d/10-ptrace.conf ptrace: Operation not permitted. No stack. The program is not being run. ``` --- ggml/src/ggml-sycl/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index 2a0045bcc..efd78b912 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -142,7 +142,7 @@ else() FetchContent_Declare( ONEMATH GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git - GIT_TAG c255b1b4c41e2ee3059455c1f96a965d6a62568a + GIT_TAG 8efe85f5aaebb37f1d8c503b7af66315feabf142 ) FetchContent_MakeAvailable(ONEMATH) # Create alias to match with find_package targets name From 0889eba570126f8a2f5a0e88fde776bbc91cca66 Mon Sep 17 00:00:00 2001 From: Anton Mitkov Date: Fri, 13 Jun 2025 08:51:39 +0100 Subject: [PATCH 14/24] sycl: Adding additional cpy dbg print output (#14034) --- ggml/src/ggml-sycl/common.hpp | 41 +++++++++++++++----------------- ggml/src/ggml-sycl/cpy.cpp | 3 +-- ggml/src/ggml-sycl/ggml-sycl.cpp | 26 ++++++++++---------- 3 files changed, 33 insertions(+), 37 deletions(-) diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 4f17699a5..753b4af14 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -513,9 +513,9 @@ constexpr size_t ceil_div(const size_t m, const size_t n) { bool gpu_has_xmx(sycl::device &dev); -template void debug_print_array(const std::string & prefix, const T array[N]) { +template std::string debug_get_array_str(const std::string & prefix, const T array[N]) { if (LIKELY(!g_ggml_sycl_debug)) { - return; + return ""; } std::stringstream ss; ss << prefix << "=["; @@ -526,29 +526,26 @@ template void debug_print_array(const std::string & prefix, con ss << array[N - 1]; } ss << "]"; - GGML_SYCL_DEBUG("%s", ss.str().c_str()); + return ss.str(); } -inline void debug_print_tensor(const std::string & prefix, const ggml_tensor * tensor, - const std::string & suffix = "") { - if (LIKELY(!g_ggml_sycl_debug)) { - return; - } - GGML_SYCL_DEBUG("%s=", prefix.c_str()); +inline std::string debug_get_tensor_str(const std::string &prefix, + const ggml_tensor *tensor, const std::string &suffix = "") { + std::stringstream ss; + if (LIKELY(!g_ggml_sycl_debug)) { return ss.str(); } + ss << prefix.c_str() << "="; if (tensor) { - GGML_SYCL_DEBUG("'%s':type=%s", tensor->name, ggml_type_name(tensor->type)); - debug_print_array(";ne", tensor->ne); - debug_print_array(";nb", tensor->nb); - if (!ggml_is_contiguous(tensor)) { - GGML_SYCL_DEBUG(";strided"); - } - if (ggml_is_permuted(tensor)) { - GGML_SYCL_DEBUG(";permuted"); - } + ss << "'" << tensor->name << "':type=" << ggml_type_name(tensor->type); + ss << debug_get_array_str(";ne", tensor->ne); + ss << debug_get_array_str(";nb", tensor->nb); + + if (!ggml_is_contiguous(tensor)) { ss << ";strided"; } + if (ggml_is_permuted(tensor)) { ss << ";permuted"; } } else { - GGML_SYCL_DEBUG("nullptr"); + ss << "nullptr"; } - GGML_SYCL_DEBUG("%s", suffix.c_str()); + ss << suffix; + return ss.str(); } // Use scope_op_debug_print to log operations coming from running a model @@ -564,10 +561,10 @@ struct scope_op_debug_print { return; } GGML_SYCL_DEBUG("[SYCL][OP] call %s%s:", func.data(), func_suffix.data()); - debug_print_tensor(" dst", dst); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" dst", dst).c_str()); if (dst) { for (std::size_t i = 0; i < num_src; ++i) { - debug_print_tensor("\tsrc" + std::to_string(i), dst->src[i]); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str("\tsrc" + std::to_string(i), dst->src[i]).c_str()); } } GGML_SYCL_DEBUG("%s\n", suffix.data()); diff --git a/ggml/src/ggml-sycl/cpy.cpp b/ggml/src/ggml-sycl/cpy.cpp index 56373b4d0..bec137140 100644 --- a/ggml/src/ggml-sycl/cpy.cpp +++ b/ggml/src/ggml-sycl/cpy.cpp @@ -723,8 +723,7 @@ static void ggml_cpy_q4_1_q4_1(const char * cx, char * cdst, const int ne, const void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try { // Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field - scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0, - std::string(" src0 type=") + ggml_type_name(src0->type)); + scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0, debug_get_tensor_str("\tsrc0", src0)); const int64_t ne = ggml_nelements(src0); GGML_ASSERT(ne == ggml_nelements(src1)); diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index feb30304f..4b7610362 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -347,7 +347,7 @@ static enum ggml_status ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) try { GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - debug_print_tensor(": tensor=", tensor, "\n"); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str()); ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context; if (tensor->view_src != NULL) { @@ -385,7 +385,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer, const void *data, size_t offset, size_t size) try { GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - debug_print_tensor(": tensor=", tensor); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str()); GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; ggml_sycl_set_device(ctx->device); @@ -413,7 +413,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer, void *data, size_t offset, size_t size) try { GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - debug_print_tensor(": tensor=", tensor); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str()); GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; @@ -444,8 +444,8 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer, ggml_tensor *dst) try { bool is_cpy_supported = ggml_backend_buffer_is_sycl(src->buffer); GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - debug_print_tensor(": dst=", dst); - debug_print_tensor(" src=", src); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str()); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str()); GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported); if (is_cpy_supported) { ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context; @@ -525,7 +525,7 @@ catch (sycl::exception const &exc) { static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - debug_print_tensor(": tensor=", tensor); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str()); GGML_SYCL_DEBUG(" size=%zu offset=%zu value=%u\n", size, offset, value); ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context; SYCL_CHECK(ggml_sycl_set_device(ctx->device)); @@ -805,7 +805,7 @@ static enum ggml_status ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) try { GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - debug_print_tensor(": tensor=", tensor, "\n"); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str()); GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; @@ -891,7 +891,7 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, size_t offset, size_t size) try { GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - debug_print_tensor(": tensor=", tensor); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str()); GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); @@ -947,7 +947,7 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *tensor, void *data, size_t offset, size_t size) try { GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - debug_print_tensor(": tensor=", tensor); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str()); GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); @@ -3863,7 +3863,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend, const void *data, size_t offset, size_t size) try { GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - debug_print_tensor(": tensor=", tensor); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str()); GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; @@ -3884,7 +3884,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend, void *data, size_t offset, size_t size) try { GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - debug_print_tensor(": tensor=", tensor); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str()); GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; @@ -3907,8 +3907,8 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend, bool is_cpy_supported = dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && ggml_backend_buffer_is_sycl(src->buffer); GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - debug_print_tensor(": dst=", dst); - debug_print_tensor(" src=", src); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str()); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str()); GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported); if (is_cpy_supported) { /* From ffad04397399ea1650fda6560c7c753059804876 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 13 Jun 2025 11:18:25 +0300 Subject: [PATCH 15/24] server : fix SWA condition for full context reprocess (#14163) ggml-ci --- tools/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 8efb8b704..b439d8b19 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -3217,7 +3217,7 @@ struct server_context { } const auto n_swa = llama_model_n_swa(model); - if (pos_min > slot.n_past - n_swa) { + if (pos_min > std::max(0, slot.n_past - n_swa)) { SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min, n_swa); SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n", "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055"); From d714dadb57d8feaa03d13b79345a4c3382172d61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C4=90inh=20Tr=E1=BB=8Dng=20Huy?= <77562200+huydt84@users.noreply.github.com> Date: Fri, 13 Jun 2025 17:34:08 +0900 Subject: [PATCH 16/24] pooling : make cls_b and cls_out_b optional (#14165) Co-authored-by: dinhhuy --- src/llama-graph.cpp | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index e74c9ff53..4493fb164 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1556,23 +1556,30 @@ void llm_graph_context::build_pooling( ggml_tensor * inp_cls = build_inp_cls(); inp = ggml_get_rows(ctx0, inp, inp_cls); - if (cls != nullptr && cls_b != nullptr) { + if (cls) { // classification head // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566 - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b); + cur = ggml_mul_mat(ctx0, cls, inp); + if (cls_b) { + cur = ggml_add(ctx0, cur, cls_b); + } cur = ggml_tanh(ctx0, cur); // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896 if (cls_out) { - GGML_ASSERT(cls_out_b != nullptr); - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b); + cur = ggml_mul_mat(ctx0, cls_out, cur); + if (cls_out_b) { + cur = ggml_add(ctx0, cur, cls_out_b); + } } } else if (cls_out) { // Single layer classification head (direct projection) // https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476 - GGML_ASSERT(cls_out_b != nullptr); - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, inp), cls_out_b); + cur = ggml_mul_mat(ctx0, cls_out, inp); + if (cls_out_b) { + cur = ggml_add(ctx0, cur, cls_out_b); + } } else { GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b"); } From cc8d08187918c6f643c3ffabb7b1ac21aa19f3d1 Mon Sep 17 00:00:00 2001 From: Christian Kastner Date: Fri, 13 Jun 2025 08:38:52 +0000 Subject: [PATCH 17/24] cmake: Add ability to pass in LLAMA_BUILD_NUMBER/COMMIT (#14167) * cmake: Add ability to pass in LLAMA_BUILD_NUMBER/COMMIT * cmake: Pass on LLAMA_BUILD_* to GGML_BUILD_* --- CMakeLists.txt | 14 ++++++++++---- common/build-info.cpp.in | 4 ++-- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f73470dff..50801cdc6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,6 +89,14 @@ option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake) +if (NOT DEFINED LLAMA_BUILD_NUMBER) + set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER}) +endif() +if (NOT DEFINED LLAMA_BUILD_COMMIT) + set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT}) +endif() +set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER}) + # override ggml options set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS}) set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) @@ -155,6 +163,8 @@ if (LLAMA_USE_SYSTEM_GGML) endif() if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML) + set(GGML_BUILD_NUMBER ${LLAMA_BUILD_NUMBER}) + set(GGML_BUILD_COMMIT ${LLAMA_BUILD_COMMIT}) add_subdirectory(ggml) # ... otherwise assume ggml is added by a parent CMakeLists.txt endif() @@ -204,10 +214,6 @@ endif() include(GNUInstallDirs) include(CMakePackageConfigHelpers) -set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER}) -set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT}) -set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER}) - set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files") set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files") set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files") diff --git a/common/build-info.cpp.in b/common/build-info.cpp.in index 0b945aa68..aee9d7eaf 100644 --- a/common/build-info.cpp.in +++ b/common/build-info.cpp.in @@ -1,4 +1,4 @@ -int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@; -char const *LLAMA_COMMIT = "@BUILD_COMMIT@"; +int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@; +char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@"; char const *LLAMA_COMPILER = "@BUILD_COMPILER@"; char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@"; From b7cc7745e38141060845145af0a1fd489d8e3e33 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 13 Jun 2025 11:55:44 +0300 Subject: [PATCH 18/24] readme : remove survey link (#14168) --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 928100f3c..90c7364df 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ## Hot topics - 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md) -- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9) - A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode - Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639 From 60c666347becacff81cd4bc9a52038ba71038e41 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 13 Jun 2025 13:47:55 +0300 Subject: [PATCH 19/24] batch : rework llama_batch_allocr (#14153) * batch : rework llama_batch_allocr ggml-ci * cont : move validation inside class ggml-ci * cont : move output counting to class ggml-ci * cont : minor ggml-ci * batch : add TODOs ggml-ci --- src/llama-batch.cpp | 68 +++++++++++++++- src/llama-batch.h | 27 ++++-- src/llama-context.cpp | 145 +++++++++++++-------------------- src/llama-context.h | 14 ++-- src/llama-graph.cpp | 7 ++ src/llama-graph.h | 6 +- src/llama-kv-cache-unified.cpp | 1 + 7 files changed, 162 insertions(+), 106 deletions(-) diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp index 69e0d7549..9066d5a9b 100644 --- a/src/llama-batch.cpp +++ b/src/llama-batch.cpp @@ -1,5 +1,9 @@ #include "llama-batch.h" +#include "llama-impl.h" +#include "llama-cparams.h" +#include "llama-vocab.h" + #include #include #include @@ -279,9 +283,42 @@ llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple ); } -llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0) { - batch = in_batch; +llama_batch_allocr::llama_batch_allocr() = default; + +bool llama_batch_allocr::init(const llama_batch & batch_inp, const llama_vocab & vocab, llama_pos p0) { + clear(); + + batch = batch_inp; + GGML_ASSERT(batch.n_tokens > 0); + + if (!batch.pos) { + if (batch.seq_id) { + LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__); + return false; + } + } + + if (batch.token) { + for (int32_t i = 0; i < batch.n_tokens; ++i) { + if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= vocab.n_tokens()) { + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); + return false; + } + } + } + + if (batch.seq_id) { + for (int32_t i = 0; i < batch.n_tokens; ++i) { + for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) { + if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= LLAMA_MAX_PARALLEL_SEQUENCES)) { + LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], LLAMA_MAX_PARALLEL_SEQUENCES); + return false; + } + } + } + } + if (!batch.pos) { assert(p0 >= 0); pos.resize(batch.n_tokens); @@ -290,6 +327,7 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0 } batch.pos = pos.data(); } + if (!batch.n_seq_id) { n_seq_id.resize(batch.n_tokens); for (int32_t i = 0; i < batch.n_tokens; i++) { @@ -297,6 +335,7 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0 } batch.n_seq_id = n_seq_id.data(); } + if (!batch.seq_id) { seq_id.resize(batch.n_tokens + 1); seq_id[batch.n_tokens] = NULL; @@ -305,12 +344,37 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0 } batch.seq_id = seq_id.data(); } + if (!batch.logits) { // by default return the output only for the last token output.resize(batch.n_tokens); output[output.size() - 1] = true; batch.logits = output.data(); } + + for (int32_t i = 0; i < batch.n_tokens; ++i) { + n_outputs += batch.logits[i] != 0; + } + + return true; +} + +const llama_batch & llama_batch_allocr::get_batch() const { + return batch; +} + +uint32_t llama_batch_allocr::get_n_outputs() const { + return n_outputs; +} + +void llama_batch_allocr::clear() { + n_outputs = 0; + + batch = {}; + pos.clear(); + n_seq_id.clear(); + seq_id.clear(); + output.clear(); } // diff --git a/src/llama-batch.h b/src/llama-batch.h index 7ad82b528..24340b00f 100644 --- a/src/llama-batch.h +++ b/src/llama-batch.h @@ -18,8 +18,8 @@ struct llama_ubatch { llama_token * token; // [n_tokens] float * embd; // [n_embd, n_tokens] llama_pos * pos; // [n_tokens] - int32_t * n_seq_id; // [n_seqs] // TODO: remove, should belong to only 1 sequence - llama_seq_id ** seq_id; // [n_seqs] // TODO: become llama_seq_id * seq_id; + int32_t * n_seq_id; // [n_seqs] + llama_seq_id ** seq_id; // [n_seqs] int8_t * output; // [n_tokens] }; @@ -78,15 +78,28 @@ struct llama_sbatch { }; // temporary allocate memory for the input batch if needed -struct llama_batch_allocr { - struct llama_batch batch; +class llama_batch_allocr { +public: + llama_batch_allocr(); + + // optionally fulfill the batch returned by llama_batch_get_one + bool init(const llama_batch & batch_inp, const llama_vocab & vocab, llama_pos p0); + + const llama_batch & get_batch() const; + + uint32_t get_n_outputs() const; + +private: + void clear(); + + llama_batch batch; + + uint32_t n_outputs; std::array seq_id_0 = { 0 }; // default sequence id + std::vector pos; std::vector n_seq_id; std::vector seq_id; std::vector output; - - // optionally fulfill the batch returned by llama_batch_get_one - llama_batch_allocr(struct llama_batch in_batch, llama_pos p0); }; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 2e551bf6e..ec1e1189b 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1,6 +1,7 @@ #include "llama-context.h" #include "llama-impl.h" +#include "llama-batch.h" #include "llama-io.h" #include "llama-memory.h" #include "llama-mmap.h" @@ -18,7 +19,8 @@ llama_context::llama_context( const llama_model & model, llama_context_params params) : - model(model) { + model(model), + batch_allocr(std::make_unique()) { LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__); t_start_us = model.t_start_us; @@ -494,7 +496,7 @@ float * llama_context::get_logits() { } float * llama_context::get_logits_ith(int32_t i) { - int32_t j = -1; + int64_t j = -1; try { if (logits == nullptr) { @@ -517,7 +519,7 @@ float * llama_context::get_logits_ith(int32_t i) { } if (j >= n_outputs) { // This should not happen - throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); + throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs)); } return logits + j*model.vocab.n_tokens(); @@ -536,7 +538,7 @@ float * llama_context::get_embeddings() { } float * llama_context::get_embeddings_ith(int32_t i) { - int32_t j = -1; + int64_t j = -1; try { if (embd == nullptr) { @@ -559,7 +561,7 @@ float * llama_context::get_embeddings_ith(int32_t i) { } if (j >= n_outputs) { // This should not happen - throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); + throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs)); } return embd + j*model.hparams.n_embd; @@ -719,40 +721,27 @@ llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch, return res; } -int llama_context::encode(llama_batch & inp_batch) { - if (inp_batch.n_tokens == 0) { +int llama_context::encode(const llama_batch & batch_inp) { + if (batch_inp.n_tokens == 0) { LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); return -1; } // temporary allocate memory for the input batch if needed // note: during encode, we always pass the full sequence starting from pos = 0 - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : 0); + if (!batch_allocr->init(batch_inp, model.vocab, batch_inp.pos ? -1 : 0)) { + LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__); + return -1; + } - const llama_batch & batch = batch_allocr.batch; - const int32_t n_tokens = batch.n_tokens; + const llama_batch & batch = batch_allocr->get_batch(); - const auto & hparams = model.hparams; + const uint32_t n_tokens = batch.n_tokens; GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT - // TODO: move the validation to the llama_batch_allocr - if (batch.token) { - for (int32_t i = 0; i < n_tokens; ++i) { - if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { - LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); - return -1; - } - - if (batch.seq_id && (batch.seq_id[i][0] < 0 || batch.seq_id[i][0] >= LLAMA_MAX_PARALLEL_SEQUENCES)) { - LLAMA_LOG_ERROR("%s: invalid seq_id[%d] = %d > %d\n", __func__, i, batch.seq_id[i][0], LLAMA_MAX_PARALLEL_SEQUENCES); - throw -1; - } - } - } - // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot - GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "encoder requires n_ubatch >= n_tokens"); + GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens"); if (t_compute_start_us == 0) { t_compute_start_us = ggml_time_us(); @@ -763,6 +752,8 @@ int llama_context::encode(llama_batch & inp_batch) { n_queued_tokens += n_tokens; + const auto & hparams = model.hparams; + const int64_t n_embd = hparams.n_embd; llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true); @@ -775,7 +766,7 @@ int llama_context::encode(llama_batch & inp_batch) { return -2; }; - for (int32_t i = 0; i < n_tokens; ++i) { + for (uint32_t i = 0; i < n_tokens; ++i) { output_ids[i] = i; } @@ -831,7 +822,8 @@ int llama_context::encode(llama_batch & inp_batch) { GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits - for (int32_t i = 0; i < n_tokens; i++) { + // TODO: fix indexing [UBATCH_IDX] + for (uint32_t i = 0; i < n_tokens; i++) { const llama_seq_id seq_id = ubatch.seq_id[i][0]; if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { continue; @@ -846,6 +838,7 @@ int llama_context::encode(llama_batch & inp_batch) { auto & embd_seq_out = embd_seq; const uint32_t n_cls_out = hparams.n_cls_out; + // TODO: fix indexing [UBATCH_IDX] for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { const llama_seq_id seq_id = ubatch.seq_id[s][0]; if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { @@ -878,13 +871,11 @@ int llama_context::encode(llama_batch & inp_batch) { memcpy(cross.v_embd.data(), embd, ggml_nbytes(t_embd)); // remember the sequence ids used during the encoding - needed for cross attention later - // TODO: the seuqence indexing here is likely not correct in the general case - // probably works only for split_simple cross.seq_ids_enc.resize(n_tokens); - for (int32_t i = 0; i < n_tokens; i++) { + for (uint32_t i = 0; i < n_tokens; i++) { cross.seq_ids_enc[i].clear(); - for (int s = 0; s < ubatch.n_seq_id[i]; s++) { - llama_seq_id seq_id = ubatch.seq_id[i][s]; + for (int s = 0; s < batch.n_seq_id[i]; s++) { + llama_seq_id seq_id = batch.seq_id[i][s]; cross.seq_ids_enc[i].insert(seq_id); } } @@ -893,68 +884,44 @@ int llama_context::encode(llama_batch & inp_batch) { return 0; } -int llama_context::decode(llama_batch & inp_batch) { +int llama_context::decode(const llama_batch & batch_inp) { if (!memory) { LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__); - return encode(inp_batch); + return encode(batch_inp); } - if (inp_batch.n_tokens == 0) { + if (batch_inp.n_tokens == 0) { LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); return -1; } - if (!inp_batch.pos) { - if (inp_batch.seq_id) { - LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__); - return -1; - } + // temporary allocate memory for the input batch if needed + if (!batch_allocr->init(batch_inp, model.vocab, batch_inp.pos ? -1 : memory->seq_pos_max(0) + 1)) { + LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__); + return -1; } - // temporary allocate memory for the input batch if needed - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : memory->seq_pos_max(0) + 1); - - const llama_batch & batch = batch_allocr.batch; + const llama_batch & batch = batch_allocr->get_batch(); const auto & vocab = model.vocab; const auto & hparams = model.hparams; const int32_t n_vocab = vocab.n_tokens(); + const int64_t n_embd = hparams.n_embd; - const int64_t n_tokens_all = batch.n_tokens; - const int64_t n_embd = hparams.n_embd; + const uint32_t n_tokens_all = batch.n_tokens; GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT - // TODO: move the validation to the llama_batch_allocr - if (batch.token) { - for (int64_t i = 0; i < n_tokens_all; ++i) { - if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { - LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]); - return -1; - } - - if (batch.seq_id && (batch.seq_id[i][0] < 0 || batch.seq_id[i][0] >= LLAMA_MAX_PARALLEL_SEQUENCES)) { - LLAMA_LOG_ERROR("%s: invalid seq_id[%" PRId64 "] = %d >= %d\n", __func__, i, batch.seq_id[i][0], LLAMA_MAX_PARALLEL_SEQUENCES); - return -1; - } - } - } - // this indicates we are doing pooled embedding const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; - int64_t n_outputs_all = 0; - - // count outputs - for (uint32_t i = 0; i < n_tokens_all; ++i) { - n_outputs_all += batch.logits[i] != 0; - } + const uint32_t n_outputs_all = batch_allocr->get_n_outputs(); if (embd_pooled) { // require that all tokens are output if (n_outputs_all != n_tokens_all) { - LLAMA_LOG_ERROR("%s: pooled embedding requires that all tokens are output (n_outputs_all = %" PRId64 ", n_tokens_all = %" PRId64 ")\n", + LLAMA_LOG_ERROR("%s: pooled embedding requires that all tokens are output (n_outputs_all = %d, n_tokens_all = %d)\n", __func__, n_outputs_all, n_tokens_all); return -1; } @@ -1024,7 +991,7 @@ int llama_context::decode(llama_batch & inp_batch) { // reserve output buffer if (output_reserve(n_outputs_all) < n_outputs_all) { - LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all); + LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all); return -2; }; @@ -1063,6 +1030,7 @@ int llama_context::decode(llama_batch & inp_batch) { pos_min[s] = std::numeric_limits::max(); } + // TODO: fix sequence indexing for (uint32_t i = 0; i < ubatch.n_tokens; ++i) { const auto & seq_id = ubatch.seq_id[i][0]; @@ -1176,14 +1144,14 @@ int llama_context::decode(llama_batch & inp_batch) { n_outputs = n_outputs_all; // set output mappings - { + if (n_outputs > 0) { bool sorted_output = true; auto & out_ids = mstate->out_ids(); - GGML_ASSERT(out_ids.size() == (size_t) n_outputs_all); + GGML_ASSERT(out_ids.size() == (size_t) n_outputs); - for (int64_t i = 0; i < n_outputs_all; ++i) { + for (int64_t i = 0; i < n_outputs; ++i) { int64_t out_id = out_ids[i]; output_ids[out_id] = i; if (out_id != i) { @@ -1195,20 +1163,22 @@ int llama_context::decode(llama_batch & inp_batch) { // note: this is mostly relevant for recurrent models atm if (!sorted_output) { const uint32_t n_vocab = model.vocab.n_tokens(); - const uint32_t n_embd = model.hparams.n_embd; + const uint64_t n_embd = model.hparams.n_embd; GGML_ASSERT((size_t) n_outputs == out_ids.size()); // TODO: is there something more efficient which also minimizes swaps? // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort) - for (int32_t i = 0; i < n_outputs - 1; ++i) { - int32_t j_min = i; - for (int32_t j = i + 1; j < n_outputs; ++j) { + for (uint32_t i = 0; i < n_outputs - 1; ++i) { + uint32_t j_min = i; + for (uint32_t j = i + 1; j < n_outputs; ++j) { if (out_ids[j] < out_ids[j_min]) { j_min = j; } } - if (j_min == i) { continue; } + if (j_min == i) { + continue; + } std::swap(out_ids[i], out_ids[j_min]); if (logits_size > 0) { for (uint32_t k = 0; k < n_vocab; k++) { @@ -1221,8 +1191,10 @@ int llama_context::decode(llama_batch & inp_batch) { } } } + std::fill(output_ids.begin(), output_ids.end(), -1); - for (int32_t i = 0; i < n_outputs; ++i) { + + for (uint32_t i = 0; i < n_outputs; ++i) { output_ids[out_ids[i]] = i; } } @@ -1242,7 +1214,7 @@ int llama_context::decode(llama_batch & inp_batch) { // output // -int32_t llama_context::output_reserve(int32_t n_outputs) { +uint32_t llama_context::output_reserve(int32_t n_outputs) { const auto & hparams = model.hparams; const auto & vocab = model.vocab; @@ -1308,8 +1280,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { // set all ids as invalid (negative) std::fill(output_ids.begin(), output_ids.end(), -1); - this->n_outputs = 0; - this->n_outputs_max = n_outputs_max; + this->n_outputs = 0; return n_outputs_max; } @@ -1800,14 +1771,12 @@ size_t llama_context::state_write_data(llama_io_write_i & io) { std::vector w_output_pos; - GGML_ASSERT(n_outputs <= n_outputs_max); - w_output_pos.resize(n_outputs); // build a more compact representation of the output ids for (size_t i = 0; i < n_batch(); ++i) { // map an output id to a position in the batch - int32_t pos = output_ids[i]; + int64_t pos = output_ids[i]; if (pos >= 0) { GGML_ASSERT(pos < n_outputs); w_output_pos[pos] = i; @@ -2082,7 +2051,7 @@ void llama_context::opt_epoch_iter( embd_seq.clear(); - int64_t n_outputs_all = n_tokens_all; + uint32_t n_outputs_all = n_tokens_all; auto mstate = memory->init_batch(batch, cparams.n_ubatch, embd_pooled); if (!mstate || mstate->get_status() != LLAMA_MEMORY_STATUS_SUCCESS) { @@ -2092,7 +2061,7 @@ void llama_context::opt_epoch_iter( // reserve output buffer if (output_reserve(n_outputs_all) < n_outputs_all) { - LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all); + LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all); GGML_ABORT("TODO: handle this error"); }; diff --git a/src/llama-context.h b/src/llama-context.h index 2e0da8c83..040f03ae4 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -1,7 +1,6 @@ #pragma once #include "llama.h" -#include "llama-batch.h" #include "llama-cparams.h" #include "llama-graph.h" #include "llama-adapter.h" @@ -13,6 +12,7 @@ #include struct llama_model; +class llama_batch_allocr; class llama_io_read_i; class llama_io_write_i; @@ -102,8 +102,8 @@ struct llama_context { llama_memory_state_i * mstate, ggml_status & ret); - int encode(llama_batch & inp_batch); - int decode(llama_batch & inp_batch); + int encode(const llama_batch & batch_inp); + int decode(const llama_batch & batch_inp); // // state save/load @@ -181,7 +181,7 @@ private: // Make sure enough space is available for outputs. // Returns max number of outputs for which space was reserved. - int32_t output_reserve(int32_t n_outputs); + uint32_t output_reserve(int32_t n_outputs); // // graph @@ -246,8 +246,10 @@ private: // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE std::map> embd_seq; - int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch - int32_t n_outputs_max = 0; // capacity (of tokens positions) for the output buffers + // reuse the batch_allocr to avoid unnecessary memory allocations + std::unique_ptr batch_allocr; + + uint32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch std::vector output_ids; // map batch token positions to ids of the logits and embd buffers diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 4493fb164..337fb5cb0 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -139,6 +139,7 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) { std::vector sum(n_tokens, 0); + // TODO: fix indexing [UBATCH_IDX] for (int s = 0; s < n_seqs; ++s) { const llama_seq_id seq_id = ubatch->seq_id[s][0]; @@ -156,6 +157,7 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) { } } + // TODO: fix indexing [UBATCH_IDX] for (int s = 0; s < n_seqs; ++s) { const llama_seq_id seq_id = ubatch->seq_id[s][0]; @@ -180,6 +182,7 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) { uint32_t * data = (uint32_t *) cls->data; memset(cls->data, 0, n_tokens * ggml_element_size(cls)); + // TODO: fix indexing [UBATCH_IDX] for (int s = 0; s < n_seqs; ++s) { const llama_seq_id seq_id = ubatch->seq_id[s][0]; @@ -210,6 +213,7 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) { std::vector last_pos(n_tokens, -1); std::vector last_row(n_tokens, -1); + // TODO: fix indexing [UBATCH_IDX] for (int s = 0; s < n_seqs; ++s) { const llama_seq_id seq_id = ubatch->seq_id[s][0]; @@ -283,6 +287,7 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) { const int32_t ti = s0*n_seq_tokens + i; float f = -INFINITY; + // TODO: fix indexing [UBATCH_IDX] for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) { if (ubatch->seq_id[s0][s] == seq_id && ubatch->pos[ti] <= ubatch->pos[tj]) { if (hparams.use_alibi) { @@ -322,6 +327,7 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) { const int32_t ti = s0*n_seq_tokens + i; float f = -INFINITY; + // TODO: fix indexing [UBATCH_IDX] for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) { if (ubatch->seq_id[s0][s] == seq_id) { if (hparams.use_alibi) { @@ -377,6 +383,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) { for (int j = 0; j < n_tokens; ++j) { for (int i = 0; i < n_enc; ++i) { float f = -INFINITY; + // TODO: fix indexing [UBATCH_IDX] for (int s = 0; s < ubatch->n_seq_id[j]; ++s) { const llama_seq_id seq_id = ubatch->seq_id[j][s]; if (cross->seq_ids_enc[i].find(seq_id) != cross->seq_ids_enc[i].end()) { diff --git a/src/llama-graph.h b/src/llama-graph.h index 88fb77f1d..87813119b 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -378,7 +378,7 @@ struct llm_graph_params { const llama_memory_state_i * mstate; const llama_cross * cross; - int32_t n_outputs; + uint32_t n_outputs; const llm_graph_cb & cb; }; @@ -412,8 +412,8 @@ struct llm_graph_context { const float norm_eps; const float norm_rms_eps; - const int32_t n_tokens; - const int32_t n_outputs; + const int64_t n_tokens; + const int64_t n_outputs; const int32_t n_ctx_orig; // yarn const enum llama_pooling_type pooling_type; diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp index 89606c598..d4e92eab3 100644 --- a/src/llama-kv-cache-unified.cpp +++ b/src/llama-kv-cache-unified.cpp @@ -674,6 +674,7 @@ void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch cells.pos_set(head_cur + idx, ubatch.pos[idx]); + // TODO: fix indexing [UBATCH_IDX] for (int32_t i = 0; i < ubatch.n_seq_id[s]; i++) { cells.seq_add(head_cur + idx, ubatch.seq_id[s][i]); } From 26ff3685bfbaa4c8838d7afd988b17dd5eb99f92 Mon Sep 17 00:00:00 2001 From: ddpasa <112642920+ddpasa@users.noreply.github.com> Date: Fri, 13 Jun 2025 15:17:53 +0200 Subject: [PATCH 20/24] docs : Update multimodal.md (#14122) * Update multimodal.md * Update multimodal.md --- docs/multimodal.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/multimodal.md b/docs/multimodal.md index e849c2a0b..edbd081df 100644 --- a/docs/multimodal.md +++ b/docs/multimodal.md @@ -107,3 +107,7 @@ NOTE: some models may require large context window, for example: `-c 8192` (tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF (tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF ``` + +## Finding more models: + +GGUF models on Huggingface with vision capabilities can be found here: https://huggingface.co/models?pipeline_tag=image-text-to-text&sort=trending&search=gguf From 80709b70a2f87c13ccaf1480b799393109996789 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 13 Jun 2025 18:35:00 +0300 Subject: [PATCH 21/24] batch : add LLAMA_BATCH_DEBUG environment variable (#14172) * batch : add LLAMA_BATCH_DEBUG environment variable ggml-ci * cont : improve seq_id display --- src/llama-batch.cpp | 53 ++++++++++++++++++++++++++++++++++++++++++++- src/llama-batch.h | 2 ++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp index 9066d5a9b..bdbf76626 100644 --- a/src/llama-batch.cpp +++ b/src/llama-batch.cpp @@ -7,6 +7,7 @@ #include #include #include +#include llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) { // clear empty sequences @@ -283,7 +284,10 @@ llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple ); } -llama_batch_allocr::llama_batch_allocr() = default; +llama_batch_allocr::llama_batch_allocr() { + const char * LLAMA_BATCH_DEBUG = getenv("LLAMA_BATCH_DEBUG"); + debug = LLAMA_BATCH_DEBUG ? atoi(LLAMA_BATCH_DEBUG) : 0; +} bool llama_batch_allocr::init(const llama_batch & batch_inp, const llama_vocab & vocab, llama_pos p0) { clear(); @@ -356,6 +360,53 @@ bool llama_batch_allocr::init(const llama_batch & batch_inp, const llama_vocab & n_outputs += batch.logits[i] != 0; } + if (debug > 0) { + LLAMA_LOG_DEBUG("%s: input batch info (p0 = %d):\n", __func__, p0); + LLAMA_LOG_DEBUG("%s: n_tokens = %d\n", __func__, batch.n_tokens); + LLAMA_LOG_DEBUG("%s: token = %p\n", __func__, (void *) batch.token); + LLAMA_LOG_DEBUG("%s: embd = %p\n", __func__, (void *) batch.embd); + LLAMA_LOG_DEBUG("%s: pos = %p\n", __func__, (void *) batch.pos); + LLAMA_LOG_DEBUG("%s: n_seq_id = %p\n", __func__, (void *) batch.n_seq_id); + LLAMA_LOG_DEBUG("%s: seq_id = %p\n", __func__, (void *) batch.seq_id); + LLAMA_LOG_DEBUG("%s: logits = %p\n", __func__, (void *) batch.logits); + LLAMA_LOG_DEBUG("%s: n_outputs = %d\n", __func__, n_outputs); + + if (debug > 1) { + int seq_id_max = 0; + for (int32_t i = 0; i < batch.n_tokens; ++i) { + for (int s = 0; s < batch.n_seq_id[i]; ++s) { + for (int s = 0; s < batch.n_seq_id[i]; ++s) { + seq_id_max = std::max(seq_id_max, batch.seq_id[i][s]); + } + } + } + ++seq_id_max; + + LLAMA_LOG_DEBUG("%s: token = [\n", __func__); + for (int32_t i = 0; i < batch.n_tokens; ++i) { + std::vector seq_id(seq_id_max); + + for (int s = 0; s < batch.n_seq_id[i]; ++s) { + seq_id[batch.seq_id[i][s]] = 1; + } + + std::stringstream ss; + for (int s = 0; s < seq_id_max; ++s) { + if (seq_id[s]) { + ss << s%10; + } else { + ss << "."; + } + } + + LLAMA_LOG_DEBUG("%s: %4d: id = %6d (%16s), pos = %4d, n_seq_id = %2d, seq_id = [%s], output = %d\n", + __func__, i, batch.token[i], vocab.token_to_piece(batch.token[i]).c_str(), + batch.pos[i], batch.n_seq_id[i], ss.str().c_str(), batch.logits[i]); + } + LLAMA_LOG_DEBUG("%s: ]\n", __func__); + } + } + return true; } diff --git a/src/llama-batch.h b/src/llama-batch.h index 24340b00f..1e0be8ac2 100644 --- a/src/llama-batch.h +++ b/src/llama-batch.h @@ -102,4 +102,6 @@ private: std::vector n_seq_id; std::vector seq_id; std::vector output; + + int debug; }; From 3cfbbdb44e08fd19429fed6cc85b982a91f0efd5 Mon Sep 17 00:00:00 2001 From: Guy Goldenberg Date: Fri, 13 Jun 2025 19:20:25 +0300 Subject: [PATCH 22/24] Merge commit from fork * vocab : prevent integer overflow during load * Add static cast and GGML_ABORT --------- Co-authored-by: Georgi Gerganov --- src/llama-vocab.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index d8c9d9730..07e692208 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -19,6 +19,7 @@ #include #include #include +#include // // helpers @@ -2572,6 +2573,10 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t // copy piece chars to output text buffer // skip up to 'lstrip' leading spaces before copying auto _try_copy = [=] (const char * token, size_t size) -> int32_t { + if (size >= static_cast(std::numeric_limits::max())) { + GGML_ABORT("invalid token size: %zu exceeds int32_t limit", size); + } + for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) { token++; size--; From 40643edb86eb10b471b0f57d4f3f7eb0e06a0df7 Mon Sep 17 00:00:00 2001 From: Svetlozar Georgiev <55534064+sgeor255@users.noreply.github.com> Date: Fri, 13 Jun 2025 17:32:56 +0100 Subject: [PATCH 23/24] sycl: fix docker image (#14144) --- .devops/intel.Dockerfile | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile index 8cad66052..9ce80a71e 100644 --- a/.devops/intel.Dockerfile +++ b/.devops/intel.Dockerfile @@ -49,19 +49,23 @@ COPY --from=build /app/full /app WORKDIR /app -RUN apt-get update \ - && apt-get install -y \ - git \ - python3 \ - python3-pip \ - && pip install --upgrade pip setuptools wheel \ - && pip install -r requirements.txt \ - && apt autoremove -y \ - && apt clean -y \ - && rm -rf /tmp/* /var/tmp/* \ - && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ - && find /var/cache -type f -delete +RUN apt-get update && \ + apt-get install -y \ + git \ + python3 \ + python3-pip \ + python3-venv && \ + python3 -m venv /opt/venv && \ + . /opt/venv/bin/activate && \ + pip install --upgrade pip setuptools wheel && \ + pip install -r requirements.txt && \ + apt autoremove -y && \ + apt clean -y && \ + rm -rf /tmp/* /var/tmp/* && \ + find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \ + find /var/cache -type f -delete +ENV PATH="/opt/venv/bin:$PATH" ENTRYPOINT ["/app/tools.sh"] From fb85a288d72abbd5e5daa8de96e6f8bfa7b5ab46 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 13 Jun 2025 20:03:05 +0300 Subject: [PATCH 24/24] vocab : fix build (#14175) ggml-ci --- src/llama-vocab.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 07e692208..905d7c428 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -9,17 +9,16 @@ #include #include +#include #include -#include #include #include #include +#include #include #include #include #include -#include -#include // // helpers