From 3979f2bb08dd003fd979549600b835dc00a56e50 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Mon, 6 Apr 2026 14:02:37 +0200 Subject: [PATCH 1/7] docs: add hunyuan-ocr gguf, also add test [no ci] (#21490) --- docs/multimodal.md | 1 + tools/mtmd/tests.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/docs/multimodal.md b/docs/multimodal.md index f2fc1510c..744347f62 100644 --- a/docs/multimodal.md +++ b/docs/multimodal.md @@ -37,6 +37,7 @@ llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload > - PaddleOCR-VL: https://github.com/ggml-org/llama.cpp/pull/18825 > - GLM-OCR: https://github.com/ggml-org/llama.cpp/pull/19677 > - Deepseek-OCR: https://github.com/ggml-org/llama.cpp/pull/17400 +> - HunyuanOCR: https://github.com/ggml-org/llama.cpp/pull/21395 ## Pre-quantized models diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh index e081bde87..d6a6b03c8 100755 --- a/tools/mtmd/tests.sh +++ b/tools/mtmd/tests.sh @@ -89,6 +89,7 @@ add_test_vision "ggml-org/LFM2-VL-450M-GGUF:Q8_0" add_test_vision "ggml-org/granite-docling-258M-GGUF:Q8_0" add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0" add_test_vision "ggml-org/DeepSeek-OCR-GGUF:Q8_0" -p "Free OCR." --chat-template deepseek-ocr +add_test_vision "ggml-org/HunyuanOCR-GGUF:Q8_0" -p "OCR" add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0" add_test_audio "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M" From 482d862bcbf813f5a8393ac05a2c31c647fc2984 Mon Sep 17 00:00:00 2001 From: lainon1 <271530700+lainon1@users.noreply.github.com> Date: Mon, 6 Apr 2026 13:03:02 +0100 Subject: [PATCH 2/7] server : handle unsuccessful sink.write in chunked stream provider (#21478) Check the return value of sink.write() in the chunked content provider and return false when the write fails, matching cpp-httplib's own streaming contract. This prevents logging chunks as sent when the sink rejected them and properly aborts the stream on connection failure. --- tools/server/server-http.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index be2af2622..37e7cbe9c 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -397,8 +397,9 @@ static void process_handler_response(server_http_req_ptr && request, server_http std::string chunk; bool has_next = response->next(chunk); if (!chunk.empty()) { - // TODO: maybe handle sink.write unsuccessful? for now, we rely on is_connection_closed() - sink.write(chunk.data(), chunk.size()); + if (!sink.write(chunk.data(), chunk.size())) { + return false; + } SRV_DBG("http: streamed chunk: %s\n", chunk.c_str()); } if (!has_next) { From 941146b3f1ebcd54f125f1f80598f29231155989 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Mon, 6 Apr 2026 14:05:18 +0200 Subject: [PATCH 3/7] convert : fix block_ff_dim retrieval for lfm2 (#21508) --- convert_hf_to_gguf.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c1737bb2c..09f6e7ae2 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -11818,10 +11818,8 @@ class LFM2Model(TextModel): model_arch = gguf.MODEL_ARCH.LFM2 def _add_feed_forward_length(self): - ff_dim = self.hparams["block_ff_dim"] - + ff_dim = self.find_hparam(["block_ff_dim", "intermediate_size"]) auto_adjust_ff_dim = self.hparams["block_auto_adjust_ff_dim"] - ff_dim = self.hparams["block_ff_dim"] ffn_dim_multiplier = self.hparams["block_ffn_dim_multiplier"] multiple_of = self.hparams["block_multiple_of"] From 4aa962e2b0d04e6be6944f899edbdbe26177e492 Mon Sep 17 00:00:00 2001 From: Aldehir Rojas Date: Mon, 6 Apr 2026 09:08:37 -0500 Subject: [PATCH 4/7] vocab : add byte token handling to BPE detokenizer for Gemma4 (#21488) --- src/llama-vocab.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 75dbaa91e..de9a9466b 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -2813,7 +2813,9 @@ uint8_t llama_vocab::impl::token_to_byte(llama_token id) const { return strtol(buf.c_str(), NULL, 16); } case LLAMA_VOCAB_TYPE_BPE: { - GGML_ABORT("fatal error"); + // Gemma4 uses BPE with SPM-style byte fallback tokens (<0xXX>) + auto buf = token_data.text.substr(3, 2); + return strtol(buf.c_str(), NULL, 16); } case LLAMA_VOCAB_TYPE_WPM: { GGML_ABORT("fatal error"); @@ -3294,6 +3296,10 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t std::string result = llama_decode_text(token_text); return _try_copy(result.data(), result.size()); } + if (attr & LLAMA_TOKEN_ATTR_BYTE) { + char byte = (char) token_to_byte(token); + return _try_copy((char*) &byte, 1); + } break; } case LLAMA_VOCAB_TYPE_RWKV: { From 94ca829b6001019622c0f67fcd48e9ec6bd7dce8 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Mon, 6 Apr 2026 22:26:02 +0800 Subject: [PATCH 5/7] llama-bench: add `-fitc` and `-fitt` to arguments (#21304) * llama-bench: add `-fitc` and `-fitt` to arguments * update README.md * address review comments * update compare-llama-bench.py --- scripts/compare-llama-bench.py | 7 +- tools/llama-bench/README.md | 2 + tools/llama-bench/llama-bench.cpp | 104 ++++++++++++++++++++++++++++-- 3 files changed, 107 insertions(+), 6 deletions(-) diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py index f43d24ebf..5a6cc7dbb 100755 --- a/scripts/compare-llama-bench.py +++ b/scripts/compare-llama-bench.py @@ -29,7 +29,8 @@ LLAMA_BENCH_DB_FIELDS = [ "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", - "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", "n_cpu_moe" + "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", "n_cpu_moe", + "fit_target", "fit_min_ctx" ] LLAMA_BENCH_DB_TYPES = [ @@ -39,6 +40,7 @@ LLAMA_BENCH_DB_TYPES = [ "TEXT", "INTEGER", "INTEGER", "INTEGER", "TEXT", "TEXT", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "TEXT", "INTEGER", "INTEGER", "REAL", "REAL", "INTEGER", + "INTEGER", "INTEGER" ] # All test-backend-ops SQL fields @@ -61,7 +63,8 @@ assert len(TEST_BACKEND_OPS_DB_FIELDS) == len(TEST_BACKEND_OPS_DB_TYPES) LLAMA_BENCH_KEY_PROPERTIES = [ "cpu_info", "gpu_info", "backends", "n_gpu_layers", "n_cpu_moe", "tensor_buft_overrides", "model_filename", "model_type", "n_batch", "n_ubatch", "embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v", - "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen", "n_depth" + "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen", "n_depth", + "fit_target", "fit_min_ctx" ] # Properties by which to differentiate results per commit for test-backend-ops: diff --git a/tools/llama-bench/README.md b/tools/llama-bench/README.md index c837bb6d2..70355920b 100644 --- a/tools/llama-bench/README.md +++ b/tools/llama-bench/README.md @@ -62,6 +62,8 @@ test parameters: -ot --override-tensors =;... (default: disabled) -nopo, --no-op-offload <0|1> (default: 0) + -fitt, --fit-target fit model to device memory with this margin per device in MiB (default: off) + -fitc, --fit-ctx minimum ctx size for --fit-target (default: 4096) Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times. Ranges can be given as diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 0a23f6985..0b395b460 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -342,6 +342,8 @@ struct cmd_params { std::vector embeddings; std::vector no_op_offload; std::vector no_host; + std::vector fit_params_target; + std::vector fit_params_min_ctx; ggml_numa_strategy numa; int reps; ggml_sched_priority prio; @@ -384,6 +386,8 @@ static const cmd_params cmd_params_defaults = { /* embeddings */ { false }, /* no_op_offload */ { false }, /* no_host */ { false }, + /* fit_params_target */ { 0 }, + /* fit_params_min_ctx */ { 0 }, /* numa */ GGML_NUMA_STRATEGY_DISABLED, /* reps */ 5, /* prio */ GGML_SCHED_PRIO_NORMAL, @@ -410,6 +414,8 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -v, --verbose verbose output\n"); printf(" --progress print test progress indicators\n"); printf(" --no-warmup skip warmup runs before benchmarking\n"); + printf(" -fitt, --fit-target fit model to device memory with this margin per device in MiB (default: off)\n"); + printf(" -fitc, --fit-ctx minimum ctx size for --fit-target (default: 4096)\n"); if (llama_supports_rpc()) { printf(" -rpc, --rpc register RPC devices (comma separated)\n"); } @@ -958,6 +964,24 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { params.progress = true; } else if (arg == "--no-warmup") { params.no_warmup = true; + } else if (arg == "-fitt" || arg == "--fit-target") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + for (const auto & v : p) { + params.fit_params_target.push_back(std::stoull(v)); + } + } else if (arg == "-fitc" || arg == "--fit-ctx") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + for (const auto & v : p) { + params.fit_params_min_ctx.push_back(std::stoul(v)); + } } else { invalid_param = true; break; @@ -1078,6 +1102,12 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; } + if (params.fit_params_target.empty()) { + params.fit_params_target = cmd_params_defaults.fit_params_target; + } + if (params.fit_params_min_ctx.empty()) { + params.fit_params_min_ctx = cmd_params_defaults.fit_params_min_ctx; + } return params; } @@ -1109,6 +1139,8 @@ struct cmd_params_instance { bool embeddings; bool no_op_offload; bool no_host; + size_t fit_target; + uint32_t fit_min_ctx; llama_model_params to_llama_mparams() const { llama_model_params mparams = llama_model_default_params(); @@ -1197,6 +1229,8 @@ static std::vector get_cmd_params_instances(const cmd_param // this ordering minimizes the number of times that each model needs to be reloaded // clang-format off for (const auto & m : params.model) + for (const auto & fpt : params.fit_params_target) + for (const auto & fpc : params.fit_params_min_ctx) for (const auto & nl : params.n_gpu_layers) for (const auto & ncmoe : params.n_cpu_moe) for (const auto & sm : params.split_mode) @@ -1251,6 +1285,8 @@ static std::vector get_cmd_params_instances(const cmd_param /* .embeddings = */ embd, /* .no_op_offload= */ nopo, /* .no_host = */ noh, + /* .fit_target = */ fpt, + /* .fit_min_ctx = */ fpc, }; instances.push_back(instance); } @@ -1286,6 +1322,8 @@ static std::vector get_cmd_params_instances(const cmd_param /* .embeddings = */ embd, /* .no_op_offload= */ nopo, /* .no_host = */ noh, + /* .fit_target = */ fpt, + /* .fit_min_ctx = */ fpc, }; instances.push_back(instance); } @@ -1321,6 +1359,8 @@ static std::vector get_cmd_params_instances(const cmd_param /* .embeddings = */ embd, /* .no_op_offload= */ nopo, /* .no_host = */ noh, + /* .fit_target = */ fpt, + /* .fit_min_ctx = */ fpc, }; instances.push_back(instance); } @@ -1361,6 +1401,8 @@ struct test { bool embeddings; bool no_op_offload; bool no_host; + size_t fit_target; + uint32_t fit_min_ctx; int n_prompt; int n_gen; int n_depth; @@ -1399,6 +1441,8 @@ struct test { embeddings = inst.embeddings; no_op_offload = inst.no_op_offload; no_host = inst.no_host; + fit_target = inst.fit_target; + fit_min_ctx = inst.fit_min_ctx; n_prompt = inst.n_prompt; n_gen = inst.n_gen; n_depth = inst.n_depth; @@ -1456,7 +1500,8 @@ struct test { "type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "devices", "tensor_split", "tensor_buft_overrides", "use_mmap", "use_direct_io", "embeddings", - "no_op_offload", "no_host", "n_prompt", "n_gen", "n_depth", + "no_op_offload", "no_host", "fit_target", "fit_min_ctx", + "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts" }; return fields; @@ -1468,7 +1513,8 @@ struct test { if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || field == "avg_ns" || - field == "stddev_ns" || field == "no_op_offload" || field == "n_cpu_moe") { + field == "stddev_ns" || field == "no_op_offload" || field == "n_cpu_moe" || + field == "fit_target" || field == "fit_min_ctx") { return INT; } if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || @@ -1549,6 +1595,8 @@ struct test { std::to_string(embeddings), std::to_string(no_op_offload), std::to_string(no_host), + std::to_string(fit_target), + std::to_string(fit_min_ctx), std::to_string(n_prompt), std::to_string(n_gen), std::to_string(n_depth), @@ -1792,6 +1840,12 @@ struct markdown_printer : public printer { if (field == "tensor_buft_overrides") { return "ot"; } + if (field == "fit_target") { + return "fitt"; + } + if (field == "fit_min_ctx") { + return "fitc"; + } return field; } @@ -1870,6 +1924,12 @@ struct markdown_printer : public printer { if (params.no_host.size() > 1 || params.no_host != cmd_params_defaults.no_host) { fields.emplace_back("no_host"); } + if (params.fit_params_target.size() > 1 || params.fit_params_target != cmd_params_defaults.fit_params_target) { + fields.emplace_back("fit_target"); + } + if (params.fit_params_min_ctx.size() > 1 || params.fit_params_min_ctx != cmd_params_defaults.fit_params_min_ctx) { + fields.emplace_back("fit_min_ctx"); + } fields.emplace_back("test"); fields.emplace_back("t/s"); @@ -2141,13 +2201,49 @@ int main(int argc, char ** argv) { if (params.progress) { fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count); } + auto mparams = inst.to_llama_mparams(); + auto cparams = inst.to_llama_cparams(); + + bool do_fit = inst.fit_target != cmd_params_defaults.fit_params_target[0] || + inst.fit_min_ctx != cmd_params_defaults.fit_params_min_ctx[0]; + + std::vector fit_tensor_split(llama_max_devices(), 0.0f); + std::vector fit_overrides(llama_max_tensor_buft_overrides(), {nullptr, nullptr}); + + if (do_fit) { + // free the previous model so fit sees full free VRAM + if (lmodel) { + llama_model_free(lmodel); + lmodel = nullptr; + prev_inst = nullptr; + } + + // use default n_gpu_layers and n_ctx so llama_params_fit can adjust them + mparams.n_gpu_layers = llama_model_default_params().n_gpu_layers; + mparams.tensor_split = fit_tensor_split.data(); + mparams.tensor_buft_overrides = fit_overrides.data(); + cparams.n_ctx = 0; + + std::vector margins(llama_max_devices(), inst.fit_target * 1024 * 1024); + + uint32_t n_ctx_needed = inst.n_prompt + inst.n_gen + inst.n_depth; + cparams.n_ctx = std::max(cparams.n_ctx, n_ctx_needed); + + llama_params_fit(inst.model.c_str(), &mparams, &cparams, + fit_tensor_split.data(), + fit_overrides.data(), + margins.data(), + inst.fit_min_ctx, + params.verbose ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR); + } + // keep the same model between tests when possible if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { if (lmodel) { llama_model_free(lmodel); } - lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams()); + lmodel = llama_model_load_from_file(inst.model.c_str(), mparams); if (lmodel == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str()); return 1; @@ -2155,7 +2251,7 @@ int main(int argc, char ** argv) { prev_inst = &inst; } - llama_context * ctx = llama_init_from_model(lmodel, inst.to_llama_cparams()); + llama_context * ctx = llama_init_from_model(lmodel, cparams); if (ctx == NULL) { fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str()); llama_model_free(lmodel); From 15f786e6581598638840276948a7e6183fc96a83 Mon Sep 17 00:00:00 2001 From: Gaurav Garg Date: Tue, 7 Apr 2026 00:04:29 +0530 Subject: [PATCH 6/7] [CUDA ] Write an optimized flash_attn_stream_k_fixup kernel (#21159) * Write an optimized flash_attn_stream_k_fixup kernel Write a specialized and more optimized kernel for cases where nblocks_stream_k is multiple of ntiles_dst. Make nblocks_stream_k to multiple of ntiles_dst if nblocks_stream_k > 2 * ntiles_dst * Use the new kernel only for nblocks_stream_k_raw > 4 * ntiles_dst to make sure we have enough concurrency on GPUs * Address review comments * Address review comments * Revert variable names to original --- ggml/src/ggml-cuda/fattn-common.cuh | 178 ++++++++++++++++++++++++---- 1 file changed, 153 insertions(+), 25 deletions(-) diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index c59a4db39..beeb52389 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -676,9 +676,96 @@ static __global__ void flash_attn_mask_to_KV_max( template // D == head size __launch_bounds__(D, 1) -static __global__ void flash_attn_stream_k_fixup( - float * __restrict__ dst, const float2 * __restrict__ dst_fixup, const int ne01, const int ne02, const int ne03, - const int ne11, const int ne12, const int nbatch_fa) { +static __global__ void flash_attn_stream_k_fixup_uniform( + float * __restrict__ dst, + const float2 * __restrict__ dst_fixup, + const int ne01, const int ne02, + const int ne12, const int nblocks_stream_k, + const int gqa_ratio, + const int blocks_per_tile, + const uint3 fd_iter_j_z_ne12, + const uint3 fd_iter_j_z, + const uint3 fd_iter_j) { + constexpr int ncols = ncols1*ncols2; + + const int tile_idx = blockIdx.x; // One block per output tile. + const int j = blockIdx.y; + const int c = blockIdx.z; + const int jc = j*ncols2 + c; + const int tid = threadIdx.x; + + // nblocks_stream_k is a multiple of ntiles_dst (== gridDim.x), so each tile gets the same number of blocks. + const int b_first = tile_idx * blocks_per_tile; + const int b_last = b_first + blocks_per_tile - 1; + + const float * dst_fixup_data = ((const float *) dst_fixup) + nblocks_stream_k*(2*2*ncols); + + // z_KV == K/V head index, zt_gqa = Q head start index per K/V head, jt = token position start index + const uint2 dm0 = fast_div_modulo(tile_idx, fd_iter_j_z_ne12); + const uint2 dm1 = fast_div_modulo(dm0.y, fd_iter_j_z); + const uint2 dm2 = fast_div_modulo(dm1.y, fd_iter_j); + + const int sequence = dm0.x; + const int z_KV = dm1.x; + const int zt_gqa = dm2.x; + const int jt = dm2.y; + + const int zt_Q = z_KV*gqa_ratio + zt_gqa*ncols2; // Global Q head start index. + + if (jt*ncols1 + j >= ne01 || zt_gqa*ncols2 + c >= gqa_ratio) { + return; + } + + dst += sequence*ne02*ne01*D + jt*ne02*(ncols1*D) + zt_Q*D + (j*ne02 + c)*D + tid; + + // Load the partial result that needs a fixup + float dst_val = *dst; + float max_val; + float rowsum; + { + const float2 tmp = dst_fixup[b_last*ncols + jc]; + max_val = tmp.x; + rowsum = tmp.y; + } + + // Combine with all previous blocks in this tile. + for (int bidx = b_last - 1; bidx >= b_first; --bidx) { + const float dst_add = dst_fixup_data[bidx*ncols*D + jc*D + tid]; + + const float2 tmp = dst_fixup[(nblocks_stream_k + bidx)*ncols + jc]; + + const float max_val_new = fmaxf(max_val, tmp.x); + + const float diff_val = max_val - max_val_new; + const float diff_add = tmp.x - max_val_new; + + const float scale_val = diff_val >= SOFTMAX_FTZ_THRESHOLD ? expf(diff_val) : 0.0f; + const float scale_add = diff_add >= SOFTMAX_FTZ_THRESHOLD ? expf(diff_add) : 0.0f; + + dst_val = scale_val*dst_val + scale_add*dst_add; + rowsum = scale_val*rowsum + scale_add*tmp.y; + + max_val = max_val_new; + } + + // Write back final result: + *dst = dst_val / rowsum; +} + +// General fixup kernel for the case where the number of blocks per tile is not uniform across tiles +// (blocks_num.x not a multiple of ntiles_dst) +template // D == head size +__launch_bounds__(D, 1) +static __global__ void flash_attn_stream_k_fixup_general( + float * __restrict__ dst, + const float2 * __restrict__ dst_fixup, + const int ne01, const int ne02, + const int gqa_ratio, + const int total_work, + const uint3 fd_iter_k_j_z_ne12, + const uint3 fd_iter_k_j_z, + const uint3 fd_iter_k_j, + const uint3 fd_iter_k) { constexpr int ncols = ncols1*ncols2; const int bidx0 = blockIdx.x; @@ -689,27 +776,26 @@ static __global__ void flash_attn_stream_k_fixup( const float * dst_fixup_data = ((const float *) dst_fixup) + gridDim.x*(2*2*ncols); - const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. - - const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa; - const int iter_j = (ne01 + (ncols1 - 1)) / ncols1; - const int iter_z_gqa = (gqa_ratio + (ncols2 - 1)) / ncols2; - - const int kbc0 = int64_t(bidx0 + 0)*(iter_k*iter_j*iter_z_gqa*ne12*ne03) / gridDim.x; - const int kbc0_stop = int64_t(bidx0 + 1)*(iter_k*iter_j*iter_z_gqa*ne12*ne03) / gridDim.x; + const int kbc0 = int64_t(bidx0 + 0)*total_work / gridDim.x; + const int kbc0_stop = int64_t(bidx0 + 1)*total_work / gridDim.x; const bool did_not_have_any_data = kbc0 == kbc0_stop; - const bool wrote_beginning_of_tile = kbc0 % iter_k == 0; - const bool did_not_write_last = kbc0/iter_k == kbc0_stop/iter_k && kbc0_stop % iter_k != 0; + const bool wrote_beginning_of_tile = fastmodulo(kbc0, fd_iter_k) == 0; + const bool did_not_write_last = fastdiv(kbc0, fd_iter_k) == fastdiv(kbc0_stop, fd_iter_k) && fastmodulo(kbc0_stop, fd_iter_k) != 0; if (did_not_have_any_data || wrote_beginning_of_tile || did_not_write_last) { return; } // z_KV == K/V head index, zt_gqa = Q head start index per K/V head, jt = token position start index - const int sequence = kbc0 /(iter_k*iter_j*iter_z_gqa*ne12); - const int z_KV = (kbc0 - iter_k*iter_j*iter_z_gqa*ne12 * sequence)/(iter_k*iter_j*iter_z_gqa); - const int zt_gqa = (kbc0 - iter_k*iter_j*iter_z_gqa*ne12 * sequence - iter_k*iter_j*iter_z_gqa * z_KV)/(iter_k*iter_j); - const int jt = (kbc0 - iter_k*iter_j*iter_z_gqa*ne12 * sequence - iter_k*iter_j*iter_z_gqa * z_KV - iter_k*iter_j * zt_gqa) / iter_k; + const uint2 dm0 = fast_div_modulo(kbc0, fd_iter_k_j_z_ne12); + const uint2 dm1 = fast_div_modulo(dm0.y, fd_iter_k_j_z); + const uint2 dm2 = fast_div_modulo(dm1.y, fd_iter_k_j); + const uint2 dm3 = fast_div_modulo(dm2.y, fd_iter_k); + + const int sequence = dm0.x; + const int z_KV = dm1.x; + const int zt_gqa = dm2.x; + const int jt = dm3.x; const int zt_Q = z_KV*gqa_ratio + zt_gqa*ncols2; // Global Q head start index. @@ -733,10 +819,11 @@ static __global__ void flash_attn_stream_k_fixup( // Iterate over previous blocks and compute the combined results. // All CUDA blocks that get here must have a previous block that needs a fixup. + const int tile_kbc0 = fastdiv(kbc0, fd_iter_k); int bidx = bidx0 - 1; int kbc_stop = kbc0; while(true) { - const int kbc = int64_t(bidx)*(iter_k*iter_j*iter_z_gqa*ne12*ne03) / gridDim.x; + const int kbc = int64_t(bidx)*total_work / gridDim.x; if (kbc == kbc_stop) { // Did not have any data. bidx--; kbc_stop = kbc; @@ -762,7 +849,7 @@ static __global__ void flash_attn_stream_k_fixup( max_val = max_val_new; // If this block started in a previous tile we are done and don't need to combine additional partial results. - if (kbc % iter_k == 0 || kbc/iter_k < kbc0/iter_k) { + if (fastmodulo(kbc, fd_iter_k) == 0 || fastdiv(kbc, fd_iter_k) < tile_kbc0) { break; } bidx--; @@ -976,14 +1063,28 @@ void launch_fattn( const int tiles_nwaves = (ntiles_dst + max_blocks - 1) / max_blocks; const int tiles_efficiency_percent = 100 * ntiles_dst / (max_blocks*tiles_nwaves); - const int nblocks_stream_k = std::min(max_blocks, ntiles_KV*ntiles_dst); - const bool use_stream_k = cc >= GGML_CUDA_CC_ADA_LOVELACE || amd_wmma_available(cc) || tiles_efficiency_percent < 75; - blocks_num.x = use_stream_k ? nblocks_stream_k : ntiles_dst; + blocks_num.x = ntiles_dst; blocks_num.y = 1; blocks_num.z = 1; + if(use_stream_k) { + const int nblocks_stream_k_raw = std::min(max_blocks, ntiles_KV*ntiles_dst); + // Round down to a multiple of ntiles_dst so that each output tile gets the same number of blocks (avoids fixup). + // Only do this if the occupancy loss from rounding is acceptable. + const int nblocks_stream_k_rounded = (nblocks_stream_k_raw / ntiles_dst) * ntiles_dst; + const int max_efficiency_loss_percent = 5; + const int efficiency_loss_percent = nblocks_stream_k_rounded > 0 + ? 100 * (nblocks_stream_k_raw - nblocks_stream_k_rounded) / nblocks_stream_k_raw + : 100; + const int nblocks_stream_k = efficiency_loss_percent <= max_efficiency_loss_percent + ? nblocks_stream_k_rounded + : nblocks_stream_k_raw; + + blocks_num.x = nblocks_stream_k; + } + if (ntiles_dst % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles. dst_tmp_meta.alloc((size_t(blocks_num.x) * ncols * (2 + DV/2))); } @@ -1063,13 +1164,40 @@ void launch_fattn( CUDA_CHECK(cudaGetLastError()); if (stream_k) { - if (ntiles_dst % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles. + if ((int)blocks_num.x % ntiles_dst == 0 && (int)blocks_num.x > ntiles_dst) { + // Optimized fixup: nblocks_stream_k is a multiple of ntiles_dst, launch one block per tile. + const int nblocks_sk = (int)blocks_num.x; + const int bpt = nblocks_sk / ntiles_dst; + + const uint3 fd0 = init_fastdiv_values(ntiles_x * ntiles_z_gqa * K->ne[2]); + const uint3 fd1 = init_fastdiv_values(ntiles_x * ntiles_z_gqa); + const uint3 fd2 = init_fastdiv_values(ntiles_x); + + const dim3 block_dim_combine(DV, 1, 1); + const dim3 blocks_num_combine = {(unsigned)ntiles_dst, ncols1, ncols2}; + + flash_attn_stream_k_fixup_uniform + <<>> + ((float *) KQV->data, dst_tmp_meta.ptr, + Q->ne[1], Q->ne[2], K->ne[2], nblocks_sk, + gqa_ratio, bpt, fd0, fd1, fd2); + } else if (ntiles_dst % blocks_num.x != 0) { + // General fixup for the cases where nblocks_stream_k < ntiles_dst. + const int total_work = ntiles_KV * ntiles_dst; + + const uint3 fd_k_j_z_ne12 = init_fastdiv_values(ntiles_KV * ntiles_x * ntiles_z_gqa * K->ne[2]); + const uint3 fd_k_j_z = init_fastdiv_values(ntiles_KV * ntiles_x * ntiles_z_gqa); + const uint3 fd_k_j = init_fastdiv_values(ntiles_KV * ntiles_x); + const uint3 fd_k = init_fastdiv_values(ntiles_KV); + const dim3 block_dim_combine(DV, 1, 1); const dim3 blocks_num_combine = {blocks_num.x, ncols1, ncols2}; - flash_attn_stream_k_fixup + flash_attn_stream_k_fixup_general <<>> - ((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], Q->ne[3], K->ne[1], K->ne[2], nbatch_fa); + ((float *) KQV->data, dst_tmp_meta.ptr, + Q->ne[1], Q->ne[2], gqa_ratio, total_work, + fd_k_j_z_ne12, fd_k_j_z, fd_k_j, fd_k); } } else if (parallel_blocks > 1) { const dim3 block_dim_combine(DV, 1, 1); From 506200cf8b5c8419ce97d16dc8c50f4634e21ebe Mon Sep 17 00:00:00 2001 From: Bipin Yadav <83943505+bipinyadav3175@users.noreply.github.com> Date: Tue, 7 Apr 2026 00:24:06 +0530 Subject: [PATCH 7/7] cli: fix stripping of \n in multiline input (#21485) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * llama-cli: fix stripping of \n in multiline input * Change & string to string_view * Apply suggestions from code review Co-authored-by: Sigbjørn Skjæret * Fix EditorConfig linter error --------- Co-authored-by: Sigbjørn Skjæret --- common/console.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/common/console.cpp b/common/console.cpp index a770416ab..36f645f33 100644 --- a/common/console.cpp +++ b/common/console.cpp @@ -700,13 +700,13 @@ namespace console { std::vector entries; size_t viewing_idx = SIZE_MAX; std::string backup_line; // current line before viewing history - void add(const std::string & line) { + void add(std::string_view line) { if (line.empty()) { return; } // avoid duplicates with the last entry if (entries.empty() || entries.back() != line) { - entries.push_back(line); + entries.emplace_back(line); } // also clear viewing state end_viewing(); @@ -1031,11 +1031,12 @@ namespace console { if (!end_of_stream && !line.empty()) { // remove the trailing newline for history storage + std::string_view hline = line; if (!line.empty() && line.back() == '\n') { - line.pop_back(); + hline.remove_suffix(1); } // TODO: maybe support multiline history entries? - history.add(line); + history.add(hline); } fflush(out);