diff --git a/common/arg.cpp b/common/arg.cpp index 584900eb8..6bfec91e8 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3501,7 +3501,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_N_MIN")); add_opt(common_arg( - {"--spec--draft-p-split", "--draft-p-split"}, "P", + {"--spec-draft-p-split", "--draft-p-split"}, "P", string_format("speculative decoding split probability (default: %.2f)", (double)params.speculative.draft.p_split), [](common_params & params, const std::string & value) { params.speculative.draft.p_split = std::stof(value); diff --git a/common/reasoning-budget.cpp b/common/reasoning-budget.cpp index 74fce5367..c6e1f86c9 100644 --- a/common/reasoning-budget.cpp +++ b/common/reasoning-budget.cpp @@ -232,34 +232,6 @@ static struct llama_sampler * common_reasoning_budget_init_state( ); } -struct llama_sampler * common_reasoning_budget_init( - const struct llama_vocab * vocab, - const std::vector & start_tokens, - const std::vector & end_tokens, - const std::vector & forced_tokens, - int32_t budget, - const std::vector & prefill_tokens) { - // Determine initial state from prefill: COUNTING if the prefill begins with - // the start sequence but does not also contain the end sequence after it. - common_reasoning_budget_state initial_state = REASONING_BUDGET_IDLE; - if (!prefill_tokens.empty() && !start_tokens.empty() && - prefill_tokens.size() >= start_tokens.size() && - std::equal(start_tokens.begin(), start_tokens.end(), prefill_tokens.begin())) { - initial_state = REASONING_BUDGET_COUNTING; - // If the end sequence also follows the start in the prefill, reasoning - // was opened and immediately closed — stay IDLE. - if (!end_tokens.empty() && - prefill_tokens.size() >= start_tokens.size() + end_tokens.size()) { - auto end_start = prefill_tokens.end() - (ptrdiff_t) end_tokens.size(); - if (end_start >= prefill_tokens.begin() + (ptrdiff_t) start_tokens.size() && - std::equal(end_tokens.begin(), end_tokens.end(), end_start)) { - initial_state = REASONING_BUDGET_IDLE; - } - } - } - return common_reasoning_budget_init_state(vocab, start_tokens, end_tokens, forced_tokens, budget, initial_state); -} - struct llama_sampler * common_reasoning_budget_init( const struct llama_vocab * vocab, const std::vector & start_tokens, diff --git a/common/reasoning-budget.h b/common/reasoning-budget.h index ee1a30ed3..ef37f46ee 100644 --- a/common/reasoning-budget.h +++ b/common/reasoning-budget.h @@ -29,10 +29,7 @@ enum common_reasoning_budget_state { // end_tokens - token sequence for natural deactivation // forced_tokens - token sequence forced when budget expires // budget - max tokens allowed in the reasoning block -// prefill_tokens - tokens already present in the prompt (generation prompt); -// used to determine the initial state: COUNTING if they begin -// with start_tokens (but don't also end with end_tokens), -// IDLE otherwise. COUNTING with budget <= 0 is promoted to FORCING. +// initial_state - initial state // struct llama_sampler * common_reasoning_budget_init( const struct llama_vocab * vocab, @@ -40,16 +37,6 @@ struct llama_sampler * common_reasoning_budget_init( const std::vector & end_tokens, const std::vector & forced_tokens, int32_t budget, - const std::vector & prefill_tokens = {}); - -// Variant that takes an explicit initial state (used by tests and clone). -// COUNTING with budget <= 0 is promoted to FORCING. -struct llama_sampler * common_reasoning_budget_init( - const struct llama_vocab * vocab, - const std::vector & start_tokens, - const std::vector & end_tokens, - const std::vector & forced_tokens, - int32_t budget, - common_reasoning_budget_state initial_state); + common_reasoning_budget_state initial_state = REASONING_BUDGET_IDLE); common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl); diff --git a/common/sampling.cpp b/common/sampling.cpp index b2e6d8e8d..d4a2fdcda 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -260,32 +260,35 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st } } + // Compute prefill tokens from the generation prompt + std::vector prefill_tokens; + if (!params.generation_prompt.empty()) { + GGML_ASSERT(vocab != nullptr); + auto tokens = common_tokenize(vocab, params.generation_prompt, false, true); + for (size_t i = 0; i < tokens.size(); i++) { + std::string piece = common_token_to_piece(vocab, tokens[i], true); + if (i == 0 && std::isspace(piece[0]) && !std::isspace(params.generation_prompt[0])) { + // Some tokenizers will add a space before the first special token, need to exclude + continue; + } + LOG_DBG("%s: prefill token: %d = %s\n", __func__, tokens[i], piece.c_str()); + prefill_tokens.push_back(tokens[i]); + } + } + // Feed generation prompt tokens to the grammar sampler so it advances past // tokens the template already placed in the prompt. // Only applies to output-format and tool-call grammars; user-supplied grammars must not be prefilled. - std::vector prefill_tokens; - if (!params.generation_prompt.empty() && common_grammar_needs_prefill(params.grammar)) { - GGML_ASSERT(vocab != nullptr); - prefill_tokens = common_tokenize(vocab, params.generation_prompt, false, true); - if (!prefill_tokens.empty()) { - std::string first_token = common_token_to_piece(vocab, prefill_tokens[0], true); - if (std::isspace(first_token[0]) && !std::isspace(params.generation_prompt[0])) { - // Some tokenizers will add a space before the first special token, need to remove - prefill_tokens = std::vector(prefill_tokens.begin() + 1, prefill_tokens.end()); - } - } - - if (grmr && !params.grammar_lazy) { - try { - for (const auto & token : prefill_tokens) { - llama_sampler_accept(grmr, token); - LOG_DBG("%s: accepted prefill token (%d)\n", __func__, token); - } - } catch (std::exception &e) { - LOG_ERR("%s: error initializing grammar sampler for grammar:\n%s\n\nGeneration prompt:\n'%s'\n", __func__, - common_grammar_value(params.grammar).c_str(), params.generation_prompt.c_str()); - throw e; + if (grmr && !params.grammar_lazy && common_grammar_needs_prefill(params.grammar)) { + try { + for (const auto & token : prefill_tokens) { + llama_sampler_accept(grmr, token); + LOG_DBG("%s: grammar accepted prefill token (%d)\n", __func__, token); } + } catch (std::exception &e) { + LOG_ERR("%s: error initializing grammar sampler for grammar:\n%s\n\nGeneration prompt:\n'%s'\n", __func__, + common_grammar_value(params.grammar).c_str(), params.generation_prompt.c_str()); + throw e; } } @@ -296,8 +299,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st params.reasoning_budget_start, params.reasoning_budget_end, params.reasoning_budget_forced, - params.reasoning_budget_tokens < 0 ? INT_MAX : params.reasoning_budget_tokens, - prefill_tokens); + params.reasoning_budget_tokens < 0 ? INT_MAX : params.reasoning_budget_tokens); + + for (const auto & token : prefill_tokens) { + llama_sampler_accept(rbudget, token); + LOG_DBG("%s: reasoning-budget accepted prefill token (%d)\n", __func__, token); + } } if (params.has_logit_bias()) { @@ -431,7 +438,7 @@ static bool grammar_should_apply(struct common_sampler * gsmpl) { return true; } -void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) { +void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool is_generated) { if (!gsmpl) { return; } @@ -439,9 +446,11 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo const auto tm = gsmpl->tm(); // grammar_should_apply() checks the reasoning budget state, so calculate this before we accept - accept_grammar = accept_grammar && grammar_should_apply(gsmpl); + const auto accept_grammar = is_generated && grammar_should_apply(gsmpl); - llama_sampler_accept(gsmpl->rbudget, token); + if (gsmpl->rbudget && is_generated) { + llama_sampler_accept(gsmpl->rbudget, token); + } if (gsmpl->grmr && accept_grammar) { llama_sampler_accept(gsmpl->grmr, token); diff --git a/common/sampling.h b/common/sampling.h index 5b57ad658..49506a00c 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -41,8 +41,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st void common_sampler_free(struct common_sampler * gsmpl); -// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar -void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar); +// if is_generated is true, the token is accepted by the sampling chain, the reasoning budget sampler, and the grammar sampler +void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool is_generated); void common_sampler_reset (struct common_sampler * gsmpl); struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl); diff --git a/common/speculative.cpp b/common/speculative.cpp index bda9993b1..bbf88fa6e 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -167,8 +167,6 @@ struct common_speculative_checkpoint { size_t size() const { return data.size(); } - - size_t ckpt_size = 0; }; struct common_speculative_state_draft : public common_speculative_state { @@ -176,7 +174,7 @@ struct common_speculative_state_draft : public common_speculative_state { llama_context * ctx_dft; bool use_ckpt = false; - struct common_speculative_checkpoint ckpt; + common_speculative_checkpoint ckpt; common_sampler * smpl; @@ -249,26 +247,16 @@ struct common_speculative_state_draft : public common_speculative_state { llama_batch_free(batch); } - void begin(const llama_tokens & prompt) override { - if (use_ckpt && ckpt.size() > 0) { - // delete checkpoint - LOG_DBG("%s: delete checkpoint, prompt.size=%zu, pos_min=%d, pos_max=%d, n_tokens=%" PRId64 ", size=%.3f MiB\n", - __func__, prompt.size(), ckpt.pos_min, ckpt.pos_max, ckpt.n_tokens, (float) ckpt.data.size() / 1024 / 1024); - ckpt.pos_min = 0; - ckpt.pos_max = 0; - ckpt.n_tokens = 0; - ckpt.ckpt_size = 0; - ckpt.data.clear(); - } + void begin(const llama_tokens & /*prompt*/) override { } - size_t draft_create_checkpoint(int n_tokens_prompt, int n_tokens_batch) { + size_t create_checkpoint(int n_tokens_prompt) { int slot_id = 0; const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx_dft, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); ckpt.pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_dft), slot_id); ckpt.pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), slot_id); - ckpt.n_tokens = n_tokens_prompt - n_tokens_batch; + ckpt.n_tokens = n_tokens_prompt; ckpt.data.resize(checkpoint_size); const size_t n = llama_state_seq_get_data_ext(ctx_dft, ckpt.data.data(), checkpoint_size, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); @@ -281,13 +269,13 @@ struct common_speculative_state_draft : public common_speculative_state { return n; } - size_t draft_restore_checkpoint(size_t ckpt_size_part_expected) { + size_t restore_checkpoint() { int slot_id = 0; LOG_DBG("%s: pos_min = %d, pos_max = %d\n", __func__, ckpt.pos_min, ckpt.pos_max); const size_t n = llama_state_seq_set_data_ext(ctx_dft, ckpt.data.data(), ckpt.size(), slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); - if (n != ckpt_size_part_expected) { - GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu, get_data_ext->%zu, set_data_ext->%zu", - __func__, ckpt.pos_min, ckpt.pos_max, ckpt.size(), ckpt_size_part_expected, n); + if (n != ckpt.size()) { + GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu", + __func__, ckpt.pos_min, ckpt.pos_max, ckpt.size()); } llama_memory_seq_rm(llama_get_memory(ctx_dft), slot_id, ckpt.pos_max + 1, -1); @@ -346,13 +334,18 @@ struct common_speculative_state_draft : public common_speculative_state { const int i_start = std::max(0, (int) prompt_cur.size() - n_ctx); + if (use_ckpt && i_start > 0) { + LOG_WRN("%s: context shift is not supported with checkpoint-based contexts - skipping\n", __func__); + return; + } + // reuse as much as possible from the old draft context // ideally, the draft context should be as big as the target context and we will always reuse the entire prompt for (int i = 0; i < (int) prompt_dft.size(); ++i) { int cur = 0; while (i_start + cur < (int) prompt_cur.size() && - i + cur < (int) prompt_dft.size() && - prompt_cur[i_start + cur] == prompt_dft[i + cur]) { + i + cur < (int) prompt_dft.size() && + prompt_cur[i_start + cur] == prompt_dft[i + cur]) { cur++; } @@ -360,21 +353,26 @@ struct common_speculative_state_draft : public common_speculative_state { reuse_i = i; reuse_n = cur; } + + if (use_ckpt) { + break; + } } LOG_DBG("%s: reuse_i = %d, reuse_n = %d, #prompt_dft = %zu, #prompt_cur = %zu\n", __func__, reuse_i, reuse_n, prompt_dft.size(), prompt_cur.size()); - if (use_ckpt && ckpt.ckpt_size == 0 && reuse_n > 0) { - LOG_DBG("%s: no checkpoint available, no reuse, (reuse_i=%d, reuse_n=%d) -> (0, 0)\n", - __func__, reuse_i, reuse_n); + if (use_ckpt && ckpt.n_tokens > reuse_n) { + LOG_DBG("%s: checkpoint (n_tokens = %d) is outdated -> delete it\n", __func__, (int) ckpt.n_tokens); + reuse_i = 0; reuse_n = 0; + + ckpt = {}; } result.clear(); result.reserve(sparams.n_max); - bool needs_ckpt = use_ckpt && prompt_dft.size() > 0; if (reuse_n == 0 || (use_ckpt && reuse_i > 0)) { llama_memory_clear(mem_dft, false); prompt_dft.clear(); @@ -393,50 +391,38 @@ struct common_speculative_state_draft : public common_speculative_state { return; } - bool do_restore = false; - if (prompt_dft.size() > prompt_cur.size() && reuse_i + reuse_n < (int64_t) prompt_dft.size()) { - // This can happen after a partial acceptance (speculative decoding with checkpoints) - LOG_DBG("%s: #prompt_dft=%zu, #prompt_cur=%zu, shorten draft\n", - __func__, prompt_dft.size(), prompt_cur.size()); - prompt_dft.resize(prompt_cur.size()); - do_restore = true; - } - if (reuse_i > 0) { + GGML_ASSERT(!use_ckpt); + bool is_removed = llama_memory_seq_rm (mem_dft, 0, 0, reuse_i); if (!is_removed) { LOG_ERR("%s: llama_memory_seq_rm failed, reuse_i=%d\n", __func__, reuse_i); + return; } llama_memory_seq_add(mem_dft, 0, reuse_i, -1, -reuse_i); prompt_dft.erase(prompt_dft.begin(), prompt_dft.begin() + reuse_i); } - if (reuse_n < (int) prompt_dft.size() || do_restore) { + if (reuse_n < (int) prompt_dft.size()) { if (use_ckpt) { - if (ckpt.n_tokens > (int64_t) prompt_dft.size()) { - LOG_INF("%s: checkpoint is too large, prompt_tgt.size=%zu, ckpt.n_tokens=%" PRId64 ", reuse_n=%d, prompt_dft.size=%zu\n", - __func__, prompt_tgt.size(), ckpt.n_tokens, reuse_n, prompt_dft.size()); + if (ckpt.n_tokens > 0) { + LOG_DBG("%s: restoring checkpoint, reuse_n=%d, prompt_dft.size=%zu\n", __func__, reuse_n, prompt_dft.size()); + restore_checkpoint(); + reuse_n = ckpt.n_tokens; + prompt_dft.resize(reuse_n); } - draft_restore_checkpoint(ckpt.ckpt_size); - reuse_n = ckpt.n_tokens; - prompt_dft.resize(reuse_n); - needs_ckpt = false; } else { - bool is_removed = llama_memory_seq_rm (mem_dft, 0, reuse_n, -1); + const bool is_removed = llama_memory_seq_rm(mem_dft, 0, reuse_n, -1); if (!is_removed) { - LOG_ERR("%s: llama_memory_seq_rm failed, reuse_n=%d, prompt_dft.size=%zu\n", - __func__, reuse_n, prompt_dft.size()); + LOG_ERR("%s: llama_memory_seq_rm failed, reuse_n=%d, prompt_dft.size=%zu\n", __func__, reuse_n, prompt_dft.size()); + return; } prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end()); } } } - if (needs_ckpt) { - ckpt.ckpt_size = draft_create_checkpoint(prompt_dft.size(), batch.n_tokens); - } - // prepare a batch to evaluate any new tokens in the prompt common_batch_clear(batch); @@ -450,12 +436,17 @@ struct common_speculative_state_draft : public common_speculative_state { // we should rarely end-up here during normal decoding if (batch.n_tokens > 0) { //LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str()); + LOG_DBG("%s: draft prompt batch: %d tokens\n", __func__, batch.n_tokens); int ret = llama_decode(ctx_dft, batch); if (ret != 0 && ret != 1) { LOG_WRN("%s: llama_decode returned %d, prompt_cur.size=%zu\n", __func__, ret, prompt_cur.size()); } + + if (use_ckpt) { + create_checkpoint(prompt_dft.size()); + } } const llama_pos n_past = prompt_dft.size(); @@ -784,17 +775,15 @@ struct common_speculative_state_ngram_mod : public common_speculative_state { } void accept(uint16_t n_accepted) override { - if (verbose) { - LOG_INF("%s: accepted %d tokens from %zu drafted tokens\n", __func__, n_accepted, n_draft_last); - } - // compute acceptance fraction if we have a recorded draft length if (n_draft_last > 0) { const double f_acc = (double)n_accepted / (double)n_draft_last; if (f_acc < 0.5) { n_low++; if (n_low >= 3) { - LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, n_low); + if (verbose) { + LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, n_low); + } mod.reset(); n_low = 0; diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 90c2b7094..5287c4df9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6658,7 +6658,7 @@ class BertModel(TextModel): tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size # ty: ignore[invalid-assignment] + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size if isinstance(tokenizer, SentencePieceProcessor): for token_id in range(tokenizer.vocab_size()): diff --git a/ggml/src/ggml-cuda/fattn-tile.cuh b/ggml/src/ggml-cuda/fattn-tile.cuh index 928b856f9..585f2c228 100644 --- a/ggml/src/ggml-cuda/fattn-tile.cuh +++ b/ggml/src/ggml-cuda/fattn-tile.cuh @@ -68,7 +68,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 64, 64) GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 64, 64) - GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 256, 2, 64, 64) + GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 16, 256, 2, 64, 64) GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 64, 64) GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 64, 64) @@ -130,7 +130,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 32, 128) GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 32, 64) - GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 256, 2, 32, 64) + GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 16, 256, 2, 32, 64) GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 32, 64) GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 32, 64) @@ -1124,7 +1124,7 @@ static void launch_fattn_tile_switch_ncols1(ggml_backend_cuda_context & ctx, ggm constexpr size_t nbytes_shared = 0; #ifdef GGML_USE_HIP - if constexpr (DV <= 128) { + if constexpr (DKQ <= 128) { if (Q->ne[1] > 32/ncols2) { constexpr int cols_per_block = 64; const int nwarps = ggml_cuda_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size; @@ -1138,7 +1138,7 @@ static void launch_fattn_tile_switch_ncols1(ggml_backend_cuda_context & ctx, ggm #endif // GGML_USE_HIP #ifndef GGML_USE_HIP - if constexpr (DV <= 256) + if constexpr (DKQ <= 256) #endif // GGML_USE_HIP { if (Q->ne[1] > 16/ncols2) { @@ -1220,11 +1220,22 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm const int gqa_limit = nvidia && gqa_ratio <= 4 && DV <= 256 ? 16 : INT_MAX; const bool use_gqa_opt = mask && max_bias == 0.0f && Q->ne[1] <= gqa_limit && K->ne[1] % FATTN_KQ_STRIDE == 0; - if constexpr (DKQ == 320) { // Mistral Small 4 + if constexpr (DKQ == 320) { + // This branch is only used for Mistral Small 4 which has a GQA ratio of 32. + // On AMD, simply use that GQA ratio with 32 columns / block since we always have enough SRAM. + // On NVIDIA however, the tile kernel is only used for GPUs that can't use the mma kernel (Pascal and older). + // Therefore, use a GQA ratio of 16 with 16 columns / block to stay below 48 kiB of SRAM / block. +#ifdef GGML_USE_HIP if (use_gqa_opt && gqa_ratio % 32 == 0) { launch_fattn_tile_switch_ncols1(ctx, dst); return; } +#else + if (use_gqa_opt && gqa_ratio % 16 == 0) { + launch_fattn_tile_switch_ncols1(ctx, dst); + return; + } +#endif // GGML_USE_HIP GGML_ABORT("flash-attn tile (320/256): expected GQA ratio multiple of 32"); } diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 66dc5f70c..c79077a15 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3572,6 +3572,9 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, && unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_SILU) { const ggml_tensor * ssm_conv = cgraph->nodes[node_idx]; const ggml_tensor * silu = cgraph->nodes[node_idx+1]; + if (ggml_get_unary_op(silu) != unary_ops.begin()[0]) { + return false; + } if (ssm_conv->type != GGML_TYPE_F32 || silu->type != GGML_TYPE_F32) { return false; @@ -3580,6 +3583,31 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, return true; } + if (ops.size() == 3 && ops.begin()[0] == GGML_OP_SSM_CONV && ops.begin()[1] == GGML_OP_ADD + && ops.begin()[2] == GGML_OP_UNARY && unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_SILU) { + const ggml_tensor * ssm_conv = cgraph->nodes[node_idx]; + const ggml_tensor * add = cgraph->nodes[node_idx+1]; + const ggml_tensor * silu = cgraph->nodes[node_idx+2]; + if (ggml_get_unary_op(silu) != unary_ops.begin()[0]) { + return false; + } + + if (ssm_conv->type != GGML_TYPE_F32 || add->type != GGML_TYPE_F32 || silu->type != GGML_TYPE_F32) { + return false; + } + + // ADD must consume ssm_conv's output and broadcast a 1-D channel-wise bias. + const ggml_tensor * bias = (add->src[0] == ssm_conv) ? add->src[1] : add->src[0]; + if (bias->type != GGML_TYPE_F32 || !ggml_is_contiguous(bias)) { + return false; + } + if (ggml_nelements(bias) != ssm_conv->ne[0] || bias->ne[0] != ssm_conv->ne[0]) { + return false; + } + + return true; + } + if (ops.size() == 2 && ops.begin()[0] == GGML_OP_UNARY && ops.begin()[1] == GGML_OP_MUL && unary_ops.size() == 1 && (unary_ops.begin()[0] == GGML_UNARY_OP_SILU || unary_ops.begin()[0] == GGML_UNARY_OP_SIGMOID || unary_ops.begin()[0] == GGML_UNARY_OP_SOFTPLUS)) { const ggml_tensor * unary = cgraph->nodes[node_idx]; @@ -3982,8 +4010,13 @@ static int ggml_cuda_try_fuse(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph return 1; } + if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SSM_CONV, GGML_OP_ADD, GGML_OP_UNARY }, { GGML_UNARY_OP_SILU })) { + ggml_cuda_op_ssm_conv(*cuda_ctx, node, cgraph->nodes[i + 1], cgraph->nodes[i + 2]); + return 2; + } + if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SSM_CONV, GGML_OP_UNARY }, { GGML_UNARY_OP_SILU })) { - ggml_cuda_op_ssm_conv(*cuda_ctx, node, cgraph->nodes[i + 1]); + ggml_cuda_op_ssm_conv(*cuda_ctx, node, /*bias_add_node=*/ nullptr, cgraph->nodes[i + 1]); return 1; } diff --git a/ggml/src/ggml-cuda/ssm-conv.cu b/ggml/src/ggml-cuda/ssm-conv.cu index b77cdc1c1..4841389fb 100644 --- a/ggml/src/ggml-cuda/ssm-conv.cu +++ b/ggml/src/ggml-cuda/ssm-conv.cu @@ -3,6 +3,7 @@ template static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float * __restrict__ src1, + const float * __restrict__ bias, const int src0_nb0, const int src0_nb1, const int src0_nb2, const int src1_nb1, float * __restrict__ dst, const int dst_nb0, const int dst_nb1, const int dst_nb2, const int64_t n_t) { @@ -27,6 +28,8 @@ static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float w[j] = w_block[tid * stride_w + j]; } + float b = bias != nullptr ? bias[bidy * split_d_inner + tid] : 0.0f; + for (int64_t i = 0; i < n_t; i++) { float sumf = 0.0f; @@ -42,12 +45,14 @@ static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float for (size_t j = 0; j < d_conv; j++) { sumf += x[(i + j) % d_conv] * w[j]; } + sumf += b; y_block[i * stride_y + tid] = apply_silu ? ggml_cuda_op_silu_single(sumf) : sumf; } } template static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0, const float * __restrict__ src1, + const float * __restrict__ bias, const int src0_nb0, const int src0_nb1, const int src0_nb2, const int src1_nb1, float * __restrict__ dst, const int dst_nb0, const int dst_nb1, const int dst_nb2, const int64_t n_t) { @@ -97,6 +102,8 @@ static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0, w[j] = w_block[tid * stride_w + j]; } + float b = bias != nullptr ? bias[bidy * split_d_inner + tid] : 0.0f; + // Compute from shared memory for (int64_t i = 0; i < local_n_t; i++) { float sumf = 0.0f; @@ -104,12 +111,13 @@ static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0, for (size_t j = 0; j < d_conv; j++) { sumf += smem[tid * n_cols + i + j] * w[j]; } + sumf += b; y_block[i * stride_y + tid] = apply_silu ? ggml_cuda_op_silu_single(sumf) : sumf; } } template -static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int src0_nb0, const int src0_nb1, +static void ssm_conv_f32_cuda(const float * src0, const float * src1, const float * bias, const int src0_nb0, const int src0_nb1, const int src0_nb2, const int src1_nb1, float * dst, const int dst_nb0, const int dst_nb1, const int dst_nb2, const int64_t nc, const int64_t nr, const int64_t n_t, const int64_t n_s, cudaStream_t stream) { @@ -120,14 +128,14 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int constexpr int kNC = decltype(NC)::value; if (n_t <= 32) { const dim3 blocks(n_s, (nr + threads - 1) / threads, 1); - ssm_conv_f32<<>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, + ssm_conv_f32<<>>(src0, src1, bias, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t); } else { const int64_t split_n_t = 32; dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t); const size_t smem_size = threads * (kNC - 1 + split_n_t) * sizeof(float); ssm_conv_long_token_f32<<>>( - src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t); + src0, src1, bias, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t); } }; @@ -140,11 +148,18 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int } } -void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * silu_dst) { +void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * bias_add_node, ggml_tensor * silu_dst) { const struct ggml_tensor * src0 = dst->src[0]; // conv_x const struct ggml_tensor * src1 = dst->src[1]; // conv1d.weight + const bool fuse_bias = bias_add_node != nullptr; const bool fuse_silu = silu_dst != nullptr; + // bias always comes with silu. + GGML_ASSERT(!fuse_bias || fuse_silu); + + // The bias (when fused) is the non-conv operand of the ADD node. + const struct ggml_tensor * bias = fuse_bias ? (bias_add_node->src[0] == dst ? bias_add_node->src[1] : bias_add_node->src[0]) : nullptr; + // When fusing, write to silu_dst (the node downstream references). const struct ggml_tensor * out = fuse_silu ? silu_dst : dst; @@ -160,16 +175,23 @@ void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst, g const float * src0_d = (const float *) src0->data; const float * src1_d = (const float *) src1->data; + const float * bias_d = fuse_bias ? (const float *) bias->data : nullptr; float * dst_d = (float *) out->data; cudaStream_t stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(out->type == GGML_TYPE_F32); + if (fuse_bias) { + GGML_ASSERT(bias->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(bias)); + GGML_ASSERT(ggml_nelements(bias) == nr); + } + if (fuse_silu) { - ssm_conv_f32_cuda(src0_d, src1_d, src0->nb[0], src0->nb[1], src0->nb[2], src1->nb[1], dst_d, out->nb[0], out->nb[1], + ssm_conv_f32_cuda(src0_d, src1_d, bias_d, src0->nb[0], src0->nb[1], src0->nb[2], src1->nb[1], dst_d, out->nb[0], out->nb[1], out->nb[2], nc, nr, n_t, n_s, stream); } else { - ssm_conv_f32_cuda(src0_d, src1_d, src0->nb[0], src0->nb[1], src0->nb[2], src1->nb[1], dst_d, out->nb[0], out->nb[1], + ssm_conv_f32_cuda(src0_d, src1_d, bias_d, src0->nb[0], src0->nb[1], src0->nb[2], src1->nb[1], dst_d, out->nb[0], out->nb[1], out->nb[2], nc, nr, n_t, n_s, stream); } } diff --git a/ggml/src/ggml-cuda/ssm-conv.cuh b/ggml/src/ggml-cuda/ssm-conv.cuh index f96a1cd24..8514ca849 100644 --- a/ggml/src/ggml-cuda/ssm-conv.cuh +++ b/ggml/src/ggml-cuda/ssm-conv.cuh @@ -1,3 +1,3 @@ #include "common.cuh" -void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * silu_dst = nullptr); +void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * bias_add_node = nullptr, ggml_tensor * silu_dst = nullptr); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index ee8366d28..2d3003f03 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -680,6 +680,7 @@ private: // slots / clients std::vector slots; + int trace = 0; int slots_debug = 0; int n_empty_consecutive = 0; @@ -918,12 +919,21 @@ private: slot.reset(); } + { + const char * LLAMA_TRACE = getenv("LLAMA_TRACE"); + trace = LLAMA_TRACE ? atoi(LLAMA_TRACE) : 0; + + if (trace) { + SRV_WRN("LLAMA_TRACE = %d\n", trace); + } + } + { const char * LLAMA_SERVER_SLOTS_DEBUG = getenv("LLAMA_SERVER_SLOTS_DEBUG"); slots_debug = LLAMA_SERVER_SLOTS_DEBUG ? atoi(LLAMA_SERVER_SLOTS_DEBUG) : 0; if (slots_debug) { - SRV_WRN("slots debug = %d\n", slots_debug); + SRV_WRN("LLAMA_SERVER_SLOTS_DEBUG = %d\n", slots_debug); } } @@ -2974,13 +2984,15 @@ private: auto accepted = common_sampler_sample_and_accept_n(slot.smpl.get(), slot.ctx, slot.spec_i_batch, slot.spec_draft); slot.spec_i_batch.clear(); - SLT_DBG(slot, "%s: n_draft=%zu, accepted=%zu\n", __func__, slot.spec_draft.size(), accepted.size()); - GGML_ASSERT(accepted.size() >= 1); // check for partial draft acceptance if (accepted.size() < slot.spec_draft.size() + 1) { if (use_ckpt) { + if (trace > 0) { + SLT_INF(slot, "accepted %2zu/%2zu draft tokens (restore checkpoint)\n", accepted.size() - 1, slot.spec_draft.size()); + } + // partial acceptance is not supported by the context -> truncate the draft and restore the state slot.spec_draft = std::move(accepted); @@ -3002,8 +3014,10 @@ private: continue; } + } - LOG_DBG("%s: partial acceptance: %zu < %zu\n", __func__, accepted.size(), slot.spec_draft.size()); + if (trace > 0) { + SLT_INF(slot, "accepted %2zu/%2zu draft tokens\n", accepted.size() - 1, n_draft); } common_speculative_accept(slot.spec.get(), accepted.size() - 1); diff --git a/vendor/cpp-httplib/httplib.cpp b/vendor/cpp-httplib/httplib.cpp index 95bf0eb1b..66c0b6ebd 100644 --- a/vendor/cpp-httplib/httplib.cpp +++ b/vendor/cpp-httplib/httplib.cpp @@ -1464,8 +1464,9 @@ bool mmap::open(const char *path) { auto wpath = u8string_to_wstring(path); if (wpath.empty()) { return false; } - hFile_ = ::CreateFile2(wpath.c_str(), GENERIC_READ, FILE_SHARE_READ, - OPEN_EXISTING, NULL); + hFile_ = + ::CreateFile2(wpath.c_str(), GENERIC_READ, + FILE_SHARE_READ | FILE_SHARE_WRITE, OPEN_EXISTING, NULL); if (hFile_ == INVALID_HANDLE_VALUE) { return false; } @@ -2052,56 +2053,50 @@ int getaddrinfo_with_timeout(const char *node, const char *service, return 0; #elif defined(_GNU_SOURCE) && defined(__GLIBC__) && \ (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2)) - // Linux implementation using getaddrinfo_a for asynchronous DNS resolution - struct gaicb request; + // #2431: gai_cancel() is non-blocking and may return EAI_NOTCANCELED while + // the resolver worker still references the stack-local gaicb. The cancel + // path therefore waits (gai_suspend with no timeout) for the worker to + // actually finish before letting the stack frame go. The trade-off is that + // a wedged DNS server can hold this thread for the system resolver timeout + // (~30s by default) past the caller's connection timeout. + struct gaicb request {}; struct gaicb *requests[1] = {&request}; - struct sigevent sevp; - struct timespec timeout; + struct sigevent sevp {}; + struct timespec timeout { + timeout_sec, 0 + }; - // Initialize the request structure - memset(&request, 0, sizeof(request)); request.ar_name = node; request.ar_service = service; request.ar_request = hints; - - // Set up timeout - timeout.tv_sec = timeout_sec; - timeout.tv_nsec = 0; - - // Initialize sigevent structure (not used, but required) - memset(&sevp, 0, sizeof(sevp)); sevp.sigev_notify = SIGEV_NONE; - // Start asynchronous resolution - int start_result = getaddrinfo_a(GAI_NOWAIT, requests, 1, &sevp); - if (start_result != 0) { return start_result; } + int rc = getaddrinfo_a(GAI_NOWAIT, requests, 1, &sevp); + if (rc != 0) { return rc; } - // Wait for completion with timeout - int wait_result = - gai_suspend((const struct gaicb *const *)requests, 1, &timeout); + auto cleanup = scope_exit([&] { + if (request.ar_result) { freeaddrinfo(request.ar_result); } + }); + + int wait_result = gai_suspend(requests, 1, &timeout); if (wait_result == 0 || wait_result == EAI_ALLDONE) { - // Completed successfully, get the result int gai_result = gai_error(&request); if (gai_result == 0) { *res = request.ar_result; + request.ar_result = nullptr; return 0; - } else { - // Clean up on error - if (request.ar_result) { freeaddrinfo(request.ar_result); } - return gai_result; } - } else if (wait_result == EAI_AGAIN) { - // Timeout occurred, cancel the request - gai_cancel(&request); - return EAI_AGAIN; - } else { - // Other error occurred - gai_cancel(&request); - return wait_result; + return gai_result; } + + gai_cancel(&request); + while (gai_error(&request) == EAI_INPROGRESS) { + gai_suspend(requests, 1, nullptr); + } + return wait_result; #else - // Fallback implementation using thread-based timeout for other Unix systems + // Fallback implementation using thread-based timeout for other Unix systems. struct GetAddrInfoState { ~GetAddrInfoState() { @@ -14142,6 +14137,9 @@ ssize_t read(session_t session, void *buf, size_t len, TlsError &err) { err.code = impl::map_mbedtls_error(ret, err.sys_errno); err.backend_code = static_cast(-ret); impl::mbedtls_last_error() = ret; + // mbedTLS signals a clean close_notify via a negative error code rather + // than 0; surface it as a clean EOF the way OpenSSL/wolfSSL do. + if (err.code == ErrorCode::PeerClosed) { return 0; } return -1; } diff --git a/vendor/cpp-httplib/httplib.h b/vendor/cpp-httplib/httplib.h index 8581d1695..7e530961b 100644 --- a/vendor/cpp-httplib/httplib.h +++ b/vendor/cpp-httplib/httplib.h @@ -8,8 +8,8 @@ #ifndef CPPHTTPLIB_HTTPLIB_H #define CPPHTTPLIB_HTTPLIB_H -#define CPPHTTPLIB_VERSION "0.43.1" -#define CPPHTTPLIB_VERSION_NUM "0x002b01" +#define CPPHTTPLIB_VERSION "0.43.2" +#define CPPHTTPLIB_VERSION_NUM "0x002b02" #ifdef _WIN32 #if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0A00