Merge branch 'upstream' into concedo_experimental

# Conflicts: # .github/workflows/build.yml # ci/run.sh # docs/function-calling.md # examples/gritlm/gritlm.cpp # ggml/CMakeLists.txt # ggml/cmake/common.cmake # ggml/src/CMakeLists.txt # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-cpu/ggml-cpu.c # ggml/src/ggml-hip/CMakeLists.txt # ggml/src/ggml-vulkan/CMakeLists.txt # ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt # requirements/requirements-compare-llama-bench.txt # scripts/compare-llama-bench.py # tests/CMakeLists.txt
2026-05-22 03:10:03 +00:00 · 2025-06-18 00:16:54 +08:00 · 2025-06-18 00:16:54 +08:00 · 4356a00f4a
commit 4356a00f4a
parent f07434f4c1 860a9e4eef
47 changed files with 1526 additions and 301 deletions
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -29,8 +29,8 @@ llama_context::llama_context(
    const auto & hparams = model.hparams;

    cparams.n_seq_max = std::max(1u, params.n_seq_max);
-    if (cparams.n_seq_max > LLAMA_MAX_PARALLEL_SEQUENCES) {
-        throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_PARALLEL_SEQUENCES));
+    if (cparams.n_seq_max > LLAMA_MAX_SEQ) {
+        throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_SEQ));
    }

    cparams.n_threads        = params.n_threads;
@ -727,9 +727,8 @@ int llama_context::encode(const llama_batch & batch_inp) {
        return -1;
    }

-    // temporary allocate memory for the input batch if needed
    // note: during encode, we always pass the full sequence starting from pos = 0
-    if (!batch_allocr->init(batch_inp, model.vocab, batch_inp.pos ? -1 : 0)) {
+    if (!batch_allocr->init(batch_inp, model.vocab, nullptr, true)) {
        LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
        return -1;
    }
@ -895,8 +894,10 @@ int llama_context::decode(const llama_batch & batch_inp) {
        return -1;
    }

-    // temporary allocate memory for the input batch if needed
-    if (!batch_allocr->init(batch_inp, model.vocab, batch_inp.pos ? -1 : memory->seq_pos_max(0) + 1)) {
+    // when computing embeddings, all tokens are output
+    const bool embd_all = cparams.embeddings;
+
+    if (!batch_allocr->init(batch_inp, model.vocab, memory.get(), embd_all)) {
        LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
        return -1;
    }
@ -913,12 +914,9 @@ int llama_context::decode(const llama_batch & batch_inp) {

    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT

-    // this indicates we are doing pooled embedding
-    const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
-
    const uint32_t n_outputs_all = batch_allocr->get_n_outputs();

-    if (embd_pooled) {
+    if (embd_all) {
        // require that all tokens are output
        if (n_outputs_all != n_tokens_all) {
            LLAMA_LOG_ERROR("%s: pooled embedding requires that all tokens are output (n_outputs_all = %d, n_tokens_all = %d)\n",
@ -947,7 +945,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
    llama_memory_state_ptr mstate;

    while (true) {
-        mstate = memory->init_batch(batch, cparams.n_ubatch, embd_pooled);
+        mstate = memory->init_batch(batch, cparams.n_ubatch, embd_all);
        if (!mstate) {
            return -2;
        }
@ -1025,8 +1023,8 @@ int llama_context::decode(const llama_batch & batch_inp) {

        if (!res) {
            // the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
-            llama_pos pos_min[LLAMA_MAX_PARALLEL_SEQUENCES];
-            for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+            llama_pos pos_min[LLAMA_MAX_SEQ];
+            for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
                pos_min[s] = std::numeric_limits<llama_pos>::max();
            }

@ -1037,7 +1035,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
                pos_min[seq_id] = std::min(pos_min[seq_id], ubatch.pos[i]);
            }

-            for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+            for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
                if (pos_min[s] == std::numeric_limits<llama_pos>::max()) {
                    continue;
                }
@ -1060,7 +1058,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
        //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
        //}

-        auto * t_logits = cparams.embeddings ? nullptr         : res->get_logits();
+        auto * t_logits = res->get_logits();
        auto * t_embd   = cparams.embeddings ? res->get_embd() : nullptr;

        if (t_embd && res->get_embd_pooled()) {
@ -1224,9 +1222,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
    const auto n_vocab = vocab.n_tokens();
    const auto n_embd  = hparams.n_embd;

-    // TODO: use a per-batch flag for logits presence instead
-    bool has_logits = !cparams.embeddings;
-    bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
+    bool has_logits = true;
+    bool has_embd   = cparams.embeddings;

    // TODO: hacky enc-dec support
    if (model.arch == LLM_ARCH_T5) {
@ -2046,14 +2043,11 @@ void llama_context::opt_epoch_iter(

        n_queued_tokens += n_tokens_all;

-        // this indicates we are doing pooled embedding
-        const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
-
        embd_seq.clear();

        uint32_t n_outputs_all = n_tokens_all;

-        auto mstate = memory->init_batch(batch, cparams.n_ubatch, embd_pooled);
+        auto mstate = memory->init_batch(batch, cparams.n_ubatch, true);
        if (!mstate || mstate->get_status() != LLAMA_MEMORY_STATUS_SUCCESS) {
            LLAMA_LOG_ERROR("%s: could not initialize batch\n", __func__);
            break;