Merge branch 'upstream' into concedo_experimental

# Conflicts: # .github/workflows/build.yml # examples/embedding/embedding.cpp # tools/imatrix/imatrix.cpp # tools/perplexity/perplexity.cpp
2025-09-11 01:24:36 +00:00 · 2025-05-08 23:41:02 +08:00 · 2025-05-08 23:41:02 +08:00 · 2439014a03
commit 2439014a03
parent b6220669f4 8c83449cb7
40 changed files with 2058 additions and 429 deletions
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -116,8 +116,6 @@ llama_context::llama_context(
                __func__, n_ctx_per_seq, hparams.n_ctx_train);
    }

-    logits_all = params.logits_all;
-
    if (!hparams.vocab_only) {
        // GPU backends
        for (auto * dev : model.devices) {
@ -253,7 +251,7 @@ llama_context::llama_context(
    }

    // reserve worst-case graph
-    if (!hparams.vocab_only) {
+    if (!hparams.vocab_only && memory) {
        const uint32_t n_seqs = 1; // TODO: worst-case number of sequences
        const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);

@ -702,6 +700,8 @@ int llama_context::encode(llama_batch & inp_batch) {
        t_compute_start_us = ggml_time_us();
    }

+    embd_seq.clear();
+
    n_queued_tokens += n_tokens;

    const int64_t n_embd = hparams.n_embd;
@ -763,12 +763,12 @@ int llama_context::encode(llama_batch & inp_batch) {
        ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
        GGML_ASSERT(backend_embd != nullptr);

-        GGML_ASSERT(embd != nullptr);
-
        switch (cparams.pooling_type) {
            case LLAMA_POOLING_TYPE_NONE:
                {
                    // extract token embeddings
+                    GGML_ASSERT(embd != nullptr);
+
                    GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
                    ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd*sizeof(float));
                } break;
@ -793,11 +793,18 @@ int llama_context::encode(llama_batch & inp_batch) {
                } break;
            case LLAMA_POOLING_TYPE_RANK:
                {
-                    // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
-                    //       wait for an encoder model that requires this pooling type in order to test it
-                    //       https://github.com/ggerganov/llama.cpp/pull/9510
-                    GGML_ABORT("RANK pooling not implemented yet");
-                }
+                    // extract the rerank score - a single float per sequence
+                    auto & embd_seq_out = embd_seq;
+
+                    for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
+                        const llama_seq_id seq_id = ubatch.seq_id[s][0];
+                        if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                            continue;
+                        }
+                        embd_seq_out[seq_id].resize(1);
+                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
+                    }
+                } break;
            case LLAMA_POOLING_TYPE_UNSPECIFIED:
                {
                    GGML_ABORT("unknown pooling type");
@ -835,6 +842,11 @@ int llama_context::encode(llama_batch & inp_batch) {
 }

 int llama_context::decode(llama_batch & inp_batch) {
+    if (!memory) {
+        LLAMA_LOG_WARN("%s: cannot decode batches with this context (use llama_encode() instead)\n", __func__);
+        return encode(inp_batch);
+    }
+
    if (inp_batch.n_tokens == 0) {
        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
        return -1;
@ -890,7 +902,7 @@ int llama_context::decode(llama_batch & inp_batch) {
        for (uint32_t i = 0; i < n_tokens_all; ++i) {
            n_outputs_all += batch.logits[i] != 0;
        }
-    } else if (logits_all || embd_pooled) {
+    } else if (embd_pooled) {
        n_outputs_all = n_tokens_all;
    } else {
        // keep last output only
@ -1853,13 +1865,12 @@ llama_context_params llama_context_default_params() {
        /*.cb_eval_user_data           =*/ nullptr,
        /*.type_k                      =*/ GGML_TYPE_F16,
        /*.type_v                      =*/ GGML_TYPE_F16,
-        /*.logits_all                  =*/ false,
+        /*.abort_callback              =*/ nullptr,
+        /*.abort_callback_data         =*/ nullptr,
        /*.embeddings                  =*/ false,
        /*.offload_kqv                 =*/ true,
        /*.flash_attn                  =*/ false,
        /*.no_perf                     =*/ true,
-        /*.abort_callback              =*/ nullptr,
-        /*.abort_callback_data         =*/ nullptr,
    };

    return result;