kv-cache : support V-less cache (#19067)

* kv-cache : support V-less cache * cuda : better check for V_is_K_view * cuda : improve V_is_K_view check * graph : add comments * hparams : refactor
2026-05-09 19:46:11 +00:00 · 2026-01-25 15:48:56 +02:00 · 2026-01-25 15:48:56 +02:00 · d9c6ce46f7
commit d9c6ce46f7
parent 70d860824a
11 changed files with 246 additions and 53 deletions
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -793,7 +793,7 @@ float * llama_context::get_embeddings_ith(int32_t i) {
            throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
        }

-        const uint32_t n_embd_out = model.hparams.get_n_embd_out();
+        const uint32_t n_embd_out = model.hparams.n_embd_out();
        return embd + j*n_embd_out;
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
@ -1279,7 +1279,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
                {
                    // extract token embeddings
                    GGML_ASSERT(embd != nullptr);
-                    const uint32_t n_embd_out = hparams.get_n_embd_out();
+                    const uint32_t n_embd_out = hparams.n_embd_out();

                    GGML_ASSERT(n_tokens*n_embd_out <= (int64_t) embd_size);
                    ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd_out*sizeof(float));
@ -1688,7 +1688,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
                    {
                        // extract token embeddings
                        GGML_ASSERT(embd != nullptr);
-                        const uint32_t n_embd_out = hparams.get_n_embd_out();
+                        const uint32_t n_embd_out = hparams.n_embd_out();
                        float * embd_out = embd + n_outputs_prev*n_embd_out;

                        if (n_outputs) {
@ -1821,7 +1821,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba

    const auto n_batch    = cparams.n_batch;
    const auto n_vocab    = vocab.n_tokens();
-    const auto n_embd_out = hparams.get_n_embd_out();
+    const auto n_embd_out = hparams.n_embd_out();

    bool has_logits = true;
    bool has_embd   = cparams.embeddings;