traded my ink for a pen

2025-09-09 16:44:35 +00:00 · 2025-03-14 11:58:15 +08:00 · 2025-03-14 11:58:15 +08:00 · 0db4ae6237
commit 0db4ae6237
parent 52cf1ded0c e0dbec0bc6
25 changed files with 13853 additions and 12134 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -959,8 +959,8 @@ struct common_init_result common_init_from_params(common_params & params) {
        return iparams;
    }
-    if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
+    if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
-        LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
+        LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
        params.ctx_shift = false;
    }
@ -1064,7 +1064,7 @@ struct common_init_result common_init_from_params(common_params & params) {
        if (llama_model_has_decoder(model)) {
            llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
        }
-        llama_kv_cache_clear(lctx);
+        llama_kv_self_clear(lctx);
        llama_synchronize(lctx);
        llama_perf_context_reset(lctx);
    }
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@ -173,7 +173,7 @@ llama_tokens common_speculative_gen_draft(
    result.reserve(params.n_draft);
    if (reuse_n == 0) {
-        llama_kv_cache_clear(ctx);
+        llama_kv_self_clear(ctx);
        prompt.clear();
    } else {
@ -192,14 +192,14 @@ llama_tokens common_speculative_gen_draft(
        }
        if (reuse_i > 0) {
-            llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
+            llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
-            llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
+            llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
            prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
        }
        if (reuse_n < (int) prompt.size()) {
-            llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
+            llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
            prompt.erase(prompt.begin() + reuse_n, prompt.end());
        }
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@ -309,7 +309,7 @@ int main(int argc, char ** argv) {
            }
            if (line == "/clear") {
                ctx.n_past = 0;
-                llama_kv_cache_seq_rm(ctx.lctx, 0, 1, -1); // keep BOS
+                llama_kv_self_seq_rm(ctx.lctx, 0, 1, -1); // keep BOS
                LOG("Chat history cleared\n\n");
                continue;
            }
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -355,7 +355,7 @@ int main(int argc, char ** argv) {
        }
        // remove any "future" tokens that we might have inherited from the previous session
-        llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
+        llama_kv_self_seq_rm(ctx, -1, n_matching_session_tokens, -1);
    }
    LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
@ -603,8 +603,8 @@ int main(int argc, char ** argv) {
                    LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                            n_past, n_left, n_ctx, params.n_keep, n_discard);
-                    llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
+                    llama_kv_self_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
-                    llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
+                    llama_kv_self_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
                    n_past -= n_discard;
@ -627,9 +627,9 @@ int main(int argc, char ** argv) {
                    LOG_DBG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
                    LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
-                    llama_kv_cache_seq_add(ctx, 0, ga_i,                n_past,              ib*bd);
+                    llama_kv_self_seq_add(ctx, 0, ga_i,                n_past,              ib*bd);
-                    llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
+                    llama_kv_self_seq_div(ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
-                    llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
+                    llama_kv_self_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
                    n_past -= bd;
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -2040,6 +2040,18 @@ struct server_context {
        return ret;
    }
    bool can_be_detokenized(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
        const llama_model * model = llama_get_model(ctx);
        const llama_vocab * vocab = llama_model_get_vocab(model);
        const int32_t n_vocab = llama_vocab_n_tokens(vocab);
        for (const auto & token : tokens) {
            if (token < 0 || token >= n_vocab) {
                return false;
            }
        }
        return true;
    }
    bool launch_slot_with_task(server_slot & slot, const server_task & task) {
        slot.reset();
        slot.id_task       = task.id;
@ -2054,6 +2066,11 @@ struct server_context {
            slot.lora = task.params.lora;
        }
        bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens);
        if (!can_detokenize) {
            send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST);
            return false;
        }
        SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
        if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
@ -2096,7 +2113,7 @@ struct server_context {
        SRV_DBG("%s", "clearing KV cache\n");
        // clear the entire KV cache
-        llama_kv_cache_clear(ctx);
+        llama_kv_self_clear(ctx);
        clean_kv_cache = false;
    }
@ -2638,8 +2655,8 @@ struct server_context {
                    res->n_tasks_deferred    = queue_tasks.queue_tasks_deferred.size();
                    res->t_start             = metrics.t_start;
-                    res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx);
+                    res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx);
-                    res->kv_cache_used_cells   = llama_get_kv_cache_used_cells(ctx);
+                    res->kv_cache_used_cells   = llama_kv_self_used_cells(ctx);
                    res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
                    res->t_prompt_processing_total       = metrics.t_prompt_processing_total;
@ -2755,7 +2772,7 @@ struct server_context {
                    // Erase token cache
                    const size_t n_erased = slot->cache_tokens.size();
-                    llama_kv_cache_seq_rm(ctx, slot->id, -1, -1);
+                    llama_kv_self_seq_rm(ctx, slot->id, -1, -1);
                    slot->cache_tokens.clear();
                    auto res = std::make_unique<server_task_result_slot_erase>();
@ -2823,8 +2840,8 @@ struct server_context {
                SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
-                llama_kv_cache_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
+                llama_kv_self_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
-                llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
+                llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
                if (slot.params.cache_prompt) {
                    for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@ -3015,8 +3032,8 @@ struct server_context {
                                            const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
-                                            llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
+                                            llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c);
-                                            llama_kv_cache_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
+                                            llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
                                            for (size_t i = 0; i < n_match; i++) {
                                                slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
@ -3054,9 +3071,9 @@ struct server_context {
                    }
                    // keep only the common part
-                    if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
+                    if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) {
                        // could not partially delete (likely using a non-Transformer model)
-                        llama_kv_cache_seq_rm(ctx, slot.id, -1, -1);
+                        llama_kv_self_seq_rm(ctx, slot.id, -1, -1);
                        // there is no common part left
                        slot.n_past = 0;
@ -3296,7 +3313,7 @@ struct server_context {
                slot.cache_tokens.push_back(id);
                slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
-                llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
+                llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
                for (size_t i = 0; i < ids.size(); ++i) {
                    completion_token_output result;
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
@ -302,7 +302,7 @@ class ServerPreset:
        server.model_hf_repo = "ggml-org/models"
        server.model_hf_file = "tinyllamas/stories260K.gguf"
        server.model_alias = "tinyllama-2"
-        server.n_ctx = 256
+        server.n_ctx = 512
        server.n_batch = 32
        server.n_slots = 2
        server.n_predict = 64
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -216,7 +216,7 @@ static void TokenizeString(const std::string & str_to_tokenize, std::vector<int>
            output_tokens = ::common_tokenize(llama_ctx_v4, str_to_tokenize, add_bos, true);
            if(add_bos)
            {
-                const llama_vocab * tmpvocab = llama_model_get_vocab(&(llama_ctx_v4->model));
+                const llama_vocab * tmpvocab = llama_model_get_vocab(llama_get_model(llama_ctx_v4));
                llama_token bostoadd = llama_vocab_bos(tmpvocab);
                if(bostoadd != LLAMA_TOKEN_NULL) //if bos does not exist, do not add it
                {
@ -249,7 +249,7 @@ static int GetEosID(FileFormat file_format, int32_t n_vocab)
    {
        if(file_format == FileFormat::GGUF_GENERIC)
        {
-            const llama_vocab * tmpvocab = llama_model_get_vocab(&(llama_ctx_v4->model));
+            const llama_vocab * tmpvocab = llama_model_get_vocab(llama_get_model(llama_ctx_v4));
            eosID = llama_vocab_eos(tmpvocab);
        }
        else if(file_format == FileFormat::GGJT_3)
@ -301,7 +301,7 @@ static int GetEotID(FileFormat file_format)
 {
    if(file_format == FileFormat::GGUF_GENERIC)
    {
-        const llama_vocab * tmpvocab = llama_model_get_vocab(&(llama_ctx_v4->model));
+        const llama_vocab * tmpvocab = llama_model_get_vocab(llama_get_model(llama_ctx_v4));
        return llama_vocab_eot(tmpvocab);
    }
    return -1;
@ -500,10 +500,10 @@ void ContextRewind(std::vector<int> &embd, std::vector<int> &current_context_tok
    if (file_format == FileFormat::GGUF_GENERIC)
    {
-        llama_kv_cache_seq_rm(llama_ctx_v4, 0, n_past, -1);
+        llama_kv_self_seq_rm(llama_ctx_v4, 0, n_past, -1);
        if(draft_ctx)
        {
-            llama_kv_cache_seq_rm(draft_ctx, 0, n_past, -1);
+            llama_kv_self_seq_rm(draft_ctx, 0, n_past, -1);
        }
    }
@ -1801,12 +1801,12 @@ void PurgeMissingTokens(llama_context * ctx, llama_context * draft_ctx, std::vec
            //extract the unwanted tokens out from context and KV
            int diff = found - trimstart;
-            llama_kv_cache_seq_rm(ctx, 0, trimstart, trimstart + diff);
+            llama_kv_self_seq_rm(ctx, 0, trimstart, trimstart + diff);
-            llama_kv_cache_seq_add(ctx, 0, trimstart + diff, -1, -diff);
+            llama_kv_self_seq_add(ctx, 0, trimstart + diff, -1, -diff);
            if(draft_ctx)
            {
-                llama_kv_cache_seq_rm(draft_ctx, 0, trimstart, trimstart + diff);
+                llama_kv_self_seq_rm(draft_ctx, 0, trimstart, trimstart + diff);
-                llama_kv_cache_seq_add(draft_ctx, 0, trimstart + diff, -1, -diff);
+                llama_kv_self_seq_add(draft_ctx, 0, trimstart + diff, -1, -diff);
            }
            for (size_t i = trimstart + diff; i < current_context_tokens.size() - 1; i++)
@ -2298,7 +2298,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
        //determine mem per token
        std::vector<int> tmp = {1, 2, 3, 4};
-        llama_kv_cache_clear(llama_ctx_v4);
+        llama_kv_self_clear(llama_ctx_v4);
        auto er = llama_decode(llama_ctx_v4, llama_batch_get_one(tmp.data(), tmp.size()));
        if(er!=0)
        {
@ -2696,13 +2696,13 @@ std::string gpttype_get_chat_template()
    // copied from examples/server/utils.hpp::llama_get_chat_template
    std::string template_key = "tokenizer.chat_template";
    // call with NULL buffer to get the total size of the string
-    int32_t res = llama_model_meta_val_str(&llama_ctx_v4->model, template_key.c_str(), NULL, 0);
+    int32_t res = llama_model_meta_val_str(llama_get_model(llama_ctx_v4), template_key.c_str(), NULL, 0);
    if (res < 0) {
        return "";
    }
    std::vector<char> model_template(res + 1, 0);
-    llama_model_meta_val_str(&llama_ctx_v4->model, template_key.c_str(), model_template.data(), model_template.size());
+    llama_model_meta_val_str(llama_get_model(llama_ctx_v4), template_key.c_str(), model_template.data(), model_template.size());
    return std::string(model_template.data(), model_template.size() - 1);
 }
@ -2885,7 +2885,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
    bool add_bos_token = true; //if set to false, mmproj handling breaks
    // if(file_format == FileFormat::GGUF_GENERIC && mmproj_filename == "")
    // {
-    //     const llama_vocab * tmpvocab = llama_model_get_vocab(&(llama_ctx_v4->model));
+    //     const llama_vocab * tmpvocab = llama_model_get_vocab(llama_get_model(llama_ctx_v4));
    //     add_bos_token = llama_vocab_get_add_bos(tmpvocab);
    //     if(!add_bos_token && debugmode==1)
    //     {
@ -3296,10 +3296,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
        {
            if(n_past==0)
            {
-                llama_kv_cache_clear(llama_ctx_v4);
+                llama_kv_self_clear(llama_ctx_v4);
                if(draft_ctx)
                {
-                    llama_kv_cache_clear(draft_ctx);
+                    llama_kv_self_clear(draft_ctx);
                }
            }
            else if(embd_inp.size()==0)
@ -3326,10 +3326,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
        }
        if(file_format == FileFormat::GGUF_GENERIC)
        {
-            llama_kv_cache_seq_rm(llama_ctx_v4, 0, n_past, -1);
+            llama_kv_self_seq_rm(llama_ctx_v4, 0, n_past, -1);
            if(draft_ctx)
            {
-                llama_kv_cache_seq_rm(draft_ctx, 0, n_past, -1);
+                llama_kv_self_seq_rm(draft_ctx, 0, n_past, -1);
            }
        }
    }
@ -3861,9 +3861,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
            //if we have somehow skipped ahead (e.g drafting), ensure that all tokens after npast are purged
            if (file_format == FileFormat::GGUF_GENERIC && draft_used)
            {
-                llama_kv_cache_seq_rm(llama_ctx_v4, 0, n_past, -1);
+                llama_kv_self_seq_rm(llama_ctx_v4, 0, n_past, -1);
                if (draft_ctx) {
-                    llama_kv_cache_seq_rm(draft_ctx, 0, n_past, -1);
+                    llama_kv_self_seq_rm(draft_ctx, 0, n_past, -1);
                }
            }
--- a/include/llama.h
+++ b/include/llama.h
@ -62,6 +62,7 @@ extern "C" {
    struct llama_model;
    struct llama_context;
    struct llama_sampler;
    struct llama_kv_cache;
    typedef int32_t llama_pos;
    typedef int32_t llama_token;
@ -471,7 +472,8 @@ extern "C" {
    DEPRECATED(LLAMA_API int32_t llama_n_vocab    (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
    LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx);
-    LLAMA_API enum llama_pooling_type    llama_pooling_type(const struct llama_context * ctx);
+    LLAMA_API    struct llama_kv_cache * llama_get_kv_self (      struct llama_context * ctx);
    LLAMA_API  enum llama_pooling_type   llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
    LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
    LLAMA_API enum llama_rope_type       llama_model_rope_type(const struct llama_model * model);
@ -588,7 +590,7 @@ extern "C" {
    // KV cache
    //
-    // TODO: remove llama_kv_cache_view_* API
+    // TODO: start using struct llama_kv_cache
    // Information associated with an individual cell in the KV cache view.
    struct llama_kv_cache_view_cell {
@ -643,13 +645,19 @@ extern "C" {
    // Returns the number of tokens in the KV cache (slow, use only for debug)
    // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
+    LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
            "use llama_kv_self_n_tokens instead");
    // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-    LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
+    LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
            "use llama_kv_self_used_cells instead");
    // Clear the KV cache - both cell info is erased and KV data is zeroed
-    LLAMA_API void llama_kv_cache_clear(
+    LLAMA_API void llama_kv_self_clear(
            struct llama_context * ctx);
    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
@ -657,7 +665,7 @@ extern "C" {
    // seq_id < 0 : match any sequence
    // p0 < 0     : [0,  p1]
    // p1 < 0     : [p0, inf)
-    LLAMA_API bool llama_kv_cache_seq_rm(
+    LLAMA_API bool llama_kv_self_seq_rm(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
                       llama_pos   p0,
@ -667,7 +675,7 @@ extern "C" {
    // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
    // p0 < 0 : [0,  p1]
    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_cp(
+    LLAMA_API void llama_kv_self_seq_cp(
            struct llama_context * ctx,
                    llama_seq_id   seq_id_src,
                    llama_seq_id   seq_id_dst,
@ -675,17 +683,17 @@ extern "C" {
                       llama_pos   p1);
    // Removes all tokens that do not belong to the specified sequence
-    LLAMA_API void llama_kv_cache_seq_keep(
+    LLAMA_API void llama_kv_self_seq_keep(
            struct llama_context * ctx,
                    llama_seq_id   seq_id);
    // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
    // If the KV cache is RoPEd, the KV data is updated accordingly:
    //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
+    //   - explicitly with llama_kv_self_update()
    // p0 < 0 : [0,  p1]
    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_add(
+    LLAMA_API void llama_kv_self_seq_add(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
                       llama_pos   p0,
@ -695,10 +703,10 @@ extern "C" {
    // Integer division of the positions by factor of `d > 1`
    // If the KV cache is RoPEd, the KV data is updated accordingly:
    //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
+    //   - explicitly with llama_kv_self_update()
    // p0 < 0 : [0,  p1]
    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_div(
+    LLAMA_API void llama_kv_self_seq_div(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
                       llama_pos   p0,
@ -706,24 +714,76 @@ extern "C" {
                             int   d);
    // Returns the largest position present in the KV cache for the specified sequence
-    LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
+    LLAMA_API llama_pos llama_kv_self_seq_pos_max(
            struct llama_context * ctx,
-                    llama_seq_id   seq_id);
+                     llama_seq_id   seq_id);
    // TODO: the llama_kv_cache_defrag and llama_kv_cache_update API tightly couples llama_context with llama_kv_cache
    //       how to avoid this?
    // Defragment the KV cache
    // This will be applied:
    //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
+    //   - explicitly with llama_kv_self_update()
-    LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
+    LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
    LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
    // Check if the context supports KV cache shifting
-    LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
+    LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
    LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
    DEPRECATED(LLAMA_API void llama_kv_cache_clear(
            struct llama_context * ctx),
            "use llama_kv_self_clear instead");
    DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
                       llama_pos   p0,
                       llama_pos   p1),
            "use llama_kv_self_seq_rm instead");
    DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
            struct llama_context * ctx,
                    llama_seq_id   seq_id_src,
                    llama_seq_id   seq_id_dst,
                       llama_pos   p0,
                       llama_pos   p1),
            "use llama_kv_self_seq_cp instead");
    DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
            struct llama_context * ctx,
                    llama_seq_id   seq_id),
            "use llama_kv_self_seq_keep instead");
    DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
                       llama_pos   p0,
                       llama_pos   p1,
                       llama_pos   delta),
            "use llama_kv_self_seq_add instead");
    DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
                       llama_pos   p0,
                       llama_pos   p1,
                             int   d),
            "use llama_kv_self_seq_div instead");
    DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
            struct llama_context * ctx,
                    llama_seq_id   seq_id),
            "use llama_kv_self_seq_pos_max instead");
    DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
            "use llama_kv_self_defrag instead");
    DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
            "use llama_kv_self_can_shift instead");
    DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
            "use llama_kv_self_update instead");
    //
    // State / sessions
--- a/otherarch/tts_adapter.cpp
+++ b/otherarch/tts_adapter.cpp
@ -607,9 +607,9 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
    std::vector<llama_token> codes;
    std::vector<llama_token> guide_tokens;
-    const llama_model * model_ttc = &(ttc_ctx->model);
+    const llama_model * model_ttc = llama_get_model(ttc_ctx);
    const llama_vocab * ttcvocab = llama_model_get_vocab(model_ttc);
-    const llama_model * model_cts = &(cts_ctx->model);
+    const llama_model * model_cts = llama_get_model(cts_ctx);
    const llama_vocab * ctsvocab = llama_model_get_vocab(model_cts);
    const int ttc_n_vocab = llama_vocab_n_tokens(ttcvocab);
    std::string prompt = inputs.prompt;
--- a/src/llama-adapter.cpp
+++ b/src/llama-adapter.cpp
@ -4,14 +4,13 @@
 #include "llama-mmap.h"
 #include "llama-model.h"
 #include <algorithm>
 #include <map>
 #include <cassert>
 #include <stdexcept>
 // vec
-struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
+ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
    if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
        return nullptr;
    }
@ -19,7 +18,7 @@ struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
    return tensors[il];
 }
-struct ggml_tensor * llama_adapter_cvec::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int  il) const {
+ggml_tensor * llama_adapter_cvec::apply_to(ggml_context * ctx, ggml_tensor * cur, int  il) const {
    ggml_tensor * layer_dir = tensor_for(il);
    if (layer_dir != nullptr) {
        cur = ggml_add(ctx, cur, layer_dir);
@ -40,7 +39,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
        auto it = ctx_map.find(buft);
        if (it == ctx_map.end()) {
-            struct ggml_init_params params = {
+            ggml_init_params params = {
                /*.mem_size   =*/ hparams.n_layer*ggml_tensor_overhead(),
                /*.mem_buffer =*/ NULL,
                /*.no_alloc   =*/ true,
@ -91,7 +90,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
    return true;
 }
-int32_t llama_adapter_cvec::apply(
+bool llama_adapter_cvec::apply(
        const llama_model & model,
        const float * data,
        size_t len,
@ -104,17 +103,17 @@ int32_t llama_adapter_cvec::apply(
        // disable the current control vector (but leave allocated for later)
        layer_start = -1;
        layer_end   = -1;
-        return 0;
+        return true;
    }
    if (n_embd != (int) hparams.n_embd) {
        LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
-        return 1;
+        return false;
    }
    if (tensors.empty()) {
        if (!init(model)) {
-            return 1;
+            return false;
        }
    }
@ -130,12 +129,12 @@ int32_t llama_adapter_cvec::apply(
        }
    }
-    return 0;
+    return true;
 }
 // lora
-llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * w) {
+llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
    const std::string name(w->name);
    const auto pos = ab_map.find(name);
@ -146,11 +145,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor *
    return nullptr;
 }
-static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
+static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
    LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
    ggml_context * ctx_init;
-    struct gguf_init_params meta_gguf_params = {
+    gguf_init_params meta_gguf_params = {
        /* .no_alloc = */ true,
        /* .ctx      = */ &ctx_init,
    };
@ -201,7 +200,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
        auto it = ctx_map.find(buft);
        if (it == ctx_map.end()) {
            // add a new context
-            struct ggml_init_params params = {
+            ggml_init_params params = {
                /*.mem_size   =*/ n_tensors*ggml_tensor_overhead(),
                /*.mem_buffer =*/ NULL,
                /*.no_alloc   =*/ true,
@ -264,7 +263,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
            throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
        }
-        struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
+        ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
        // validate tensor shape
        if (is_token_embd) {
            // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
@ -281,8 +280,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
        }
        // save tensor to adapter
-        struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
+        ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
-        struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
+        ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
        ggml_set_name(tensor_a, w.a->name);
        ggml_set_name(tensor_b, w.b->name);
        adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
@ -308,7 +307,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
    {
        llama_file gguf_file(path_lora, "rb");
        std::vector<uint8_t> read_buf;
-        auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
+        auto set_tensor = [&](ggml_tensor * orig, ggml_tensor * dev) {
            size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
            size_t size = ggml_nbytes(orig);
            read_buf.resize(size);
@ -327,8 +326,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
    LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
 }
-struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
+llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
-    struct llama_adapter_lora * adapter = new llama_adapter_lora();
+    llama_adapter_lora * adapter = new llama_adapter_lora();
    try {
        llama_adapter_lora_init_impl(*model, path_lora, *adapter);
@ -342,6 +341,6 @@ struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model,
    return nullptr;
 }
-void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
+void llama_adapter_lora_free(llama_adapter_lora * adapter) {
    delete adapter;
 }
--- a/src/llama-adapter.h
+++ b/src/llama-adapter.h
@ -15,11 +15,11 @@
 //
 struct llama_adapter_cvec {
-    struct ggml_tensor * tensor_for(int il) const;
+    ggml_tensor * tensor_for(int il) const;
-    struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int  il) const;
+    ggml_tensor * apply_to(ggml_context * ctx, ggml_tensor * cur, int  il) const;
-    int32_t apply(
+    bool apply(
            const llama_model & model,
            const float * data,
            size_t len,
@ -36,7 +36,7 @@ private:
    std::vector<ggml_context_ptr> ctxs;
    std::vector<ggml_backend_buffer_ptr> bufs;
-    std::vector<struct ggml_tensor *> tensors; // per layer
+    std::vector<ggml_tensor *> tensors; // per layer
 };
 //
@ -44,8 +44,8 @@ private:
 //
 struct llama_adapter_lora_weight {
-    struct ggml_tensor * a = nullptr;
+    ggml_tensor * a = nullptr;
-    struct ggml_tensor * b = nullptr;
+    ggml_tensor * b = nullptr;
    // get actual scale based on rank and alpha
    float get_scale(float alpha, float adapter_scale) const {
@ -55,12 +55,12 @@ struct llama_adapter_lora_weight {
    }
    llama_adapter_lora_weight() = default;
-    llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
+    llama_adapter_lora_weight(ggml_tensor * a, ggml_tensor * b) : a(a), b(b) {}
 };
 struct llama_adapter_lora {
    // map tensor name to lora_a_b
-    std::unordered_map<std::string, struct llama_adapter_lora_weight> ab_map;
+    std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
    std::vector<ggml_context_ptr> ctxs;
    std::vector<ggml_backend_buffer_ptr> bufs;
@ -70,5 +70,7 @@ struct llama_adapter_lora {
    llama_adapter_lora() = default;
    ~llama_adapter_lora() = default;
-    llama_adapter_lora_weight * get_weight(struct ggml_tensor * w);
+    llama_adapter_lora_weight * get_weight(ggml_tensor * w);
 };
 using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;
--- a/src/llama-batch.h
+++ b/src/llama-batch.h
@ -42,9 +42,9 @@ struct llama_sbatch {
    bool logits_all; // TODO: remove once lctx.logits_all is removed too
    // sorted indices into the batch
-    std::vector<size_t> ids;
+    std::vector<int64_t> ids;
    // batch indices of the output
-    std::vector<size_t> out_ids;
+    std::vector<int64_t> out_ids;
    std::vector<llama_sbatch_seq> seq;
    const llama_batch * batch = nullptr;
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
--- a/src/llama-context.h
+++ b/src/llama-context.h
@ -3,66 +3,210 @@
 #include "llama.h"
 #include "llama-batch.h"
 #include "llama-cparams.h"
-#include "llama-model.h"
+#include "llama-graph.h"
 #include "llama-kv-cache.h"
 #include "llama-adapter.h"
 #include "ggml-cpp.h"
 #include <map>
 #include <unordered_map>
 #include <vector>
-#include <set>
+
 struct llama_model;
 struct llama_kv_cache;
 class llama_io_read_i;
 class llama_io_write_i;
 struct llama_context {
-    llama_context(const llama_model & model)
+    // init scheduler and compute buffers, reserve worst-case graphs
-        : model(model)
+    llama_context(
-        , t_start_us(model.t_start_us)
+            const llama_model & model,
-        , t_load_us(model.t_load_us) {}
+                  llama_context_params params);
-    const struct llama_model & model;
+    ~llama_context();
-    struct llama_cparams      cparams;
+    void synchronize();
    struct llama_sbatch       sbatch;  // TODO: revisit if needed
    struct llama_kv_cache     kv_self;
    struct llama_adapter_cvec cvec;
-    std::unordered_map<struct llama_adapter_lora *, float> lora;
+    const llama_model & get_model() const;
-    std::vector<ggml_backend_ptr> backends;
+    uint32_t n_ctx()         const;
-    std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
+    uint32_t n_ctx_per_seq() const;
    uint32_t n_batch()       const;
    uint32_t n_ubatch()      const;
    uint32_t n_seq_max()     const;
-    ggml_backend_t backend_cpu = nullptr;
+    uint32_t n_threads()       const;
    uint32_t n_threads_batch() const;
-    ggml_threadpool_t threadpool       = nullptr;
+          llama_kv_cache * get_kv_self();
-    ggml_threadpool_t threadpool_batch = nullptr;
+    const llama_kv_cache * get_kv_self() const;
-    bool has_evaluated_once = false;
+    void kv_self_update();
-    mutable int64_t t_start_us;
+    enum llama_pooling_type pooling_type() const;
    mutable int64_t t_load_us;
    mutable int64_t t_p_eval_us = 0;
    mutable int64_t t_eval_us   = 0;
-    mutable int64_t t_compute_start_us = 0;
+    float * get_logits();
-    mutable int64_t n_queued_tokens = 0;
+    float * get_logits_ith(int32_t i);
-    mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
+    float * get_embeddings();
-    mutable int32_t n_eval   = 0; // number of eval calls
+    float * get_embeddings_ith(int32_t i);
    float * get_embeddings_seq(llama_seq_id seq_id);
-    // host buffer for the model output (logits and embeddings)
+    void attach_threadpool(
-    ggml_backend_buffer_ptr buf_output;
+            ggml_threadpool_t threadpool,
            ggml_threadpool_t threadpool_batch);
    void detach_threadpool();
    void set_n_threads(int32_t n_threads, int32_t n_threads_batch);
    void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data);
    void set_embeddings (bool value);
    void set_causal_attn(bool value);
    void set_adapter_lora(
            llama_adapter_lora * adapter,
            float scale);
    bool rm_adapter_lora(
            llama_adapter_lora * adapter);
    void clear_adapter_lora();
    bool apply_adapter_cvec(
            const float * data,
                 size_t   len,
                int32_t   n_embd,
                int32_t   il_start,
                int32_t   il_end);
    int encode(llama_batch & inp_batch);
    int decode(llama_batch & inp_batch);
    //
    // state save/load
    //
    size_t state_get_size();
    size_t state_get_data(      uint8_t * dst, size_t size);
    size_t state_set_data(const uint8_t * src, size_t size);
    size_t state_seq_get_size(llama_seq_id seq_id);
    size_t state_seq_get_data(llama_seq_id seq_id,       uint8_t * dst, size_t size);
    size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size);
    bool state_load_file(
            const char * filepath,
           llama_token * tokens_out,
                size_t   n_token_capacity,
                size_t * n_token_count_out);
    bool state_save_file(
            const char * filepath,
     const llama_token * tokens,
                size_t   n_token_count);
    size_t state_seq_load_file(
          llama_seq_id   seq_id,
            const char * filepath,
           llama_token * tokens_out,
                size_t   n_token_capacity,
                size_t * n_token_count_out);
    size_t state_seq_save_file(
          llama_seq_id   seq_id,
            const char * filepath,
     const llama_token * tokens,
                size_t   n_token_count);
    //
    // perf
    //
    llama_perf_context_data perf_get_data() const;
    void perf_reset();
 private:
    //
    // output
    //
    // Make sure enough space is available for outputs.
    // Returns max number of outputs for which space was reserved.
    int32_t output_reserve(int32_t n_outputs);
    // make the outputs have the same order they had in the user-provided batch
    // TODO: maybe remove this
    void output_reorder();
    //
    // graph
    //
    int32_t graph_max_nodes() const;
    // zero-out inputs and create the ctx_compute for the compute graph
    ggml_cgraph * graph_init();
    llm_graph_result_ptr graph_build(
            ggml_context * ctx,
             ggml_cgraph * gf,
      const llama_ubatch & ubatch,
          llm_graph_type   gtype);
    // returns the result of ggml_backend_sched_graph_compute_async execution
    ggml_status graph_compute(
            ggml_cgraph * gf,
                   bool   batched);
    llm_graph_cb graph_get_cb() const;
    // used by kv_self_update()
    ggml_tensor * build_rope_shift(
        ggml_context * ctx0,
        ggml_tensor * cur,
        ggml_tensor * shift,
        ggml_tensor * factors,
        ggml_backend_buffer * bbuf) const;
    llm_graph_result_ptr build_kv_self_shift(
            ggml_context * ctx0,
            ggml_cgraph * gf) const;
    llm_graph_result_ptr build_kv_self_defrag(
            ggml_context * ctx0,
            ggml_cgraph * gf) const;
    // TODO: read/write lora adapters and cvec
    size_t state_write_data(llama_io_write_i & io);
    size_t state_read_data (llama_io_read_i  & io);
    size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id);
    size_t state_seq_read_data (llama_io_read_i  & io, llama_seq_id seq_id);
    //
    // members
    //
    const llama_model & model;
    llama_cparams       cparams;
    llama_adapter_cvec  cvec;
    llama_adapter_loras loras;
    llama_sbatch        sbatch;
    llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably
    std::unique_ptr<llama_kv_cache_unified> kv_self;
    // TODO: remove
    bool logits_all = false;
    // decode output (2-dimensional array: [n_outputs][n_vocab])
    size_t  logits_size = 0; // capacity (of floats) for logits
    float * logits      = nullptr;
    std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
    size_t  output_size = 0; // capacity (of tokens positions) for the output buffers
    int32_t n_outputs   = 0; // number of actually-used outputs in the current ubatch or last logical batch
    bool logits_all = false;
    // embeddings output (2-dimensional array: [n_outputs][n_embd])
    // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
    size_t  embd_size = 0; // capacity (of floats) for embeddings
@ -72,57 +216,47 @@ struct llama_context {
    // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
    std::map<llama_seq_id, std::vector<float>> embd_seq;
-    // whether we are computing encoder output or decoder output
+    int32_t n_outputs     = 0; // number of actually-used outputs in the current ubatch or last logical batch
-    bool is_encoding = false;
+    int32_t n_outputs_max = 0; // capacity (of tokens positions) for the output buffers
-    // TODO: find a better way to accommodate mutli-dimension position encoding methods
+    std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
    // number of position id each token get, 1 for each token in most cases.
    // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
    int n_pos_per_token = 1;
    // output of the encoder part of the encoder-decoder models
    std::vector<float> embd_enc;
    std::vector<std::set<llama_seq_id>> seq_ids_enc;
    // memory buffers used to evaluate the model
    std::vector<uint8_t> buf_compute_meta;
    ggml_backend_sched_ptr sched;
    ggml_backend_t backend_cpu = nullptr;
    std::vector<ggml_backend_ptr> backends;
    ggml_context_ptr ctx_compute;
    ggml_threadpool_t threadpool       = nullptr;
    ggml_threadpool_t threadpool_batch = nullptr;
    ggml_abort_callback abort_callback      = nullptr;
    void *              abort_callback_data = nullptr;
-    // input tensors
+    std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
-    struct ggml_tensor * inp_tokens;        // I32 [n_batch]
+
-    struct ggml_tensor * inp_embd;          // F32 [n_embd, n_batch]
+    // buffer types used for the compute buffer of each backend
-    struct ggml_tensor * inp_pos;           // I32 [n_batch]
+    std::vector<ggml_backend_t>             backend_ptrs;
-    struct ggml_tensor * inp_out_ids;       // I32 [n_outputs]
+    std::vector<ggml_backend_buffer_type_t> backend_buft;
-    struct ggml_tensor * inp_KQ_mask;       // F32 [kv_size, n_batch]
+
-    struct ggml_tensor * inp_KQ_mask_swa;   // F32 [kv_size, n_batch]
+    // memory buffers used to evaluate the model
-    struct ggml_tensor * inp_K_shift;       // I32 [kv_size]
+    std::vector<uint8_t> buf_compute_meta;
-    struct ggml_tensor * inp_mean;          // F32 [n_batch, n_batch]
+
-    struct ggml_tensor * inp_cls;           // I32 [n_batch]
+    // host buffer for the model output (logits and embeddings)
-    struct ggml_tensor * inp_s_copy;        // I32 [kv_size]
+    ggml_backend_buffer_ptr buf_output;
-    struct ggml_tensor * inp_s_mask;        // F32 [1, n_kv]
+
-    struct ggml_tensor * inp_s_seq;         // I32 [n_kv, n_batch]
+    bool has_evaluated_once = false;
-    struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
+
-    struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
+    // perf
-    struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
+    mutable int64_t t_start_us  = 0;
    mutable int64_t t_load_us   = 0;
    mutable int64_t t_p_eval_us = 0;
    mutable int64_t t_eval_us   = 0;
    mutable int64_t t_compute_start_us = 0;
    mutable int64_t n_queued_tokens    = 0;
    mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
    mutable int32_t n_eval   = 0; // number of eval calls
 };
 // TODO: make these methods of llama_context
 void llama_set_k_shift(struct llama_context & lctx);
 void llama_set_s_copy(struct llama_context & lctx);
 void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch);
 // Make sure enough space is available for outputs.
 // Returns max number of outputs for which space was reserved.
 size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs);
 // make the outputs have the same order they had in the user-provided batch
 void llama_output_reorder(struct llama_context & ctx);
 // For internal test use
 // TODO: remove
 const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(struct llama_context * ctx);
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@ -0,0 +1,576 @@
 #pragma once
 #include "llama-arch.h"
 #include "llama-hparams.h"
 #include "llama-adapter.h"
 #include <cstdint>
 #include <vector>
 #include <memory>
 #include <set>
 #include <functional>
 struct ggml_cgraph;
 struct ggml_context;
 struct ggml_tensor;
 struct llama_ubatch;
 struct llama_cparams;
 class llama_memory_i;
 class llama_kv_cache_unified;
 // certain models (typically multi-modal) can produce different types of graphs
 enum llm_graph_type {
    LLM_GRAPH_TYPE_DEFAULT,
    LLM_GRAPH_TYPE_ENCODER,
    LLM_GRAPH_TYPE_DECODER,
 };
 enum llm_ffn_op_type {
    LLM_FFN_SILU,
    LLM_FFN_GELU,
    LLM_FFN_RELU,
    LLM_FFN_RELU_SQR,
    LLM_FFN_SWIGLU,
 };
 enum llm_ffn_gate_type {
    LLM_FFN_SEQ,
    LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
 };
 enum llm_norm_type {
    LLM_NORM,
    LLM_NORM_RMS,
    LLM_NORM_GROUP,
 };
 // TODO: tmp - need something better to pass the data from the encoder to the decoder
 struct llama_cross {
    // the output embeddings from the encoder as a ggml tensor
    // TODO: this needs more work to be correct, for now copy the embeddings data to host memory
    //       ref: https://github.com/ggml-org/llama.cpp/pull/11213#discussion_r1969892524
    //ggml_tensor * t_embd = nullptr;
    int64_t n_embd = 0;
    int64_t n_enc  = 0;
    // embeddings data copied to host memory (tmp)
    std::vector<float> v_embd;
    // needed to construct the cross-attention mask in the decoder
    std::vector<std::set<llama_seq_id>> seq_ids_enc;
 };
 //
 // llm_graph_input
 //
 class llm_graph_input_i {
 public:
    virtual ~llm_graph_input_i() = default;
    virtual void set_input(const llama_ubatch * ubatch) = 0;
 };
 using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
 class llm_graph_input_embd : public llm_graph_input_i {
 public:
    llm_graph_input_embd()          = default;
    virtual ~llm_graph_input_embd() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * tokens = nullptr; // I32 [n_batch]
    ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
 };
 class llm_graph_input_pos : public llm_graph_input_i {
 public:
    llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
    virtual ~llm_graph_input_pos() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * pos = nullptr; // I32 [n_batch]
    const int64_t n_pos_per_token = 1;
 };
 class llm_graph_input_pos_bucket : public llm_graph_input_i {
 public:
    llm_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {}
    virtual ~llm_graph_input_pos_bucket() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * pos_bucket = nullptr; // I32 [n_batch, n_batch]
    const llama_hparams & hparams;
 };
 class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
 public:
    llm_graph_input_pos_bucket_kv(
            const llama_hparams & hparams,
            const llama_kv_cache_unified * kv_self) : hparams(hparams), kv_self(kv_self) {}
    virtual ~llm_graph_input_pos_bucket_kv() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]
    const llama_hparams & hparams;
    const llama_kv_cache_unified * kv_self;
 };
 class llm_graph_input_out_ids : public llm_graph_input_i {
 public:
    llm_graph_input_out_ids(
            const llama_hparams & hparams,
            const llama_cparams & cparams,
            int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
    virtual ~llm_graph_input_out_ids() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * out_ids; // I32 [n_outputs]
    const llama_hparams & hparams;
    const llama_cparams & cparams;
    const int32_t n_outputs;
 };
 class llm_graph_input_mean : public llm_graph_input_i {
 public:
    llm_graph_input_mean(const llama_cparams & cparams) : cparams(cparams) {}
    virtual ~llm_graph_input_mean() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * mean; // F32 [n_batch, n_batch]
    const llama_cparams & cparams;
 };
 class llm_graph_input_cls : public llm_graph_input_i {
 public:
    llm_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {}
    virtual ~llm_graph_input_cls() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * cls; // I32 [n_batch]
    const llama_cparams & cparams;
 };
 class llm_graph_input_s_copy : public llm_graph_input_i {
 public:
    llm_graph_input_s_copy(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
    virtual ~llm_graph_input_s_copy() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * s_copy; // I32 [kv_size]
    const llama_kv_cache_unified * kv_self;
 };
 class llm_graph_input_s_mask : public llm_graph_input_i {
 public:
    llm_graph_input_s_mask(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
    virtual ~llm_graph_input_s_mask() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * s_mask; // F32 [1, n_kv]
    const llama_kv_cache_unified * kv_self;
 };
 class llm_graph_input_cross_embd : public llm_graph_input_i {
 public:
    llm_graph_input_cross_embd(
            const llama_cross * cross) : cross(cross) {}
    virtual ~llm_graph_input_cross_embd() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * cross_embd; // F32 [n_embd, n_outputs_enc]
    const llama_cross * cross;
 };
 class llm_graph_input_attn_no_cache : public llm_graph_input_i {
 public:
    llm_graph_input_attn_no_cache(const llama_hparams & hparams, const llama_cparams & cparams) :
        hparams(hparams),
        cparams(cparams) {
    }
    ~llm_graph_input_attn_no_cache() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * get_kq_mask() const { return kq_mask_cnv; }
    ggml_tensor * kq_mask     = nullptr; // F32 [n_tokens, n_batch]
    ggml_tensor * kq_mask_cnv = nullptr; //     [n_tokens, n_batch]
    const llama_hparams & hparams;
    const llama_cparams & cparams;
 };
 class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
 public:
    llm_graph_input_attn_kv_unified(
            const llama_hparams & hparams,
            const llama_cparams & cparams,
            const llama_kv_cache_unified * kv_self) :
        hparams(hparams),
        cparams(cparams),
        kv_self(kv_self) {
    }
    ~llm_graph_input_attn_kv_unified() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * get_kq_mask()     const { return self_kq_mask_cnv; }
    ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
    ggml_tensor * self_kq_mask         = nullptr; // F32 [n_kv, n_batch]
    ggml_tensor * self_kq_mask_cnv     = nullptr; //     [n_kv, n_batch]
    ggml_tensor * self_kq_mask_swa     = nullptr; // F32 [n_kv, n_batch]
    ggml_tensor * self_kq_mask_swa_cnv = nullptr; //     [n_kv, n_batch]
    const llama_hparams & hparams;
    const llama_cparams & cparams;
    const llama_kv_cache_unified * kv_self;
 };
 class llm_graph_input_attn_cross : public llm_graph_input_i {
 public:
    llm_graph_input_attn_cross(const llama_cross * cross) : cross(cross) {}
    ~llm_graph_input_attn_cross() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * get_kq_mask_cross() const { return cross_kq_mask_cnv; }
    ggml_tensor * cross_kq_mask     = nullptr; // F32 [n_outputs_enc, n_batch]
    ggml_tensor * cross_kq_mask_cnv = nullptr; // F32 [n_outputs_enc, n_batch]
    const llama_cross * cross = nullptr;
 };
 //
 // llm_graph_result
 //
 // these objects deliver the result from the graph build process back to the llama_context
 // note that the input tensors created for the graph are referenced here - the goal is to be able to populate their
 //   specific data, by calling the set_inputs() method
 // along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc.
 //   these are used by the llama_context to extact the relevant data, based on the compute parameters
 class llm_graph_result_i {
 public:
    virtual ~llm_graph_result_i() = default;
    virtual ggml_tensor * get_logits()      = 0;
    virtual ggml_tensor * get_embd()        = 0;
    virtual ggml_tensor * get_embd_pooled() = 0;
    virtual void set_inputs(const llama_ubatch * ubatch) = 0;
 };
 using llm_graph_result_ptr = std::unique_ptr<llm_graph_result_i>;
 class llm_graph_result : public llm_graph_result_i {
 public:
    virtual ~llm_graph_result() = default;
    ggml_tensor * get_logits()      override { return t_logits; }
    ggml_tensor * get_embd()        override { return t_embd; }
    ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
    void set_inputs(const llama_ubatch * ubatch) override {
        for (auto & input : inputs) {
            input->set_input(ubatch);
        }
    }
    llm_graph_input_i * add_input(llm_graph_input_ptr input) {
        inputs.emplace_back(std::move(input));
        return inputs.back().get();
    }
    // important graph nodes
    ggml_tensor * t_logits      = nullptr;
    ggml_tensor * t_embd        = nullptr;
    ggml_tensor * t_embd_pooled = nullptr;
    std::vector<llm_graph_input_ptr> inputs;
 };
 //
 // llm_graph_context
 //
 // callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
 using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
 struct llm_graph_params {
    ggml_context * ctx;
    const llm_arch arch;
    const llama_hparams & hparams;
    const llama_cparams & cparams;
    const llama_ubatch  & ubatch;
    ggml_backend_sched * sched;
    ggml_backend * backend_cpu;
    const llama_adapter_cvec  * cvec;
    const llama_adapter_loras * loras;
    const llama_memory_i      * memory;
    const llama_cross         * cross;
    int32_t n_outputs;
    const llm_graph_cb & cb;
 };
 struct llm_graph_context {
    const llm_arch arch;
    const llama_hparams & hparams;
    const llama_cparams & cparams;
    const llama_ubatch  & ubatch;
    const int64_t n_embd;
    const int64_t n_layer;
    const int64_t n_rot;
    const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
    const int64_t n_ctx_per_seq;
    const int64_t n_head;
    const int64_t n_head_kv;
    const int64_t n_embd_head_k;
    const int64_t n_embd_k_gqa;
    const int64_t n_embd_head_v;
    const int64_t n_embd_v_gqa;
    const int64_t n_expert;
    const int64_t n_expert_used;
    const float freq_base;
    const float freq_scale;
    const float ext_factor;
    const float attn_factor;
    const float beta_fast;
    const float beta_slow;
    const float norm_eps;
    const float norm_rms_eps;
    const int32_t n_tokens;
    const int32_t n_outputs;
    const int32_t n_ctx_orig; // yarn
    const enum llama_pooling_type pooling_type;
    const enum llama_rope_type    rope_type;
    ggml_context * ctx0 = nullptr;
    ggml_backend_sched * sched;
    ggml_backend * backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
    const llama_adapter_cvec  * cvec;
    const llama_adapter_loras * loras;
    const llama_memory_i      * memory;
    const llama_cross         * cross;
    const llm_graph_cb & cb_func;
    std::unique_ptr<llm_graph_result> res;
    llm_graph_context(const llm_graph_params & params);
    int64_t n_pos_per_token() const;
    void cb(ggml_tensor * cur, const char * name, int il) const;
    //
    // common
    //
    ggml_tensor * build_cvec(
             ggml_tensor * cur,
                     int   il) const;
    // do mat_mul, while optionally apply lora
    ggml_tensor * build_lora_mm(
              ggml_tensor * w,
              ggml_tensor * cur) const;
    // do mat_mul_id, while optionally apply lora
    ggml_tensor * build_lora_mm_id(
              ggml_tensor * w,   // ggml_tensor * as
              ggml_tensor * cur, // ggml_tensor * b
              ggml_tensor * ids) const;
    ggml_tensor * build_norm(
             ggml_tensor * cur,
             ggml_tensor * mw,
             ggml_tensor * mb,
           llm_norm_type   type,
                     int   il) const;
    ggml_tensor * build_ffn(
             ggml_tensor * cur,
             ggml_tensor * up,
             ggml_tensor * up_b,
             ggml_tensor * up_s,
             ggml_tensor * gate,
             ggml_tensor * gate_b,
             ggml_tensor * gate_s,
             ggml_tensor * down,
             ggml_tensor * down_b,
             ggml_tensor * down_s,
             ggml_tensor * act_scales,
         llm_ffn_op_type   type_op,
       llm_ffn_gate_type   type_gate,
                     int   il) const;
    ggml_tensor * build_moe_ffn(
             ggml_tensor * cur,
             ggml_tensor * gate_inp,
             ggml_tensor * up_exps,
             ggml_tensor * gate_exps,
             ggml_tensor * down_exps,
             ggml_tensor * exp_probs_b,
                 int64_t   n_expert,
                 int64_t   n_expert_used,
         llm_ffn_op_type   type_op,
                    bool   norm_w,
                    bool   scale_w,
                   float   w_scale,
            llama_expert_gating_func_type gating_op,
                     int   il) const;
    //
    // inputs
    //
    ggml_tensor * build_inp_embd(ggml_tensor * tok_embd) const;
    ggml_tensor * build_inp_pos() const;
    ggml_tensor * build_inp_out_ids() const;
    ggml_tensor * build_inp_mean() const;
    ggml_tensor * build_inp_cls() const;
    ggml_tensor * build_inp_s_copy() const;
    ggml_tensor * build_inp_s_mask() const;
    ggml_tensor * build_inp_cross_embd() const;
    ggml_tensor * build_inp_pos_bucket_enc() const;
    ggml_tensor * build_inp_pos_bucket_dec() const;
    ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const;
    //
    // attention
    //
    ggml_tensor * build_attn_mha(
             ggml_cgraph * gf,
             ggml_tensor * q,
             ggml_tensor * k,
             ggml_tensor * v,
             ggml_tensor * kq_b,
             ggml_tensor * kq_mask,
                    bool   v_trans,
                   float   kq_scale) const;
    llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
    ggml_tensor * build_attn(
            llm_graph_input_attn_no_cache * inp,
            ggml_cgraph * gf,
            ggml_tensor * wo,
            ggml_tensor * wo_b,
            ggml_tensor * q_cur,
            ggml_tensor * k_cur,
            ggml_tensor * v_cur,
            ggml_tensor * kq_b,
                  float   kq_scale,
                    int   il) const;
    llm_graph_input_attn_kv_unified * build_attn_inp_kv_unified(
            bool causal,
            bool swa) const;
    ggml_tensor * build_attn(
            llm_graph_input_attn_kv_unified * inp,
            ggml_cgraph * gf,
            ggml_tensor * wo,
            ggml_tensor * wo_b,
            ggml_tensor * q_cur,
            ggml_tensor * k_cur,
            ggml_tensor * v_cur,
            ggml_tensor * kq_b,
                  float   kq_scale,
                    int   il) const;
    llm_graph_input_attn_cross * build_attn_inp_cross() const;
    ggml_tensor * build_attn(
            llm_graph_input_attn_cross * inp,
            ggml_cgraph * gf,
            ggml_tensor * wo,
            ggml_tensor * wo_b,
            ggml_tensor * q_cur,
            ggml_tensor * k_cur,
            ggml_tensor * v_cur,
            ggml_tensor * kq_b,
                  float   kq_scale,
                    int   il) const;
    //
    // recurrent
    //
    ggml_tensor * build_copy_mask_state(
             ggml_cgraph * gf,
             ggml_tensor * s,
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
                 int32_t   n_state,
                 int32_t   n_seqs) const;
    ggml_tensor * build_rwkv_token_shift_load(
             ggml_cgraph * gf,
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
      const llama_ubatch & ubatch,
                     int   il) const;
    ggml_tensor * build_rwkv_token_shift_store(
             ggml_tensor * token_shift,
      const llama_ubatch & ubatch,
                     int   il) const;
    //
    // pooling
    //
    void build_pooling(
            ggml_cgraph * gf,
            ggml_tensor * cls,
            ggml_tensor * cls_b,
            ggml_tensor * cls_out,
            ggml_tensor * cls_out_b) const;
 };
--- a/src/llama-io.cpp
+++ b/src/llama-io.cpp
@ -0,0 +1,15 @@
 #include "llama-io.h"
 void llama_io_write_i::write_string(const std::string & str) {
    uint32_t str_size = str.size();
    write(&str_size,  sizeof(str_size));
    write(str.data(), str_size);
 }
 void llama_io_read_i::read_string(std::string & str) {
    uint32_t str_size;
    read_to(&str_size, sizeof(str_size));
    str.assign((const char *) read(str_size), str_size);
 }
--- a/src/llama-io.h
+++ b/src/llama-io.h
@ -0,0 +1,35 @@
 #pragma once
 #include <cstddef>
 #include <cstdint>
 #include <string>
 struct ggml_tensor;
 class llama_io_write_i {
 public:
    llama_io_write_i() = default;
    virtual ~llama_io_write_i() = default;
    virtual void write(const void * src, size_t size) = 0;
    virtual void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) = 0;
    // bytes written so far
    virtual size_t n_bytes() = 0;
    void write_string(const std::string & str);
 };
 class llama_io_read_i {
 public:
    llama_io_read_i() = default;
    virtual ~llama_io_read_i() = default;
    virtual const uint8_t * read(size_t size) = 0;
    virtual void read_to(void * dst, size_t size) = 0;
    // bytes read so far
    virtual size_t n_bytes() = 0;
    void read_string(std::string & str);
 };
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@ -1,12 +1,29 @@
 #pragma once
 #include "llama.h"
 #include "llama-io.h"
 #include "llama-memory.h"
 #include "ggml-cpp.h"
 #include <functional>
 #include <set>
 #include <vector>
-#include <algorithm>
+
 struct llama_cparams;
 struct llama_hparams;
 struct llama_ubatch;
 struct llama_kv_cache : public llama_memory_i {
    using llama_memory_i::llama_memory_i;
    virtual int32_t  get_n_tokens()   const = 0;
    virtual uint32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
    virtual bool get_can_shift() const = 0;
    bool get_can_edit() const override { return get_can_shift(); }
 };
 struct llama_kv_cell {
    llama_pos pos   = -1;
@ -29,11 +46,105 @@ struct llama_kv_cell {
    }
 };
 // a structure holds information about the slot found in llama_kv_cache_find_slot
 struct llama_kv_cache_slot_info {
    std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
    bool found = false;                       // the slot was found
    explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
    llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
    operator bool() const { return found; }
 };
 // ring-buffer of cached KV data
-struct llama_kv_cache {
+// TODO: pimpl
 // TODO: add notion of max sequences
 class llama_kv_cache_unified : public llama_kv_cache {
 public:
    // can be used to query data from the model if needed
    struct callbacks {
        std::function<ggml_tensor * (uint32_t n_ctx_per_seq, int il)> get_rope_factors;
    };
    llama_kv_cache_unified(
            const llama_hparams & hparams,
            callbacks             cbs);
    virtual ~llama_kv_cache_unified() = default;
    // TODO: become constructor
    bool init(
            const llama_model & model,   // TODO: do not reference the model
          const llama_cparams & cparams,
                    ggml_type   type_k,
                    ggml_type   type_v,
                     uint32_t   kv_size,
                         bool   offload);
    int32_t  get_n_tokens()   const override;
    uint32_t get_used_cells() const override;
    size_t total_size() const;
    // TODO: better data structures to reduce the cost of this operation
    llama_pos pos_max() const;
    void clear() override;
    void defrag() override;
    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
    void seq_keep(llama_seq_id seq_id) override;
    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) override;
    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
    llama_pos seq_pos_max(llama_seq_id seq_id) override;
    bool get_can_shift() const override;
    // find an empty slot of size "n_tokens" in the cache
    // updates the cache head
    // returns a structure holding information about the slot found
    // Note: On success, it's important that cache.head points
    // to the first cell of the slot.
    llama_kv_cache_slot_info find_slot(const llama_ubatch & batch);
    // TODO: maybe not needed
    uint32_t get_padding(const llama_cparams & cparams) const;
    // find how many cells are currently in use
    uint32_t cell_max() const;
    size_t size_k_bytes() const;
    size_t size_v_bytes() const;
    // defrag
    struct {
        std::vector<uint32_t> ids;
    } defrag_info;
    // return true if cells have been moved
    bool defrag_prepare(int32_t n_max_nodes);
    // state save/load
    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const;
    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1);
    // members
    const llama_hparams & hparams;
    callbacks cbs;
    bool has_shift = false;
    bool do_defrag = false;
    // TODO: remove this and implement llama_kv_cache_recurrent instead
    bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
    bool v_trans   = true;  // the value tensor is transposed
    bool can_shift = false;
@ -47,124 +158,30 @@ struct llama_kv_cache {
    // computed before each graph build
    uint32_t n = 0;
    std::vector<llama_kv_cell> cells;
    std::vector<ggml_tensor *> k_l; // per layer
    std::vector<ggml_tensor *> v_l;
 private:
    ggml_type type_k = GGML_TYPE_F16;
    ggml_type type_v = GGML_TYPE_F16;
-    std::vector<llama_kv_cell> cells;
+    std::vector<ggml_context_ptr>        ctxs;
    std::vector<struct ggml_tensor *> k_l; // per layer
    std::vector<struct ggml_tensor *> v_l;
    std::vector<ggml_context_ptr> ctxs;
    std::vector<ggml_backend_buffer_ptr> bufs;
-    size_t total_size() const {
+    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
-        size_t size = 0;
+    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
        for (const auto & buf : bufs) {
            size += ggml_backend_buffer_get_size(buf.get());
        }
-        return size;
+    bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
-    }
+    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
    // TODO: better data structures to reduce the cost of this operation
    llama_pos max_pos() const {
        llama_pos max_pos = -1;
        for (const auto & cell : cells) {
            max_pos = std::max(max_pos, cell.pos);
        }
        return max_pos;
    }
 };
-// a structure holds information about the slot found in llama_kv_cache_find_slot
+// TODO: temporary reusing llama_kv_cache_unified -- implement recurrent cache and simplify llama_kv_cache_unified
-struct llama_kv_cache_slot_info {
+//class llama_kv_cache_recurrent : public llama_kv_cache_unified {
-    std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
+//public:
-    bool found = false;                       // the slot was found
+//    using llama_kv_cache_unified::llama_kv_cache_unified;
-
+//};
    explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
    llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
    operator bool() const { return found; }
 };
 // TODO: maybe not needed
 uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams);
 bool llama_kv_cache_init(
        struct llama_kv_cache & cache,
            const llama_model & model,
          const llama_cparams & cparams,
                    ggml_type   type_k,
                    ggml_type   type_v,
                     uint32_t   kv_size,
                         bool   offload);
 // find an empty slot of size "n_tokens" in the cache
 // updates the cache head
 // returns a structure holding information about the slot found
 // Note: On success, it's important that cache.head points
 // to the first cell of the slot.
 struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
           struct llama_kv_cache & cache,
       const struct llama_ubatch & batch);
 // find how many cells are currently in use
 uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache);
 void llama_kv_cache_clear(struct llama_kv_cache & cache);
 bool llama_kv_cache_seq_rm(
        struct llama_kv_cache & cache,
                 llama_seq_id   seq_id,
                    llama_pos   p0,
                    llama_pos   p1);
 void llama_kv_cache_seq_cp(
        struct llama_kv_cache & cache,
                 llama_seq_id   seq_id_src,
                 llama_seq_id   seq_id_dst,
                    llama_pos   p0,
                    llama_pos   p1);
 void llama_kv_cache_seq_keep(
        struct llama_kv_cache & cache,
                 llama_seq_id   seq_id);
 void llama_kv_cache_seq_add(
        struct llama_kv_cache & cache,
                 llama_seq_id   seq_id,
                    llama_pos   p0,
                    llama_pos   p1,
                    llama_pos   delta);
 void llama_kv_cache_seq_div(
        struct llama_kv_cache & cache,
                 llama_seq_id   seq_id,
                    llama_pos   p0,
                    llama_pos   p1,
                          int   d);
 llama_pos llama_kv_cache_seq_pos_max(
        struct llama_kv_cache & cache,
                 llama_seq_id   seq_id);
 void llama_kv_cache_defrag(struct llama_kv_cache & cache);
 int32_t llama_get_kv_cache_token_count(const struct llama_kv_cache & kv);
 int32_t llama_get_kv_cache_used_cells(const struct llama_kv_cache & kv);
 bool llama_kv_cache_can_shift(const struct llama_kv_cache & kv);
 //
 // kv cache view
 //
 struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max);
 void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv);
 //
 // kv cache restore
@ -184,13 +201,15 @@ struct llama_kv_slot_restorer {
    bool do_restore = false;
-    explicit llama_kv_slot_restorer(const struct llama_kv_cache & cache) {
+    llama_kv_cache_unified & cache;
    explicit llama_kv_slot_restorer(llama_kv_cache_unified & cache) : cache(cache) {
        old_state.head = cache.head;
        old_state.n    = cache.n;
    }
    // saves a slot information for future restoration
-    void save(const struct llama_kv_cache_slot_info & slot) {
+    void save(const llama_kv_cache_slot_info & slot) {
        if (slot) {
            do_restore = true;
            if (slot.boundaries.first != slot.boundaries.second) {
@ -201,19 +220,68 @@ struct llama_kv_slot_restorer {
    // must be explicitly called to restore the kv_cache state
    // and rollback changes from all llama_kv_cache_find_slot calls
-    void restore(struct llama_kv_cache & cache) {
+    void restore() {
        if (do_restore) {
            cache.head = old_state.head;
            cache.n    = old_state.n;
            if (cache.recurrent) { // recurrent models like Mamba or RWKV can't have a state partially erased
-                llama_kv_cache_seq_rm(cache, -1, -1, -1);
+                cache.seq_rm(-1, -1, -1);
            } else {
                for (auto & slot : slot_boundaries) {
-                    llama_kv_cache_seq_rm(cache, -1, slot.first, slot.second);
+                    cache.seq_rm(-1, slot.first, slot.second);
                }
            }
        }
    }
 };
 // TODO: maybe become part of the public llama_kv_cache in the future
 int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv);
 int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv);
 void llama_kv_cache_clear(llama_kv_cache * kv);
 bool llama_kv_cache_seq_rm(
        llama_kv_cache * kv,
          llama_seq_id   seq_id,
             llama_pos   p0,
             llama_pos   p1);
 void llama_kv_cache_seq_cp(
        llama_kv_cache * kv,
          llama_seq_id   seq_id_src,
          llama_seq_id   seq_id_dst,
             llama_pos   p0,
             llama_pos   p1);
 void llama_kv_cache_seq_keep(llama_kv_cache * kv, llama_seq_id seq_id);
 void llama_kv_cache_seq_add(
        llama_kv_cache * kv,
          llama_seq_id   seq_id,
             llama_pos   p0,
             llama_pos   p1,
             llama_pos   delta);
 void llama_kv_cache_seq_div(
        llama_kv_cache * kv,
          llama_seq_id   seq_id,
             llama_pos   p0,
             llama_pos   p1,
                   int   d);
 llama_pos llama_kv_cache_seq_pos_max(llama_kv_cache * kv, llama_seq_id seq_id);
 void llama_kv_cache_defrag(llama_kv_cache * kv);
 bool llama_kv_cache_can_shift(const llama_kv_cache * kv);
 //
 // kv cache view
 //
 llama_kv_cache_view llama_kv_cache_view_init(const llama_kv_cache & kv, int32_t n_seq_max);
 void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache * kv);
--- a/src/llama-memory.cpp
+++ b/src/llama-memory.cpp
@ -0,0 +1 @@
 #include "llama-memory.h"
--- a/src/llama-memory.h
+++ b/src/llama-memory.h
@ -0,0 +1,21 @@
 #pragma once
 #include "llama.h"
 // general concept of LLM memory
 // the KV cache is a type of LLM memory, but there can be other types
 class llama_memory_i {
 public:
    virtual void clear() = 0;
    virtual void defrag() = 0;
    virtual bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) = 0;
    virtual void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
    virtual void seq_keep(llama_seq_id seq_id) = 0;
    virtual void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) = 0;
    virtual void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) = 0;
    virtual llama_pos seq_pos_max(llama_seq_id seq_id) = 0;
    virtual bool get_can_edit() const = 0;
 };
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
--- a/src/llama-model.h
+++ b/src/llama-model.h
@ -2,7 +2,9 @@
 #include "llama.h"
 #include "llama-arch.h"
 #include "llama-graph.h"
 #include "llama-hparams.h"
 #include "llama-memory.h"
 #include "llama-vocab.h"
 #include <memory>
@ -10,6 +12,8 @@
 #include <unordered_map>
 #include <vector>
 struct llama_cparams;
 struct llama_ubatch;
 struct llama_model_loader;
 // available models
@ -347,7 +351,7 @@ struct llama_model {
    std::string desc() const;
    size_t size() const;
-    size_t max_nodes() const;
+    size_t n_tensors() const;
    size_t n_devices() const;
    // total number of parameters in the model
@ -362,9 +366,22 @@ struct llama_model {
    const struct ggml_tensor * get_tensor(const char * name) const;
    // TODO: move this to new llm_arch_model_i interface
    llama_memory_i * create_memory() const; // TODO: params
    // TODO: move this to new llm_arch_model_i interface
    llm_graph_result_ptr build_graph(
            const llm_graph_params & params,
                       ggml_cgraph * gf,
                    llm_graph_type   type) const;
 private:
    struct impl;
    std::unique_ptr<impl> pimpl;
 };
 const char * llm_type_name(llm_type type);
 // For internal test use
 // TODO: remove
 const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model);
--- a/src/llama.cpp
+++ b/src/llama.cpp