From bfaddd7a3bbfdbdb7532b2251d2e25b93c7a89bf Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Sun, 10 May 2026 23:39:03 +0800
Subject: [PATCH] added support for added memory and gemma and glm prompt fixes
 for batching mode

---
 gpttype_adapter.cpp | 26 ++++++++++++++++++++++----
 koboldcpp.py        |  2 +-
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index 24f5868fe..290b7c946 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -3411,6 +3411,7 @@ struct BatchGenerateRequest
     int slot = -1;
     BatchState state = BatchState::WAITING;
     std::string prompt;
+    std::string prompt_added_memory;
     std::vector<std::string> stop_sequences;
     int max_context_length = 0;
     int max_length = 0;
@@ -3557,10 +3558,6 @@ static bool batch_inputs_eligible(const generation_inputs & inputs)
     {
         return false;
     }
-    if(inputs.memory && std::string(inputs.memory).size() > 0)
-    {
-        return false;
-    }
     if(inputs.negative_prompt && std::string(inputs.negative_prompt).size() > 0)
     {
         return false;
@@ -3831,12 +3828,23 @@ static bool batch_claim_waiting_locked()
         req->slot = slot;
         req->state = BatchState::PREFILL;
         batch_touched_since_legacy = true;
+
+        ApplyPromptFormatAdjustments(req->prompt_added_memory, req->prompt);
+        std::vector<llama_token> added_memory_tokens; //temporary buf before copying over
+
         TokenizeString(req->prompt, req->prompt_tokens, file_format, add_bos_token);
         if(req->prompt_tokens.empty())
         {
             TokenizeString("", req->prompt_tokens, file_format, add_bos_token);
         }
+        if(req->prompt_added_memory!="")
+        {
+            TokenizeString(req->prompt_added_memory, added_memory_tokens, file_format, add_bos_token);
+        }
+
         int n_ctx = req->max_context_length > 0 ? std::min(req->max_context_length, kcpp_data->n_ctx) : kcpp_data->n_ctx;
+        AppendDedicatedMemoryAndNegativePrompt(req->prompt_tokens, added_memory_tokens, std::vector<llama_token>(), req->max_length, n_ctx);
+
         if(req->max_length > 0 && (int) req->prompt_tokens.size() + req->max_length > n_ctx)
         {
             int keep = std::max(1, n_ctx - req->max_length);
@@ -3845,6 +3853,15 @@ static bool batch_claim_waiting_locked()
                 req->prompt_tokens.erase(req->prompt_tokens.begin(), req->prompt_tokens.end() - keep);
             }
         }
+
+        if (debugmode==1 && !is_quiet)
+        {
+            std::string outstr = "";
+            printf("\n\n[Debug: Dump %zu Raw Input Tokens]\n",req->prompt_tokens.size());
+            outstr += get_tok_vec_str(req->prompt_tokens);
+            printf("%s\n", RemoveBell(outstr).c_str());
+        }
+
         req->prompt_token_count = req->prompt_tokens.size();
         req->sampler = batch_build_sampler(*req);
         for(llama_token token : req->prompt_tokens)
@@ -4024,6 +4041,7 @@ int gpttype_batch_generate_submit(const generation_inputs inputs)
     auto req = std::make_unique<BatchGenerateRequest>();
     req->id = batch_next_request_id++;
     req->prompt = inputs.prompt ? inputs.prompt : "";
+    req->prompt_added_memory = inputs.memory ? inputs.memory : "";
     req->max_context_length = inputs.max_context_length;
     req->max_length = inputs.max_length;
     req->seed = inputs.seed;
diff --git a/koboldcpp.py b/koboldcpp.py
index ea4c2821d..9accb8472 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -2290,7 +2290,7 @@ def continuous_batching_python_eligible(genparams, api_format):
         return False
     if not getattr(args, "noshift", False) or getattr(args, "smartcontext", False) or getattr(args, "draftmodel", "") or getattr(args, "mmproj", "") or getattr(args, "enableguidance", False):
         return False
-    if genparams.get("memory") or genparams.get("negative_prompt") or genparams.get("images") or genparams.get("audio"):
+    if genparams.get("negative_prompt") or genparams.get("images") or genparams.get("audio"):
         return False
     if genparams.get("ban_eos_token", False):
         return False