Allocate a small amount of extra context for GGUF to deal with KV fragmentation causing issues in some scenarios.

2025-09-10 09:04:36 +00:00 · 2023-11-28 20:55:14 +08:00 · 2023-11-28 20:55:14 +08:00 · ba5c33319b
commit ba5c33319b
parent d2ef458b02
2 changed files with 5 additions and 4 deletions
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -883,7 +883,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
    {
        llama_model_params model_params = llama_model_default_params();
        llama_context_params llama_ctx_params = llama_context_default_params();
-        llama_ctx_params.n_ctx = clamped_max_context_length;
+        llama_ctx_params.n_ctx = clamped_max_context_length + 64; //add some extra context to deal with KV fragmentation
        //llama_ctx_paran_parts = -1;
        llama_ctx_params.seed = -1;
        llama_ctx_params.f16_kv = inputs.f16_kv;
@ -1808,7 +1808,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o

            if (!evalres)
            {
-                fprintf(stderr, "\nFailed to predict! Check your context buffer sizes!\n");
+                fprintf(stderr, "\nFailed to predict at %d! Check your context buffer sizes!\n",n_past);
                snprintf(output.text, sizeof(output.text), "%s", "");
                output.status = 0;
                generation_finished = true;