updated docs, shifted kv extra space to be subtracted from user's ctx value instead of added on load.

2025-09-10 00:54:41 +00:00 · 2023-11-30 14:19:40 +08:00 · 2023-11-30 14:19:40 +08:00 · a012342a77
commit a012342a77
parent 66ef4a20e2
2 changed files with 26 additions and 4 deletions
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -98,6 +98,8 @@ static std::mutex concat_output_mtx;
 static std::string concat_output = "";
 static std::string concat_output_reader_copy = "";

+const size_t extra_context_handle_fragmentation = 80;
+
 inline bool IsNanCheck(float f)
 {
    const unsigned int u = *(unsigned int*)&f;
@ -883,7 +885,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
    {
        llama_model_params model_params = llama_model_default_params();
        llama_context_params llama_ctx_params = llama_context_default_params();
-        llama_ctx_params.n_ctx = clamped_max_context_length + 64; //add some extra context to deal with KV fragmentation
+        llama_ctx_params.n_ctx = clamped_max_context_length;
        //llama_ctx_paran_parts = -1;
        llama_ctx_params.seed = -1;
        llama_ctx_params.f16_kv = inputs.f16_kv;
@ -1421,6 +1423,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
            stop_sequence.push_back(stopper);
        }
    }
+
    std::string addedmemory = inputs.memory;
    params.prompt = inputs.prompt;
    params.seed = inputs.seed;
@ -1442,6 +1445,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
    params.n_threads_batch = n_blasthreads;
    bool stream_sse = inputs.stream_sse;

+    if(params.n_ctx >= 256 && useContextShift && (file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON))
+    {
+        params.n_ctx -= extra_context_handle_fragmentation; //add some additional buffer to handle KV fragmentation
+        if(debugmode==1)
+        {
+            printf("\nTrue max context permitted: %d\n",params.n_ctx);
+        }
+    }
+
    bool allow_regular_prints = (debugmode!=-1 && !inputs.quiet) || debugmode >= 1;

    generation_finished = false; // Set current generation status