llama : remove KV cache defragmentation logic (#15473)

ggml-ci
2025-09-14 19:09:45 +00:00 · 2025-08-22 12:22:13 +03:00 · 2025-08-22 12:22:13 +03:00 · 9ebebef62f
commit 9ebebef62f
parent ad5c975c2d
16 changed files with 32 additions and 440 deletions
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@ -245,7 +245,6 @@ struct cmd_params {
    std::vector<int>                 n_ubatch;
    std::vector<ggml_type>           type_k;
    std::vector<ggml_type>           type_v;
-    std::vector<float>               defrag_thold;
    std::vector<int>                 n_threads;
    std::vector<std::string>         cpu_mask;
    std::vector<bool>                cpu_strict;
@ -282,7 +281,6 @@ static const cmd_params cmd_params_defaults = {
    /* n_ubatch             */ { 512 },
    /* type_k               */ { GGML_TYPE_F16 },
    /* type_v               */ { GGML_TYPE_F16 },
-    /* defrag_thold         */ { -1.0f },
    /* n_threads            */ { cpu_get_num_math() },
    /* cpu_mask             */ { "0x0" },
    /* cpu_strict           */ { false },
@ -346,8 +344,6 @@ static void print_usage(int /* argc */, char ** argv) {
           join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
    printf("  -ctv, --cache-type-v <t>                  (default: %s)\n",
           join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
-    printf("  -dt, --defrag-thold <f>                   (default: %s)\n",
-           join(cmd_params_defaults.defrag_thold, ",").c_str());
    printf("  -t, --threads <n>                         (default: %s)\n",
           join(cmd_params_defaults.n_threads, ",").c_str());
    printf("  -C, --cpu-mask <hex,hex>                  (default: %s)\n",
@ -533,13 +529,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                    break;
                }
                params.type_v.insert(params.type_v.end(), types.begin(), types.end());
-            } else if (arg == "-dt" || arg == "--defrag-thold") {
-                if (++i >= argc) {
-                    invalid_param = true;
-                    break;
-                }
-                auto p = string_split<float>(argv[i], split_delim);
-                params.defrag_thold.insert(params.defrag_thold.end(), p.begin(), p.end());
            } else if (arg == "-t" || arg == "--threads") {
                if (++i >= argc) {
                    invalid_param = true;
@ -849,9 +838,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.type_v.empty()) {
        params.type_v = cmd_params_defaults.type_v;
    }
-    if (params.defrag_thold.empty()) {
-        params.defrag_thold = cmd_params_defaults.defrag_thold;
-    }
    if (params.n_gpu_layers.empty()) {
        params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
    }
@ -910,7 +896,6 @@ struct cmd_params_instance {
    int                n_ubatch;
    ggml_type          type_k;
    ggml_type          type_v;
-    float              defrag_thold;
    int                n_threads;
    std::string        cpu_mask;
    bool               cpu_strict;
@ -1007,7 +992,6 @@ struct cmd_params_instance {
        cparams.n_ubatch     = n_ubatch;
        cparams.type_k       = type_k;
        cparams.type_v       = type_v;
-        cparams.defrag_thold = defrag_thold;
        cparams.offload_kqv  = !no_kv_offload;
        cparams.flash_attn   = flash_attn;
        cparams.embeddings   = embeddings;
@ -1037,7 +1021,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
    for (const auto & nub : params.n_ubatch)
    for (const auto & tk : params.type_k)
    for (const auto & tv : params.type_v)
-    for (const auto & defrag_thold : params.defrag_thold)
    for (const auto & nkvo : params.no_kv_offload)
    for (const auto & fa : params.flash_attn)
    for (const auto & nt : params.n_threads)
@ -1058,7 +1041,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .n_ubatch     = */ nub,
                /* .type_k       = */ tk,
                /* .type_v       = */ tv,
-                /* .defrag_thold = */ defrag_thold,
                /* .n_threads    = */ nt,
                /* .cpu_mask     = */ cm,
                /* .cpu_strict   = */ cs,
@ -1091,7 +1073,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .n_ubatch     = */ nub,
                /* .type_k       = */ tk,
                /* .type_v       = */ tv,
-                /* .defrag_thold = */ defrag_thold,
                /* .n_threads    = */ nt,
                /* .cpu_mask     = */ cm,
                /* .cpu_strict   = */ cs,
@ -1124,7 +1105,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .n_ubatch     = */ nub,
                /* .type_k       = */ tk,
                /* .type_v       = */ tv,
-                /* .defrag_thold = */ defrag_thold,
                /* .n_threads    = */ nt,
                /* .cpu_mask     = */ cm,
                /* .cpu_strict   = */ cs,
@ -1166,7 +1146,6 @@ struct test {
    int                      poll;
    ggml_type                type_k;
    ggml_type                type_v;
-    float                    defrag_thold;
    int                      n_gpu_layers;
    llama_split_mode         split_mode;
    int                      main_gpu;
@ -1201,7 +1180,6 @@ struct test {
        poll           = inst.poll;
        type_k         = inst.type_k;
        type_v         = inst.type_v;
-        defrag_thold   = inst.defrag_thold;
        n_gpu_layers   = inst.n_gpu_layers;
        split_mode     = inst.split_mode;
        main_gpu       = inst.main_gpu;
@ -1257,7 +1235,6 @@ struct test {
            "model_type",   "model_size",   "model_n_params", "n_batch",    "n_ubatch",     "n_threads",
            "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
            "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "tensor_buft_overrides",
-            "defrag_thold",
            "use_mmap",     "embeddings",   "no_op_offload",   "n_prompt",       "n_gen",      "n_depth",      "test_time",
            "avg_ns",       "stddev_ns",    "avg_ts",         "stddev_ts",
        };
@ -1277,7 +1254,7 @@ struct test {
            field == "use_mmap" || field == "embeddings") {
            return BOOL;
        }
-        if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold") {
+        if (field == "avg_ts" || field == "stddev_ts") {
            return FLOAT;
        }
        return STRING;
@ -1344,7 +1321,6 @@ struct test {
                                            std::to_string(flash_attn),
                                            tensor_split_str,
                                            tensor_buft_overrides_str,
-                                            std::to_string(defrag_thold),
                                            std::to_string(use_mmap),
                                            std::to_string(embeddings),
                                            std::to_string(no_op_offload),
@ -1611,9 +1587,6 @@ struct markdown_printer : public printer {
        if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
            fields.emplace_back("type_v");
        }
-        if (params.defrag_thold.size() > 1 || params.defrag_thold != cmd_params_defaults.defrag_thold) {
-            fields.emplace_back("defrag_thold");
-        }
        if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
            fields.emplace_back("main_gpu");
        }