llama : remove KV cache defragmentation logic (#15473)

ggml-ci
This commit is contained in:
Georgi Gerganov 2025-08-22 12:22:13 +03:00 committed by GitHub
parent ad5c975c2d
commit 9ebebef62f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
16 changed files with 32 additions and 440 deletions

View file

@ -245,7 +245,6 @@ struct cmd_params {
std::vector<int> n_ubatch;
std::vector<ggml_type> type_k;
std::vector<ggml_type> type_v;
std::vector<float> defrag_thold;
std::vector<int> n_threads;
std::vector<std::string> cpu_mask;
std::vector<bool> cpu_strict;
@ -282,7 +281,6 @@ static const cmd_params cmd_params_defaults = {
/* n_ubatch */ { 512 },
/* type_k */ { GGML_TYPE_F16 },
/* type_v */ { GGML_TYPE_F16 },
/* defrag_thold */ { -1.0f },
/* n_threads */ { cpu_get_num_math() },
/* cpu_mask */ { "0x0" },
/* cpu_strict */ { false },
@ -346,8 +344,6 @@ static void print_usage(int /* argc */, char ** argv) {
join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv, --cache-type-v <t> (default: %s)\n",
join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -dt, --defrag-thold <f> (default: %s)\n",
join(cmd_params_defaults.defrag_thold, ",").c_str());
printf(" -t, --threads <n> (default: %s)\n",
join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n",
@ -533,13 +529,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
break;
}
params.type_v.insert(params.type_v.end(), types.begin(), types.end());
} else if (arg == "-dt" || arg == "--defrag-thold") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<float>(argv[i], split_delim);
params.defrag_thold.insert(params.defrag_thold.end(), p.begin(), p.end());
} else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
invalid_param = true;
@ -849,9 +838,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.type_v.empty()) {
params.type_v = cmd_params_defaults.type_v;
}
if (params.defrag_thold.empty()) {
params.defrag_thold = cmd_params_defaults.defrag_thold;
}
if (params.n_gpu_layers.empty()) {
params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
}
@ -910,7 +896,6 @@ struct cmd_params_instance {
int n_ubatch;
ggml_type type_k;
ggml_type type_v;
float defrag_thold;
int n_threads;
std::string cpu_mask;
bool cpu_strict;
@ -1007,7 +992,6 @@ struct cmd_params_instance {
cparams.n_ubatch = n_ubatch;
cparams.type_k = type_k;
cparams.type_v = type_v;
cparams.defrag_thold = defrag_thold;
cparams.offload_kqv = !no_kv_offload;
cparams.flash_attn = flash_attn;
cparams.embeddings = embeddings;
@ -1037,7 +1021,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & nub : params.n_ubatch)
for (const auto & tk : params.type_k)
for (const auto & tv : params.type_v)
for (const auto & defrag_thold : params.defrag_thold)
for (const auto & nkvo : params.no_kv_offload)
for (const auto & fa : params.flash_attn)
for (const auto & nt : params.n_threads)
@ -1058,7 +1041,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .n_ubatch = */ nub,
/* .type_k = */ tk,
/* .type_v = */ tv,
/* .defrag_thold = */ defrag_thold,
/* .n_threads = */ nt,
/* .cpu_mask = */ cm,
/* .cpu_strict = */ cs,
@ -1091,7 +1073,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .n_ubatch = */ nub,
/* .type_k = */ tk,
/* .type_v = */ tv,
/* .defrag_thold = */ defrag_thold,
/* .n_threads = */ nt,
/* .cpu_mask = */ cm,
/* .cpu_strict = */ cs,
@ -1124,7 +1105,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .n_ubatch = */ nub,
/* .type_k = */ tk,
/* .type_v = */ tv,
/* .defrag_thold = */ defrag_thold,
/* .n_threads = */ nt,
/* .cpu_mask = */ cm,
/* .cpu_strict = */ cs,
@ -1166,7 +1146,6 @@ struct test {
int poll;
ggml_type type_k;
ggml_type type_v;
float defrag_thold;
int n_gpu_layers;
llama_split_mode split_mode;
int main_gpu;
@ -1201,7 +1180,6 @@ struct test {
poll = inst.poll;
type_k = inst.type_k;
type_v = inst.type_v;
defrag_thold = inst.defrag_thold;
n_gpu_layers = inst.n_gpu_layers;
split_mode = inst.split_mode;
main_gpu = inst.main_gpu;
@ -1257,7 +1235,6 @@ struct test {
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
"defrag_thold",
"use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time",
"avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
};
@ -1277,7 +1254,7 @@ struct test {
field == "use_mmap" || field == "embeddings") {
return BOOL;
}
if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold") {
if (field == "avg_ts" || field == "stddev_ts") {
return FLOAT;
}
return STRING;
@ -1344,7 +1321,6 @@ struct test {
std::to_string(flash_attn),
tensor_split_str,
tensor_buft_overrides_str,
std::to_string(defrag_thold),
std::to_string(use_mmap),
std::to_string(embeddings),
std::to_string(no_op_offload),
@ -1611,9 +1587,6 @@ struct markdown_printer : public printer {
if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
fields.emplace_back("type_v");
}
if (params.defrag_thold.size() > 1 || params.defrag_thold != cmd_params_defaults.defrag_thold) {
fields.emplace_back("defrag_thold");
}
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
fields.emplace_back("main_gpu");
}