Merge branch 'master' into concedo_experimental

# Conflicts: # .devops/nix/sif.nix # .github/workflows/build.yml # .github/workflows/python-check-requirements.yml # README-sycl.md # README.md # flake.lock # flake.nix # requirements/requirements-convert-hf-to-gguf.txt # scripts/compare-llama-bench.py
2025-09-11 01:24:36 +00:00 · 2024-03-04 15:33:33 +08:00 · 2024-03-04 15:33:33 +08:00 · 7c64845dea
commit 7c64845dea
parent d4a12133e7 67be2ce101
41 changed files with 3325 additions and 2053 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -44,6 +44,7 @@ struct server_params {
    int32_t write_timeout = 600;
    bool slots_endpoint = true;
    bool metrics_endpoint = false;
+    int n_threads_http = -1;
 };

 bool server_verbose = false;
@ -441,8 +442,8 @@ struct llama_server_context
            const int ga_w = params.grp_attn_w;

            if (ga_n != 1) {
-                GGML_ASSERT(ga_n > 0                    && "ga_n must be positive");                     // NOLINT
-                GGML_ASSERT(ga_w % ga_n == 0            && "ga_w must be a multiple of ga_n");     // NOLINT
+                GGML_ASSERT(ga_n > 0                    && "ga_n must be positive");                       // NOLINT
+                GGML_ASSERT(ga_w % ga_n == 0            && "ga_w must be a multiple of ga_n");             // NOLINT
                //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of ga_w");    // NOLINT
                //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT

@ -1709,8 +1710,8 @@ struct llama_server_context
                    }
                    slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);

-                    // if input prompt is too big, truncate it
-                    if (slot.n_prompt_tokens >= slot.n_ctx)
+                    // if input prompt is too big, truncate it, if group attention self-extend is disabled
+                    if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx)
                    {
                        const int n_left = slot.n_ctx - slot.params.n_keep;
                        const int n_block_size = n_left / 2;
@ -1785,9 +1786,11 @@ struct llama_server_context
                        }

                        LOG_INFO("slot progression", {
-                            { "slot_id", slot.id },
-                            { "task_id", slot.task_id },
-                            { "n_past",  slot.n_past },
+                            { "slot_id",    slot.id },
+                            { "task_id",    slot.task_id },
+                            { "n_past",     slot.n_past },
+                            { "n_past_se",  slot.n_past_se },
+                            { "ga_i",       slot.ga_i },
                            { "n_prompt_tokens_processed", slot.n_prompt_tokens_processed }
                        });
                    }
@ -2001,6 +2004,17 @@ struct llama_server_context
        LOG_VERBOSE("slots updated", {});
        return true;
    }
+
+    json model_meta() {
+        return json{
+                {"vocab_type", llama_vocab_type(model)},
+                {"n_vocab", llama_n_vocab(model)},
+                {"n_ctx_train", llama_n_ctx_train(model)},
+                {"n_embd", llama_n_embd(model)},
+                {"n_params", llama_model_n_params(model)},
+                {"size", llama_model_size(model)},
+        };
+    }
 };

 static void server_print_usage(const char *argv0, const gpt_params &params,
@ -2013,6 +2027,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("  -v, --verbose             verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
    printf("  -t N, --threads N         number of threads to use during computation (default: %d)\n", params.n_threads);
    printf("  -tb N, --threads-batch N  number of threads to use during batch and prompt processing (default: same as --threads)\n");
+    printf("  --threads-http N          number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n");
    printf("  -c N, --ctx-size N        size of the prompt context (default: %d)\n", params.n_ctx);
    printf("  --rope-scaling {none,linear,yarn}\n");
    printf("                            RoPE frequency scaling method, defaults to linear unless specified by the model\n");
@ -2299,6 +2314,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            params.n_threads_batch = std::stoi(argv[i]);
        }
+        else if (arg == "--threads-http")
+        {
+            if (++i >= argc)
+            {
+                invalid_param = true;
+                break;
+            }
+            sparams.n_threads_http = std::stoi(argv[i]);
+        }
        else if (arg == "-b" || arg == "--batch-size")
        {
            if (++i >= argc)
@ -2380,14 +2404,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
 #else
            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
-#endif // GGML_USE_CUBLAS
-        }
-        else if (arg == "--no-mul-mat-q" || arg == "-nommq")
-        {
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
-            params.mul_mat_q = false;
-#else
-            LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
 #endif // GGML_USE_CUBLAS
        }
        else if (arg == "--main-gpu" || arg == "-mg")
@ -2909,9 +2925,10 @@ int main(int argc, char **argv)
                for (const auto& metric_def : metrics_def) {
                    std::string name = metric_def["name"];
                    std::string help = metric_def["help"];
-                    prometheus << "# HELP llamacpp:" << name << " " << help                << "\n"
-                               << "# TYPE llamacpp:" << name << " " << type                << "\n"
-                               << "llamacpp:"        << name << " " << metric_def["value"] << "\n";
+                    auto value = json_value(metric_def, "value", 0);
+                    prometheus << "# HELP llamacpp:" << name << " " << help  << "\n"
+                               << "# TYPE llamacpp:" << name << " " << type  << "\n"
+                               << "llamacpp:"        << name << " " << value << "\n";
                }
            }

@ -2992,6 +3009,7 @@ int main(int argc, char **argv)
        state.store(SERVER_STATE_READY);
        LOG_INFO("model loaded", {});
    }
+    const auto model_meta = llama.model_meta();

    if (sparams.chat_template.empty()) { // custom chat template is not supplied
        // check if the template comes with the model is supported by us
@ -3141,7 +3159,7 @@ int main(int argc, char **argv)
                }
            });

-    svr.Get("/v1/models", [&params](const httplib::Request& req, httplib::Response& res)
+    svr.Get("/v1/models", [&params, &model_meta](const httplib::Request& req, httplib::Response& res)
            {
                res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
                std::time_t t = std::time(0);
@ -3150,10 +3168,11 @@ int main(int argc, char **argv)
                    {"object", "list"},
                    {"data", {
                        {
-                            {"id", params.model_alias},
-                            {"object", "model"},
-                            {"created", t},
-                            {"owned_by", "llamacpp"}
+                            {"id",       params.model_alias},
+                            {"object",   "model"},
+                            {"created",  t},
+                            {"owned_by", "llamacpp"},
+                            {"meta",     model_meta}
                        },
                    }}
                };
@ -3450,6 +3469,13 @@ int main(int argc, char **argv)
    }*/
    //);

+    if (sparams.n_threads_http < 1) {
+        // +2 threads for monitoring endpoints
+        sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
+    }
+    log_data["n_threads_http"] =  std::to_string(sparams.n_threads_http);
+    svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
+
    LOG_INFO("HTTP server listening", log_data);
    // run the HTTP server in a thread - see comment below
    std::thread t([&]()