Merge branch 'upstream' into concedo_experimental

# Conflicts: # .github/workflows/build.yml # CMakeLists.txt # README.md # ci/run.sh # llama.cpp # models/ggml-vocab-llama-bpe.gguf.inp # models/ggml-vocab-llama-bpe.gguf.out # requirements.txt # scripts/compare-llama-bench.py # scripts/sync-ggml.last # tests/CMakeLists.txt # tests/test-backend-ops.cpp # tests/test-grammar-integration.cpp # tests/test-tokenizer-1-bpe.cpp
2025-09-11 09:34:37 +00:00 · 2024-05-14 19:28:47 +08:00 · 2024-05-14 19:28:47 +08:00 · 2ee808a747
commit 2ee808a747
parent 5d15f8f76a 541600201e
66 changed files with 3034 additions and 1821 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -652,9 +652,6 @@ struct server_context {
    std::string              system_prompt;
    std::vector<llama_token> system_tokens;

-    std::string name_user;      // this should be the antiprompt
-    std::string name_assistant;
-
    // slots / clients
    std::vector<server_slot> slots;
    json default_generation_settings_for_props;
@ -674,6 +671,8 @@ struct server_context {
            llama_free_model(model);
            model = nullptr;
        }
+
+        llama_batch_free(batch);
    }

    bool load_model(const gpt_params & params_) {
@ -1099,15 +1098,11 @@ struct server_context {
        system_need_update = false;
    }

-    void system_prompt_set(const json & sys_props) {
-        system_prompt  = sys_props.value("prompt", "");
-        name_user      = sys_props.value("anti_prompt", "");
-        name_assistant = sys_props.value("assistant_name", "");
+    bool system_prompt_set(const std::string & sys_prompt) {
+        system_prompt = sys_prompt;

        LOG_VERBOSE("system prompt process", {
            {"system_prompt",  system_prompt},
-            {"name_user",      name_user},
-            {"name_assistant", name_assistant},
        });

        // release all slots
@ -1116,6 +1111,7 @@ struct server_context {
        }

        system_need_update = true;
+        return true;
    }

    bool process_token(completion_token_output & result, server_slot & slot) {
@ -1535,7 +1531,8 @@ struct server_context {
                    }

                    if (task.data.contains("system_prompt")) {
-                        system_prompt_set(task.data.at("system_prompt"));
+                        std::string sys_prompt = json_value(task.data, "system_prompt", std::string());
+                        system_prompt_set(sys_prompt);

                        for (server_slot & slot : slots) {
                            slot.n_past    = 0;
@ -2271,10 +2268,10 @@ struct server_context {

                const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs);
                if (n_probs > 0) {
-                    const size_t n_considered = slot.ctx_sampling->n_considered;
+                    const size_t n_valid = slot.ctx_sampling->n_valid;

                    // Make sure at least n_probs top tokens are at the front of the vector:
-                    if (slot.sparams.temp == 0.0f && n_probs > n_considered) {
+                    if (slot.sparams.temp == 0.0f && n_probs > n_valid) {
                        llama_sample_top_k(ctx, &cur_p, n_probs, 0);
                    }

@ -2290,7 +2287,7 @@ struct server_context {
                        for (size_t i = 0; i < n_probs; ++i) {
                            result.probs.push_back({
                                cur_p.data[i].id,
-                                i >= n_considered ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability.
+                                i >= n_valid ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability.
                            });
                        }
                    }
@ -2919,7 +2916,7 @@ int main(int argc, char ** argv) {
    server_params_parse(argc, argv, sparams, params);

    if (!sparams.system_prompt.empty()) {
-        ctx_server.system_prompt_set(json::parse(sparams.system_prompt));
+        ctx_server.system_prompt_set(sparams.system_prompt);
    }

    if (params.model_alias == "unknown") {
@ -3408,8 +3405,7 @@ int main(int argc, char ** argv) {
    const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
        json data = {
-            { "user_name",                   ctx_server.name_user.c_str() },
-            { "assistant_name",              ctx_server.name_assistant.c_str() },
+            { "system_prompt",               ctx_server.system_prompt.c_str() },
            { "default_generation_settings", ctx_server.default_generation_settings_for_props },
            { "total_slots",                 ctx_server.params.n_parallel }
        };