Merge branch 'upstream' into concedo_experimental

# Conflicts: # .github/ISSUE_TEMPLATE/020-enhancement.yml # .github/ISSUE_TEMPLATE/030-research.yml # .github/ISSUE_TEMPLATE/040-refactor.yml # .github/workflows/build.yml # Makefile # common/CMakeLists.txt # examples/CMakeLists.txt # examples/infill/infill.cpp # examples/lookahead/lookahead.cpp # examples/lookup/lookup-stats.cpp # examples/lookup/lookup.cpp # examples/parallel/parallel.cpp # examples/retrieval/retrieval.cpp # examples/save-load-state/save-load-state.cpp # examples/speculative/speculative.cpp # flake.lock # ggml/src/ggml-cann/CMakeLists.txt # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/kernels/CMakeLists.txt # ggml/src/ggml-cann/kernels/dup.cpp # ggml/src/ggml-cann/kernels/get_row_f16.cpp # ggml/src/ggml-cann/kernels/get_row_f32.cpp # ggml/src/ggml-cann/kernels/get_row_q4_0.cpp # tests/test-arg-parser.cpp # tests/test-backend-ops.cpp
2025-09-14 02:49:41 +00:00 · 2024-11-25 16:26:08 +08:00 · 2024-11-25 16:26:08 +08:00 · 83350ec314
commit 83350ec314
parent a7f161da95 d9d54e498d
21 changed files with 801 additions and 377 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -176,7 +176,7 @@ struct server_slot {
    // sampling
    json json_schema;

-    struct common_sampler_params sparams;
+    struct common_params_sampling sparams;
    struct common_sampler * smpl = nullptr;

    llama_token sampled;
@ -688,7 +688,7 @@ struct server_context {

            SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);

-            slot.sparams = params.sparams;
+            slot.sparams = params.sampling;

            slot.callback_on_release = [this](int) {
                queue_tasks.pop_deferred_task();
@ -744,7 +744,7 @@ struct server_context {
                }

                // length of the Longest Common Subsequence between the current slot's prompt and the input prompt
-                int cur_lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
+                int cur_lcs_len = common_lcs(slot.cache_tokens, task.prompt_tokens);

                // fraction of the common subsequence length compared to the current slot's prompt length
                float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
@ -789,7 +789,7 @@ struct server_context {
    bool launch_slot_with_task(server_slot & slot, const server_task & task) {
        slot_params default_params;
        // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
-        auto default_sparams = params.sparams;
+        auto default_sparams = params.sampling;
        const auto & data = task.data;

        if (data.count("__oaicompat") != 0) {
@ -1961,7 +1961,7 @@ struct server_context {

                            if (slot.params.cache_prompt) {
                                // reuse any previously computed tokens that are common with the new prompt
-                                slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens);
+                                slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens);

                                // reuse chunks from the cached prompt by shifting their KV cache in the new position
                                if (params.n_cache_reuse > 0) {