Merge branch 'upstream' into concedo_experimental

# Conflicts: # .devops/cuda.Dockerfile # CODEOWNERS # README.md # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/common.h # ggml/src/ggml-opencl/ggml-opencl.cpp # scripts/sync-ggml-am.sh # scripts/sync-ggml.last # scripts/sync-ggml.sh # tests/test-chat.cpp # tools/batched-bench/batched-bench.cpp # tools/mtmd/clip.h
2025-09-13 02:19:41 +00:00 · 2025-08-20 20:34:45 +08:00 · 2025-08-20 20:34:45 +08:00 · 1c41c38a6a
commit 1c41c38a6a
parent eb33467c8c a094f38143
40 changed files with 426 additions and 274 deletions
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@ -1201,6 +1201,8 @@ struct server_task_result_metrics : server_task_result {
    uint64_t n_tokens_predicted_total        = 0;
    uint64_t t_tokens_generation_total       = 0;

+    uint64_t n_past_max = 0;
+
    uint64_t n_prompt_tokens_processed = 0;
    uint64_t t_prompt_processing       = 0;

@ -1226,6 +1228,8 @@ struct server_task_result_metrics : server_task_result {
            { "n_tokens_predicted_total",        n_tokens_predicted_total },
            { "t_prompt_processing_total",       t_prompt_processing_total },

+            { "n_past_max",                      n_past_max },
+
            { "n_prompt_tokens_processed",       n_prompt_tokens_processed },
            { "t_prompt_processing",             t_prompt_processing },
            { "n_tokens_predicted",              n_tokens_predicted },
@ -1587,6 +1591,8 @@ struct server_metrics {
    uint64_t n_tokens_predicted_total        = 0;
    uint64_t t_tokens_generation_total       = 0;

+    uint64_t n_past_max = 0;
+
    uint64_t n_prompt_tokens_processed = 0;
    uint64_t t_prompt_processing       = 0;

@ -1605,6 +1611,10 @@ struct server_metrics {
        n_prompt_tokens_processed       += slot.n_prompt_tokens_processed;
        t_prompt_processing             += slot.t_prompt_processing;
        t_prompt_processing_total       += slot.t_prompt_processing;
+
+        if (slot.n_past > 0) {
+            n_past_max = std::max(n_past_max, (uint64_t) slot.n_past);
+        }
    }

    void on_prediction(const server_slot & slot) {
@ -1620,6 +1630,9 @@ struct server_metrics {
            if (slot.is_processing()) {
                n_busy_slots_total++;
            }
+            if (slot.n_past > 0) {
+                n_past_max = std::max(n_past_max, (uint64_t) slot.n_past);
+            }
        }
    }

@ -1716,7 +1729,7 @@ struct server_queue {
    void pop_deferred_task() {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        if (!queue_tasks_deferred.empty()) {
-            queue_tasks.emplace_back(std::move(queue_tasks_deferred.front()));
+            queue_tasks.emplace_front(std::move(queue_tasks_deferred.front()));
            queue_tasks_deferred.pop_front();
        }
        condition_tasks.notify_one();
@ -2875,6 +2888,8 @@ struct server_context {
                    res->n_tokens_predicted_total        = metrics.n_tokens_predicted_total;
                    res->t_tokens_generation_total       = metrics.t_tokens_generation_total;

+                    res->n_past_max = metrics.n_past_max;
+
                    res->n_prompt_tokens_processed = metrics.n_prompt_tokens_processed;
                    res->t_prompt_processing       = metrics.t_prompt_processing;
                    res->n_tokens_predicted        = metrics.n_tokens_predicted;
@ -4077,6 +4092,10 @@ int main(int argc, char ** argv) {
                    {"name",  "n_decode_total"},
                    {"help",  "Total number of llama_decode() calls"},
                    {"value",  res_metrics->n_decode_total}
+            }, {
+                    {"name",  "n_past_max"},
+                    {"help",  "Largest observed n_past."},
+                    {"value",  res_metrics->n_past_max}
            }, {
                    {"name",  "n_busy_slots_per_decode"},
                    {"help",  "Average number of busy slots per llama_decode() call"},
--- a/tools/server/tests/unit/test_basic.py
+++ b/tools/server/tests/unit/test_basic.py
@ -5,7 +5,7 @@ from utils import *
 server = ServerPreset.tinyllama2()


-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(autouse=True)
 def create_server():
    global server
    server = ServerPreset.tinyllama2()
--- a/tools/server/tests/unit/test_completion.py
+++ b/tools/server/tests/unit/test_completion.py
@ -7,7 +7,7 @@ from utils import *
 server = ServerPreset.tinyllama2()


-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(autouse=True)
 def create_server():
    global server
    server = ServerPreset.tinyllama2()
@ -229,7 +229,7 @@ def test_nocache_long_input_prompt():
        "temperature": 1.0,
        "cache_prompt": False,
    })
-    assert res.status_code == 200
+    assert res.status_code == 400


 def test_completion_with_tokens_input():
--- a/tools/server/tests/unit/test_ctx_shift.py
+++ b/tools/server/tests/unit/test_ctx_shift.py
@ -11,7 +11,7 @@ Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
 Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
 """.strip()

-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(autouse=True)
 def create_server():
    global server
    server = ServerPreset.tinyllama2()
@ -25,6 +25,7 @@ def test_ctx_shift_enabled():
    # the prompt is truncated to keep the last 109 tokens
    # 64 tokens are generated thanks to shifting the context when it gets full
    global server
+    server.enable_ctx_shift = True
    server.start()
    res = server.make_request("POST", "/completion", data={
        "n_predict": 64,
@ -42,7 +43,6 @@ def test_ctx_shift_enabled():
 ])
 def test_ctx_shift_disabled_short_prompt(n_predict: int, n_token_output: int, truncated: bool):
    global server
-    server.disable_ctx_shift = True
    server.n_predict = -1
    server.start()
    res = server.make_request("POST", "/completion", data={
@ -56,7 +56,6 @@ def test_ctx_shift_disabled_short_prompt(n_predict: int, n_token_output: int, tr

 def test_ctx_shift_disabled_long_prompt():
    global server
-    server.disable_ctx_shift = True
    server.start()
    res = server.make_request("POST", "/completion", data={
        "n_predict": 64,
@ -68,7 +67,6 @@ def test_ctx_shift_disabled_long_prompt():

 def test_ctx_shift_disabled_stream():
    global server
-    server.disable_ctx_shift = True
    server.start()
    res = server.make_stream_request("POST", "/v1/completions", data={
        "n_predict": 256,
--- a/tools/server/tests/unit/test_embedding.py
+++ b/tools/server/tests/unit/test_embedding.py
@ -8,7 +8,7 @@ server = ServerPreset.bert_bge_small()

 EPSILON = 1e-3

-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(autouse=True)
 def create_server():
    global server
    server = ServerPreset.bert_bge_small()
--- a/tools/server/tests/unit/test_infill.py
+++ b/tools/server/tests/unit/test_infill.py
@ -3,7 +3,7 @@ from utils import *

 server = ServerPreset.tinyllama_infill()

-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(autouse=True)
 def create_server():
    global server
    server = ServerPreset.tinyllama_infill()
--- a/tools/server/tests/unit/test_lora.py
+++ b/tools/server/tests/unit/test_lora.py
@ -5,7 +5,7 @@ server = ServerPreset.stories15m_moe()

 LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf"

-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(autouse=True)
 def create_server():
    global server
    server = ServerPreset.stories15m_moe()
--- a/tools/server/tests/unit/test_rerank.py
+++ b/tools/server/tests/unit/test_rerank.py
@ -4,7 +4,7 @@ from utils import *
 server = ServerPreset.jina_reranker_tiny()


-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(autouse=True)
 def create_server():
    global server
    server = ServerPreset.jina_reranker_tiny()
--- a/tools/server/tests/unit/test_security.py
+++ b/tools/server/tests/unit/test_security.py
@ -6,7 +6,7 @@ server = ServerPreset.tinyllama2()

 TEST_API_KEY = "sk-this-is-the-secret-key"

-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(autouse=True)
 def create_server():
    global server
    server = ServerPreset.tinyllama2()
--- a/tools/server/tests/unit/test_slot_save.py
+++ b/tools/server/tests/unit/test_slot_save.py
@ -3,7 +3,7 @@ from utils import *

 server = ServerPreset.tinyllama2()

-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(autouse=True)
 def create_server():
    global server
    server = ServerPreset.tinyllama2()
--- a/tools/server/tests/unit/test_speculative.py
+++ b/tools/server/tests/unit/test_speculative.py
@ -16,7 +16,7 @@ def create_server():
    server.draft_max = 8


-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(autouse=True)
 def fixture_create_server():
    return create_server()

@ -91,6 +91,7 @@ def test_slot_ctx_not_exceeded():
 def test_with_ctx_shift():
    global server
    server.n_ctx = 64
+    server.enable_ctx_shift = True
    server.start()
    res = server.make_request("POST", "/completion", data={
        "prompt": "Hello " * 56,
--- a/tools/server/tests/unit/test_tokenize.py
+++ b/tools/server/tests/unit/test_tokenize.py
@ -4,7 +4,7 @@ from utils import *
 server = ServerPreset.tinyllama2()


-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(autouse=True)
 def create_server():
    global server
    server = ServerPreset.tinyllama2()
--- a/tools/server/tests/unit/test_tool_call.py
+++ b/tools/server/tests/unit/test_tool_call.py
@ -22,6 +22,8 @@ def create_server():
    server.model_alias = "tinyllama-2-tool-call"
    server.server_port = 8081
    server.n_slots = 1
+    server.n_ctx = 8192
+    server.n_batch = 2048

 class CompletionMode(Enum):
    NORMAL = "normal"
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@ -79,7 +79,7 @@ class ServerProcess:
    draft: int | None = None
    api_key: str | None = None
    lora_files: List[str] | None = None
-    disable_ctx_shift: int | None = False
+    enable_ctx_shift: int | None = False
    draft_min: int | None = None
    draft_max: int | None = None
    no_webui: bool | None = None
@ -178,8 +178,8 @@ class ServerProcess:
        if self.lora_files:
            for lora_file in self.lora_files:
                server_args.extend(["--lora", lora_file])
-        if self.disable_ctx_shift:
-            server_args.extend(["--no-context-shift"])
+        if self.enable_ctx_shift:
+            server_args.append("--context-shift")
        if self.api_key:
            server_args.extend(["--api-key", self.api_key])
        if self.draft_max: