server: support multiple generations from one prompt (OAI "n" option) (#17775)

* backend support * server: support multiple generations from one prompt (OAI "n" option) * fix invalid batch * format oai * clean up * disable ctx shift * add test * update comments * fix style * add n_cmpl to docs [no ci] * allowing using both n_cmpl and n
2026-04-29 12:10:53 +00:00 · 2025-12-06 15:54:38 +01:00 · 2025-12-06 15:54:38 +01:00 · c42712b056
commit c42712b056
parent 09c7c50e64
7 changed files with 146 additions and 19 deletions
--- a/tools/server/tests/unit/test_chat_completion.py
+++ b/tools/server/tests/unit/test_chat_completion.py
@ -477,3 +477,22 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
    assert last_progress["total"] > 0
    assert last_progress["processed"] == last_progress["total"]
    assert total_batch_count == batch_count
+
+
+def test_chat_completions_multiple_choices():
+    global server
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": 8,
+        "n": 2,
+        "messages": [
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+        ],
+    })
+    assert res.status_code == 200
+    assert len(res.body["choices"]) == 2
+    for choice in res.body["choices"]:
+        assert "assistant" == choice["message"]["role"]
+        assert match_regex("Suddenly", choice["message"]["content"])
+        assert choice["finish_reason"] == "length"