server: support multiple generations from one prompt (OAI "n" option) (#17775)

* backend support

* server: support multiple generations from one prompt (OAI "n" option)

* fix invalid batch

* format oai

* clean up

* disable ctx shift

* add test

* update comments

* fix style

* add n_cmpl to docs [no ci]

* allowing using both n_cmpl and n
This commit is contained in:
Xuan-Son Nguyen 2025-12-06 15:54:38 +01:00 committed by GitHub
parent 09c7c50e64
commit c42712b056
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 146 additions and 19 deletions

View file

@ -477,3 +477,22 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
assert last_progress["total"] > 0
assert last_progress["processed"] == last_progress["total"]
assert total_batch_count == batch_count
def test_chat_completions_multiple_choices():
global server
server.start()
res = server.make_request("POST", "/chat/completions", data={
"max_tokens": 8,
"n": 2,
"messages": [
{"role": "system", "content": "Book"},
{"role": "user", "content": "What is the best book"},
],
})
assert res.status_code == 200
assert len(res.body["choices"]) == 2
for choice in res.body["choices"]:
assert "assistant" == choice["message"]["role"]
assert match_regex("Suddenly", choice["message"]["content"])
assert choice["finish_reason"] == "length"