roll back ktransformers backend, add max_tokens, max_completion_tokens param

This commit is contained in:
qiyuxinlin 2025-04-21 12:55:37 +00:00
parent a1162eea01
commit 03a65d6bea
10 changed files with 144 additions and 161 deletions

View file

@ -207,7 +207,7 @@ async def chat_completion(request: Request, create: ChatCompletionCreate):
"<tools▁end>":"<tool▁calls▁end>"
}
# Use check_client_connected for early stopping
async for res in interface.inference(input_message, id, create.temperature, create.top_p):
async for res in interface.inference(input_message, id, create.temperature, create.top_p, create.max_tokens, create.max_completion_tokens):
if isinstance(res, RawUsage):
# Final return on utilization
raw_usage = res
@ -371,7 +371,7 @@ async def chat_completion(request: Request, create: ChatCompletionCreate):
"<tool▁end>":"<tool▁call▁end>",
"<tools▁end>":"<tool▁calls▁end>"
}
async for res in interface.inference(input_message, id, create.temperature, create.top_p):
async for res in interface.inference(input_message, id, create.temperature, create.top_p, create.max_tokens, create.max_completion_tokens):
if isinstance(res, RawUsage):
raw_usage = res
usage = CompletionUsage(