From 84164f584ce4420ee14749fa11f7d429d2cb7195 Mon Sep 17 00:00:00 2001 From: Yuhao Tsui Date: Wed, 26 Mar 2025 15:39:46 +0800 Subject: [PATCH] Update completions.py --- ktransformers/server/api/ollama/completions.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/ktransformers/server/api/ollama/completions.py b/ktransformers/server/api/ollama/completions.py index 91bd886..7fc758b 100644 --- a/ktransformers/server/api/ollama/completions.py +++ b/ktransformers/server/api/ollama/completions.py @@ -85,8 +85,12 @@ async def generate(request: Request, input: OllamaGenerateCompletionRequest): return check_link_response(request, inner()) else: complete_response = "" - async for token in interface.inference(input.prompt, id): - complete_response += token + async for res in interface.inference(input.prompt, id): + if isinstance(res, RawUsage): + raw_usage = res + else: + token, finish_reason = res + complete_response += token response = OllamaGenerationResponse( model=config.model_name, created_at=str(datetime.now()), @@ -187,8 +191,12 @@ async def chat(request: Request, input: OllamaChatCompletionRequest): complete_response = "" eval_count = 0 - async for token in interface.inference(prompt, id): - complete_response += token + async for res in interface.inference(prompt, id): + if isinstance(res, RawUsage): + raw_usage = res + else: + token, finish_reason = res + complete_response += token eval_count += 1 end_time = time()