mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2025-09-09 13:55:27 +00:00
[update] support openai chat completion api
This commit is contained in:
parent
63b1c8525b
commit
299c4dca64
8 changed files with 166 additions and 83 deletions
|
@ -16,6 +16,7 @@ from ktransformers.local_chat import custom_models, default_optimize_rules
|
|||
from ktransformers.util.utils import get_device
|
||||
from typing import Optional
|
||||
from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled, MLAWrapperSingleton
|
||||
from ktransformers.server.schemas.endpoints.chat import RawUsage
|
||||
|
||||
warm_uped = False
|
||||
|
||||
|
@ -231,3 +232,12 @@ class KTransformersInterface(TransformersInterface):
|
|||
async with self._infer_lock:
|
||||
async for v in super().inference(local_messages, thread_id, temperature, top_p):
|
||||
yield v
|
||||
|
||||
# return this inference raw usage
|
||||
yield RawUsage(
|
||||
tokenize_time = self.profiler.get_timer_sec('tokenize'),
|
||||
prefill_time = self.profiler.get_timer_sec('prefill'),
|
||||
decode_time = self.profiler.get_timer_sec('decode'),
|
||||
prefill_count = self.profiler.get_counter('prefill'),
|
||||
decode_count = self.profiler.get_counter('decode'),
|
||||
)
|
Loading…
Add table
Add a link
Reference in a new issue