mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2025-09-14 17:19:42 +00:00
Merge branch 'kvcache-ai:main' into main
This commit is contained in:
commit
e5694f91c0
17 changed files with 356 additions and 163 deletions
|
@ -13,6 +13,8 @@ from ktransformers.server.utils.create_interface import get_interface
|
|||
from ktransformers.server.schemas.assistants.streaming import check_link_response
|
||||
from ktransformers.server.backend.base import BackendInterfaceBase
|
||||
|
||||
from ktransformers.server.schemas.endpoints.chat import RawUsage
|
||||
|
||||
router = APIRouter(prefix='/api')
|
||||
|
||||
# https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-completion
|
||||
|
@ -61,14 +63,18 @@ async def generate(request: Request, input: OllamaGenerateCompletionRequest):
|
|||
|
||||
if input.stream:
|
||||
async def inner():
|
||||
async for token in interface.inference(input.prompt, id):
|
||||
d = OllamaGenerationStreamResponse(
|
||||
model=config.model_name,
|
||||
created_at=str(datetime.now()),
|
||||
response=token,
|
||||
done=False
|
||||
)
|
||||
yield d.model_dump_json() + '\n'
|
||||
async for res in interface.inference(input.prompt, id):
|
||||
if isinstance(res, RawUsage):
|
||||
raw_usage = res
|
||||
else:
|
||||
token, finish_reason = res
|
||||
d = OllamaGenerationStreamResponse(
|
||||
model=config.model_name,
|
||||
created_at=str(datetime.now()),
|
||||
response=token,
|
||||
done=False
|
||||
)
|
||||
yield d.model_dump_json() + '\n'
|
||||
d = OllamaGenerationStreamResponse(
|
||||
model=config.model_name,
|
||||
created_at=str(datetime.now()),
|
||||
|
@ -142,14 +148,18 @@ async def chat(request: Request, input: OllamaChatCompletionRequest):
|
|||
eval_count = 0 # 统计生成的 token 数量
|
||||
tokens = []
|
||||
|
||||
async for token in interface.inference(prompt, id):
|
||||
d = OllamaChatCompletionStreamResponse(
|
||||
model=config.model_name,
|
||||
created_at=str(datetime.now()),
|
||||
message={"role": "assistant", "content": token},
|
||||
done=False
|
||||
)
|
||||
yield d.model_dump_json() + '\n'
|
||||
async for res in interface.inference(prompt, id):
|
||||
if isinstance(res, RawUsage):
|
||||
raw_usage = res
|
||||
else:
|
||||
token, finish_reason = res
|
||||
d = OllamaChatCompletionStreamResponse(
|
||||
model=config.model_name,
|
||||
created_at=str(datetime.now()),
|
||||
message={"role": "assistant", "content": token},
|
||||
done=False
|
||||
)
|
||||
yield d.model_dump_json() + '\n'
|
||||
# 计算性能数据
|
||||
end_time = time()
|
||||
total_duration = int((end_time - start_time) * 1_000_000_000) # 转换为纳秒
|
||||
|
|
|
@ -5,10 +5,16 @@ from fastapi import APIRouter
|
|||
from fastapi.requests import Request
|
||||
from ktransformers.server.utils.create_interface import get_interface
|
||||
from ktransformers.server.schemas.assistants.streaming import chat_stream_response
|
||||
from ktransformers.server.schemas.endpoints.chat import ChatCompletionCreate,ChatCompletionChunk,ChatCompletionObject, Usage
|
||||
from ktransformers.server.schemas.endpoints.chat import ChatCompletionCreate
|
||||
from ktransformers.server.schemas.endpoints.chat import RawUsage
|
||||
from ktransformers.server.backend.base import BackendInterfaceBase
|
||||
from ktransformers.server.config.config import Config
|
||||
|
||||
from ktransformers.server.schemas.endpoints.chat import ChatCompletionChunk
|
||||
from openai.types.chat import ChatCompletion
|
||||
from openai.types.completion_usage import CompletionUsage
|
||||
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@router.get('/models', tags=['openai'])
|
||||
|
@ -29,15 +35,76 @@ async def chat_completion(request:Request,create:ChatCompletionCreate):
|
|||
assert request.headers.get('Authorization', '').split()[-1] == Config().api_key
|
||||
|
||||
if create.stream:
|
||||
from openai.types.chat.chat_completion_chunk import Choice, ChoiceDelta
|
||||
|
||||
async def inner():
|
||||
chunk = ChatCompletionChunk(id=id,object='chat.completion.chunk',created=int(time()))
|
||||
async for token in interface.inference(input_message,id,create.temperature,create.top_p):
|
||||
chunk.set_token(token)
|
||||
yield chunk
|
||||
return chat_stream_response(request,inner())
|
||||
chunk = ChatCompletionChunk(
|
||||
id = id,
|
||||
choices = [],
|
||||
object = 'chat.completion.chunk',
|
||||
created = int(time()),
|
||||
model = Config().model_name,
|
||||
)
|
||||
|
||||
async for res in interface.inference(input_message,id, create.temperature, create.top_p):
|
||||
if isinstance(res, RawUsage):
|
||||
# at the end of inference, interface.inference() will return the usage of inference
|
||||
raw_usage = res
|
||||
chunk.choices = []
|
||||
chunk.usage = CompletionUsage(
|
||||
prompt_tokens = raw_usage.prefill_count,
|
||||
completion_tokens = raw_usage.decode_count,
|
||||
total_tokens = raw_usage.prefill_count + raw_usage.decode_count
|
||||
)
|
||||
|
||||
yield chunk
|
||||
|
||||
else:
|
||||
token, finish_reason = res
|
||||
choice = Choice(
|
||||
index = 0,
|
||||
delta = ChoiceDelta(content=token, role=None, tool_calls=None),
|
||||
finish_reason = finish_reason,
|
||||
logprobs = None,
|
||||
)
|
||||
chunk.choices = [choice]
|
||||
yield chunk
|
||||
|
||||
return chat_stream_response(request, inner())
|
||||
else:
|
||||
comp = ChatCompletionObject(id=id,object='chat.completion',created=int(time()))
|
||||
comp.usage = Usage(completion_tokens=1, prompt_tokens=1, total_tokens=2)
|
||||
async for token in interface.inference(input_message,id,create.temperature,create.top_p):
|
||||
comp.append_token(token)
|
||||
return comp
|
||||
from openai.types.chat.chat_completion import Choice
|
||||
from openai.types.chat.chat_completion_message import ChatCompletionMessage
|
||||
|
||||
content = ""
|
||||
finish_reason = None
|
||||
async for res in interface.inference(input_message,id,create.temperature,create.top_p):
|
||||
if isinstance(res, RawUsage):
|
||||
raw_usage = res
|
||||
usage = CompletionUsage(
|
||||
prompt_tokens = raw_usage.prefill_count,
|
||||
completion_tokens = raw_usage.decode_count,
|
||||
total_tokens = raw_usage.prefill_count + raw_usage.decode_count
|
||||
)
|
||||
else:
|
||||
token, finish_reason = res
|
||||
content = content + token
|
||||
finish_reason = finish_reason
|
||||
|
||||
choice = Choice(
|
||||
index = 0,
|
||||
finish_reason = finish_reason,
|
||||
message = ChatCompletionMessage(
|
||||
content=content,
|
||||
role="assistant"
|
||||
))
|
||||
|
||||
chat_completion = ChatCompletion(
|
||||
id = id,
|
||||
choices = [choice],
|
||||
created = int(time()),
|
||||
model = Config().model_name,
|
||||
object = 'chat.completion',
|
||||
usage = usage
|
||||
)
|
||||
|
||||
return chat_completion
|
||||
|
|
|
@ -6,6 +6,7 @@ from fastapi.requests import Request
|
|||
from ktransformers.server.utils.create_interface import get_interface
|
||||
from ktransformers.server.schemas.assistants.streaming import stream_response
|
||||
from ktransformers.server.schemas.legacy.completions import CompletionCreate,CompletionObject
|
||||
from ktransformers.server.schemas.endpoints.chat import RawUsage
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
@ -17,17 +18,24 @@ async def create_completion(request:Request,create:CompletionCreate):
|
|||
print(f'COMPLETION INPUT:----\n{create.prompt}\n----')
|
||||
|
||||
|
||||
|
||||
if create.stream:
|
||||
async def inner():
|
||||
async for token in interface.inference(create.prompt,id,create.temperature,create.top_p):
|
||||
d = {'choices':[{'delta':{'content':token}}]}
|
||||
yield f"data:{json.dumps(d)}\n\n"
|
||||
async for res in interface.inference(create.prompt,id,create.temperature,create.top_p):
|
||||
if isinstance(res, RawUsage):
|
||||
raw_usage = res
|
||||
else:
|
||||
token, finish_reason = res
|
||||
d = {'choices':[{'delta':{'content':token}}]}
|
||||
yield f"data:{json.dumps(d)}\n\n"
|
||||
d = {'choices':[{'delta':{'content':''},'finish_reason':''}]}
|
||||
yield f"data:{json.dumps(d)}\n\n"
|
||||
return stream_response(request,inner())
|
||||
else:
|
||||
comp = CompletionObject(id=id,object='text_completion',created=int(time()))
|
||||
async for token in interface.inference(create.prompt,id,create.temperature,create.top_p):
|
||||
comp.append_token(token)
|
||||
async for res in interface.inference(create.prompt,id,create.temperature,create.top_p):
|
||||
if isinstance(res, RawUsage):
|
||||
raw_usage = res
|
||||
else:
|
||||
token, finish_reason = res
|
||||
comp.append_token(token)
|
||||
return comp
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue