kvcache-ai-ktransformers/ktransformers/server/api/openai/endpoints/chat.py

42 lines
1.5 KiB
Python

import json
from time import time
from uuid import uuid4
from fastapi import APIRouter
from fastapi.requests import Request
from ktransformers.server.utils.create_interface import get_interface
from ktransformers.server.schemas.assistants.streaming import chat_stream_response
from ktransformers.server.schemas.endpoints.chat import ChatCompletionCreate,ChatCompletionChunk,ChatCompletionObject
from ktransformers.server.backend.base import BackendInterfaceBase
router = APIRouter()
models = [
{"id": "0", "name": "ktranformers-model"},
]
@router.get('/models', tags=['openai'])
async def list_models():
return models
@router.post('/chat/completions', tags=['openai'])
async def chat_completion(request:Request,create:ChatCompletionCreate):
id = str(uuid4())
interface: BackendInterfaceBase = get_interface()
# input_ids = interface.format_and_tokenize_input_ids(id,messages=create.get_tokenizer_messages())
input_message = [json.loads(m.model_dump_json()) for m in create.messages]
if create.stream:
async def inner():
chunk = ChatCompletionChunk(id=id,object='chat.completion.chunk',created=int(time()))
async for token in interface.inference(input_message,id):
chunk.set_token(token)
yield chunk
return chat_stream_response(request,inner())
else:
comp = ChatCompletionObject(id=id,object='chat.completion.chunk',created=int(time()))
async for token in interface.inference(input_message,id):
comp.append_token(token)
return comp