add additional openai chat completions parameters

-support stop parameter mapped to koboldai stop_sequence parameter

-make default max_length / max_tokens parameter consistent with default 80 token length in generate function

-add support for providing name of local model in openai responses
This commit is contained in:
teddybear082 2023-10-03 21:15:45 -04:00
parent 42c2f2c90a
commit 443a6f7ff6
3 changed files with 76 additions and 8 deletions

View file

@ -372,6 +372,7 @@ totalgens = 0
currentusergenkey = "" #store a special key so polled streaming works even in multiuser
args = None #global args
openaistreaming = False #store if using openai endpoint in streaming mode
local_model_name = "koboldcpp" #store actual local model name for openai endpoint if it can be found, otherwise default to 'koboldcpp'
class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
sys_version = ""
@ -397,14 +398,18 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
if api_format==1:
genparams["prompt"] = genparams.get('text', "")
genparams["top_k"] = int(genparams.get('top_k', 120))
genparams["max_length"]=genparams.get('max', 50)
genparams["max_length"]=genparams.get('max', 80)
elif api_format==3:
frqp = genparams.get('frequency_penalty', 0.1)
scaled_rep_pen = genparams.get('presence_penalty', frqp) + 1
genparams["max_length"] = genparams.get('max_tokens', 50)
genparams["max_length"] = genparams.get('max_tokens', 80)
genparams["rep_pen"] = scaled_rep_pen
# openai allows either a string or a list as a stop sequence
if isinstance(genparams.get('stop',[]), list):
genparams["stop_sequence"] = genparams.get('stop', [])
else:
genparams["stop_sequence"] = [genparams.get('stop')]
elif api_format==4:
# TODO: translate other openai unique chat completion parameters to kobold parameters
# translate openai chat completion messages format into one big string.
messages_array = genparams.get('messages', [])
messages_string = ""
@ -421,8 +426,13 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
genparams["prompt"] = messages_string
frqp = genparams.get('frequency_penalty', 0.1)
scaled_rep_pen = genparams.get('presence_penalty', frqp) + 1
genparams["max_length"] = genparams.get('max_tokens', 50)
genparams["max_length"] = genparams.get('max_tokens', 80)
genparams["rep_pen"] = scaled_rep_pen
# openai allows either a string or a list as a stop sequence
if isinstance(genparams.get('stop',[]), list):
genparams["stop_sequence"] = genparams.get('stop', [])
else:
genparams["stop_sequence"] = [genparams.get('stop')]
return generate(
prompt=genparams.get('prompt', ""),
@ -462,10 +472,10 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
if api_format==1:
res = {"data": {"seqs":[recvtxt]}}
elif api_format==3:
res = {"id": "cmpl-1", "object": "text_completion", "created": 1, "model": "koboldcpp",
res = {"id": "cmpl-1", "object": "text_completion", "created": 1, "model": local_model_name,
"choices": [{"text": recvtxt, "index": 0, "finish_reason": "length"}]}
elif api_format==4:
res = {"id": "cmpl-1", "object": "chat.completion", "created": 1, "model": "koboldcpp",
res = {"id": "cmpl-1", "object": "chat.completion", "created": 1, "model": local_model_name,
"choices": [{"index": 0, "message":{"role": "assistant", "content": recvtxt,}, "finish_reason": "length"}]}
else:
res = {"results": [{"text": recvtxt}]}
@ -517,7 +527,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
event_str = ""
# if openaistreaming endpoint, set format to expected openai streaming response
if openaistreaming == True:
event_data = {"id":"koboldcpp","object":"chat.completion.chunk","created":1,"model":"koboldcpp","choices":[{"index":0,"finish_reason":"length","delta":{'role':'assistant','content':tokenStr},}],}
event_data = {"id":"koboldcpp","object":"chat.completion.chunk","created":1,"model":local_model_name,"choices":[{"index":0,"finish_reason":"length","delta":{'role':'assistant','content':tokenStr},}],}
event_str = json.dumps(event_data)
else:
event_str = json.dumps(event_data)
@ -611,7 +621,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
response_body = (json.dumps({"results": [{"text": pendtxtStr}]}).encode())
elif self.path.endswith('/v1/models') or self.path.endswith('/models'):
response_body = (json.dumps({"object":"list","data":[{"id":"koboldcpp","object":"model","created":1,"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"}]}).encode())
response_body = (json.dumps({"object":"list","data":[{"id":local_model_name,"object":"model","created":1,"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"}]}).encode())
force_json = True
elif self.path.endswith(('/api')) or self.path.endswith(('/api/v1')):
@ -1732,6 +1742,12 @@ def main(launch_args,start_server=True):
print(f"==========\nLoading model: {modelname} \n[Threads: {args.threads}, BlasThreads: {args.blasthreads}, SmartContext: {args.smartcontext}]")
loadok = load_model(modelname)
print("Load Model OK: " + str(loadok))
# set local_model_name variable to model for use by openai api endpoints if possible, otherwise default to 'koboldcpp'
global local_model_name
full_model_path = os.path.abspath(args.model_param)
index_of_last_backslash = full_model_path.rfind('\\')
if index_of_last_backslash != -1:
local_model_name = full_model_path[index_of_last_backslash + 1:]
if not loadok:
print("Could not load model: " + modelname)

16
openaichattest.py Normal file
View file

@ -0,0 +1,16 @@
import openai
openai.api_key = "sk-test"
openai.api_base = "http://localhost:5001/api/extra/oai/v1"
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello!"},
],
max_tokens=250,
)
print("whole response: \n")
print(completion)
print("\nMessage content:\n")
print(completion.choices[0].message)

36
openaistreamtest.py Normal file
View file

@ -0,0 +1,36 @@
import openai
import time
# Example of an OpenAI ChatCompletion request with stream=True
# https://platform.openai.com/docs/guides/chat
openai.api_key = "sk-test"
openai.api_base = "http://localhost:5001/api/extra/oai/v1"
# record the time before the request is sent
start_time = time.time()
# send a ChatCompletion request to count to 100
response = openai.ChatCompletion.create(
model='gpt-3.5-turbo',
messages=[
{'role': 'user', 'content': 'Count to 100, with a comma between each number and no newlines. E.g., 1, 2, 3, ...'}
],
temperature=0,
max_tokens=150,
stream=True # again, we set stream=True
)
# create variables to collect the stream of chunks
collected_chunks = []
collected_messages = []
# iterate through the stream of events
for chunk in response:
print(chunk)
chunk_time = time.time() - start_time # calculate the time delay of the chunk
collected_chunks.append(chunk) # save the event response
chunk_message = chunk['choices'][0]['delta'] # extract the message
collected_messages.append(chunk_message) # save the message
print(f"Message received {chunk_time:.2f} seconds after request: {chunk_message}") # print the delay and text
# print the time delay and text received
print(f"Full response received {chunk_time:.2f} seconds after request")
full_reply_content = ''.join([m.get('content', '') for m in collected_messages])
print(f"Full conversation received: {full_reply_content}")