From 443a6f7ff6346f41c78b0a6ff59c063999542327 Mon Sep 17 00:00:00 2001 From: teddybear082 <87204721+teddybear082@users.noreply.github.com> Date: Tue, 3 Oct 2023 21:15:45 -0400 Subject: [PATCH] add additional openai chat completions parameters -support stop parameter mapped to koboldai stop_sequence parameter -make default max_length / max_tokens parameter consistent with default 80 token length in generate function -add support for providing name of local model in openai responses --- koboldcpp.py | 32 ++++++++++++++++++++++++-------- openaichattest.py | 16 ++++++++++++++++ openaistreamtest.py | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 8 deletions(-) create mode 100644 openaichattest.py create mode 100644 openaistreamtest.py diff --git a/koboldcpp.py b/koboldcpp.py index 4d60e2665..6a41c98b7 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -372,6 +372,7 @@ totalgens = 0 currentusergenkey = "" #store a special key so polled streaming works even in multiuser args = None #global args openaistreaming = False #store if using openai endpoint in streaming mode +local_model_name = "koboldcpp" #store actual local model name for openai endpoint if it can be found, otherwise default to 'koboldcpp' class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): sys_version = "" @@ -397,14 +398,18 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): if api_format==1: genparams["prompt"] = genparams.get('text', "") genparams["top_k"] = int(genparams.get('top_k', 120)) - genparams["max_length"]=genparams.get('max', 50) + genparams["max_length"]=genparams.get('max', 80) elif api_format==3: frqp = genparams.get('frequency_penalty', 0.1) scaled_rep_pen = genparams.get('presence_penalty', frqp) + 1 - genparams["max_length"] = genparams.get('max_tokens', 50) + genparams["max_length"] = genparams.get('max_tokens', 80) genparams["rep_pen"] = scaled_rep_pen + # openai allows either a string or a list as a stop sequence + if isinstance(genparams.get('stop',[]), list): + genparams["stop_sequence"] = genparams.get('stop', []) + else: + genparams["stop_sequence"] = [genparams.get('stop')] elif api_format==4: - # TODO: translate other openai unique chat completion parameters to kobold parameters # translate openai chat completion messages format into one big string. messages_array = genparams.get('messages', []) messages_string = "" @@ -421,8 +426,13 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): genparams["prompt"] = messages_string frqp = genparams.get('frequency_penalty', 0.1) scaled_rep_pen = genparams.get('presence_penalty', frqp) + 1 - genparams["max_length"] = genparams.get('max_tokens', 50) + genparams["max_length"] = genparams.get('max_tokens', 80) genparams["rep_pen"] = scaled_rep_pen + # openai allows either a string or a list as a stop sequence + if isinstance(genparams.get('stop',[]), list): + genparams["stop_sequence"] = genparams.get('stop', []) + else: + genparams["stop_sequence"] = [genparams.get('stop')] return generate( prompt=genparams.get('prompt', ""), @@ -462,10 +472,10 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): if api_format==1: res = {"data": {"seqs":[recvtxt]}} elif api_format==3: - res = {"id": "cmpl-1", "object": "text_completion", "created": 1, "model": "koboldcpp", + res = {"id": "cmpl-1", "object": "text_completion", "created": 1, "model": local_model_name, "choices": [{"text": recvtxt, "index": 0, "finish_reason": "length"}]} elif api_format==4: - res = {"id": "cmpl-1", "object": "chat.completion", "created": 1, "model": "koboldcpp", + res = {"id": "cmpl-1", "object": "chat.completion", "created": 1, "model": local_model_name, "choices": [{"index": 0, "message":{"role": "assistant", "content": recvtxt,}, "finish_reason": "length"}]} else: res = {"results": [{"text": recvtxt}]} @@ -517,7 +527,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): event_str = "" # if openaistreaming endpoint, set format to expected openai streaming response if openaistreaming == True: - event_data = {"id":"koboldcpp","object":"chat.completion.chunk","created":1,"model":"koboldcpp","choices":[{"index":0,"finish_reason":"length","delta":{'role':'assistant','content':tokenStr},}],} + event_data = {"id":"koboldcpp","object":"chat.completion.chunk","created":1,"model":local_model_name,"choices":[{"index":0,"finish_reason":"length","delta":{'role':'assistant','content':tokenStr},}],} event_str = json.dumps(event_data) else: event_str = json.dumps(event_data) @@ -611,7 +621,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): response_body = (json.dumps({"results": [{"text": pendtxtStr}]}).encode()) elif self.path.endswith('/v1/models') or self.path.endswith('/models'): - response_body = (json.dumps({"object":"list","data":[{"id":"koboldcpp","object":"model","created":1,"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"}]}).encode()) + response_body = (json.dumps({"object":"list","data":[{"id":local_model_name,"object":"model","created":1,"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"}]}).encode()) force_json = True elif self.path.endswith(('/api')) or self.path.endswith(('/api/v1')): @@ -1732,6 +1742,12 @@ def main(launch_args,start_server=True): print(f"==========\nLoading model: {modelname} \n[Threads: {args.threads}, BlasThreads: {args.blasthreads}, SmartContext: {args.smartcontext}]") loadok = load_model(modelname) print("Load Model OK: " + str(loadok)) + # set local_model_name variable to model for use by openai api endpoints if possible, otherwise default to 'koboldcpp' + global local_model_name + full_model_path = os.path.abspath(args.model_param) + index_of_last_backslash = full_model_path.rfind('\\') + if index_of_last_backslash != -1: + local_model_name = full_model_path[index_of_last_backslash + 1:] if not loadok: print("Could not load model: " + modelname) diff --git a/openaichattest.py b/openaichattest.py new file mode 100644 index 000000000..6635b8701 --- /dev/null +++ b/openaichattest.py @@ -0,0 +1,16 @@ +import openai +openai.api_key = "sk-test" +openai.api_base = "http://localhost:5001/api/extra/oai/v1" + +completion = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello!"}, + ], + max_tokens=250, +) +print("whole response: \n") +print(completion) +print("\nMessage content:\n") +print(completion.choices[0].message) \ No newline at end of file diff --git a/openaistreamtest.py b/openaistreamtest.py new file mode 100644 index 000000000..8a9c4a521 --- /dev/null +++ b/openaistreamtest.py @@ -0,0 +1,36 @@ +import openai +import time +# Example of an OpenAI ChatCompletion request with stream=True +# https://platform.openai.com/docs/guides/chat +openai.api_key = "sk-test" +openai.api_base = "http://localhost:5001/api/extra/oai/v1" +# record the time before the request is sent +start_time = time.time() + +# send a ChatCompletion request to count to 100 +response = openai.ChatCompletion.create( + model='gpt-3.5-turbo', + messages=[ + {'role': 'user', 'content': 'Count to 100, with a comma between each number and no newlines. E.g., 1, 2, 3, ...'} + ], + temperature=0, + max_tokens=150, + stream=True # again, we set stream=True +) + +# create variables to collect the stream of chunks +collected_chunks = [] +collected_messages = [] +# iterate through the stream of events +for chunk in response: + print(chunk) + chunk_time = time.time() - start_time # calculate the time delay of the chunk + collected_chunks.append(chunk) # save the event response + chunk_message = chunk['choices'][0]['delta'] # extract the message + collected_messages.append(chunk_message) # save the message + print(f"Message received {chunk_time:.2f} seconds after request: {chunk_message}") # print the delay and text + +# print the time delay and text received +print(f"Full response received {chunk_time:.2f} seconds after request") +full_reply_content = ''.join([m.get('content', '') for m in collected_messages]) +print(f"Full conversation received: {full_reply_content}") \ No newline at end of file