From 443a6f7ff6346f41c78b0a6ff59c063999542327 Mon Sep 17 00:00:00 2001
From: teddybear082 <87204721+teddybear082@users.noreply.github.com>
Date: Tue, 3 Oct 2023 21:15:45 -0400
Subject: [PATCH] add additional openai chat completions parameters

-support stop parameter mapped to koboldai stop_sequence parameter

-make default max_length / max_tokens parameter consistent with default 80 token length in generate function

-add support for providing name of local model in openai responses
---
 koboldcpp.py        | 32 ++++++++++++++++++++++++--------
 openaichattest.py   | 16 ++++++++++++++++
 openaistreamtest.py | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 76 insertions(+), 8 deletions(-)
 create mode 100644 openaichattest.py
 create mode 100644 openaistreamtest.py

diff --git a/koboldcpp.py b/koboldcpp.py
index 4d60e2665..6a41c98b7 100755
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -372,6 +372,7 @@ totalgens = 0
 currentusergenkey = "" #store a special key so polled streaming works even in multiuser
 args = None #global args
 openaistreaming = False #store if using openai endpoint in streaming mode
+local_model_name = "koboldcpp" #store actual local model name for openai endpoint if it can be found, otherwise default to 'koboldcpp'
 
 class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
     sys_version = ""
@@ -397,14 +398,18 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
             if api_format==1:
                 genparams["prompt"] = genparams.get('text', "")
                 genparams["top_k"] = int(genparams.get('top_k', 120))
-                genparams["max_length"]=genparams.get('max', 50)
+                genparams["max_length"]=genparams.get('max', 80)
             elif api_format==3:
                 frqp = genparams.get('frequency_penalty', 0.1)
                 scaled_rep_pen = genparams.get('presence_penalty', frqp) + 1
-                genparams["max_length"] = genparams.get('max_tokens', 50)
+                genparams["max_length"] = genparams.get('max_tokens', 80)
                 genparams["rep_pen"] = scaled_rep_pen
+                # openai allows either a string or a list as a stop sequence
+                if isinstance(genparams.get('stop',[]), list):
+                    genparams["stop_sequence"] = genparams.get('stop', [])
+                else:
+                    genparams["stop_sequence"] = [genparams.get('stop')]
             elif api_format==4:
-                # TODO: translate other openai unique chat completion parameters to kobold parameters
                 # translate openai chat completion messages format into one big string.
                 messages_array = genparams.get('messages', [])
                 messages_string = ""
@@ -421,8 +426,13 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
                 genparams["prompt"] = messages_string
                 frqp = genparams.get('frequency_penalty', 0.1)
                 scaled_rep_pen = genparams.get('presence_penalty', frqp) + 1
-                genparams["max_length"] = genparams.get('max_tokens', 50)
+                genparams["max_length"] = genparams.get('max_tokens', 80)
                 genparams["rep_pen"] = scaled_rep_pen
+                # openai allows either a string or a list as a stop sequence
+                if isinstance(genparams.get('stop',[]), list):
+                    genparams["stop_sequence"] = genparams.get('stop', [])
+                else:
+                    genparams["stop_sequence"] = [genparams.get('stop')]
 
             return generate(
                 prompt=genparams.get('prompt', ""),
@@ -462,10 +472,10 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
         if api_format==1:
             res = {"data": {"seqs":[recvtxt]}}
         elif api_format==3:
-            res = {"id": "cmpl-1", "object": "text_completion", "created": 1, "model": "koboldcpp",
+            res = {"id": "cmpl-1", "object": "text_completion", "created": 1, "model": local_model_name,
             "choices": [{"text": recvtxt, "index": 0, "finish_reason": "length"}]}
         elif api_format==4:
-            res = {"id": "cmpl-1", "object": "chat.completion", "created": 1, "model": "koboldcpp",
+            res = {"id": "cmpl-1", "object": "chat.completion", "created": 1, "model": local_model_name,
             "choices": [{"index": 0, "message":{"role": "assistant", "content": recvtxt,}, "finish_reason": "length"}]}
         else:
             res = {"results": [{"text": recvtxt}]}
@@ -517,7 +527,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
                 event_str = ""
                 # if openaistreaming endpoint, set format to expected openai streaming response
                 if openaistreaming == True:
-                    event_data = {"id":"koboldcpp","object":"chat.completion.chunk","created":1,"model":"koboldcpp","choices":[{"index":0,"finish_reason":"length","delta":{'role':'assistant','content':tokenStr},}],}
+                    event_data = {"id":"koboldcpp","object":"chat.completion.chunk","created":1,"model":local_model_name,"choices":[{"index":0,"finish_reason":"length","delta":{'role':'assistant','content':tokenStr},}],}
                     event_str = json.dumps(event_data)
                 else:
                     event_str = json.dumps(event_data)
@@ -611,7 +621,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
             response_body = (json.dumps({"results": [{"text": pendtxtStr}]}).encode())
 
         elif self.path.endswith('/v1/models') or self.path.endswith('/models'):
-            response_body = (json.dumps({"object":"list","data":[{"id":"koboldcpp","object":"model","created":1,"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"}]}).encode())
+            response_body = (json.dumps({"object":"list","data":[{"id":local_model_name,"object":"model","created":1,"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"}]}).encode())
             force_json = True
 
         elif self.path.endswith(('/api')) or self.path.endswith(('/api/v1')):
@@ -1732,6 +1742,12 @@ def main(launch_args,start_server=True):
     print(f"==========\nLoading model: {modelname} \n[Threads: {args.threads}, BlasThreads: {args.blasthreads}, SmartContext: {args.smartcontext}]")
     loadok = load_model(modelname)
     print("Load Model OK: " + str(loadok))
+    # set local_model_name variable to model for use by openai api endpoints if possible, otherwise default to 'koboldcpp'
+    global local_model_name
+    full_model_path = os.path.abspath(args.model_param)
+    index_of_last_backslash = full_model_path.rfind('\\')
+    if index_of_last_backslash != -1:
+        local_model_name = full_model_path[index_of_last_backslash + 1:]
 
     if not loadok:
         print("Could not load model: " + modelname)
diff --git a/openaichattest.py b/openaichattest.py
new file mode 100644
index 000000000..6635b8701
--- /dev/null
+++ b/openaichattest.py
@@ -0,0 +1,16 @@
+import openai
+openai.api_key = "sk-test"
+openai.api_base = "http://localhost:5001/api/extra/oai/v1"
+
+completion = openai.ChatCompletion.create(
+  model="gpt-3.5-turbo",
+  messages=[
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Hello!"},
+  ],
+  max_tokens=250,
+)
+print("whole response: \n")
+print(completion)
+print("\nMessage content:\n")
+print(completion.choices[0].message)
\ No newline at end of file
diff --git a/openaistreamtest.py b/openaistreamtest.py
new file mode 100644
index 000000000..8a9c4a521
--- /dev/null
+++ b/openaistreamtest.py
@@ -0,0 +1,36 @@
+import openai
+import time
+# Example of an OpenAI ChatCompletion request with stream=True
+# https://platform.openai.com/docs/guides/chat
+openai.api_key = "sk-test"
+openai.api_base = "http://localhost:5001/api/extra/oai/v1"
+# record the time before the request is sent
+start_time = time.time()
+
+# send a ChatCompletion request to count to 100
+response = openai.ChatCompletion.create(
+    model='gpt-3.5-turbo',
+    messages=[
+        {'role': 'user', 'content': 'Count to 100, with a comma between each number and no newlines. E.g., 1, 2, 3, ...'}
+    ],
+    temperature=0,
+    max_tokens=150,
+    stream=True  # again, we set stream=True
+)
+
+# create variables to collect the stream of chunks
+collected_chunks = []
+collected_messages = []
+# iterate through the stream of events
+for chunk in response:
+    print(chunk)
+    chunk_time = time.time() - start_time  # calculate the time delay of the chunk
+    collected_chunks.append(chunk)  # save the event response
+    chunk_message = chunk['choices'][0]['delta']  # extract the message
+    collected_messages.append(chunk_message)  # save the message
+    print(f"Message received {chunk_time:.2f} seconds after request: {chunk_message}")  # print the delay and text
+
+# print the time delay and text received
+print(f"Full response received {chunk_time:.2f} seconds after request")
+full_reply_content = ''.join([m.get('content', '') for m in collected_messages])
+print(f"Full conversation received: {full_reply_content}")
\ No newline at end of file