add additional openai chat completions parameters

-support stop parameter mapped to koboldai stop_sequence parameter -make default max_length / max_tokens parameter consistent with default 80 token length in generate function -add support for providing name of local model in openai responses
2026-05-22 03:10:03 +00:00 · 2023-10-03 21:15:45 -04:00 · 2023-10-03 21:15:45 -04:00 · 443a6f7ff6
commit 443a6f7ff6
parent 42c2f2c90a
3 changed files with 76 additions and 8 deletions
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -372,6 +372,7 @@ totalgens = 0
 currentusergenkey = "" #store a special key so polled streaming works even in multiuser
 args = None #global args
 openaistreaming = False #store if using openai endpoint in streaming mode
+local_model_name = "koboldcpp" #store actual local model name for openai endpoint if it can be found, otherwise default to 'koboldcpp'

 class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
    sys_version = ""
@ -397,14 +398,18 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
            if api_format==1:
                genparams["prompt"] = genparams.get('text', "")
                genparams["top_k"] = int(genparams.get('top_k', 120))
-                genparams["max_length"]=genparams.get('max', 50)
+                genparams["max_length"]=genparams.get('max', 80)
            elif api_format==3:
                frqp = genparams.get('frequency_penalty', 0.1)
                scaled_rep_pen = genparams.get('presence_penalty', frqp) + 1
-                genparams["max_length"] = genparams.get('max_tokens', 50)
+                genparams["max_length"] = genparams.get('max_tokens', 80)
                genparams["rep_pen"] = scaled_rep_pen
+                # openai allows either a string or a list as a stop sequence
+                if isinstance(genparams.get('stop',[]), list):
+                    genparams["stop_sequence"] = genparams.get('stop', [])
+                else:
+                    genparams["stop_sequence"] = [genparams.get('stop')]
            elif api_format==4:
-                # TODO: translate other openai unique chat completion parameters to kobold parameters
                # translate openai chat completion messages format into one big string.
                messages_array = genparams.get('messages', [])
                messages_string = ""
@ -421,8 +426,13 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
                genparams["prompt"] = messages_string
                frqp = genparams.get('frequency_penalty', 0.1)
                scaled_rep_pen = genparams.get('presence_penalty', frqp) + 1
-                genparams["max_length"] = genparams.get('max_tokens', 50)
+                genparams["max_length"] = genparams.get('max_tokens', 80)
                genparams["rep_pen"] = scaled_rep_pen
+                # openai allows either a string or a list as a stop sequence
+                if isinstance(genparams.get('stop',[]), list):
+                    genparams["stop_sequence"] = genparams.get('stop', [])
+                else:
+                    genparams["stop_sequence"] = [genparams.get('stop')]

            return generate(
                prompt=genparams.get('prompt', ""),
@ -462,10 +472,10 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
        if api_format==1:
            res = {"data": {"seqs":[recvtxt]}}
        elif api_format==3:
-            res = {"id": "cmpl-1", "object": "text_completion", "created": 1, "model": "koboldcpp",
+            res = {"id": "cmpl-1", "object": "text_completion", "created": 1, "model": local_model_name,
            "choices": [{"text": recvtxt, "index": 0, "finish_reason": "length"}]}
        elif api_format==4:
-            res = {"id": "cmpl-1", "object": "chat.completion", "created": 1, "model": "koboldcpp",
+            res = {"id": "cmpl-1", "object": "chat.completion", "created": 1, "model": local_model_name,
            "choices": [{"index": 0, "message":{"role": "assistant", "content": recvtxt,}, "finish_reason": "length"}]}
        else:
            res = {"results": [{"text": recvtxt}]}
@ -517,7 +527,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
                event_str = ""
                # if openaistreaming endpoint, set format to expected openai streaming response
                if openaistreaming == True:
-                    event_data = {"id":"koboldcpp","object":"chat.completion.chunk","created":1,"model":"koboldcpp","choices":[{"index":0,"finish_reason":"length","delta":{'role':'assistant','content':tokenStr},}],}
+                    event_data = {"id":"koboldcpp","object":"chat.completion.chunk","created":1,"model":local_model_name,"choices":[{"index":0,"finish_reason":"length","delta":{'role':'assistant','content':tokenStr},}],}
                    event_str = json.dumps(event_data)
                else:
                    event_str = json.dumps(event_data)
@ -611,7 +621,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
            response_body = (json.dumps({"results": [{"text": pendtxtStr}]}).encode())

        elif self.path.endswith('/v1/models') or self.path.endswith('/models'):
-            response_body = (json.dumps({"object":"list","data":[{"id":"koboldcpp","object":"model","created":1,"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"}]}).encode())
+            response_body = (json.dumps({"object":"list","data":[{"id":local_model_name,"object":"model","created":1,"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"}]}).encode())
            force_json = True

        elif self.path.endswith(('/api')) or self.path.endswith(('/api/v1')):
@ -1732,6 +1742,12 @@ def main(launch_args,start_server=True):
    print(f"==========\nLoading model: {modelname} \n[Threads: {args.threads}, BlasThreads: {args.blasthreads}, SmartContext: {args.smartcontext}]")
    loadok = load_model(modelname)
    print("Load Model OK: " + str(loadok))
+    # set local_model_name variable to model for use by openai api endpoints if possible, otherwise default to 'koboldcpp'
+    global local_model_name
+    full_model_path = os.path.abspath(args.model_param)
+    index_of_last_backslash = full_model_path.rfind('\\')
+    if index_of_last_backslash != -1:
+        local_model_name = full_model_path[index_of_last_backslash + 1:]

    if not loadok:
        print("Could not load model: " + modelname)
--- a/openaichattest.py
+++ b/openaichattest.py
@ -0,0 +1,16 @@
+import openai
+openai.api_key = "sk-test"
+openai.api_base = "http://localhost:5001/api/extra/oai/v1"
+
+completion = openai.ChatCompletion.create(
+  model="gpt-3.5-turbo",
+  messages=[
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Hello!"},
+  ],
+  max_tokens=250,
+)
+print("whole response: \n")
+print(completion)
+print("\nMessage content:\n")
+print(completion.choices[0].message)
--- a/openaistreamtest.py
+++ b/openaistreamtest.py
@ -0,0 +1,36 @@
+import openai
+import time
+# Example of an OpenAI ChatCompletion request with stream=True
+# https://platform.openai.com/docs/guides/chat
+openai.api_key = "sk-test"
+openai.api_base = "http://localhost:5001/api/extra/oai/v1"
+# record the time before the request is sent
+start_time = time.time()
+
+# send a ChatCompletion request to count to 100
+response = openai.ChatCompletion.create(
+    model='gpt-3.5-turbo',
+    messages=[
+        {'role': 'user', 'content': 'Count to 100, with a comma between each number and no newlines. E.g., 1, 2, 3, ...'}
+    ],
+    temperature=0,
+    max_tokens=150,
+    stream=True  # again, we set stream=True
+)
+
+# create variables to collect the stream of chunks
+collected_chunks = []
+collected_messages = []
+# iterate through the stream of events
+for chunk in response:
+    print(chunk)
+    chunk_time = time.time() - start_time  # calculate the time delay of the chunk
+    collected_chunks.append(chunk)  # save the event response
+    chunk_message = chunk['choices'][0]['delta']  # extract the message
+    collected_messages.append(chunk_message)  # save the message
+    print(f"Message received {chunk_time:.2f} seconds after request: {chunk_message}")  # print the delay and text
+
+# print the time delay and text received
+print(f"Full response received {chunk_time:.2f} seconds after request")
+full_reply_content = ''.join([m.get('content', '') for m in collected_messages])
+print(f"Full conversation received: {full_reply_content}")