mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-22 03:10:03 +00:00
add additional openai chat completions parameters
-support stop parameter mapped to koboldai stop_sequence parameter -make default max_length / max_tokens parameter consistent with default 80 token length in generate function -add support for providing name of local model in openai responses
This commit is contained in:
parent
42c2f2c90a
commit
443a6f7ff6
3 changed files with 76 additions and 8 deletions
32
koboldcpp.py
32
koboldcpp.py
|
|
@ -372,6 +372,7 @@ totalgens = 0
|
|||
currentusergenkey = "" #store a special key so polled streaming works even in multiuser
|
||||
args = None #global args
|
||||
openaistreaming = False #store if using openai endpoint in streaming mode
|
||||
local_model_name = "koboldcpp" #store actual local model name for openai endpoint if it can be found, otherwise default to 'koboldcpp'
|
||||
|
||||
class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||
sys_version = ""
|
||||
|
|
@ -397,14 +398,18 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
if api_format==1:
|
||||
genparams["prompt"] = genparams.get('text', "")
|
||||
genparams["top_k"] = int(genparams.get('top_k', 120))
|
||||
genparams["max_length"]=genparams.get('max', 50)
|
||||
genparams["max_length"]=genparams.get('max', 80)
|
||||
elif api_format==3:
|
||||
frqp = genparams.get('frequency_penalty', 0.1)
|
||||
scaled_rep_pen = genparams.get('presence_penalty', frqp) + 1
|
||||
genparams["max_length"] = genparams.get('max_tokens', 50)
|
||||
genparams["max_length"] = genparams.get('max_tokens', 80)
|
||||
genparams["rep_pen"] = scaled_rep_pen
|
||||
# openai allows either a string or a list as a stop sequence
|
||||
if isinstance(genparams.get('stop',[]), list):
|
||||
genparams["stop_sequence"] = genparams.get('stop', [])
|
||||
else:
|
||||
genparams["stop_sequence"] = [genparams.get('stop')]
|
||||
elif api_format==4:
|
||||
# TODO: translate other openai unique chat completion parameters to kobold parameters
|
||||
# translate openai chat completion messages format into one big string.
|
||||
messages_array = genparams.get('messages', [])
|
||||
messages_string = ""
|
||||
|
|
@ -421,8 +426,13 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
genparams["prompt"] = messages_string
|
||||
frqp = genparams.get('frequency_penalty', 0.1)
|
||||
scaled_rep_pen = genparams.get('presence_penalty', frqp) + 1
|
||||
genparams["max_length"] = genparams.get('max_tokens', 50)
|
||||
genparams["max_length"] = genparams.get('max_tokens', 80)
|
||||
genparams["rep_pen"] = scaled_rep_pen
|
||||
# openai allows either a string or a list as a stop sequence
|
||||
if isinstance(genparams.get('stop',[]), list):
|
||||
genparams["stop_sequence"] = genparams.get('stop', [])
|
||||
else:
|
||||
genparams["stop_sequence"] = [genparams.get('stop')]
|
||||
|
||||
return generate(
|
||||
prompt=genparams.get('prompt', ""),
|
||||
|
|
@ -462,10 +472,10 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
if api_format==1:
|
||||
res = {"data": {"seqs":[recvtxt]}}
|
||||
elif api_format==3:
|
||||
res = {"id": "cmpl-1", "object": "text_completion", "created": 1, "model": "koboldcpp",
|
||||
res = {"id": "cmpl-1", "object": "text_completion", "created": 1, "model": local_model_name,
|
||||
"choices": [{"text": recvtxt, "index": 0, "finish_reason": "length"}]}
|
||||
elif api_format==4:
|
||||
res = {"id": "cmpl-1", "object": "chat.completion", "created": 1, "model": "koboldcpp",
|
||||
res = {"id": "cmpl-1", "object": "chat.completion", "created": 1, "model": local_model_name,
|
||||
"choices": [{"index": 0, "message":{"role": "assistant", "content": recvtxt,}, "finish_reason": "length"}]}
|
||||
else:
|
||||
res = {"results": [{"text": recvtxt}]}
|
||||
|
|
@ -517,7 +527,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
event_str = ""
|
||||
# if openaistreaming endpoint, set format to expected openai streaming response
|
||||
if openaistreaming == True:
|
||||
event_data = {"id":"koboldcpp","object":"chat.completion.chunk","created":1,"model":"koboldcpp","choices":[{"index":0,"finish_reason":"length","delta":{'role':'assistant','content':tokenStr},}],}
|
||||
event_data = {"id":"koboldcpp","object":"chat.completion.chunk","created":1,"model":local_model_name,"choices":[{"index":0,"finish_reason":"length","delta":{'role':'assistant','content':tokenStr},}],}
|
||||
event_str = json.dumps(event_data)
|
||||
else:
|
||||
event_str = json.dumps(event_data)
|
||||
|
|
@ -611,7 +621,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
response_body = (json.dumps({"results": [{"text": pendtxtStr}]}).encode())
|
||||
|
||||
elif self.path.endswith('/v1/models') or self.path.endswith('/models'):
|
||||
response_body = (json.dumps({"object":"list","data":[{"id":"koboldcpp","object":"model","created":1,"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"}]}).encode())
|
||||
response_body = (json.dumps({"object":"list","data":[{"id":local_model_name,"object":"model","created":1,"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"}]}).encode())
|
||||
force_json = True
|
||||
|
||||
elif self.path.endswith(('/api')) or self.path.endswith(('/api/v1')):
|
||||
|
|
@ -1732,6 +1742,12 @@ def main(launch_args,start_server=True):
|
|||
print(f"==========\nLoading model: {modelname} \n[Threads: {args.threads}, BlasThreads: {args.blasthreads}, SmartContext: {args.smartcontext}]")
|
||||
loadok = load_model(modelname)
|
||||
print("Load Model OK: " + str(loadok))
|
||||
# set local_model_name variable to model for use by openai api endpoints if possible, otherwise default to 'koboldcpp'
|
||||
global local_model_name
|
||||
full_model_path = os.path.abspath(args.model_param)
|
||||
index_of_last_backslash = full_model_path.rfind('\\')
|
||||
if index_of_last_backslash != -1:
|
||||
local_model_name = full_model_path[index_of_last_backslash + 1:]
|
||||
|
||||
if not loadok:
|
||||
print("Could not load model: " + modelname)
|
||||
|
|
|
|||
16
openaichattest.py
Normal file
16
openaichattest.py
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
import openai
|
||||
openai.api_key = "sk-test"
|
||||
openai.api_base = "http://localhost:5001/api/extra/oai/v1"
|
||||
|
||||
completion = openai.ChatCompletion.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "Hello!"},
|
||||
],
|
||||
max_tokens=250,
|
||||
)
|
||||
print("whole response: \n")
|
||||
print(completion)
|
||||
print("\nMessage content:\n")
|
||||
print(completion.choices[0].message)
|
||||
36
openaistreamtest.py
Normal file
36
openaistreamtest.py
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
import openai
|
||||
import time
|
||||
# Example of an OpenAI ChatCompletion request with stream=True
|
||||
# https://platform.openai.com/docs/guides/chat
|
||||
openai.api_key = "sk-test"
|
||||
openai.api_base = "http://localhost:5001/api/extra/oai/v1"
|
||||
# record the time before the request is sent
|
||||
start_time = time.time()
|
||||
|
||||
# send a ChatCompletion request to count to 100
|
||||
response = openai.ChatCompletion.create(
|
||||
model='gpt-3.5-turbo',
|
||||
messages=[
|
||||
{'role': 'user', 'content': 'Count to 100, with a comma between each number and no newlines. E.g., 1, 2, 3, ...'}
|
||||
],
|
||||
temperature=0,
|
||||
max_tokens=150,
|
||||
stream=True # again, we set stream=True
|
||||
)
|
||||
|
||||
# create variables to collect the stream of chunks
|
||||
collected_chunks = []
|
||||
collected_messages = []
|
||||
# iterate through the stream of events
|
||||
for chunk in response:
|
||||
print(chunk)
|
||||
chunk_time = time.time() - start_time # calculate the time delay of the chunk
|
||||
collected_chunks.append(chunk) # save the event response
|
||||
chunk_message = chunk['choices'][0]['delta'] # extract the message
|
||||
collected_messages.append(chunk_message) # save the message
|
||||
print(f"Message received {chunk_time:.2f} seconds after request: {chunk_message}") # print the delay and text
|
||||
|
||||
# print the time delay and text received
|
||||
print(f"Full response received {chunk_time:.2f} seconds after request")
|
||||
full_reply_content = ''.join([m.get('content', '') for m in collected_messages])
|
||||
print(f"Full conversation received: {full_reply_content}")
|
||||
Loading…
Add table
Add a link
Reference in a new issue