mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 09:04:36 +00:00
ollama sync completions mostly working. stupid api.
This commit is contained in:
parent
2c1a06a07d
commit
62dde8cfb2
2 changed files with 45 additions and 16 deletions
|
@ -2533,6 +2533,10 @@ std::string gpttype_detokenize(const std::vector<int> & inputids, bool render_sp
|
||||||
std::string output = "";
|
std::string output = "";
|
||||||
for (auto eid : inputids)
|
for (auto eid : inputids)
|
||||||
{
|
{
|
||||||
|
if(eid<0 || eid>=n_vocab)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
std::string tokenizedstr = FileFormatTokenizeID(eid, file_format, render_special);
|
std::string tokenizedstr = FileFormatTokenizeID(eid, file_format, render_special);
|
||||||
output += tokenizedstr;
|
output += tokenizedstr;
|
||||||
}
|
}
|
||||||
|
|
57
koboldcpp.py
57
koboldcpp.py
|
@ -1243,6 +1243,26 @@ def whisper_generate(genparams):
|
||||||
outstr = ret.data.decode("UTF-8","ignore")
|
outstr = ret.data.decode("UTF-8","ignore")
|
||||||
return outstr
|
return outstr
|
||||||
|
|
||||||
|
def tokenize_ids(countprompt,tcaddspecial):
|
||||||
|
rawcountdata = handle.token_count(countprompt.encode("UTF-8"),tcaddspecial)
|
||||||
|
countlimit = rawcountdata.count if (rawcountdata.count>=0 and rawcountdata.count<50000) else 0
|
||||||
|
# the above protects the server in case the count limit got corrupted
|
||||||
|
countdata = [rawcountdata.ids[i] for i in range(countlimit)]
|
||||||
|
return countdata
|
||||||
|
|
||||||
|
def detokenize_ids(tokids):
|
||||||
|
tokidslen = len(tokids)
|
||||||
|
detokstr = ""
|
||||||
|
if tokidslen > 0 and tokidslen < 65536:
|
||||||
|
inputs = token_count_outputs()
|
||||||
|
inputs.count = tokidslen
|
||||||
|
inputs.ids = (ctypes.c_int * tokidslen)()
|
||||||
|
for i, cid in enumerate(tokids):
|
||||||
|
inputs.ids[i] = cid
|
||||||
|
detok = handle.detokenize(inputs)
|
||||||
|
detokstr = ctypes.string_at(detok).decode("UTF-8","ignore")
|
||||||
|
return detokstr
|
||||||
|
|
||||||
#################################################################
|
#################################################################
|
||||||
### A hacky simple HTTP server simulating a kobold api by Concedo
|
### A hacky simple HTTP server simulating a kobold api by Concedo
|
||||||
### we are intentionally NOT using flask, because we want MINIMAL dependencies
|
### we are intentionally NOT using flask, because we want MINIMAL dependencies
|
||||||
|
@ -1463,7 +1483,22 @@ ws ::= | " " | "\n" [ \t]{0,20}
|
||||||
assistant_message_start = adapter_obj.get("assistant_start", "### Response:")
|
assistant_message_start = adapter_obj.get("assistant_start", "### Response:")
|
||||||
genparams["prompt"] = f"{user_message_start} In one sentence, write a descriptive caption for this image.\n{assistant_message_start}"
|
genparams["prompt"] = f"{user_message_start} In one sentence, write a descriptive caption for this image.\n{assistant_message_start}"
|
||||||
elif api_format==6:
|
elif api_format==6:
|
||||||
genparams["prompt"] = genparams.get('system', "") + genparams.get('prompt', "")
|
detokstr = ""
|
||||||
|
tokids = genparams.get('context', [])
|
||||||
|
try:
|
||||||
|
detokstr = detokenize_ids(tokids)
|
||||||
|
except Exception as e:
|
||||||
|
utfprint("Ollama Context Error: " + str(e))
|
||||||
|
ollamasysprompt = genparams.get('system', "")
|
||||||
|
ollamabodyprompt = detokstr + "\n\n### Instruction:\n" + genparams.get('prompt', "") + "\n\n### Response:\n"
|
||||||
|
genparams["stop_sequence"] = genparams.get('stop', [])
|
||||||
|
genparams["stop_sequence"].append("\n### Instruction:")
|
||||||
|
genparams["stop_sequence"].append("\n### Response:")
|
||||||
|
genparams["trim_stop"] = True
|
||||||
|
genparams["ollamasysprompt"] = ollamasysprompt
|
||||||
|
genparams["ollamabodyprompt"] = ollamabodyprompt
|
||||||
|
genparams["prompt"] = ollamasysprompt + ollamabodyprompt
|
||||||
|
utfprint(genparams["prompt"])
|
||||||
|
|
||||||
return genparams
|
return genparams
|
||||||
|
|
||||||
|
@ -1568,7 +1603,9 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||||
elif api_format == 5:
|
elif api_format == 5:
|
||||||
res = {"caption": end_trim_to_sentence(recvtxt)}
|
res = {"caption": end_trim_to_sentence(recvtxt)}
|
||||||
elif api_format == 6:
|
elif api_format == 6:
|
||||||
res = {"model": friendlymodelname,"created_at": str(datetime.now(timezone.utc).isoformat()),"response":recvtxt,"done": True,"context": [1,2,3],"total_duration": 1,"load_duration": 1,"prompt_eval_count": prompttokens,"prompt_eval_duration": 1,"eval_count": comptokens,"eval_duration": 1}
|
oldprompt = genparams.get('ollamabodyprompt', "")
|
||||||
|
tokarr = tokenize_ids(oldprompt+recvtxt,False)
|
||||||
|
res = {"model": friendlymodelname,"created_at": str(datetime.now(timezone.utc).isoformat()),"response":recvtxt,"done": True,"context": tokarr,"total_duration": 1,"load_duration": 1,"prompt_eval_count": prompttokens,"prompt_eval_duration": 1,"eval_count": comptokens,"eval_duration": 1}
|
||||||
else:
|
else:
|
||||||
res = {"results": [{"text": recvtxt, "finish_reason": currfinishreason, "logprobs":logprobsdict, "prompt_tokens": prompttokens, "completion_tokens": comptokens}]}
|
res = {"results": [{"text": recvtxt, "finish_reason": currfinishreason, "logprobs":logprobsdict, "prompt_tokens": prompttokens, "completion_tokens": comptokens}]}
|
||||||
|
|
||||||
|
@ -2038,10 +2075,7 @@ Enter Prompt:<br>
|
||||||
genparams = json.loads(body)
|
genparams = json.loads(body)
|
||||||
countprompt = genparams.get('prompt', "")
|
countprompt = genparams.get('prompt', "")
|
||||||
tcaddspecial = genparams.get('special', True)
|
tcaddspecial = genparams.get('special', True)
|
||||||
rawcountdata = handle.token_count(countprompt.encode("UTF-8"),tcaddspecial)
|
countdata = tokenize_ids(countprompt,tcaddspecial)
|
||||||
countlimit = rawcountdata.count if (rawcountdata.count>=0 and rawcountdata.count<50000) else 0
|
|
||||||
# the above protects the server in case the count limit got corrupted
|
|
||||||
countdata = [rawcountdata.ids[i] for i in range(countlimit)]
|
|
||||||
response_body = (json.dumps({"value": len(countdata),"ids": countdata}).encode())
|
response_body = (json.dumps({"value": len(countdata),"ids": countdata}).encode())
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -2055,16 +2089,7 @@ Enter Prompt:<br>
|
||||||
try:
|
try:
|
||||||
genparams = json.loads(body)
|
genparams = json.loads(body)
|
||||||
tokids = genparams.get('ids', [])
|
tokids = genparams.get('ids', [])
|
||||||
tokidslen = len(tokids)
|
detokstr = detokenize_ids(tokids)
|
||||||
detokstr = ""
|
|
||||||
if tokidslen > 0 and tokidslen < 65536:
|
|
||||||
inputs = token_count_outputs()
|
|
||||||
inputs.count = tokidslen
|
|
||||||
inputs.ids = (ctypes.c_int * tokidslen)()
|
|
||||||
for i, cid in enumerate(tokids):
|
|
||||||
inputs.ids[i] = cid
|
|
||||||
detok = handle.detokenize(inputs)
|
|
||||||
detokstr = ctypes.string_at(detok).decode("UTF-8","ignore")
|
|
||||||
response_body = (json.dumps({"result": detokstr,"success":True}).encode())
|
response_body = (json.dumps({"result": detokstr,"success":True}).encode())
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
utfprint("Detokenize Error: " + str(e))
|
utfprint("Detokenize Error: " + str(e))
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue