mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
deferred aborting for queued generations
This commit is contained in:
parent
61ca3a0d30
commit
c2e497ccfb
2 changed files with 26 additions and 14 deletions
|
@ -110,6 +110,7 @@ You can then run koboldcpp anywhere from the terminal by running `koboldcpp` to
|
||||||
- Start the python server `python koboldcpp.py --model phi-2.Q2_K.gguf`
|
- Start the python server `python koboldcpp.py --model phi-2.Q2_K.gguf`
|
||||||
- Connect to `http://localhost:5001` on your mobile browser
|
- Connect to `http://localhost:5001` on your mobile browser
|
||||||
- If you encounter any errors, make sure your packages are up-to-date with `pkg up`
|
- If you encounter any errors, make sure your packages are up-to-date with `pkg up`
|
||||||
|
- GPU acceleration for Termux may be possible but I have not explored it. If you find a good cross-device solution, do share or PR it.
|
||||||
|
|
||||||
## AMD
|
## AMD
|
||||||
- Please check out https://github.com/YellowRoseCx/koboldcpp-rocm
|
- Please check out https://github.com/YellowRoseCx/koboldcpp-rocm
|
||||||
|
|
39
koboldcpp.py
39
koboldcpp.py
|
@ -329,7 +329,7 @@ def load_model(model_filename):
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
def generate(prompt, memory="", max_length=32, max_context_length=512, temperature=0.7, top_k=100, top_a=0.0, top_p=0.92, min_p=0.0, typical_p=1.0, tfs=1.0, rep_pen=1.0, rep_pen_range=128, presence_penalty=0.0, mirostat=0, mirostat_tau=5.0, mirostat_eta=0.1, sampler_order=[6,0,1,3,4,2,5], seed=-1, stop_sequence=[], use_default_badwordsids=False, stream_sse=False, grammar='', grammar_retain_state=False, genkey='', trimstop=False, quiet=False, dynatemp_range=0.0, dynatemp_exponent=1.0, logit_biases={}):
|
def generate(prompt, memory="", max_length=32, max_context_length=512, temperature=0.7, top_k=100, top_a=0.0, top_p=0.92, min_p=0.0, typical_p=1.0, tfs=1.0, rep_pen=1.0, rep_pen_range=128, presence_penalty=0.0, mirostat=0, mirostat_tau=5.0, mirostat_eta=0.1, sampler_order=[6,0,1,3,4,2,5], seed=-1, stop_sequence=[], use_default_badwordsids=False, stream_sse=False, grammar='', grammar_retain_state=False, genkey='', trimstop=False, quiet=False, dynatemp_range=0.0, dynatemp_exponent=1.0, logit_biases={}):
|
||||||
global maxctx, args, currentusergenkey, totalgens
|
global maxctx, args, currentusergenkey, totalgens, pendingabortkey
|
||||||
inputs = generation_inputs()
|
inputs = generation_inputs()
|
||||||
outputs = ctypes.create_unicode_buffer(ctypes.sizeof(generation_outputs))
|
outputs = ctypes.create_unicode_buffer(ctypes.sizeof(generation_outputs))
|
||||||
inputs.prompt = prompt.encode("UTF-8")
|
inputs.prompt = prompt.encode("UTF-8")
|
||||||
|
@ -408,16 +408,23 @@ def generate(prompt, memory="", max_length=32, max_context_length=512, temperatu
|
||||||
|
|
||||||
currentusergenkey = genkey
|
currentusergenkey = genkey
|
||||||
totalgens += 1
|
totalgens += 1
|
||||||
ret = handle.generate(inputs,outputs)
|
#early exit if aborted
|
||||||
outstr = ""
|
|
||||||
if ret.status==1:
|
if pendingabortkey!="" and pendingabortkey==genkey:
|
||||||
outstr = ret.text.decode("UTF-8","ignore")
|
print(f"\nDeferred Abort for GenKey: {pendingabortkey}")
|
||||||
if trimstop:
|
pendingabortkey = ""
|
||||||
for trim_str in stop_sequence:
|
return ""
|
||||||
sindex = outstr.find(trim_str)
|
else:
|
||||||
if sindex != -1 and trim_str!="":
|
ret = handle.generate(inputs,outputs)
|
||||||
outstr = outstr[:sindex]
|
outstr = ""
|
||||||
return outstr
|
if ret.status==1:
|
||||||
|
outstr = ret.text.decode("UTF-8","ignore")
|
||||||
|
if trimstop:
|
||||||
|
for trim_str in stop_sequence:
|
||||||
|
sindex = outstr.find(trim_str)
|
||||||
|
if sindex != -1 and trim_str!="":
|
||||||
|
outstr = outstr[:sindex]
|
||||||
|
return outstr
|
||||||
|
|
||||||
def utfprint(str):
|
def utfprint(str):
|
||||||
try:
|
try:
|
||||||
|
@ -457,6 +464,7 @@ punishcounter = 0 #causes a timeout if too many errors
|
||||||
rewardcounter = 0 #reduces error counts for successful jobs
|
rewardcounter = 0 #reduces error counts for successful jobs
|
||||||
totalgens = 0
|
totalgens = 0
|
||||||
currentusergenkey = "" #store a special key so polled streaming works even in multiuser
|
currentusergenkey = "" #store a special key so polled streaming works even in multiuser
|
||||||
|
pendingabortkey = "" #if an abort is received for the non-active request, remember it (at least 1) to cancel later
|
||||||
args = None #global args
|
args = None #global args
|
||||||
gui_layers_untouched = True
|
gui_layers_untouched = True
|
||||||
runmode_untouched = True
|
runmode_untouched = True
|
||||||
|
@ -879,7 +887,7 @@ Enter Prompt:<br>
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_POST(self):
|
def do_POST(self):
|
||||||
global modelbusy, requestsinqueue, currentusergenkey, totalgens
|
global modelbusy, requestsinqueue, currentusergenkey, totalgens, pendingabortkey
|
||||||
content_length = int(self.headers['content-length'])
|
content_length = int(self.headers['content-length'])
|
||||||
body = self.rfile.read(content_length)
|
body = self.rfile.read(content_length)
|
||||||
self.path = self.path.rstrip('/')
|
self.path = self.path.rstrip('/')
|
||||||
|
@ -914,10 +922,13 @@ Enter Prompt:<br>
|
||||||
if (multiuserkey=="" and requestsinqueue==0) or (multiuserkey!="" and multiuserkey==currentusergenkey):
|
if (multiuserkey=="" and requestsinqueue==0) or (multiuserkey!="" and multiuserkey==currentusergenkey):
|
||||||
ag = handle.abort_generate()
|
ag = handle.abort_generate()
|
||||||
time.sleep(0.1) #short delay before replying
|
time.sleep(0.1) #short delay before replying
|
||||||
response_body = (json.dumps({"success": ("true" if ag else "false")}).encode())
|
response_body = (json.dumps({"success": ("true" if ag else "false"), "done":"true"}).encode())
|
||||||
print("\nGeneration Aborted")
|
print("\nGeneration Aborted")
|
||||||
|
elif (multiuserkey!="" and requestsinqueue>0):
|
||||||
|
pendingabortkey = multiuserkey
|
||||||
|
response_body = (json.dumps({"success": "true", "done":"false"}).encode())
|
||||||
else:
|
else:
|
||||||
response_body = (json.dumps({"success": "false"}).encode())
|
response_body = (json.dumps({"success": "false", "done":"false"}).encode())
|
||||||
|
|
||||||
elif self.path.endswith('/api/extra/generate/check'):
|
elif self.path.endswith('/api/extra/generate/check'):
|
||||||
pendtxtStr = ""
|
pendtxtStr = ""
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue