From 77fa2cd3484e38f4973491c9066b1ad227b2b37b Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sat, 16 May 2026 00:30:23 +0800 Subject: [PATCH] batching horde worker adjustments --- koboldcpp.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index 46b03c97c..d39cba4df 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -6397,6 +6397,8 @@ Change Mode
muint = int(args.multiuser) if muint<=0 and ((args.whispermodel and args.whispermodel!="") or (args.sdmodel and args.sdmodel!="") or (args.ttsmodel and args.ttsmodel!="") or (args.embeddingsmodel and args.embeddingsmodel!="")): muint = 2 # this prevents errors when using voice/img together with text + if getattr(args, "continuous_batching", 0) > 1 and muint <=0: + muint = multiuser_concurrent_limit # multiuser required for batching multiuserlimit = ((muint-1) if muint > 1 else multiuser_concurrent_limit) #backwards compatibility for up to X concurrent requests, use default limit of X if multiuser set to 1 if muint > 0 and requestsinqueue < multiuserlimit: @@ -8638,7 +8640,7 @@ def show_gui(): qkvopt = quantkv_text[quantkv_var.get()].lower() if (quantkv_var.get()>=0 and quantkv_var.get() < len(quantkv_text)) else "f16" args.quantkv = qkvopt args.lowvram = lowvram_var.get()==1 - args.nommq = mmq_var.get()==1 + args.nommq = mmq_var.get()==0 args.splitmode = splitmode_var.get() if splitmode_var.get() in splitmode_choices else splitmode_choices[0] gpuchoiceidx = 0 @@ -9358,7 +9360,7 @@ def make_url_request(url, data, method='POST', headers={}, timeout=300): return None #A very simple and stripped down embedded horde worker with no dependencies -def run_horde_worker(args, api_key, worker_name, worker_id, threads_to_show): +def run_horde_worker(args, api_key, worker_name, worker_id, parallel_batching_threads): global friendlymodelname, maxhordectx, maxhordelen, exitcounter, punishcounter, modelbusy, session_starttime, sslvalid epurl = get_my_epurl() @@ -9440,7 +9442,7 @@ def run_horde_worker(args, api_key, worker_name, worker_id, threads_to_show): continue #first, make sure we are not generating - if modelbusy.locked(): + if parallel_batching_threads<=1 and modelbusy.locked(): time.sleep(0.2) continue @@ -9454,8 +9456,8 @@ def run_horde_worker(args, api_key, worker_name, worker_id, threads_to_show): "softprompts": [], "bridge_agent": BRIDGE_AGENT, } - if threads_to_show>1: - gen_dict["threads"] = threads_to_show + if parallel_batching_threads>1: + gen_dict["threads"] = parallel_batching_threads pop = make_url_request_horde(f'{cluster}/api/v2/generate/text/pop',gen_dict) if not pop: punishcounter += 1 @@ -9478,7 +9480,7 @@ def run_horde_worker(args, api_key, worker_name, worker_id, threads_to_show): #do gen while exitcounter < 10: - if not modelbusy.locked(): + if parallel_batching_threads>1 or not modelbusy.locked(): #horde gets a genkey to avoid KCPP overlap current_payload['genkey'] = f"HORDEREQ_{random.randint(100, 999)}" current_generation = make_url_request_horde(f'{epurl}/api/v1/generate', current_payload, method='POST',addmykey=True)