+ '''
+ finalhtml = finalhtml.encode('utf-8')
+ self.send_response(200)
+ self.send_header('content-length', str(len(finalhtml)))
+ self.end_headers(content_type='text/html')
+ self.wfile.write(finalhtml)
+
+ def do_GET(self):
+ self.path = self.path.rstrip('/')
+ if self.path in ["", "/?"] or self.path.startswith(('/?','?')): #it's possible for the root url to have ?params without /
+ self.hypervisor_ui()
+ else:
+ self.send_response(404)
+ self.end_headers(content_type='text/html')
+ rp = 'Error: KoboldCpp Hypervisor is running, but this endpoint does not exist. Please check the URL.'
+ self.wfile.write(rp.encode())
+ return
+
#################################################################
### A hacky simple HTTP server simulating a kobold api by Concedo
### we are intentionally NOT using flask, because we want MINIMAL dependencies
#################################################################
-class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
+class KcppServerRequestHandler(http.server.SimpleHTTPRequestHandler):
sys_version = ""
server_version = "ConcedoLlamaForKoboldServer"
@@ -2349,6 +2466,7 @@ Enter Prompt:
def do_GET(self):
global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui
+ global last_req_time, start_time
global has_multiplayer, multiplayer_turn_major, multiplayer_turn_minor, multiplayer_story_data_compressed, multiplayer_dataformat, multiplayer_lastactive, maxctx, maxhordelen, friendlymodelname, lastgeneratedcomfyimg, KcppVersion, totalgens, preloaded_story, exitcounter, currentusergenkey, friendlysdmodelname, fullsdmodelpath, mmprojpath, password
self.path = self.path.rstrip('/')
response_body = None
@@ -2405,8 +2523,11 @@ Enter Prompt:
caps = get_capabilities()
response_body = (json.dumps(caps).encode())
+ elif self.path.endswith(('/api/extra/health')): #used by hypervisor to get info about a kcpp instance
+ uptime = time.time() - start_time
+ response_body = (json.dumps({"model":friendlymodelname,"url":endpoint_url, "uptime":uptime}).encode())
+
elif self.path.endswith(('/api/extra/perf')):
- global last_req_time, start_time
lastp = handle.get_last_process_time()
laste = handle.get_last_eval_time()
lastc = handle.get_last_token_count()
@@ -2531,7 +2652,7 @@ Enter Prompt:
if response_body is None:
self.send_response(404)
self.end_headers(content_type='text/html')
- rp = 'Error: HTTP Server is running, but this endpoint does not exist. Please check the URL.'
+ rp = 'Error: KoboldCpp HTTP Server is running, but this endpoint does not exist. Please check the URL.'
self.wfile.write(rp.encode())
else:
self.send_response(200)
@@ -2982,9 +3103,9 @@ Enter Prompt:
self.send_header("cache-control", "no-store")
if content_type is not None:
self.send_header('content-type', content_type)
- return super(ServerRequestHandler, self).end_headers()
+ return super(KcppServerRequestHandler, self).end_headers()
-def RunServerMultiThreaded(addr, port):
+def RunServerMultiThreaded(addr, port, server_handler):
global exitcounter, sslvalid
global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui
if is_port_in_use(port):
@@ -3028,7 +3149,7 @@ def RunServerMultiThreaded(addr, port):
def run(self):
global exitcounter
- handler = ServerRequestHandler(addr, port)
+ handler = server_handler(addr, port)
with http.server.HTTPServer((addr, port), handler, False) as self.httpd:
try:
if ipv6_sock:
@@ -3772,7 +3893,7 @@ def show_gui():
network_tab = tabcontent["Network"]
# interfaces
- makelabelentry(network_tab, "Port: ", port_var, 1, 150,tooltip="Select the port to host the KoboldCPP webserver.\n(Defaults to 5001)")
+ makelabelentry(network_tab, "Port: ", port_var, 1, 150,tooltip=f"Select the port to host the KoboldCPP webserver.\n(Defaults to {defaultport})")
makelabelentry(network_tab, "Host: ", host_var, 2, 150,tooltip="Select a specific host interface to bind to.\n(Defaults to all)")
makecheckbox(network_tab, "Multiuser Mode", multiuser_var, 3,tooltiptxt="Allows requests by multiple different clients to be queued and handled in sequence.")
@@ -4386,7 +4507,7 @@ def show_gui_yesnobox(title,message):
def print_with_time(txt):
print(f"{datetime.now().strftime('[%H:%M:%S]')} " + txt, flush=True)
-def make_url_request(url, data, method='POST', headers={}):
+def make_url_request(url, data, method='POST', headers={}, timeout=300):
import urllib.request
global nocertify
try:
@@ -4401,7 +4522,7 @@ def make_url_request(url, data, method='POST', headers={}):
else:
request = urllib.request.Request(url, headers=headers, method=method)
response_data = ""
- with urllib.request.urlopen(request,timeout=300) as response:
+ with urllib.request.urlopen(request,timeout=timeout) as response:
response_data = response.read().decode('utf-8',"ignore")
json_response = json.loads(response_data)
return json_response
@@ -4880,18 +5001,17 @@ def analyze_gguf_model_wrapper(filename=""):
dumpthread.start()
def main(launch_args,start_server=True):
- global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui
- global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath
+ global args, showdebug, kcpp_instance
+ args = launch_args #note: these are NOT shared with the child processes!
- args = launch_args
if (args.version) and len(sys.argv) <= 2:
print(f"{KcppVersion}") # just print version and exit
return
+
if (args.model_param or args.model) and args.prompt and not args.benchmark and not (args.debugmode >= 1):
suppress_stdout()
print(f"***\nWelcome to KoboldCpp - Version {KcppVersion}") # just update version manually
- # print("Python version: " + sys.version)
#perform some basic cleanup of old temporary directories
try:
@@ -4907,6 +5027,44 @@ def main(launch_args,start_server=True):
analyze_gguf_model_wrapper(args.analyze)
return
+ if args.debugmode != 1:
+ showdebug = False #not shared with child process!
+
+ multiprocessing.freeze_support()
+ kcpp_instance = multiprocessing.Process(target=kcpp_main_process,kwargs={"launch_args": args, "start_server": start_server})
+ kcpp_instance.daemon = True
+ kcpp_instance.start()
+
+ # start the server for the hypervisor thread
+ if args.hypervisor and args.hypervisorport:
+ if args.ssl:
+ global sslvalid
+ if len(args.ssl)==2 and isinstance(args.ssl[0], str) and os.path.exists(args.ssl[0]) and isinstance(args.ssl[1], str) and os.path.exists(args.ssl[1]):
+ sslvalid = True
+ print("SSL configuration is valid and will be used.")
+ else:
+ print("Your SSL configuration is INVALID. SSL will not be used.")
+ epurl = ""
+ httpsaffix = ("https" if sslvalid else "http")
+ if args.host=="":
+ epurl = f"{httpsaffix}://localhost:{args.hypervisorport}"
+ else:
+ epurl = f"{httpsaffix}://{args.host}:{args.hypervisorport}"
+ print(f"======\nStarting Persistent KoboldCpp Hypervisor at {epurl}", flush=True)
+ asyncio.run(RunServerMultiThreaded(args.host, args.hypervisorport, HypervisorServerRequestHandler))
+ else:
+ while True: # non-hypervisor single instance mode
+ time.sleep(1)
+ if not kcpp_instance.is_alive():
+ break
+
+def kcpp_main_process(launch_args,start_server=True):
+ global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui, start_time
+ global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath, ttsmodelpath
+
+ args = launch_args
+ start_time = time.time()
+
if args.config and len(args.config)==1:
cfgname = args.config[0]
if isinstance(cfgname, str):
@@ -5404,27 +5562,27 @@ def main(launch_args,start_server=True):
if args.port_param!=defaultport:
args.port = args.port_param
- global sslvalid
+ global sslvalid, endpoint_url
if args.ssl:
if len(args.ssl)==2 and isinstance(args.ssl[0], str) and os.path.exists(args.ssl[0]) and isinstance(args.ssl[1], str) and os.path.exists(args.ssl[1]):
sslvalid = True
print("SSL configuration is valid and will be used.")
else:
print("Your SSL configuration is INVALID. SSL will not be used.")
- epurl = ""
+ endpoint_url = ""
httpsaffix = ("https" if sslvalid else "http")
if args.host=="":
- epurl = f"{httpsaffix}://localhost:{args.port}"
+ endpoint_url = f"{httpsaffix}://localhost:{args.port}"
else:
- epurl = f"{httpsaffix}://{args.host}:{args.port}"
+ endpoint_url = f"{httpsaffix}://{args.host}:{args.port}"
if not args.remotetunnel:
- print(f"Starting Kobold API on port {args.port} at {epurl}/api/")
- print(f"Starting OpenAI Compatible API on port {args.port} at {epurl}/v1/")
+ print(f"Starting Kobold API on port {args.port} at {endpoint_url}/api/")
+ print(f"Starting OpenAI Compatible API on port {args.port} at {endpoint_url}/v1/")
if args.sdmodel:
- print(f"StableUI is available at {epurl}/sdui/")
+ print(f"StableUI is available at {endpoint_url}/sdui/")
if args.launch:
- LaunchWebbrowser(epurl,"--launch was set, but could not launch web browser automatically.")
+ LaunchWebbrowser(endpoint_url,"--launch was set, but could not launch web browser automatically.")
if args.hordekey and args.hordekey!="":
if args.hordeworkername and args.hordeworkername!="":
@@ -5530,35 +5688,13 @@ def main(launch_args,start_server=True):
setuptunnel(True if args.sdmodel else False)
else:
# Flush stdout for previous win32 issue so the client can see output.
- print(f"======\nPlease connect to custom endpoint at {epurl}", flush=True)
- asyncio.run(RunServerMultiThreaded(args.host, args.port))
+ print(f"======\nPlease connect to custom endpoint at {endpoint_url}", flush=True)
+ asyncio.run(RunServerMultiThreaded(args.host, args.port, KcppServerRequestHandler))
else:
# Flush stdout for previous win32 issue so the client can see output.
if not args.prompt or args.benchmark:
print("Server was not started, main function complete. Idling.", flush=True)
-def run_in_queue(launch_args, input_queue, output_queue):
- main(launch_args, start_server=False)
- output_queue.put({'command': 'complete'})
- while True:
- if not input_queue.empty():
- while not input_queue.empty():
- data = input_queue.get()
- if data['command'] == 'generate':
- pl = data['data']
- genout = generate(genparams=pl)
- result = genout['text']
- output_queue.put({'command': 'generated text', 'data': result})
- time.sleep(0.2)
-
-def start_in_seperate_process(launch_args):
- import multiprocessing
- input_queue = multiprocessing.Queue()
- output_queue = multiprocessing.Queue()
- p = multiprocessing.Process(target=run_in_queue, args=(launch_args, input_queue, output_queue))
- p.start()
- return (output_queue, input_queue, p)
-
if __name__ == '__main__':
def check_range(value_type, min_value, max_value):
@@ -5577,7 +5713,7 @@ if __name__ == '__main__':
modelgroup.add_argument("--model", metavar=('[filename]'), help="Model file to load", type=str, default="")
modelgroup.add_argument("model_param", help="Model file to load (positional)", nargs="?")
portgroup = parser.add_mutually_exclusive_group() #we want to be backwards compatible with the unnamed positional args
- portgroup.add_argument("--port", metavar=('[portnumber]'), help="Port to listen on", default=defaultport, type=int, action='store')
+ portgroup.add_argument("--port", metavar=('[portnumber]'), help=f"Port to listen on. (Defaults to {defaultport})", default=defaultport, type=int, action='store')
portgroup.add_argument("port_param", help="Port to listen on (positional)", default=defaultport, nargs="?", type=int, action='store')
parser.add_argument("--host", metavar=('[ipaddr]'), help="Host IP to listen on. If this flag is not set, all routable interfaces are accepted.", default="")
parser.add_argument("--launch", help="Launches a web browser when load is completed.", action='store_true')
@@ -5676,6 +5812,12 @@ if __name__ == '__main__':
ttsparsergroup.add_argument("--ttsmaxlen", help="Limit number of audio tokens generated with TTS.", type=int, default=default_ttsmaxlen)
ttsparsergroup.add_argument("--ttsthreads", metavar=('[threads]'), help="Use a different number of threads for TTS if specified. Otherwise, has the same value as --threads.", type=int, default=0)
+ hypervisorgroup = parser.add_argument_group('Hypervisor Commands')
+ hypervisorgroup.add_argument("--hypervisor", help="Enables hypervisor mode, allowing you to unload and reload different configurations or models.", action='store_true')
+ hypervisorgroup.add_argument("--hypervisorport", metavar=('[portnumber]'), help=f"Port for the hypervisor to listen on. (Defaults to {defaulthypervisorport})", default=defaulthypervisorport, type=int, action='store')
+ hypervisorgroup.add_argument("--hypervisorpassword", metavar=('[password]'), help="Require a password to access the hypervisor. Note that password are sent in plaintext as part of the URL, and only provide rudimentary security!", default=None)
+ hypervisorgroup.add_argument("--hypervisordir", metavar=('[directory]'), help="Specify a directory to look for .kcpps configs in, which can be used to swap models.", default="")
+
deprecatedgroup = parser.add_argument_group('Deprecated Commands, DO NOT USE!')
deprecatedgroup.add_argument("--hordeconfig", help=argparse.SUPPRESS, nargs='+')
deprecatedgroup.add_argument("--sdconfig", help=argparse.SUPPRESS, nargs='+')
diff --git a/otherarch/sdcpp/stable-diffusion.cpp b/otherarch/sdcpp/stable-diffusion.cpp
index 4e7d5a2d6..79e5ce5f6 100644
--- a/otherarch/sdcpp/stable-diffusion.cpp
+++ b/otherarch/sdcpp/stable-diffusion.cpp
@@ -1276,7 +1276,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
LOG_DEBUG("prompt after extract and remove lora: \"%s\"", prompt.c_str());
int64_t t0 = ggml_time_ms();
- sd_ctx->sd->apply_loras(lora_f2m);
+ // sd_ctx->sd->apply_loras(lora_f2m); //only use hardcoded lora for kcpp
if(pending_apply_lora_fname!="" && pending_apply_lora_power>0)
{
printf("\nApplying LoRA now...\n");
diff --git a/version.txt b/version.txt
index c9bb105bf..58dc838c1 100644
--- a/version.txt
+++ b/version.txt
@@ -15,7 +15,7 @@ VSVersionInfo(
StringTable(
u'040904b0',
[
- StringStruct(u'CompanyName', u'KoboldCpp'),
+ StringStruct(u'CompanyName', u'KoboldAI'),
StringStruct(u'FileDescription', u'KoboldCpp'),
StringStruct(u'InternalName', u'KoboldCpp'),
StringStruct(u'LegalCopyright', u'AGPLv3'),
diff --git a/version_template.txt b/version_template.txt
index 117cbe45d..4b8bf3ec7 100644
--- a/version_template.txt
+++ b/version_template.txt
@@ -15,7 +15,7 @@ VSVersionInfo(
StringTable(
u'040904b0',
[
- StringStruct(u'CompanyName', u'KoboldCpp'),
+ StringStruct(u'CompanyName', u'KoboldAI'),
StringStruct(u'FileDescription', u'KoboldCpp'),
StringStruct(u'InternalName', u'KoboldCpp'),
StringStruct(u'LegalCopyright', u'AGPLv3'),