From 4cd571db89cac0d42feb18faf4d6f8d2e2a4d6e9 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Thu, 8 Feb 2024 16:54:38 +0800 Subject: [PATCH] vulkan multigpu, show uptime --- class.py | 4 ++-- expose.cpp | 16 ++++++++++++++-- expose.h | 2 +- gpttype_adapter.cpp | 4 ++-- kcpp_docs.embd | 12 ++++++++++++ koboldcpp.py | 26 +++++++++++++++++++------- 6 files changed, 50 insertions(+), 14 deletions(-) diff --git a/class.py b/class.py index 4a2a32240..fc90d35fc 100644 --- a/class.py +++ b/class.py @@ -254,9 +254,9 @@ class model_backend(InferenceModel): self.kcpp_noblas = True self.kcpp_nommap = True elif accel==8: - self.kcpp_usevulkan = 0 + self.kcpp_usevulkan = [0] elif accel==9: - self.kcpp_usevulkan = 1 + self.kcpp_usevulkan = [1] pass def unload(self): diff --git a/expose.cpp b/expose.cpp index a9a24f7ea..584d9841d 100644 --- a/expose.cpp +++ b/expose.cpp @@ -59,8 +59,20 @@ extern "C" putenv((char*)platformenv.c_str()); putenv((char*)deviceenv.c_str()); - int vulkan_info = inputs.vulkan_info; - vulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+std::to_string(vulkan_info); + std::string vulkan_info_raw = inputs.vulkan_info; + std::string vulkan_info_str = ""; + for (size_t i = 0; i < vulkan_info_raw.length(); ++i) { + vulkan_info_str += vulkan_info_raw[i]; + if (i < vulkan_info_raw.length() - 1) { + vulkan_info_str += ","; + } + } + if(vulkan_info_str=="") + { + vulkan_info_str = "0"; + } + + vulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str; putenv((char*)vulkandeviceenv.c_str()); executable_path = inputs.executable_path; diff --git a/expose.h b/expose.h index 053cad927..bc475a2a1 100644 --- a/expose.h +++ b/expose.h @@ -44,7 +44,7 @@ struct load_model_inputs const bool use_contextshift; const int clblast_info = 0; const int cublas_info = 0; - const int vulkan_info = 0; + const char * vulkan_info; const int blasbatchsize = 512; const int debugmode = 0; const int forceversion = 0; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index ef6035401..3362683ce 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -865,7 +865,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in llama_ctx_params.rope_freq_scale = rope_freq_scale; llama_ctx_params.n_batch = kcpp_params->n_batch; - #if defined(GGML_USE_CUBLAS) + #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) bool ts_all_zero = true; for (int i = 0; i < tensor_split_max; ++i) { if (inputs.tensor_split[i] != 0.0f) { @@ -966,7 +966,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in llama_ctx_params.n_threads = kcpp_params->n_threads; llama_ctx_params.n_threads_batch = kcpp_params->n_threads_batch; - #if defined(GGML_USE_CUBLAS) + #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) bool ts_all_zero = true; for (int i = 0; i < tensor_split_max; ++i) { if (inputs.tensor_split[i] != 0.0f) { diff --git a/kcpp_docs.embd b/kcpp_docs.embd index e801363e9..7a603bd72 100644 --- a/kcpp_docs.embd +++ b/kcpp_docs.embd @@ -306,6 +306,10 @@ "type": "integer", "description": "Last token count." }, + "last_seed": { + "type": "integer", + "description": "Last generation seed used." + }, "total_gens": { "type": "integer", "description": "Total requests generated since startup." @@ -321,6 +325,14 @@ "idle": { "type": "integer", "description": "Status of backend, busy or idle." + }, + "hordeexitcounter": { + "type": "integer", + "description": "Status of embedded horde worker. If it's too high, may have crashed." + }, + "uptime": { + "type": "integer", + "description": "Seconds that the server has been running for." } }, "required": [ diff --git a/koboldcpp.py b/koboldcpp.py index 1762c467e..4b912420e 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -38,7 +38,7 @@ class load_model_inputs(ctypes.Structure): ("use_contextshift", ctypes.c_bool), ("clblast_info", ctypes.c_int), ("cublas_info", ctypes.c_int), - ("vulkan_info", ctypes.c_int), + ("vulkan_info", ctypes.c_char_p), ("blasbatchsize", ctypes.c_int), ("debugmode", ctypes.c_int), ("forceversion", ctypes.c_int), @@ -256,6 +256,7 @@ def load_model(model_filename): inputs.threads = args.threads inputs.low_vram = (True if (args.usecublas and "lowvram" in args.usecublas) else False) inputs.use_mmq = (True if (args.usecublas and "mmq" in args.usecublas) else False) + inputs.vulkan_info = "0".encode("UTF-8") inputs.blasthreads = args.blasthreads inputs.use_mmap = (not args.nommap) inputs.use_mlock = args.usemlock @@ -315,9 +316,14 @@ def load_model(model_filename): inputs.cublas_info = 3 if args.usevulkan: - inputs.vulkan_info = int(args.usevulkan) + s = "" + for l in range(0,len(args.usevulkan)): + s += str(args.usevulkan[l]) + if s=="": + s = "0" + inputs.vulkan_info = s.encode("UTF-8") else: - inputs.vulkan_info = 0 + inputs.vulkan_info = "0".encode("UTF-8") inputs.executable_path = (getdirpath()+"/").encode("UTF-8") inputs.debugmode = args.debugmode @@ -476,6 +482,7 @@ gui_layers_untouched = True runmode_untouched = True preloaded_story = None sslvalid = False +start_time = time.time() class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): sys_version = "" @@ -846,7 +853,8 @@ Enter Prompt:
totalgens = handle.get_total_gens() stopreason = handle.get_last_stop_reason() lastseed = handle.get_last_seed() - response_body = (json.dumps({"last_process":lastp,"last_eval":laste,"last_token_count":lastc, "last_seed":lastseed, "total_gens":totalgens, "stop_reason":stopreason, "queue":requestsinqueue, "idle":(0 if modelbusy.locked() else 1), "hordeexitcounter":exitcounter}).encode()) + uptime = time.time() - start_time + response_body = (json.dumps({"last_process":lastp,"last_eval":laste,"last_token_count":lastc, "last_seed":lastseed, "total_gens":totalgens, "stop_reason":stopreason, "queue":requestsinqueue, "idle":(0 if modelbusy.locked() else 1), "hordeexitcounter":exitcounter, "uptime":uptime}).encode()) elif self.path.endswith('/api/extra/generate/check'): pendtxtStr = "" @@ -1765,7 +1773,7 @@ def show_new_gui(): if mmq_var.get()==1: args.usecublas.append("mmq") if runopts_var.get() == "Use Vulkan": - args.usevulkan = int(gpuchoiceidx) + args.usevulkan = [int(gpuchoiceidx)] if gpulayers_var.get(): args.gpulayers = int(gpulayers_var.get()) if runopts_var.get()=="Use No BLAS": @@ -1848,7 +1856,11 @@ def show_new_gui(): elif "usevulkan" in dict: if vulkan_option is not None: runopts_var.set(vulkan_option) - gpu_choice_var.set(str(int(dict["usevulkan"])+1)) + gpu_choice_var.set("1") + for opt in range(0,4): + if opt in dict["usevulkan"]: + gpu_choice_var.set(str(opt+1)) + break elif "noavx2" in dict and "noblas" in dict and dict["noblas"] and dict["noavx2"]: if failsafe_option is not None: @@ -2630,7 +2642,7 @@ if __name__ == '__main__': compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true') compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2) compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'mmq']) - compatgroup.add_argument("--usevulkan", help="Use Vulkan for GPU Acceleration. Can optionally specify GPU Device ID (e.g. --usevulkan 0).", metavar=('[Device ID]'), nargs='?', const=0, type=int, default=None) + compatgroup.add_argument("--usevulkan", help="Use Vulkan for GPU Acceleration. Can optionally specify GPU Device ID (e.g. --usevulkan 0).", metavar=('[Device ID]'), nargs='*', type=int, default=None) parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), nargs='?', const=1, type=int, default=0) parser.add_argument("--tensor_split", help="For CUDA with ALL GPU set only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+') parser.add_argument("--onready", help="An optional shell command to execute after the model has been loaded.", metavar=('[shell command]'), type=str, default="",nargs=1)