mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
vulkan multigpu, show uptime
This commit is contained in:
parent
ec2dbd99a3
commit
4cd571db89
6 changed files with 50 additions and 14 deletions
4
class.py
4
class.py
|
@ -254,9 +254,9 @@ class model_backend(InferenceModel):
|
|||
self.kcpp_noblas = True
|
||||
self.kcpp_nommap = True
|
||||
elif accel==8:
|
||||
self.kcpp_usevulkan = 0
|
||||
self.kcpp_usevulkan = [0]
|
||||
elif accel==9:
|
||||
self.kcpp_usevulkan = 1
|
||||
self.kcpp_usevulkan = [1]
|
||||
pass
|
||||
|
||||
def unload(self):
|
||||
|
|
16
expose.cpp
16
expose.cpp
|
@ -59,8 +59,20 @@ extern "C"
|
|||
putenv((char*)platformenv.c_str());
|
||||
putenv((char*)deviceenv.c_str());
|
||||
|
||||
int vulkan_info = inputs.vulkan_info;
|
||||
vulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+std::to_string(vulkan_info);
|
||||
std::string vulkan_info_raw = inputs.vulkan_info;
|
||||
std::string vulkan_info_str = "";
|
||||
for (size_t i = 0; i < vulkan_info_raw.length(); ++i) {
|
||||
vulkan_info_str += vulkan_info_raw[i];
|
||||
if (i < vulkan_info_raw.length() - 1) {
|
||||
vulkan_info_str += ",";
|
||||
}
|
||||
}
|
||||
if(vulkan_info_str=="")
|
||||
{
|
||||
vulkan_info_str = "0";
|
||||
}
|
||||
|
||||
vulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str;
|
||||
putenv((char*)vulkandeviceenv.c_str());
|
||||
|
||||
executable_path = inputs.executable_path;
|
||||
|
|
2
expose.h
2
expose.h
|
@ -44,7 +44,7 @@ struct load_model_inputs
|
|||
const bool use_contextshift;
|
||||
const int clblast_info = 0;
|
||||
const int cublas_info = 0;
|
||||
const int vulkan_info = 0;
|
||||
const char * vulkan_info;
|
||||
const int blasbatchsize = 512;
|
||||
const int debugmode = 0;
|
||||
const int forceversion = 0;
|
||||
|
|
|
@ -865,7 +865,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
llama_ctx_params.rope_freq_scale = rope_freq_scale;
|
||||
llama_ctx_params.n_batch = kcpp_params->n_batch;
|
||||
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN)
|
||||
bool ts_all_zero = true;
|
||||
for (int i = 0; i < tensor_split_max; ++i) {
|
||||
if (inputs.tensor_split[i] != 0.0f) {
|
||||
|
@ -966,7 +966,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
llama_ctx_params.n_threads = kcpp_params->n_threads;
|
||||
llama_ctx_params.n_threads_batch = kcpp_params->n_threads_batch;
|
||||
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN)
|
||||
bool ts_all_zero = true;
|
||||
for (int i = 0; i < tensor_split_max; ++i) {
|
||||
if (inputs.tensor_split[i] != 0.0f) {
|
||||
|
|
|
@ -306,6 +306,10 @@
|
|||
"type": "integer",
|
||||
"description": "Last token count."
|
||||
},
|
||||
"last_seed": {
|
||||
"type": "integer",
|
||||
"description": "Last generation seed used."
|
||||
},
|
||||
"total_gens": {
|
||||
"type": "integer",
|
||||
"description": "Total requests generated since startup."
|
||||
|
@ -321,6 +325,14 @@
|
|||
"idle": {
|
||||
"type": "integer",
|
||||
"description": "Status of backend, busy or idle."
|
||||
},
|
||||
"hordeexitcounter": {
|
||||
"type": "integer",
|
||||
"description": "Status of embedded horde worker. If it's too high, may have crashed."
|
||||
},
|
||||
"uptime": {
|
||||
"type": "integer",
|
||||
"description": "Seconds that the server has been running for."
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
|
|
26
koboldcpp.py
26
koboldcpp.py
|
@ -38,7 +38,7 @@ class load_model_inputs(ctypes.Structure):
|
|||
("use_contextshift", ctypes.c_bool),
|
||||
("clblast_info", ctypes.c_int),
|
||||
("cublas_info", ctypes.c_int),
|
||||
("vulkan_info", ctypes.c_int),
|
||||
("vulkan_info", ctypes.c_char_p),
|
||||
("blasbatchsize", ctypes.c_int),
|
||||
("debugmode", ctypes.c_int),
|
||||
("forceversion", ctypes.c_int),
|
||||
|
@ -256,6 +256,7 @@ def load_model(model_filename):
|
|||
inputs.threads = args.threads
|
||||
inputs.low_vram = (True if (args.usecublas and "lowvram" in args.usecublas) else False)
|
||||
inputs.use_mmq = (True if (args.usecublas and "mmq" in args.usecublas) else False)
|
||||
inputs.vulkan_info = "0".encode("UTF-8")
|
||||
inputs.blasthreads = args.blasthreads
|
||||
inputs.use_mmap = (not args.nommap)
|
||||
inputs.use_mlock = args.usemlock
|
||||
|
@ -315,9 +316,14 @@ def load_model(model_filename):
|
|||
inputs.cublas_info = 3
|
||||
|
||||
if args.usevulkan:
|
||||
inputs.vulkan_info = int(args.usevulkan)
|
||||
s = ""
|
||||
for l in range(0,len(args.usevulkan)):
|
||||
s += str(args.usevulkan[l])
|
||||
if s=="":
|
||||
s = "0"
|
||||
inputs.vulkan_info = s.encode("UTF-8")
|
||||
else:
|
||||
inputs.vulkan_info = 0
|
||||
inputs.vulkan_info = "0".encode("UTF-8")
|
||||
|
||||
inputs.executable_path = (getdirpath()+"/").encode("UTF-8")
|
||||
inputs.debugmode = args.debugmode
|
||||
|
@ -476,6 +482,7 @@ gui_layers_untouched = True
|
|||
runmode_untouched = True
|
||||
preloaded_story = None
|
||||
sslvalid = False
|
||||
start_time = time.time()
|
||||
|
||||
class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||
sys_version = ""
|
||||
|
@ -846,7 +853,8 @@ Enter Prompt:<br>
|
|||
totalgens = handle.get_total_gens()
|
||||
stopreason = handle.get_last_stop_reason()
|
||||
lastseed = handle.get_last_seed()
|
||||
response_body = (json.dumps({"last_process":lastp,"last_eval":laste,"last_token_count":lastc, "last_seed":lastseed, "total_gens":totalgens, "stop_reason":stopreason, "queue":requestsinqueue, "idle":(0 if modelbusy.locked() else 1), "hordeexitcounter":exitcounter}).encode())
|
||||
uptime = time.time() - start_time
|
||||
response_body = (json.dumps({"last_process":lastp,"last_eval":laste,"last_token_count":lastc, "last_seed":lastseed, "total_gens":totalgens, "stop_reason":stopreason, "queue":requestsinqueue, "idle":(0 if modelbusy.locked() else 1), "hordeexitcounter":exitcounter, "uptime":uptime}).encode())
|
||||
|
||||
elif self.path.endswith('/api/extra/generate/check'):
|
||||
pendtxtStr = ""
|
||||
|
@ -1765,7 +1773,7 @@ def show_new_gui():
|
|||
if mmq_var.get()==1:
|
||||
args.usecublas.append("mmq")
|
||||
if runopts_var.get() == "Use Vulkan":
|
||||
args.usevulkan = int(gpuchoiceidx)
|
||||
args.usevulkan = [int(gpuchoiceidx)]
|
||||
if gpulayers_var.get():
|
||||
args.gpulayers = int(gpulayers_var.get())
|
||||
if runopts_var.get()=="Use No BLAS":
|
||||
|
@ -1848,7 +1856,11 @@ def show_new_gui():
|
|||
elif "usevulkan" in dict:
|
||||
if vulkan_option is not None:
|
||||
runopts_var.set(vulkan_option)
|
||||
gpu_choice_var.set(str(int(dict["usevulkan"])+1))
|
||||
gpu_choice_var.set("1")
|
||||
for opt in range(0,4):
|
||||
if opt in dict["usevulkan"]:
|
||||
gpu_choice_var.set(str(opt+1))
|
||||
break
|
||||
|
||||
elif "noavx2" in dict and "noblas" in dict and dict["noblas"] and dict["noavx2"]:
|
||||
if failsafe_option is not None:
|
||||
|
@ -2630,7 +2642,7 @@ if __name__ == '__main__':
|
|||
compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
|
||||
compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
|
||||
compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'mmq'])
|
||||
compatgroup.add_argument("--usevulkan", help="Use Vulkan for GPU Acceleration. Can optionally specify GPU Device ID (e.g. --usevulkan 0).", metavar=('[Device ID]'), nargs='?', const=0, type=int, default=None)
|
||||
compatgroup.add_argument("--usevulkan", help="Use Vulkan for GPU Acceleration. Can optionally specify GPU Device ID (e.g. --usevulkan 0).", metavar=('[Device ID]'), nargs='*', type=int, default=None)
|
||||
parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), nargs='?', const=1, type=int, default=0)
|
||||
parser.add_argument("--tensor_split", help="For CUDA with ALL GPU set only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+')
|
||||
parser.add_argument("--onready", help="An optional shell command to execute after the model has been loaded.", metavar=('[shell command]'), type=str, default="",nargs=1)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue