increase default ctx size to 8k, rename usecublas to usecuda

This commit is contained in:
Concedo 2025-07-13 18:27:42 +08:00
parent 811463a704
commit e7eb6d3200
3 changed files with 56 additions and 54 deletions

View file

@ -70,7 +70,7 @@ KoboldCpp can now also be run on Novita AI, a newer alternative GPU cloud provid
- Other models for Whisper (speech recognition), Image Generation, Text to Speech or Image Recognition [can be found on the Wiki](https://github.com/LostRuins/koboldcpp/wiki#what-models-does-koboldcpp-support-what-architectures-are-supported) - Other models for Whisper (speech recognition), Image Generation, Text to Speech or Image Recognition [can be found on the Wiki](https://github.com/LostRuins/koboldcpp/wiki#what-models-does-koboldcpp-support-what-architectures-are-supported)
## Improving Performance ## Improving Performance
- **GPU Acceleration**: If you're on Windows with an Nvidia GPU you can get CUDA support out of the box using the `--usecublas` flag (Nvidia Only), or `--usevulkan` (Any GPU), make sure you select the correct .exe with CUDA support. - **GPU Acceleration**: If you're on Windows with an Nvidia GPU you can get CUDA support out of the box using the `--usecuda` flag (Nvidia Only), or `--usevulkan` (Any GPU), make sure you select the correct .exe with CUDA support.
- **GPU Layer Offloading**: Add `--gpulayers` to offload model layers to the GPU. The more layers you offload to VRAM, the faster generation speed will become. Experiment to determine number of layers to offload, and reduce by a few if you run out of memory. - **GPU Layer Offloading**: Add `--gpulayers` to offload model layers to the GPU. The more layers you offload to VRAM, the faster generation speed will become. Experiment to determine number of layers to offload, and reduce by a few if you run out of memory.
- **Increasing Context Size**: Use `--contextsize (number)` to increase context size, allowing the model to read more text. Note that you may also need to increase the max context in the KoboldAI Lite UI as well (click and edit the number text field). - **Increasing Context Size**: Use `--contextsize (number)` to increase context size, allowing the model to read more text. Note that you may also need to increase the max context in the KoboldAI Lite UI as well (click and edit the number text field).
- **Old CPU Compatibility**: If you are having crashes or issues, you can try running in a non-avx2 compatibility mode by adding the `--noavx2` flag. You can also try reducing your `--blasbatchssize` (set -1 to avoid batching) - **Old CPU Compatibility**: If you are having crashes or issues, you can try running in a non-avx2 compatibility mode by adding the `--noavx2` flag. You can also try reducing your `--blasbatchssize` (set -1 to avoid batching)

View file

@ -175,7 +175,7 @@
" print(f\"Please open the above link, and input the password '{ltpw}'\\nYour KoboldCpp will start shortly...\")\n", " print(f\"Please open the above link, and input the password '{ltpw}'\\nYour KoboldCpp will start shortly...\")\n",
" print(\"=================\")\n", " print(\"=================\")\n",
" !sleep 10\n", " !sleep 10\n",
"!./koboldcpp_linux $ModelCommand --usecublas 0 mmq --chatcompletionsadapter AutoGuess --multiuser --gpulayers $Layers --contextsize $ContextSize --websearch --quiet --remotetunnel $FACommand $MPCommand $VCommand $SCommand $WCommand $TTSCommand $ECommand $SavGdriveCommand\n" "!./koboldcpp_linux $ModelCommand --usecuda 0 mmq --chatcompletionsadapter AutoGuess --multiuser --gpulayers $Layers --contextsize $ContextSize --websearch --quiet --remotetunnel $FACommand $MPCommand $VCommand $SCommand $WCommand $TTSCommand $ECommand $SavGdriveCommand\n"
] ]
} }
], ],

View file

@ -81,7 +81,7 @@ password = "" #if empty, no auth key required
fullwhispermodelpath = "" #if empty, it's not initialized fullwhispermodelpath = "" #if empty, it's not initialized
ttsmodelpath = "" #if empty, not initialized ttsmodelpath = "" #if empty, not initialized
embeddingsmodelpath = "" #if empty, not initialized embeddingsmodelpath = "" #if empty, not initialized
maxctx = 4096 maxctx = 8192
maxhordectx = 0 #set to whatever maxctx is if 0 maxhordectx = 0 #set to whatever maxctx is if 0
maxhordelen = 512 maxhordelen = 512
modelbusy = threading.Lock() modelbusy = threading.Lock()
@ -459,7 +459,7 @@ lib_vulkan_noavx2 = pick_existant_file("koboldcpp_vulkan_noavx2.dll","koboldcpp_
libname = "" libname = ""
lib_option_pairs = [ lib_option_pairs = [
(lib_default, "Use CPU"), (lib_default, "Use CPU"),
(lib_cublas, "Use CuBLAS"), (lib_cublas, "Use CUDA"),
(lib_hipblas, "Use hipBLAS (ROCm)"), (lib_hipblas, "Use hipBLAS (ROCm)"),
(lib_vulkan, "Use Vulkan"), (lib_vulkan, "Use Vulkan"),
(lib_clblast, "Use CLBlast"), (lib_clblast, "Use CLBlast"),
@ -490,7 +490,7 @@ def init_library():
libname = lib_failsafe libname = lib_failsafe
elif file_exists(lib_noavx2): elif file_exists(lib_noavx2):
libname = lib_noavx2 libname = lib_noavx2
elif (args.usecublas is not None): elif (args.usecuda is not None):
if file_exists(lib_cublas): if file_exists(lib_cublas):
libname = lib_cublas libname = lib_cublas
elif file_exists(lib_hipblas): elif file_exists(lib_hipblas):
@ -598,34 +598,34 @@ def set_backend_props(inputs):
if(args.maingpu is not None and args.maingpu>=0): if(args.maingpu is not None and args.maingpu>=0):
inputs.kcpp_main_gpu = args.maingpu inputs.kcpp_main_gpu = args.maingpu
if args.usecublas: if args.usecuda:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
if not args.tensor_split: if not args.tensor_split:
if (args.usecublas and "0" in args.usecublas): if (args.usecuda and "0" in args.usecuda):
os.environ["CUDA_VISIBLE_DEVICES"] = "0" os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["HIP_VISIBLE_DEVICES"] = "0" os.environ["HIP_VISIBLE_DEVICES"] = "0"
inputs.kcpp_main_gpu = 0 inputs.kcpp_main_gpu = 0
elif (args.usecublas and "1" in args.usecublas): elif (args.usecuda and "1" in args.usecuda):
os.environ["CUDA_VISIBLE_DEVICES"] = "1" os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["HIP_VISIBLE_DEVICES"] = "1" os.environ["HIP_VISIBLE_DEVICES"] = "1"
inputs.kcpp_main_gpu = 0 inputs.kcpp_main_gpu = 0
elif (args.usecublas and "2" in args.usecublas): elif (args.usecuda and "2" in args.usecuda):
os.environ["CUDA_VISIBLE_DEVICES"] = "2" os.environ["CUDA_VISIBLE_DEVICES"] = "2"
os.environ["HIP_VISIBLE_DEVICES"] = "2" os.environ["HIP_VISIBLE_DEVICES"] = "2"
inputs.kcpp_main_gpu = 0 inputs.kcpp_main_gpu = 0
elif (args.usecublas and "3" in args.usecublas): elif (args.usecuda and "3" in args.usecuda):
os.environ["CUDA_VISIBLE_DEVICES"] = "3" os.environ["CUDA_VISIBLE_DEVICES"] = "3"
os.environ["HIP_VISIBLE_DEVICES"] = "3" os.environ["HIP_VISIBLE_DEVICES"] = "3"
inputs.kcpp_main_gpu = 0 inputs.kcpp_main_gpu = 0
else: else:
if(args.maingpu is None or args.maingpu<0): if(args.maingpu is None or args.maingpu<0):
if (args.usecublas and "0" in args.usecublas): if (args.usecuda and "0" in args.usecuda):
inputs.kcpp_main_gpu = 0 inputs.kcpp_main_gpu = 0
elif (args.usecublas and "1" in args.usecublas): elif (args.usecuda and "1" in args.usecuda):
inputs.kcpp_main_gpu = 1 inputs.kcpp_main_gpu = 1
elif (args.usecublas and "2" in args.usecublas): elif (args.usecuda and "2" in args.usecuda):
inputs.kcpp_main_gpu = 2 inputs.kcpp_main_gpu = 2
elif (args.usecublas and "3" in args.usecublas): elif (args.usecuda and "3" in args.usecuda):
inputs.kcpp_main_gpu = 3 inputs.kcpp_main_gpu = 3
if args.usevulkan: #is an empty array if using vulkan without defined gpu if args.usevulkan: #is an empty array if using vulkan without defined gpu
@ -1312,9 +1312,9 @@ def auto_set_backend_cli():
args.noavx2 = True args.noavx2 = True
args.failsafe = True args.failsafe = True
if eligible_cuda and exitcounter < 100 and MaxMemory[0]>3500000000 and (("Use CuBLAS" in runopts and CUDevicesNames[0]!="") or "Use hipBLAS (ROCm)" in runopts) and any(CUDevicesNames): if eligible_cuda and exitcounter < 100 and MaxMemory[0]>3500000000 and (("Use CUDA" in runopts and CUDevicesNames[0]!="") or "Use hipBLAS (ROCm)" in runopts) and any(CUDevicesNames):
if "Use CuBLAS" in runopts or "Use hipBLAS (ROCm)" in runopts: if "Use CUDA" in runopts or "Use hipBLAS (ROCm)" in runopts:
args.usecublas = ["normal","mmq"] args.usecuda = ["normal","mmq"]
print(f"Auto Selected CUDA Backend (flag={cpusupport})\n") print(f"Auto Selected CUDA Backend (flag={cpusupport})\n")
found_new_backend = True found_new_backend = True
elif exitcounter < 100 and (1 in VKIsDGPU) and ("Use Vulkan" in runopts or "Use Vulkan (Old CPU)" in runopts): elif exitcounter < 100 and (1 in VKIsDGPU) and ("Use Vulkan" in runopts or "Use Vulkan (Old CPU)" in runopts):
@ -1333,9 +1333,9 @@ def load_model(model_filename):
inputs.model_filename = model_filename.encode("UTF-8") inputs.model_filename = model_filename.encode("UTF-8")
inputs.max_context_length = maxctx #initial value to use for ctx, can be overwritten inputs.max_context_length = maxctx #initial value to use for ctx, can be overwritten
inputs.threads = args.threads inputs.threads = args.threads
inputs.low_vram = (True if (args.usecublas and "lowvram" in args.usecublas) else False) inputs.low_vram = (True if (args.usecuda and "lowvram" in args.usecuda) else False)
inputs.use_mmq = (True if (args.usecublas and "nommq" not in args.usecublas) else False) inputs.use_mmq = (True if (args.usecuda and "nommq" not in args.usecuda) else False)
inputs.use_rowsplit = (True if (args.usecublas and "rowsplit" in args.usecublas) else False) inputs.use_rowsplit = (True if (args.usecuda and "rowsplit" in args.usecuda) else False)
inputs.vulkan_info = "0".encode("UTF-8") inputs.vulkan_info = "0".encode("UTF-8")
inputs.blasthreads = args.blasthreads inputs.blasthreads = args.blasthreads
inputs.use_mmap = args.usemmap inputs.use_mmap = args.usemmap
@ -4375,7 +4375,7 @@ def show_gui():
if not any(runopts): if not any(runopts):
exitcounter = 999 exitcounter = 999
exit_with_error(2,"KoboldCPP couldn't locate any backends to use (i.e Default, Vulkan, CLBlast, CuBLAS).\n\nTo use the program, please run the 'make' command from the directory.","No Backends Available!") exit_with_error(2,"KoboldCPP couldn't locate any backends to use (i.e Default, Vulkan, CLBlast, CUDA).\n\nTo use the program, please run the 'make' command from the directory.","No Backends Available!")
# Vars - should be in scope to be used by multiple widgets # Vars - should be in scope to be used by multiple widgets
gpulayers_var = ctk.StringVar(value="-1") gpulayers_var = ctk.StringVar(value="-1")
@ -4731,9 +4731,9 @@ def show_gui():
#autopick cublas if suitable, requires at least 3.5GB VRAM to auto pick #autopick cublas if suitable, requires at least 3.5GB VRAM to auto pick
#we do not want to autoselect hip/cublas if the user has already changed their desired backend! #we do not want to autoselect hip/cublas if the user has already changed their desired backend!
if eligible_cuda and exitcounter < 100 and MaxMemory[0]>3500000000 and (("Use CuBLAS" in runopts and CUDevicesNames[0]!="") or "Use hipBLAS (ROCm)" in runopts) and (any(CUDevicesNames) or any(CLDevicesNames)) and runmode_untouched: if eligible_cuda and exitcounter < 100 and MaxMemory[0]>3500000000 and (("Use CUDA" in runopts and CUDevicesNames[0]!="") or "Use hipBLAS (ROCm)" in runopts) and (any(CUDevicesNames) or any(CLDevicesNames)) and runmode_untouched:
if "Use CuBLAS" in runopts: if "Use CUDA" in runopts:
runopts_var.set("Use CuBLAS") runopts_var.set("Use CUDA")
gpu_choice_var.set("1") gpu_choice_var.set("1")
print(f"Auto Selected CUDA Backend (flag={cpusupport})\n") print(f"Auto Selected CUDA Backend (flag={cpusupport})\n")
found_new_backend = True found_new_backend = True
@ -4798,7 +4798,7 @@ def show_gui():
predicted_gpu_layers = autoset_gpu_layers(int(contextsize_text[context_var.get()]),(sd_quant_var.get()==1),int(blasbatchsize_values[int(blas_size_var.get())]),(quantkv_var.get() if flashattention_var.get()==1 else 0)) predicted_gpu_layers = autoset_gpu_layers(int(contextsize_text[context_var.get()]),(sd_quant_var.get()==1),int(blasbatchsize_values[int(blas_size_var.get())]),(quantkv_var.get() if flashattention_var.get()==1 else 0))
max_gpu_layers = (f"/{modelfile_extracted_meta[1][0]+3}" if (modelfile_extracted_meta and modelfile_extracted_meta[1] and modelfile_extracted_meta[1][0]!=0) else "") max_gpu_layers = (f"/{modelfile_extracted_meta[1][0]+3}" if (modelfile_extracted_meta and modelfile_extracted_meta[1] and modelfile_extracted_meta[1][0]!=0) else "")
index = runopts_var.get() index = runopts_var.get()
gpu_be = (index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CLBlast (Older CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)") gpu_be = (index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CLBlast (Older CPU)" or index == "Use CUDA" or index == "Use hipBLAS (ROCm)")
layercounter_label.grid(row=6, column=1, padx=75, sticky="W") layercounter_label.grid(row=6, column=1, padx=75, sticky="W")
quick_layercounter_label.grid(row=6, column=1, padx=75, sticky="W") quick_layercounter_label.grid(row=6, column=1, padx=75, sticky="W")
if sys.platform=="darwin" and gpulayers_var.get()=="-1": if sys.platform=="darwin" and gpulayers_var.get()=="-1":
@ -4885,7 +4885,7 @@ def show_gui():
global runmode_untouched global runmode_untouched
runmode_untouched = False runmode_untouched = False
index = runopts_var.get() index = runopts_var.get()
if index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CLBlast (Older CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)": if index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CLBlast (Older CPU)" or index == "Use CUDA" or index == "Use hipBLAS (ROCm)":
quick_gpuname_label.grid(row=3, column=1, padx=75, sticky="W") quick_gpuname_label.grid(row=3, column=1, padx=75, sticky="W")
gpuname_label.grid(row=3, column=1, padx=75, sticky="W") gpuname_label.grid(row=3, column=1, padx=75, sticky="W")
gpu_selector_label.grid(row=3, column=0, padx = 8, pady=1, stick="nw") gpu_selector_label.grid(row=3, column=0, padx = 8, pady=1, stick="nw")
@ -4899,7 +4899,7 @@ def show_gui():
maingpu_entry.grid_remove() maingpu_entry.grid_remove()
if gpu_choice_var.get()=="All": if gpu_choice_var.get()=="All":
gpu_choice_var.set("1") gpu_choice_var.set("1")
elif index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)": elif index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CUDA" or index == "Use hipBLAS (ROCm)":
gpu_selector_box.grid_remove() gpu_selector_box.grid_remove()
quick_gpu_selector_box.grid_remove() quick_gpu_selector_box.grid_remove()
CUDA_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") CUDA_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
@ -4918,7 +4918,7 @@ def show_gui():
maingpu_label.grid_remove() maingpu_label.grid_remove()
maingpu_entry.grid_remove() maingpu_entry.grid_remove()
if index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)": if index == "Use CUDA" or index == "Use hipBLAS (ROCm)":
lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw") lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw")
mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw") mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw")
quick_mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw") quick_mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw")
@ -4937,7 +4937,7 @@ def show_gui():
tensor_split_label.grid(row=8, column=0, padx = 8, pady=1, stick="nw") tensor_split_label.grid(row=8, column=0, padx = 8, pady=1, stick="nw")
tensor_split_entry.grid(row=8, column=1, padx=8, pady=1, stick="nw") tensor_split_entry.grid(row=8, column=1, padx=8, pady=1, stick="nw")
if index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CLBlast (Older CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)": if index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CLBlast (Older CPU)" or index == "Use CUDA" or index == "Use hipBLAS (ROCm)":
gpu_layers_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw") gpu_layers_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw")
gpu_layers_entry.grid(row=6, column=1, padx=8, pady=1, stick="nw") gpu_layers_entry.grid(row=6, column=1, padx=8, pady=1, stick="nw")
quick_gpu_layers_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw") quick_gpu_layers_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw")
@ -4956,7 +4956,7 @@ def show_gui():
changed_gpu_choice_var() changed_gpu_choice_var()
# presets selector # presets selector
makelabel(quick_tab, "Backend:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.") makelabel(quick_tab, "Backend:", 1,0,"Select a backend to use.\nCUDA runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")
runoptbox = ctk.CTkComboBox(quick_tab, values=runopts, width=190,variable=runopts_var, state="readonly") runoptbox = ctk.CTkComboBox(quick_tab, values=runopts, width=190,variable=runopts_var, state="readonly")
runoptbox.grid(row=1, column=1,padx=8, stick="nw") runoptbox.grid(row=1, column=1,padx=8, stick="nw")
@ -4993,7 +4993,7 @@ def show_gui():
makecheckbox(quick_tab, "Use FlashAttention", flashattention_var, 22, 1, tooltiptxt="Enable flash attention for GGUF models.") makecheckbox(quick_tab, "Use FlashAttention", flashattention_var, 22, 1, tooltiptxt="Enable flash attention for GGUF models.")
# context size # context size
makeslider(quick_tab, "Context Size:", contextsize_text, context_var, 0, len(contextsize_text)-1, 30, width=280, set=5,tooltip="What is the maximum context size to support. Model specific. You cannot exceed it.\nLarger contexts require more memory, and not all models support it.") makeslider(quick_tab, "Context Size:", contextsize_text, context_var, 0, len(contextsize_text)-1, 30, width=280, set=7,tooltip="What is the maximum context size to support. Model specific. You cannot exceed it.\nLarger contexts require more memory, and not all models support it.")
# load model # load model
makefileentry(quick_tab, "GGUF Text Model:", "Select GGUF or GGML Model File", model_var, 40, 280, onchoosefile=on_picked_model_file,tooltiptxt="Select a GGUF or GGML model file on disk to be loaded.") makefileentry(quick_tab, "GGUF Text Model:", "Select GGUF or GGML Model File", model_var, 40, 280, onchoosefile=on_picked_model_file,tooltiptxt="Select a GGUF or GGML model file on disk to be loaded.")
@ -5004,7 +5004,7 @@ def show_gui():
hardware_tab = tabcontent["Hardware"] hardware_tab = tabcontent["Hardware"]
# presets selector # presets selector
makelabel(hardware_tab, "Backend:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.") makelabel(hardware_tab, "Backend:", 1,0,"Select a backend to use.\nCUDA runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")
runoptbox = ctk.CTkComboBox(hardware_tab, values=runopts, width=180,variable=runopts_var, state="readonly") runoptbox = ctk.CTkComboBox(hardware_tab, values=runopts, width=180,variable=runopts_var, state="readonly")
runoptbox.grid(row=1, column=1,padx=8, stick="nw") runoptbox.grid(row=1, column=1,padx=8, stick="nw")
runoptbox.set(runopts[0]) # Set to first available option runoptbox.set(runopts[0]) # Set to first available option
@ -5067,7 +5067,7 @@ def show_gui():
makecheckbox(tokens_tab, "Use Sliding Window Attention (SWA)", swa_var, 4,tooltiptxt="Allows Sliding Window Attention (SWA) KV Cache, which saves memory but cannot be used with context shifting.", command=toggleswa) makecheckbox(tokens_tab, "Use Sliding Window Attention (SWA)", swa_var, 4,tooltiptxt="Allows Sliding Window Attention (SWA) KV Cache, which saves memory but cannot be used with context shifting.", command=toggleswa)
# context size # context size
makeslider(tokens_tab, "Context Size:",contextsize_text, context_var, 0, len(contextsize_text)-1, 18, width=280, set=5,tooltip="What is the maximum context size to support. Model specific. You cannot exceed it.\nLarger contexts require more memory, and not all models support it.") makeslider(tokens_tab, "Context Size:",contextsize_text, context_var, 0, len(contextsize_text)-1, 18, width=280, set=7,tooltip="What is the maximum context size to support. Model specific. You cannot exceed it.\nLarger contexts require more memory, and not all models support it.")
context_var.trace_add("write", changed_gpulayers_estimate) context_var.trace_add("write", changed_gpulayers_estimate)
makelabelentry(tokens_tab, "Default Gen Amt:", defaultgenamt_var, row=20, padx=120, singleline=True, tooltip="How many tokens to generate by default, if not specified. Must be smaller than context size. Usually, your frontend GUI will override this.") makelabelentry(tokens_tab, "Default Gen Amt:", defaultgenamt_var, row=20, padx=120, singleline=True, tooltip="How many tokens to generate by default, if not specified. Must be smaller than context size. Usually, your frontend GUI will override this.")
@ -5316,7 +5316,7 @@ def show_gui():
gpuchoiceidx = 0 gpuchoiceidx = 0
args.usecpu = False args.usecpu = False
args.usevulkan = None args.usevulkan = None
args.usecublas = None args.usecuda = None
args.useclblast = None args.useclblast = None
args.noavx2 = False args.noavx2 = False
if gpu_choice_var.get()!="All": if gpu_choice_var.get()!="All":
@ -5328,17 +5328,17 @@ def show_gui():
elif runopts_var.get() == "Use CLBlast (Older CPU)": elif runopts_var.get() == "Use CLBlast (Older CPU)":
args.noavx2 = True args.noavx2 = True
args.failsafe = True args.failsafe = True
if runopts_var.get() == "Use CuBLAS" or runopts_var.get() == "Use hipBLAS (ROCm)": if runopts_var.get() == "Use CUDA" or runopts_var.get() == "Use hipBLAS (ROCm)":
if gpu_choice_var.get()=="All": if gpu_choice_var.get()=="All":
args.usecublas = ["lowvram"] if lowvram_var.get() == 1 else ["normal"] args.usecuda = ["lowvram"] if lowvram_var.get() == 1 else ["normal"]
else: else:
args.usecublas = ["lowvram",str(gpuchoiceidx)] if lowvram_var.get() == 1 else ["normal",str(gpuchoiceidx)] args.usecuda = ["lowvram",str(gpuchoiceidx)] if lowvram_var.get() == 1 else ["normal",str(gpuchoiceidx)]
if mmq_var.get()==1: if mmq_var.get()==1:
args.usecublas.append("mmq") args.usecuda.append("mmq")
else: else:
args.usecublas.append("nommq") args.usecuda.append("nommq")
if rowsplit_var.get()==1: if rowsplit_var.get()==1:
args.usecublas.append("rowsplit") args.usecuda.append("rowsplit")
if runopts_var.get() == "Use Vulkan" or runopts_var.get() == "Use Vulkan (Old CPU)": if runopts_var.get() == "Use Vulkan" or runopts_var.get() == "Use Vulkan (Old CPU)":
if gpu_choice_var.get()=="All": if gpu_choice_var.get()=="All":
args.usevulkan = [] args.usevulkan = []
@ -5519,18 +5519,18 @@ def show_gui():
if clblast_option is not None: if clblast_option is not None:
runopts_var.set(clblast_option) runopts_var.set(clblast_option)
gpu_choice_var.set(str(["0 0", "1 0", "0 1", "1 1"].index(str(dict["useclblast"][0]) + " " + str(dict["useclblast"][1])) + 1)) gpu_choice_var.set(str(["0 0", "1 0", "0 1", "1 1"].index(str(dict["useclblast"][0]) + " " + str(dict["useclblast"][1])) + 1))
elif "usecublas" in dict and dict["usecublas"]: elif "usecuda" in dict and dict["usecuda"]:
if cublas_option is not None or hipblas_option is not None: if cublas_option is not None or hipblas_option is not None:
if cublas_option: if cublas_option:
runopts_var.set(cublas_option) runopts_var.set(cublas_option)
elif hipblas_option: elif hipblas_option:
runopts_var.set(hipblas_option) runopts_var.set(hipblas_option)
lowvram_var.set(1 if "lowvram" in dict["usecublas"] else 0) lowvram_var.set(1 if "lowvram" in dict["usecuda"] else 0)
mmq_var.set(1 if "mmq" in dict["usecublas"] else 0) mmq_var.set(1 if "mmq" in dict["usecuda"] else 0)
rowsplit_var.set(1 if "rowsplit" in dict["usecublas"] else 0) rowsplit_var.set(1 if "rowsplit" in dict["usecuda"] else 0)
gpu_choice_var.set("All") gpu_choice_var.set("All")
for g in range(4): for g in range(4):
if str(g) in dict["usecublas"]: if str(g) in dict["usecuda"]:
gpu_choice_var.set(str(g+1)) gpu_choice_var.set(str(g+1))
break break
elif "usevulkan" in dict and dict['usevulkan'] is not None: elif "usevulkan" in dict and dict['usevulkan'] is not None:
@ -5997,6 +5997,8 @@ def convert_invalid_args(args):
dict = args dict = args
if isinstance(args, argparse.Namespace): if isinstance(args, argparse.Namespace):
dict = vars(args) dict = vars(args)
if "usecuda" not in dict and "usecublas" in dict and dict["usecublas"]:
dict["usecuda"] = dict["usecublas"]
if "sdconfig" in dict and dict["sdconfig"] and len(dict["sdconfig"])>0: if "sdconfig" in dict and dict["sdconfig"] and len(dict["sdconfig"])>0:
dict["sdmodel"] = dict["sdconfig"][0] dict["sdmodel"] = dict["sdconfig"][0]
if dict["sdconfig"] and len(dict["sdconfig"]) > 1: if dict["sdconfig"] and len(dict["sdconfig"]) > 1:
@ -6149,7 +6151,7 @@ def load_config_cli(filename):
setattr(args, key, value) setattr(args, key, value)
if args.istemplate: if args.istemplate:
print("\nA .kcppt template was selected from CLI...") print("\nA .kcppt template was selected from CLI...")
if (args.usecublas is None) and (args.usevulkan is None) and (args.useclblast is None): if (args.usecuda is None) and (args.usevulkan is None) and (args.useclblast is None):
print("Automatically selecting your backend...") print("Automatically selecting your backend...")
auto_set_backend_cli() auto_set_backend_cli()
@ -6166,7 +6168,7 @@ def convert_args_to_template(savdict):
savdict["debugmode"] = 0 savdict["debugmode"] = 0
savdict["ssl"] = None savdict["ssl"] = None
savdict["useclblast"] = None savdict["useclblast"] = None
savdict["usecublas"] = None savdict["usecuda"] = None
savdict["usevulkan"] = None savdict["usevulkan"] = None
savdict["usecpu"] = None savdict["usecpu"] = None
savdict["tensor_split"] = None savdict["tensor_split"] = None
@ -6812,7 +6814,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
global maxctx global maxctx
maxctx = args.contextsize maxctx = args.contextsize
args.defaultgenamt = max(128, min(args.defaultgenamt, 4096)) args.defaultgenamt = max(128, min(args.defaultgenamt, 8192))
args.defaultgenamt = min(args.defaultgenamt, maxctx / 2) args.defaultgenamt = min(args.defaultgenamt, maxctx / 2)
if args.port_param!=defaultport: if args.port_param!=defaultport:
@ -6847,14 +6849,14 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
print("MacOS detected: Auto GPU layers set to maximum") print("MacOS detected: Auto GPU layers set to maximum")
args.gpulayers = 200 args.gpulayers = 200
elif not shouldavoidgpu and args.model_param and os.path.exists(args.model_param): elif not shouldavoidgpu and args.model_param and os.path.exists(args.model_param):
if (args.usecublas is None) and (args.usevulkan is None) and (args.useclblast is None): if (args.usecuda is None) and (args.usevulkan is None) and (args.useclblast is None):
print("No GPU or CPU backend was selected. Trying to assign one for you automatically...") print("No GPU or CPU backend was selected. Trying to assign one for you automatically...")
auto_set_backend_cli() auto_set_backend_cli()
if MaxMemory[0] == 0: #try to get gpu vram for cuda if not picked yet if MaxMemory[0] == 0: #try to get gpu vram for cuda if not picked yet
fetch_gpu_properties(False,True,True) fetch_gpu_properties(False,True,True)
pass pass
if args.gpulayers==-1: if args.gpulayers==-1:
if MaxMemory[0] > 0 and (not args.usecpu) and ((args.usecublas is not None) or (args.usevulkan is not None) or (args.useclblast is not None) or sys.platform=="darwin"): if MaxMemory[0] > 0 and (not args.usecpu) and ((args.usecuda is not None) or (args.usevulkan is not None) or (args.useclblast is not None) or sys.platform=="darwin"):
extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj,args.draftmodel,args.ttsmodel if args.ttsgpu else "",args.embeddingsmodel if args.embeddingsgpu else "") extract_modelfile_params(args.model_param,args.sdmodel,args.whispermodel,args.mmproj,args.draftmodel,args.ttsmodel if args.ttsgpu else "",args.embeddingsmodel if args.embeddingsgpu else "")
layeramt = autoset_gpu_layers(args.contextsize,args.sdquant,args.blasbatchsize,(args.quantkv if args.flashattention else 0)) layeramt = autoset_gpu_layers(args.contextsize,args.sdquant,args.blasbatchsize,(args.quantkv if args.flashattention else 0))
print(f"Auto Recommended GPU Layers: {layeramt}") print(f"Auto Recommended GPU Layers: {layeramt}")
@ -7279,7 +7281,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
s_pp = float(benchmaxctx-benchlen)/t_pp s_pp = float(benchmaxctx-benchlen)/t_pp
s_gen = float(benchlen)/t_gen s_gen = float(benchlen)/t_gen
datetimestamp = datetime.now(timezone.utc) datetimestamp = datetime.now(timezone.utc)
benchflagstr = f"NoAVX2={args.noavx2} Threads={args.threads} HighPriority={args.highpriority} Cublas_Args={args.usecublas} Tensor_Split={args.tensor_split} BlasThreads={args.blasthreads} BlasBatchSize={args.blasbatchsize} FlashAttention={args.flashattention} KvCache={args.quantkv}" benchflagstr = f"NoAVX2={args.noavx2} Threads={args.threads} HighPriority={args.highpriority} Cuda_Args={args.usecuda} Tensor_Split={args.tensor_split} BlasThreads={args.blasthreads} BlasBatchSize={args.blasbatchsize} FlashAttention={args.flashattention} KvCache={args.quantkv}"
print(f"\nBenchmark Completed - v{KcppVersion} Results:\n======") print(f"\nBenchmark Completed - v{KcppVersion} Results:\n======")
print(f"Flags: {benchflagstr}") print(f"Flags: {benchflagstr}")
print(f"Timestamp: {datetimestamp}") print(f"Timestamp: {datetimestamp}")
@ -7347,11 +7349,11 @@ if __name__ == '__main__':
parser.add_argument("--threads", metavar=('[threads]'), help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=get_default_threads()) parser.add_argument("--threads", metavar=('[threads]'), help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=get_default_threads())
compatgroup = parser.add_mutually_exclusive_group() compatgroup = parser.add_mutually_exclusive_group()
compatgroup.add_argument("--usecublas", "--usehipblas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq|nommq] [rowsplit]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'all', 'mmq', 'nommq', 'rowsplit']) compatgroup.add_argument("--usecuda", "--usecublas", "--usehipblas", help="Use CUDA for GPU Acceleration. Requires CUDA. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq|nommq] [rowsplit]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'all', 'mmq', 'nommq', 'rowsplit'])
compatgroup.add_argument("--usevulkan", help="Use Vulkan for GPU Acceleration. Can optionally specify one or more GPU Device ID (e.g. --usevulkan 0), leave blank to autodetect.", metavar=('[Device IDs]'), nargs='*', type=int, default=None) compatgroup.add_argument("--usevulkan", help="Use Vulkan for GPU Acceleration. Can optionally specify one or more GPU Device ID (e.g. --usevulkan 0), leave blank to autodetect.", metavar=('[Device IDs]'), nargs='*', type=int, default=None)
compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2) compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
compatgroup.add_argument("--usecpu", help="Do not use any GPU acceleration (CPU Only)", action='store_true') compatgroup.add_argument("--usecpu", help="Do not use any GPU acceleration (CPU Only)", action='store_true')
parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 4096).",metavar=('[256 to 262144]'), type=check_range(int,256,262144), default=4096) parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 8192).",metavar=('[256 to 262144]'), type=check_range(int,256,262144), default=8192)
parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU. Set to -1 to try autodetect, set to 0 to disable GPU offload.",metavar=('[GPU layers]'), nargs='?', const=1, type=int, default=-1) parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU. Set to -1 to try autodetect, set to 0 to disable GPU offload.",metavar=('[GPU layers]'), nargs='?', const=1, type=int, default=-1)
parser.add_argument("--tensor_split", help="For CUDA and Vulkan only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+') parser.add_argument("--tensor_split", help="For CUDA and Vulkan only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+')
@ -7409,7 +7411,7 @@ if __name__ == '__main__':
advparser.add_argument("--exporttemplate", help="Exports the current selected arguments as a .kcppt template file", metavar=('[filename]'), type=str, default="") advparser.add_argument("--exporttemplate", help="Exports the current selected arguments as a .kcppt template file", metavar=('[filename]'), type=str, default="")
advparser.add_argument("--nomodel", help="Allows you to launch the GUI alone, without selecting any model.", action='store_true') advparser.add_argument("--nomodel", help="Allows you to launch the GUI alone, without selecting any model.", action='store_true')
advparser.add_argument("--moeexperts", metavar=('[num of experts]'), help="How many experts to use for MoE models (default=follow gguf)", type=int, default=-1) advparser.add_argument("--moeexperts", metavar=('[num of experts]'), help="How many experts to use for MoE models (default=follow gguf)", type=int, default=-1)
advparser.add_argument("--defaultgenamt", help="How many tokens to generate by default, if not specified. Must be smaller than context size. Usually, your frontend GUI will override this.", type=check_range(int,64,4096), default=512) advparser.add_argument("--defaultgenamt", help="How many tokens to generate by default, if not specified. Must be smaller than context size. Usually, your frontend GUI will override this.", type=check_range(int,64,8192), default=512)
advparser.add_argument("--nobostoken", help="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.", action='store_true') advparser.add_argument("--nobostoken", help="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.", action='store_true')
advparser.add_argument("--enableguidance", help="Enables the use of Classifier-Free-Guidance, which allows the use of negative prompts. Has performance and memory impact.", action='store_true') advparser.add_argument("--enableguidance", help="Enables the use of Classifier-Free-Guidance, which allows the use of negative prompts. Has performance and memory impact.", action='store_true')
advparser.add_argument("--maxrequestsize", metavar=('[size in MB]'), help="Specify a max request payload size. Any requests to the server larger than this size will be dropped. Do not change if unsure.", type=int, default=32) advparser.add_argument("--maxrequestsize", metavar=('[size in MB]'), help="Specify a max request payload size. Any requests to the server larger than this size will be dropped. Do not change if unsure.", type=int, default=32)