sse3 mode for noavx2 clblast, fixed metadata, added version command

This commit is contained in:
Concedo 2025-01-06 21:59:05 +08:00
parent 7b25b6171c
commit 58791612d2
5 changed files with 36 additions and 20 deletions

View file

@ -75,6 +75,7 @@ FASTCXXFLAGS = $(subst -O3,-Ofast,$(CXXFLAGS))
# these are used on windows, to build some libraries with extra old device compatibility
SIMPLECFLAGS =
SIMPLERCFLAGS =
FULLCFLAGS =
NONECFLAGS =
@ -91,6 +92,7 @@ CUBLAS_OBJS =
OBJS_FULL += ggml-alloc.o ggml-cpu-traits.o ggml-quants.o ggml-cpu-quants.o ggml-cpu-aarch64.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o sgemm.o common.o sampling.o
OBJS_SIMPLE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx2.o ggml-cpu-quants_noavx2.o ggml-cpu-aarch64_noavx2.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o sgemm_noavx2.o common.o sampling.o
OBJS_SIMPLER += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx1.o ggml-cpu-quants_noavx1.o ggml-cpu-aarch64_noavx1.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o sgemm_noavx1.o common.o sampling.o
OBJS_FAILSAFE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_failsafe.o ggml-cpu-quants_failsafe.o ggml-cpu-aarch64_failsafe.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o sgemm_failsafe.o common.o sampling.o
# OS specific
@ -148,6 +150,7 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
CFLAGS +=
NONECFLAGS +=
SIMPLECFLAGS += -mavx -msse3
SIMPLERCFLAGS += -mavx
ifdef LLAMA_NOAVX2
FULLCFLAGS += -msse3 -mavx
else
@ -161,6 +164,7 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
CFLAGS +=
NONECFLAGS +=
SIMPLECFLAGS += -mavx -msse3
SIMPLERCFLAGS += -mavx
ifdef LLAMA_NOAVX2
FULLCFLAGS += -msse3 -mavx
else
@ -462,7 +466,7 @@ ggml_v4_clblast.o: ggml/src/ggml.c ggml/include/ggml.h
ggml_v4_cublas.o: ggml/src/ggml.c ggml/include/ggml.h
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
ggml_v4_clblast_noavx2.o: ggml/src/ggml.c ggml/include/ggml.h
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
$(CC) $(FASTCFLAGS) $(SIMPLERCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
ggml_v4_vulkan.o: ggml/src/ggml.c ggml/include/ggml.h
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(VULKAN_FLAGS) -c $< -o $@
ggml_v4_vulkan_noavx2.o: ggml/src/ggml.c ggml/include/ggml.h
@ -478,19 +482,23 @@ ggml-cpu_v4_noavx2.o: ggml/src/ggml-cpu/ggml-cpu.c ggml/include/ggml-cpu.h
ggml-cpu_v4_clblast.o: ggml/src/ggml-cpu/ggml-cpu.c ggml/include/ggml-cpu.h
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
ggml-cpu_v4_clblast_noavx2.o: ggml/src/ggml-cpu/ggml-cpu.c ggml/include/ggml-cpu.h
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
$(CC) $(FASTCFLAGS) $(SIMPLERCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
#quants
ggml-quants.o: ggml/src/ggml-quants.c ggml/include/ggml.h ggml/src/ggml-quants.h ggml/src/ggml-common.h
$(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
ggml-quants_noavx2.o: ggml/src/ggml-quants.c ggml/include/ggml.h ggml/src/ggml-quants.h ggml/src/ggml-common.h
$(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
ggml-quants_noavx1.o: ggml/src/ggml-quants.c ggml/include/ggml.h ggml/src/ggml-quants.h ggml/src/ggml-common.h
$(CC) $(CFLAGS) $(SIMPLERCFLAGS) -c $< -o $@
ggml-quants_failsafe.o: ggml/src/ggml-quants.c ggml/include/ggml.h ggml/src/ggml-quants.h ggml/src/ggml-common.h
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
ggml-cpu-quants.o: ggml/src/ggml-cpu/ggml-cpu-quants.c ggml/include/ggml.h ggml/src/ggml-cpu/ggml-cpu-quants.h ggml/src/ggml-common.h
$(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
ggml-cpu-quants_noavx2.o: ggml/src/ggml-cpu/ggml-cpu-quants.c ggml/include/ggml.h ggml/src/ggml-cpu/ggml-cpu-quants.h ggml/src/ggml-common.h
$(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
ggml-cpu-quants_noavx1.o: ggml/src/ggml-cpu/ggml-cpu-quants.c ggml/include/ggml.h ggml/src/ggml-cpu/ggml-cpu-quants.h ggml/src/ggml-common.h
$(CC) $(CFLAGS) $(SIMPLERCFLAGS) -c $< -o $@
ggml-cpu-quants_failsafe.o: ggml/src/ggml-cpu/ggml-cpu-quants.c ggml/include/ggml.h ggml/src/ggml-cpu/ggml-cpu-quants.h ggml/src/ggml-common.h
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
@ -499,6 +507,8 @@ ggml-cpu-aarch64.o: ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp ggml/include/ggml.h g
$(CXX) $(CXXFLAGS) $(FULLCFLAGS) -c $< -o $@
ggml-cpu-aarch64_noavx2.o: ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp ggml/include/ggml.h ggml/src/ggml-cpu/ggml-cpu-aarch64.h
$(CXX) $(CXXFLAGS) $(SIMPLECFLAGS) -c $< -o $@
ggml-cpu-aarch64_noavx1.o: ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp ggml/include/ggml.h ggml/src/ggml-cpu/ggml-cpu-aarch64.h
$(CXX) $(CXXFLAGS) $(SIMPLERCFLAGS) -c $< -o $@
ggml-cpu-aarch64_failsafe.o: ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp ggml/include/ggml.h ggml/src/ggml-cpu/ggml-cpu-aarch64.h
$(CXX) $(CXXFLAGS) $(NONECFLAGS) -c $< -o $@
@ -507,6 +517,8 @@ sgemm.o: ggml/src/ggml-cpu/llamafile/sgemm.cpp ggml/src/ggml-cpu/llamafile/sgemm
$(CXX) $(CXXFLAGS) $(FULLCFLAGS) -c $< -o $@
sgemm_noavx2.o: ggml/src/ggml-cpu/llamafile/sgemm.cpp ggml/src/ggml-cpu/llamafile/sgemm.h ggml/include/ggml.h
$(CXX) $(CXXFLAGS) $(SIMPLECFLAGS) -c $< -o $@
sgemm_noavx1.o: ggml/src/ggml-cpu/llamafile/sgemm.cpp ggml/src/ggml-cpu/llamafile/sgemm.h ggml/include/ggml.h
$(CXX) $(CXXFLAGS) $(SIMPLERCFLAGS) -c $< -o $@
sgemm_failsafe.o: ggml/src/ggml-cpu/llamafile/sgemm.cpp ggml/src/ggml-cpu/llamafile/sgemm.h ggml/include/ggml.h
$(CXX) $(CXXFLAGS) $(NONECFLAGS) -c $< -o $@
@ -562,7 +574,7 @@ ggml_v3_clblast.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
ggml_v3_cublas.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
ggml_v3_clblast_noavx2.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
$(CC) $(FASTCFLAGS) $(SIMPLERCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
#version 2 libs
ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
@ -576,7 +588,7 @@ ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
ggml_v2_cublas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
ggml_v2_clblast_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
$(CC) $(FASTCFLAGS) $(SIMPLERCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
#extreme old version compat
ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h
@ -692,7 +704,7 @@ ifdef CLBLAST_BUILD
koboldcpp_clblast: ggml_v4_clblast.o ggml-cpu_v4_clblast.o ggml_v3_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
$(CLBLAST_BUILD)
ifdef NOAVX2_BUILD
koboldcpp_clblast_noavx2: ggml_v4_clblast_noavx2.o ggml-cpu_v4_clblast_noavx2.o ggml_v3_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_clblast_noavx2.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_SIMPLE) $(OBJS)
koboldcpp_clblast_noavx2: ggml_v4_clblast_noavx2.o ggml-cpu_v4_clblast_noavx2.o ggml_v3_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_clblast_noavx2.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_SIMPLER) $(OBJS)
$(CLBLAST_BUILD)
else
koboldcpp_clblast_noavx2:

View file

@ -99,7 +99,7 @@ class model_backend(InferenceModel):
"extra_classes": "",
'children': [{'text': 'Use No BLAS', 'value': 0}, {'text': 'Use CuBLAS', 'value': 1},
{'text': 'Use CLBLast GPU #1', 'value': 2},{'text': 'Use CLBLast GPU #2', 'value': 3},{'text': 'Use CLBLast GPU #3', 'value': 4}
,{'text': 'NoAVX2 Mode (Old CPU)', 'value': 5},{'text': 'Failsafe Mode (Old CPU)', 'value': 6},{'text': 'Use Vulkan GPU #1', 'value': 7},{'text': 'Use Vulkan GPU #2', 'value': 8}],
,{'text': 'NoAVX2 Mode (Old CPU)', 'value': 5},{'text': 'Failsafe Mode (Older CPU)', 'value': 6},{'text': 'Use Vulkan GPU #1', 'value': 7},{'text': 'Use Vulkan GPU #2', 'value': 8}],
})
requested_parameters.append({
"uitype": "text",

View file

@ -359,9 +359,9 @@ lib_option_pairs = [
(lib_hipblas, "Use hipBLAS (ROCm)"),
(lib_vulkan, "Use Vulkan"),
(lib_noavx2, "Use CPU (Old CPU)"),
(lib_clblast_noavx2, "Use CLBlast (Old CPU)"),
(lib_vulkan_noavx2, "Use Vulkan (Old CPU)"),
(lib_failsafe, "Failsafe Mode (Old CPU)")]
(lib_clblast_noavx2, "Use CLBlast (Older CPU)"),
(lib_failsafe, "Failsafe Mode (Older CPU)")]
default_option, clblast_option, cublas_option, hipblas_option, vulkan_option, noavx2_option, clblast_noavx2_option, vulkan_noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs)
runopts = [opt for lib, opt in lib_option_pairs if file_exists(lib)]
@ -3124,7 +3124,7 @@ def show_gui():
nl = '\n'
tooltxt = "Number of backends you have built and available." + (f"\n\nMissing Backends: \n\n{nl.join(antirunopts)}" if len(runopts) < 8 else "")
num_backends_built = makelabel(parent, str(len(runopts)) + "/8", 5, 2,tooltxt)
num_backends_built.grid(row=1, column=1, padx=195, pady=0)
num_backends_built.grid(row=1, column=1, padx=205, pady=0)
num_backends_built.configure(text_color="#00ff00")
def gui_changed_modelfile(*args):
@ -3143,7 +3143,7 @@ def show_gui():
predicted_gpu_layers = autoset_gpu_layers(int(contextsize_text[context_var.get()]),(sd_quant_var.get()==1),int(blasbatchsize_values[int(blas_size_var.get())]))
max_gpu_layers = (f"/{modelfile_extracted_meta[0][0]+3}" if (modelfile_extracted_meta and modelfile_extracted_meta[0] and modelfile_extracted_meta[0][0]!=0) else "")
index = runopts_var.get()
gpu_be = (index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)")
gpu_be = (index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Older CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)")
layercounter_label.grid(row=6, column=1, padx=75, sticky="W")
quick_layercounter_label.grid(row=6, column=1, padx=75, sticky="W")
if sys.platform=="darwin" and gpulayers_var.get()=="-1":
@ -3174,7 +3174,7 @@ def show_gui():
if v == "Use Vulkan" or v == "Use Vulkan (Old CPU)":
quick_gpuname_label.configure(text=VKDevicesNames[s])
gpuname_label.configure(text=VKDevicesNames[s])
elif v == "Use CLBlast" or v == "Use CLBlast (Old CPU)":
elif v == "Use CLBlast" or v == "Use CLBlast (Older CPU)":
quick_gpuname_label.configure(text=CLDevicesNames[s])
gpuname_label.configure(text=CLDevicesNames[s])
else:
@ -3231,12 +3231,12 @@ def show_gui():
global runmode_untouched
runmode_untouched = False
index = runopts_var.get()
if index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
if index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Older CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
quick_gpuname_label.grid(row=3, column=1, padx=75, sticky="W")
gpuname_label.grid(row=3, column=1, padx=75, sticky="W")
gpu_selector_label.grid(row=3, column=0, padx = 8, pady=1, stick="nw")
quick_gpu_selector_label.grid(row=3, column=0, padx = 8, pady=1, stick="nw")
if index == "Use CLBlast" or index == "Use CLBlast (Old CPU)":
if index == "Use CLBlast" or index == "Use CLBlast (Older CPU)":
gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw")
CUDA_gpu_selector_box.grid_remove()
@ -3280,7 +3280,7 @@ def show_gui():
else:
quick_use_flashattn.grid(row=22, column=1, padx=8, pady=1, stick="nw")
if index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
if index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Older CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
gpu_layers_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw")
gpu_layers_entry.grid(row=6, column=1, padx=8, pady=1, stick="nw")
quick_gpu_layers_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw")
@ -3302,7 +3302,7 @@ def show_gui():
# presets selector
makelabel(quick_tab, "Presets:", 1,0,"Select a backend to use.\nCuBLAS runs on Nvidia GPUs, and is much faster.\nVulkan and CLBlast works on all GPUs but is somewhat slower.\nOtherwise, runs on CPU only.\nNoAVX2 and Failsafe modes support older PCs.")
runoptbox = ctk.CTkComboBox(quick_tab, values=runopts, width=180,variable=runopts_var, state="readonly")
runoptbox = ctk.CTkComboBox(quick_tab, values=runopts, width=190,variable=runopts_var, state="readonly")
runoptbox.grid(row=1, column=1,padx=8, stick="nw")
runoptbox.set(runopts[0]) # Set to first available option
@ -3636,9 +3636,9 @@ def show_gui():
args.noavx2 = False
if gpu_choice_var.get()!="All":
gpuchoiceidx = int(gpu_choice_var.get())-1
if runopts_var.get() == "Use CLBlast" or runopts_var.get() == "Use CLBlast (Old CPU)":
if runopts_var.get() == "Use CLBlast" or runopts_var.get() == "Use CLBlast (Older CPU)":
args.useclblast = [[0,0], [1,0], [0,1], [1,1]][gpuchoiceidx]
if runopts_var.get() == "Use CLBlast (Old CPU)":
if runopts_var.get() == "Use CLBlast (Older CPU)":
args.noavx2 = True
if runopts_var.get() == "Use CuBLAS" or runopts_var.get() == "Use hipBLAS (ROCm)":
if gpu_choice_var.get()=="All":
@ -3664,7 +3664,7 @@ def show_gui():
args.usecpu = True
if runopts_var.get()=="Use CPU (Old CPU)":
args.noavx2 = True
if runopts_var.get()=="Failsafe Mode (Old CPU)":
if runopts_var.get()=="Failsafe Mode (Older CPU)":
args.noavx2 = True
args.usecpu = True
args.nommap = True
@ -4517,6 +4517,9 @@ def main(launch_args,start_server=True):
global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath
args = launch_args
if (args.version) and len(sys.argv) <= 2:
print(f"{KcppVersion}") # just print version and exit
return
if (args.model_param or args.model) and args.prompt and not args.benchmark and not (args.debugmode >= 1):
suppress_stdout()
@ -5164,6 +5167,7 @@ if __name__ == '__main__':
#more advanced params
advparser = parser.add_argument_group('Advanced Commands')
advparser.add_argument("--version", help="Prints version and exits.", action='store_true')
advparser.add_argument("--ropeconfig", help="If set, uses customized RoPE scaling from configured frequency scale and frequency base (e.g. --ropeconfig 0.25 10000). Otherwise, uses NTK-Aware scaling set automatically based on context size. For linear rope, simply set the freq-scale and ignore the freq-base",metavar=('[rope-freq-scale]', '[rope-freq-base]'), default=[0.0, 10000.0], type=float, nargs='+')
advparser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024,2048], default=512)
advparser.add_argument("--blasthreads", help="Use a different number of threads during BLAS if specified. Otherwise, has the same value as --threads",metavar=('[threads]'), type=int, default=0)

View file

@ -15,7 +15,7 @@ VSVersionInfo(
StringTable(
u'040904b0',
[
StringStruct(u'CompanyName', u'Your Company Name'),
StringStruct(u'CompanyName', u'KoboldCpp'),
StringStruct(u'FileDescription', u'KoboldCpp'),
StringStruct(u'InternalName', u'KoboldCpp'),
StringStruct(u'LegalCopyright', u'AGPLv3'),

View file

@ -15,7 +15,7 @@ VSVersionInfo(
StringTable(
u'040904b0',
[
StringStruct(u'CompanyName', u'Your Company Name'),
StringStruct(u'CompanyName', u'KoboldCpp'),
StringStruct(u'FileDescription', u'KoboldCpp'),
StringStruct(u'InternalName', u'KoboldCpp'),
StringStruct(u'LegalCopyright', u'AGPLv3'),