diff --git a/expose.cpp b/expose.cpp index 703b15872..a1b1612fd 100644 --- a/expose.cpp +++ b/expose.cpp @@ -36,16 +36,8 @@ extern "C" mmproj_filename = inputs.mmproj_filename; draftmodel_filename = inputs.draftmodel_filename; - int forceversion = inputs.forceversion; - file_format = check_file_format(model.c_str(),&file_format_meta); - if(forceversion!=0) - { - printf("\nWARNING: FILE FORMAT FORCED TO VER %d\nIf incorrect, loading may fail or crash.\n",forceversion); - file_format = (FileFormat)forceversion; - } - //first digit is whether configured, second is platform, third is devices int cl_parseinfo = inputs.clblast_info; diff --git a/expose.h b/expose.h index 6d616639a..e6a9cc4dd 100644 --- a/expose.h +++ b/expose.h @@ -57,7 +57,7 @@ struct load_model_inputs const int kcpp_main_gpu = 0; const char * vulkan_info = nullptr; const int batchsize = 512; - const int forceversion = 0; + const bool autofit = false; const int gpulayers = 0; const float rope_freq_scale = 1.0f; const float rope_freq_base = 10000.0f; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index f08447719..5dc4062c2 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -391,6 +391,38 @@ bool allExtendedUnicode(const std::string& str) { return true; } +void print_fitted_params(const llama_model_params & mparams, const llama_context_params & cparams) +{ + std::cout << "-c " << cparams.n_ctx; + std::cout << " -ngl " << mparams.n_gpu_layers; + size_t nd = llama_max_devices(); + while (nd > 1 && mparams.tensor_split[nd - 1] == 0.0f) { + nd--; + } + if (nd > 1) { + for (size_t id = 0; id < nd; id++) { + if (id == 0) { + std::cout << " -ts "; + } + if (id > 0) { + std::cout << ","; + } + std::cout << mparams.tensor_split[id]; + } + } + const size_t ntbo = llama_max_tensor_buft_overrides(); + for (size_t itbo = 0; itbo < ntbo && mparams.tensor_buft_overrides[itbo].pattern != nullptr; itbo++) { + if (itbo == 0) { + std::cout << " -ot "; + } + if (itbo > 0) { + std::cout << ","; + } + std::cout << mparams.tensor_buft_overrides[itbo].pattern << "=" << ggml_backend_buft_name(mparams.tensor_buft_overrides[itbo].buft); + } + std::cout << "\n"; +} + // Find tokens that completely contain `str`, either as a single token, or as a sequence of tokens. // It's important to use a hash map for head tokens because some models have many of them. // For example, the Llama 3 tokenizer has 6570 tokens containing the period ('.') character. @@ -2297,8 +2329,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in std::vector kvos; //ensure it keeps in scope until model is created std::vector tenos; //ensure it keeps in scope until model is created std::vector temp_tensor_names; //store temp tensor names to have mem references. - temp_tensor_names.reserve(32); //very important, prevents vector from reallocating - tenos.reserve(32); + temp_tensor_names.reserve(llama_max_tensor_buft_overrides()); //very important, prevents vector from reallocating + tenos.reserve(llama_max_tensor_buft_overrides()); if(inputs.moe_experts>0) { printf("\nOverriding number of experts to %d\n",inputs.moe_experts); @@ -2401,6 +2433,25 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in model_params.tensor_buft_overrides = tenos.data(); } + //apply overrides from autofit + float tensor_split_temp[128] = {0}; //temp buffer for autofit + if(inputs.autofit) + { + common_params temp_params; + printf("\nAttempting to use llama.cpp's automating fitting code. This will override all your layer configs, may or may not work!\n"); + //zero out any customizations made + tenos.clear(); + tenos.push_back({nullptr, nullptr}); + model_params.tensor_buft_overrides = tenos.data(); + model_params.tensor_split = tensor_split_temp; + model_params.n_gpu_layers = 999; //must be this value to be considered default + llama_params_fit(kcpp_data->model_filename.c_str(), &model_params, &llama_ctx_params, + tensor_split_temp, tenos.data(), 1024*1024*1024, kcpp_data->n_ctx, + GGML_LOG_LEVEL_DEBUG); + printf("Autofit Result: "); + print_fitted_params(model_params,llama_ctx_params); + } + llama_model * llamamodel = llama_model_load_from_file(kcpp_data->model_filename.c_str(), model_params); if(file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL || llama_model_rope_type(llamamodel)==LLAMA_ROPE_TYPE_MROPE || llama_model_rope_type(llamamodel)==LLAMA_ROPE_TYPE_IMROPE) { @@ -2452,6 +2503,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in llama_ctx_params.swa_full = kcpp_data->swa_full; llama_ctx_params.type_k = (inputs.quant_k>1?GGML_TYPE_Q4_0:(inputs.quant_k==1?GGML_TYPE_Q8_0:GGML_TYPE_F16)); llama_ctx_params.type_v = (inputs.quant_v>1?GGML_TYPE_Q4_0:(inputs.quant_v==1?GGML_TYPE_Q8_0:GGML_TYPE_F16)); + llama_ctx_v4 = llama_init_from_model(llamamodel, llama_ctx_params); if(load_guidance) { diff --git a/koboldcpp.py b/koboldcpp.py index b9db0e197..7c5727dba 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -199,7 +199,7 @@ class load_model_inputs(ctypes.Structure): ("kcpp_main_gpu", ctypes.c_int), ("vulkan_info", ctypes.c_char_p), ("batchsize", ctypes.c_int), - ("forceversion", ctypes.c_int), + ("autofit", ctypes.c_bool), ("gpulayers", ctypes.c_int), ("rope_freq_scale", ctypes.c_float), ("rope_freq_base", ctypes.c_float), @@ -1489,7 +1489,7 @@ def load_model(model_filename): else: inputs.quant_k = inputs.quant_v = 0 inputs.batchsize = args.batchsize - inputs.forceversion = args.forceversion + inputs.autofit = args.autofit inputs.gpulayers = args.gpulayers if args.overridenativecontext and args.overridenativecontext>0: inputs.overridenativecontext = args.overridenativecontext @@ -5166,7 +5166,7 @@ def show_gui(): quantkv_var = ctk.IntVar(value=0) blas_threads_var = ctk.StringVar() blas_size_var = ctk.IntVar() - version_var = ctk.StringVar(value="0") + autofit_var = ctk.IntVar() tensor_split_str_vars = ctk.StringVar(value="") rowsplit_var = ctk.IntVar() maingpu_var = ctk.StringVar(value="") @@ -5861,8 +5861,7 @@ def show_gui(): makeslider(hardware_tab, "Batch Size:", batchsize_text, blas_size_var, 0, len(batchsize_values)-1, 16,width=200, set=6,tooltip="How many tokens to process at once per batch.\nLarger values use more memory.") blas_size_var.trace_add("write", changed_gpulayers_estimate) - # force version - makelabelentry(hardware_tab, "Force Version:" , version_var, 100, 50,padx=160,singleline=True,tooltip="If the autodetected version is wrong, you can change it here.\nLeave as 0 for default.") + makecheckbox(hardware_tab, "AutoFit (llama.cpp mode)", autofit_var, 100,0, tooltiptxt="Automatically attempt to fit the model in the best possible way. Overrides everything else. Not recommended for multi model setups. Experimental.") ctk.CTkButton(hardware_tab , text = "Run Benchmark", command = guibench ).grid(row=110,column=0, stick="nw", padx= 8, pady=2) @@ -6243,7 +6242,7 @@ def show_gui(): args.maingpu = -1 if maingpu_var.get()=="" else int(maingpu_var.get()) args.blasthreads = None if blas_threads_var.get()=="" else int(blas_threads_var.get()) args.batchsize = int(batchsize_values[int(blas_size_var.get())]) - args.forceversion = 0 if version_var.get()=="" else int(version_var.get()) + args.autofit = autofit_var.get() == 1 args.contextsize = int(contextsize_text[context_var.get()]) if customrope_var.get()==1: if manualrope_var.get()==1: @@ -6515,7 +6514,7 @@ def show_gui(): if "batchsize" in dict and dict["batchsize"]: blas_size_var.set(batchsize_values.index(str(dict["batchsize"]))) - version_var.set(str(dict["forceversion"]) if ("forceversion" in dict and dict["forceversion"]) else "0") + autofit_var.set(1 if "autofit" in dict and dict["autofit"] else 0) model_var.set(dict["model_param"] if ("model_param" in dict and dict["model_param"]) else "") lora_var.set("") @@ -8355,6 +8354,7 @@ if __name__ == '__main__': parser.add_argument("--contextsize","--ctx-size", "-c", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 8192).",metavar=('[256 to 262144]'), type=check_range(int,256,262144), default=8192) parser.add_argument("--gpulayers","--gpu-layers","--n-gpu-layers","-ngl", help="Set number of layers to offload to GPU when using GPU. Requires GPU. Set to -1 to try autodetect, set to 0 to disable GPU offload.",metavar=('[GPU layers]'), nargs='?', const=1, type=int, default=-1) parser.add_argument("--tensor_split","--tensorsplit","--tensor-split","-ts", help="For CUDA and Vulkan only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+') + parser.add_argument("--autofit","--fit","-fit", help="Automatically attempt to fit the model in the best possible way. Overrides everything else. Experimental.", action='store_true') #more advanced params advparser = parser.add_argument_group('Advanced Commands') @@ -8409,7 +8409,6 @@ if __name__ == '__main__': advparser.add_argument("--flashattention","--flash-attn","-fa", help="Enables flash attention.", action='store_true') advparser.add_argument("--lowvram","-nkvo","--no-kv-offload", help="If supported by the backend, do not offload KV to GPU (lowvram mode). Not recommended, will be slow.", action='store_true') advparser.add_argument("--quantkv", help="Sets the KV cache data type quantization, 0=f16, 1=q8, 2=q4. Requires Flash Attention for full effect, otherwise only K cache is quantized.",metavar=('[quantization level 0/1/2]'), type=int, choices=[0,1,2], default=0) - advparser.add_argument("--forceversion", help="If the model file format detection fails (e.g. rogue modified model) you can set this to override the detected format (enter desired version, e.g. 401 for GPTNeoX-Type2).",metavar=('[version]'), type=int, default=0) advparser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently. Outdated. Not recommended.", action='store_true') advparser.add_argument("--unpack", help="Extracts the file contents of the KoboldCpp binary into a target directory.", metavar=('destination'), type=str, default="") advparser.add_argument("--exportconfig", help="Exports the current selected arguments as a .kcpps settings file", metavar=('[filename]'), type=str, default="") @@ -8485,6 +8484,7 @@ if __name__ == '__main__': compatgroup.add_argument("--noblas", help=argparse.SUPPRESS, action='store_true') compatgroup3.add_argument("--nommap","--no-mmap", help=argparse.SUPPRESS, action='store_true') deprecatedgroup.add_argument("--sdnotile", help=argparse.SUPPRESS, action='store_true') # legacy option, see sdtiledvae + deprecatedgroup.add_argument("--forceversion", help=argparse.SUPPRESS, action='store_true') #no longer used debuggroup = parser.add_argument_group('Debug Commands') debuggroup.add_argument("--testmemory", help=argparse.SUPPRESS, action='store_true')