diff --git a/expose.h b/expose.h index 52c860b4e..29e2a141e 100644 --- a/expose.h +++ b/expose.h @@ -62,6 +62,7 @@ struct load_model_inputs const int moe_experts = -1; const bool no_bos_token = false; const char * override_kv = nullptr; + const char * override_tensors = nullptr; const bool flash_attention = false; const float tensor_split[tensor_split_max] = {}; const int quant_k = 0; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 27cd25b3a..5efc73193 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -2172,6 +2172,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } std::vector kvos; //ensure it keeps in scope until model is created + std::vector tenos; //ensure it keeps in scope until model is created + std::vector temp_tensor_names; //store temp tensor names to have mem references. if(inputs.moe_experts>0) { printf("\nOverriding number of experts to %d\n",inputs.moe_experts); @@ -2195,13 +2197,58 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in { printf("\nAttempting to apply KV override: %s...\n",override_kv.c_str()); bool kvo_ok = string_parse_kv_override(override_kv.c_str(),kvos); - LLAMA_LOG_INFO("\nKV override result: %s\n",(kvo_ok?"success":"failed")); + LLAMA_LOG_INFO("\nKV override parse: %s\n",(kvo_ok?"success":"failed")); fflush(stdout); } if(kvos.size()>0) { + kvos.emplace_back(); + kvos.back().key[0] = 0; model_params.kv_overrides = kvos.data(); } + //handle override tensor + std::string tensoroverrides = inputs.override_tensors; + if(tensoroverrides!="" && ggml_backend_dev_count()>1) + { + printf("Handling Override Tensors for backends: "); + std::map buft_list; + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + auto * dev = ggml_backend_dev_get(i); + auto * buft = ggml_backend_dev_buffer_type(dev); + if (buft) { + std::string name = ggml_backend_buft_name(buft); + printf("%s ", name.c_str()); + buft_list[name] = buft; + } + } + printf("\n\n"); + for (const auto & override : string_split(tensoroverrides, ',')) { + std::string::size_type pos = override.find('='); + if (pos == std::string::npos) { + printf("\nInvalid Override Tensor: %s\n",override.c_str()); + continue; + } + std::string tensor_name = override.substr(0, pos); + std::string buffer_type = override.substr(pos + 1); + + if (buft_list.find(buffer_type) == buft_list.end()) { + printf("\nUnknown Buffer Type: %s\n",buffer_type.c_str()); + continue; + } + llama_model_tensor_buft_override nto; + temp_tensor_names.push_back(tensor_name); + nto.pattern = temp_tensor_names[temp_tensor_names.size()-1].c_str(); + nto.buft = buft_list.at(buffer_type); + tenos.push_back(nto); + printf("Override Tensor: %s to %s\n",tensor_name.c_str(),buffer_type.c_str()); + } + } + if(tenos.size()>0) + { + tenos.push_back({nullptr, nullptr}); + model_params.tensor_buft_overrides = tenos.data(); + } + llama_model * llamamodel = llama_model_load_from_file(kcpp_data->model_filename.c_str(), model_params); if(overwriteRope) diff --git a/koboldcpp.py b/koboldcpp.py index 61ce280e8..ba5377a86 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -181,6 +181,7 @@ class load_model_inputs(ctypes.Structure): ("moe_experts", ctypes.c_int), ("no_bos_token", ctypes.c_bool), ("override_kv", ctypes.c_char_p), + ("override_tensors", ctypes.c_char_p), ("flash_attention", ctypes.c_bool), ("tensor_split", ctypes.c_float * tensor_split_max), ("quant_k", ctypes.c_int), @@ -1214,6 +1215,7 @@ def load_model(model_filename): inputs.moe_experts = args.moeexperts inputs.no_bos_token = args.nobostoken inputs.override_kv = args.overridekv.encode("UTF-8") if args.overridekv else "".encode("UTF-8") + inputs.override_tensors = args.overridetensors.encode("UTF-8") if args.overridetensors else "".encode("UTF-8") inputs = set_backend_props(inputs) ret = handle.load_model(inputs) return ret @@ -3868,6 +3870,7 @@ def show_gui(): defaultgenamt_var = ctk.StringVar(value=str(512)) nobostoken_var = ctk.IntVar(value=0) override_kv_var = ctk.StringVar(value="") + override_tensors_var = ctk.StringVar(value="") model_var = ctk.StringVar() lora_var = ctk.StringVar() @@ -4393,8 +4396,9 @@ def show_gui(): qkvslider,qkvlabel,qkvtitle = makeslider(tokens_tab, "Quantize KV Cache:", quantkv_text, quantkv_var, 0, 2, 30, set=0,tooltip="Enable quantization of KV cache.\nRequires FlashAttention for full effect, otherwise only K cache is quantized.") quantkv_var.trace("w", toggleflashattn) makecheckbox(tokens_tab, "No BOS Token", nobostoken_var, 43, tooltiptxt="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.") - makelabelentry(tokens_tab, "MoE Experts:", moeexperts_var, row=45, padx=100, singleline=True, tooltip="Override number of MoE experts.") - makelabelentry(tokens_tab, "Override KV:", override_kv_var, row=47, padx=100, singleline=True, width=150, tooltip="Advanced option to override model metadata by key, same as in llama.cpp. Mainly for debugging, not intended for general use. Types: int, float, bool, str") + makelabelentry(tokens_tab, "MoE Experts:", moeexperts_var, row=45, padx=120, singleline=True, tooltip="Override number of MoE experts.") + makelabelentry(tokens_tab, "Override KV:", override_kv_var, row=47, padx=120, singleline=True, width=150, tooltip="Advanced option to override model metadata by key, same as in llama.cpp. Mainly for debugging, not intended for general use. Types: int, float, bool, str") + makelabelentry(tokens_tab, "Override Tensors:", override_tensors_var, row=49, padx=120, singleline=True, width=150, tooltip="Advanced option to override tensor backend selection, same as in llama.cpp.") # Model Tab model_tab = tabcontent["Loaded Files"] @@ -4667,6 +4671,7 @@ def show_gui(): args.defaultgenamt = int(defaultgenamt_var.get()) if defaultgenamt_var.get()!="" else 512 args.nobostoken = (nobostoken_var.get()==1) args.overridekv = None if override_kv_var.get() == "" else override_kv_var.get() + args.overridetensors = None if override_tensors_var.get() == "" else override_tensors_var.get() args.chatcompletionsadapter = None if chatcompletionsadapter_var.get() == "" else chatcompletionsadapter_var.get() try: if kcpp_exporting_template and isinstance(args.chatcompletionsadapter, str) and args.chatcompletionsadapter!="" and os.path.exists(args.chatcompletionsadapter): @@ -4861,6 +4866,8 @@ def show_gui(): nobostoken_var.set(dict["nobostoken"] if ("nobostoken" in dict) else 0) if "overridekv" in dict and dict["overridekv"]: override_kv_var.set(dict["overridekv"]) + if "overridetensors" in dict and dict["overridetensors"]: + override_tensors_var.set(dict["overridetensors"]) if "blasbatchsize" in dict and dict["blasbatchsize"]: blas_size_var.set(blasbatchsize_values.index(str(dict["blasbatchsize"]))) @@ -6588,6 +6595,7 @@ if __name__ == '__main__': advparser.add_argument("--nobostoken", help="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.", action='store_true') advparser.add_argument("--maxrequestsize", metavar=('[size in MB]'), help="Specify a max request payload size. Any requests to the server larger than this size will be dropped. Do not change if unsure.", type=int, default=32) advparser.add_argument("--overridekv", metavar=('[name=type:value]'), help="Advanced option to override a metadata by key, same as in llama.cpp. Mainly for debugging, not intended for general use. Types: int, float, bool, str", default="") + advparser.add_argument("--overridetensors", metavar=('[tensor name pattern=buffer type]'), help="Advanced option to override tensor backend selection, same as in llama.cpp.", default="") compatgroup2 = parser.add_mutually_exclusive_group() compatgroup2.add_argument("--showgui", help="Always show the GUI instead of launching the model right away when loading settings from a .kcpps file.", action='store_true') compatgroup2.add_argument("--skiplauncher", help="Doesn't display or use the GUI launcher.", action='store_true')