mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
kv override option (+1 squashed commits)
Squashed commits: [e615fc01] kv override option
This commit is contained in:
parent
cbdee99354
commit
c67510718e
3 changed files with 21 additions and 1 deletions
1
expose.h
1
expose.h
|
@ -61,6 +61,7 @@ struct load_model_inputs
|
||||||
const float rope_freq_base = 10000.0f;
|
const float rope_freq_base = 10000.0f;
|
||||||
const int moe_experts = -1;
|
const int moe_experts = -1;
|
||||||
const bool no_bos_token = false;
|
const bool no_bos_token = false;
|
||||||
|
const char * override_kv = nullptr;
|
||||||
const bool flash_attention = false;
|
const bool flash_attention = false;
|
||||||
const float tensor_split[tensor_split_max] = {};
|
const float tensor_split[tensor_split_max] = {};
|
||||||
const int quant_k = 0;
|
const int quant_k = 0;
|
||||||
|
|
|
@ -2189,6 +2189,17 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
||||||
kvo.val_i64 = inputs.moe_experts;
|
kvo.val_i64 = inputs.moe_experts;
|
||||||
kvos.push_back(kvo);
|
kvos.push_back(kvo);
|
||||||
|
}
|
||||||
|
std::string override_kv = inputs.override_kv;
|
||||||
|
if(override_kv != "" && file_format==FileFormat::GGUF_GENERIC)
|
||||||
|
{
|
||||||
|
printf("\nAttempting to apply KV override: %s...\n",override_kv.c_str());
|
||||||
|
bool kvo_ok = string_parse_kv_override(override_kv.c_str(),kvos);
|
||||||
|
LLAMA_LOG_INFO("\nKV override result: %s\n",(kvo_ok?"success":"failed"));
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
if(kvos.size()>0)
|
||||||
|
{
|
||||||
model_params.kv_overrides = kvos.data();
|
model_params.kv_overrides = kvos.data();
|
||||||
}
|
}
|
||||||
llama_model * llamamodel = llama_model_load_from_file(kcpp_data->model_filename.c_str(), model_params);
|
llama_model * llamamodel = llama_model_load_from_file(kcpp_data->model_filename.c_str(), model_params);
|
||||||
|
|
10
koboldcpp.py
10
koboldcpp.py
|
@ -180,6 +180,7 @@ class load_model_inputs(ctypes.Structure):
|
||||||
("rope_freq_base", ctypes.c_float),
|
("rope_freq_base", ctypes.c_float),
|
||||||
("moe_experts", ctypes.c_int),
|
("moe_experts", ctypes.c_int),
|
||||||
("no_bos_token", ctypes.c_bool),
|
("no_bos_token", ctypes.c_bool),
|
||||||
|
("override_kv", ctypes.c_char_p),
|
||||||
("flash_attention", ctypes.c_bool),
|
("flash_attention", ctypes.c_bool),
|
||||||
("tensor_split", ctypes.c_float * tensor_split_max),
|
("tensor_split", ctypes.c_float * tensor_split_max),
|
||||||
("quant_k", ctypes.c_int),
|
("quant_k", ctypes.c_int),
|
||||||
|
@ -1128,7 +1129,7 @@ def fetch_gpu_properties(testCL,testCU,testVK):
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
if MaxMemory[0]>0:
|
if MaxMemory[0]>0:
|
||||||
print(f"Auto Detected Free GPU Memory: {int(MaxMemory[0]/1024/1024)} MB (Set GPU layers manually if incorrect)")
|
print(f"Detected Free GPU Memory: {int(MaxMemory[0]/1024/1024)} MB (Set GPU layers manually if incorrect)")
|
||||||
else:
|
else:
|
||||||
print("Unable to determine GPU Memory")
|
print("Unable to determine GPU Memory")
|
||||||
return
|
return
|
||||||
|
@ -1212,6 +1213,7 @@ def load_model(model_filename):
|
||||||
|
|
||||||
inputs.moe_experts = args.moeexperts
|
inputs.moe_experts = args.moeexperts
|
||||||
inputs.no_bos_token = args.nobostoken
|
inputs.no_bos_token = args.nobostoken
|
||||||
|
inputs.override_kv = args.overridekv.encode("UTF-8") if args.overridekv else "".encode("UTF-8")
|
||||||
inputs = set_backend_props(inputs)
|
inputs = set_backend_props(inputs)
|
||||||
ret = handle.load_model(inputs)
|
ret = handle.load_model(inputs)
|
||||||
return ret
|
return ret
|
||||||
|
@ -3788,6 +3790,7 @@ def show_gui():
|
||||||
moeexperts_var = ctk.StringVar(value=str(-1))
|
moeexperts_var = ctk.StringVar(value=str(-1))
|
||||||
defaultgenamt_var = ctk.StringVar(value=str(512))
|
defaultgenamt_var = ctk.StringVar(value=str(512))
|
||||||
nobostoken_var = ctk.IntVar(value=0)
|
nobostoken_var = ctk.IntVar(value=0)
|
||||||
|
override_kv_var = ctk.StringVar(value="")
|
||||||
|
|
||||||
model_var = ctk.StringVar()
|
model_var = ctk.StringVar()
|
||||||
lora_var = ctk.StringVar()
|
lora_var = ctk.StringVar()
|
||||||
|
@ -4314,6 +4317,7 @@ def show_gui():
|
||||||
quantkv_var.trace("w", toggleflashattn)
|
quantkv_var.trace("w", toggleflashattn)
|
||||||
makecheckbox(tokens_tab, "No BOS Token", nobostoken_var, 43, tooltiptxt="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.")
|
makecheckbox(tokens_tab, "No BOS Token", nobostoken_var, 43, tooltiptxt="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.")
|
||||||
makelabelentry(tokens_tab, "MoE Experts:", moeexperts_var, row=45, padx=100, singleline=True, tooltip="Override number of MoE experts.")
|
makelabelentry(tokens_tab, "MoE Experts:", moeexperts_var, row=45, padx=100, singleline=True, tooltip="Override number of MoE experts.")
|
||||||
|
makelabelentry(tokens_tab, "Override KV:", override_kv_var, row=47, padx=100, singleline=True, width=150, tooltip="Advanced option to override model metadata by key, same as in llama.cpp. Mainly for debugging, not intended for general use. Types: int, float, bool, str")
|
||||||
|
|
||||||
# Model Tab
|
# Model Tab
|
||||||
model_tab = tabcontent["Loaded Files"]
|
model_tab = tabcontent["Loaded Files"]
|
||||||
|
@ -4585,6 +4589,7 @@ def show_gui():
|
||||||
args.moeexperts = int(moeexperts_var.get()) if moeexperts_var.get()!="" else -1
|
args.moeexperts = int(moeexperts_var.get()) if moeexperts_var.get()!="" else -1
|
||||||
args.defaultgenamt = int(defaultgenamt_var.get()) if defaultgenamt_var.get()!="" else 512
|
args.defaultgenamt = int(defaultgenamt_var.get()) if defaultgenamt_var.get()!="" else 512
|
||||||
args.nobostoken = (nobostoken_var.get()==1)
|
args.nobostoken = (nobostoken_var.get()==1)
|
||||||
|
args.overridekv = None if override_kv_var.get() == "" else override_kv_var.get()
|
||||||
args.chatcompletionsadapter = None if chatcompletionsadapter_var.get() == "" else chatcompletionsadapter_var.get()
|
args.chatcompletionsadapter = None if chatcompletionsadapter_var.get() == "" else chatcompletionsadapter_var.get()
|
||||||
try:
|
try:
|
||||||
if kcpp_exporting_template and isinstance(args.chatcompletionsadapter, str) and args.chatcompletionsadapter!="" and os.path.exists(args.chatcompletionsadapter):
|
if kcpp_exporting_template and isinstance(args.chatcompletionsadapter, str) and args.chatcompletionsadapter!="" and os.path.exists(args.chatcompletionsadapter):
|
||||||
|
@ -4777,6 +4782,8 @@ def show_gui():
|
||||||
if "defaultgenamt" in dict and dict["defaultgenamt"]:
|
if "defaultgenamt" in dict and dict["defaultgenamt"]:
|
||||||
defaultgenamt_var.set(dict["defaultgenamt"])
|
defaultgenamt_var.set(dict["defaultgenamt"])
|
||||||
nobostoken_var.set(dict["nobostoken"] if ("nobostoken" in dict) else 0)
|
nobostoken_var.set(dict["nobostoken"] if ("nobostoken" in dict) else 0)
|
||||||
|
if "overridekv" in dict and dict["overridekv"]:
|
||||||
|
override_kv_var.set(dict["overridekv"])
|
||||||
|
|
||||||
if "blasbatchsize" in dict and dict["blasbatchsize"]:
|
if "blasbatchsize" in dict and dict["blasbatchsize"]:
|
||||||
blas_size_var.set(blasbatchsize_values.index(str(dict["blasbatchsize"])))
|
blas_size_var.set(blasbatchsize_values.index(str(dict["blasbatchsize"])))
|
||||||
|
@ -6496,6 +6503,7 @@ if __name__ == '__main__':
|
||||||
advparser.add_argument("--defaultgenamt", help="How many tokens to generate by default, if not specified. Must be smaller than context size. Usually, your frontend GUI will override this.", type=check_range(int,128,2048), default=512)
|
advparser.add_argument("--defaultgenamt", help="How many tokens to generate by default, if not specified. Must be smaller than context size. Usually, your frontend GUI will override this.", type=check_range(int,128,2048), default=512)
|
||||||
advparser.add_argument("--nobostoken", help="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.", action='store_true')
|
advparser.add_argument("--nobostoken", help="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.", action='store_true')
|
||||||
advparser.add_argument("--maxrequestsize", metavar=('[size in MB]'), help="Specify a max request payload size. Any requests to the server larger than this size will be dropped. Do not change if unsure.", type=int, default=32)
|
advparser.add_argument("--maxrequestsize", metavar=('[size in MB]'), help="Specify a max request payload size. Any requests to the server larger than this size will be dropped. Do not change if unsure.", type=int, default=32)
|
||||||
|
advparser.add_argument("--overridekv", metavar=('[name=type:value]'), help="Advanced option to override a metadata by key, same as in llama.cpp. Mainly for debugging, not intended for general use. Types: int, float, bool, str", default="")
|
||||||
compatgroup2 = parser.add_mutually_exclusive_group()
|
compatgroup2 = parser.add_mutually_exclusive_group()
|
||||||
compatgroup2.add_argument("--showgui", help="Always show the GUI instead of launching the model right away when loading settings from a .kcpps file.", action='store_true')
|
compatgroup2.add_argument("--showgui", help="Always show the GUI instead of launching the model right away when loading settings from a .kcpps file.", action='store_true')
|
||||||
compatgroup2.add_argument("--skiplauncher", help="Doesn't display or use the GUI launcher.", action='store_true')
|
compatgroup2.add_argument("--skiplauncher", help="Doesn't display or use the GUI launcher.", action='store_true')
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue