mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
updated lite with chat inject, added layer detect, added more console logging
This commit is contained in:
parent
a441c27cb5
commit
d775a419b2
4 changed files with 79 additions and 13 deletions
|
@ -498,7 +498,7 @@ void sample_top_a(llama_token_data_array * candidates, float a, size_t min_keep)
|
||||||
}
|
}
|
||||||
|
|
||||||
void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float penalty_base, int allowed_length, const std::unordered_multimap<gpt_vocab::id, std::vector<gpt_vocab::id>>& restart_sequences, llama_token_data_array * candidates) {
|
void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float penalty_base, int allowed_length, const std::unordered_multimap<gpt_vocab::id, std::vector<gpt_vocab::id>>& restart_sequences, llama_token_data_array * candidates) {
|
||||||
if (penalty_multiplier == 0.0f || penalty_base == 0.0f) {
|
if (penalty_multiplier <= 0.0f || penalty_base <= 0.0f) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (penalty_range <= 0) {
|
if (penalty_range <= 0) {
|
||||||
|
@ -1352,7 +1352,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
printf("CUBLAS: Set main device to %d\n",cu_parseinfo_maindevice);
|
printf("CUBLAS: Set main device to %d\n",cu_parseinfo_maindevice);
|
||||||
}
|
}
|
||||||
ggml_cuda_set_mul_mat_q(inputs.use_mmq);
|
ggml_cuda_set_mul_mat_q(inputs.use_mmq);
|
||||||
if(file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2 && kcpp_params->flash_attn)
|
if(file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2 && !kcpp_params->flash_attn)
|
||||||
{
|
{
|
||||||
printf("CUBLAS: Warning, you are running Qwen2 without Flash Attention and may observe incoherent output.\n");
|
printf("CUBLAS: Warning, you are running Qwen2 without Flash Attention and may observe incoherent output.\n");
|
||||||
}
|
}
|
||||||
|
@ -2837,7 +2837,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
||||||
float pt2 = (time2*1000.0/(realnpredict==0?1:realnpredict));
|
float pt2 = (time2*1000.0/(realnpredict==0?1:realnpredict));
|
||||||
float ts2 = (1000.0/pt2);
|
float ts2 = (1000.0/pt2);
|
||||||
float tokens_per_second = (realnpredict == 0 ? 0 : realnpredict / (time1 + time2));
|
float tokens_per_second = (realnpredict == 0 ? 0 : realnpredict / (time1 + time2));
|
||||||
printf("\nCtxLimit: %d/%d, Process:%.2fs (%.1fms/T = %.2fT/s), Generate:%.2fs (%.1fms/T = %.2fT/s), Total:%.2fs (%.2fT/s)",(int)current_context_tokens.size(),(int)nctx, time1, pt1, ts1, time2, pt2, ts2, (time1 + time2), tokens_per_second);
|
printf("\nCtxLimit:%d/%d, Amt:%d/%d, Process:%.2fs (%.1fms/T = %.2fT/s), Generate:%.2fs (%.1fms/T = %.2fT/s), Total:%.2fs (%.2fT/s)",(int)current_context_tokens.size(),(int)nctx, realnpredict, kcpp_params->n_predict, time1, pt1, ts1, time2, pt2, ts2, (time1 + time2), tokens_per_second);
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
output.status = 1;
|
output.status = 1;
|
||||||
output.stopreason = last_stop_reason;
|
output.stopreason = last_stop_reason;
|
||||||
|
|
|
@ -225,6 +225,28 @@
|
||||||
"3105": 3.2
|
"3105": 3.2
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
"dry_multiplier": {
|
||||||
|
"description": "KoboldCpp ONLY. DRY multiplier value, 0 to disable.",
|
||||||
|
"exclusiveMinimum": 0,
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"dry_base": {
|
||||||
|
"description": "KoboldCpp ONLY. DRY base value.",
|
||||||
|
"exclusiveMinimum": 0,
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"dry_allowed_length": {
|
||||||
|
"description": "KoboldCpp ONLY. DRY allowed length value.",
|
||||||
|
"exclusiveMinimum": 0,
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"dry_sequence_breakers": {
|
||||||
|
"description": "An array of string sequence breakers for DRY.",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"type": "array"
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"required": [
|
"required": [
|
||||||
"prompt"
|
"prompt"
|
||||||
|
|
18
klite.embd
18
klite.embd
|
@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
|
||||||
-->
|
-->
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
const LITEVER = 156;
|
const LITEVER = 157;
|
||||||
const urlParams = new URLSearchParams(window.location.search);
|
const urlParams = new URLSearchParams(window.location.search);
|
||||||
const localflag = true;
|
const localflag = true;
|
||||||
const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
|
const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
|
||||||
|
@ -11635,7 +11635,7 @@ Current version indicated by LITEVER below.
|
||||||
console.log(submit_payload);
|
console.log(submit_payload);
|
||||||
|
|
||||||
//preprocess to add extra fields
|
//preprocess to add extra fields
|
||||||
if((custom_kobold_endpoint != "" && is_using_kcpp_with_mirostat()))
|
if(custom_kobold_endpoint != "" && is_using_kcpp_with_mirostat())
|
||||||
{
|
{
|
||||||
if(localsettings.miro_type>0)
|
if(localsettings.miro_type>0)
|
||||||
{
|
{
|
||||||
|
@ -11652,7 +11652,7 @@ Current version indicated by LITEVER below.
|
||||||
submit_payload.params.banned_tokens = get_token_bans();
|
submit_payload.params.banned_tokens = get_token_bans();
|
||||||
submit_payload.params.render_special = localsettings.render_special_tags;
|
submit_payload.params.render_special = localsettings.render_special_tags;
|
||||||
}
|
}
|
||||||
if((custom_kobold_endpoint != "" && is_using_kcpp_with_dry()))
|
if(custom_kobold_endpoint != "" && is_using_kcpp_with_dry() && localsettings.dry_multiplier > 0)
|
||||||
{
|
{
|
||||||
submit_payload.params.dry_multiplier = localsettings.dry_multiplier;
|
submit_payload.params.dry_multiplier = localsettings.dry_multiplier;
|
||||||
submit_payload.params.dry_base = localsettings.dry_base;
|
submit_payload.params.dry_base = localsettings.dry_base;
|
||||||
|
@ -14914,6 +14914,18 @@ Current version indicated by LITEVER below.
|
||||||
}
|
}
|
||||||
|
|
||||||
let namepart = (curr.myturn ? "User" : "KoboldAI");
|
let namepart = (curr.myturn ? "User" : "KoboldAI");
|
||||||
|
//advanced name replacement
|
||||||
|
if(localsettings.inject_chatnames_instruct && localsettings.instruct_has_markdown)
|
||||||
|
{
|
||||||
|
let person = (curr.myturn ? localsettings.chatname : localsettings.chatopponent);
|
||||||
|
let prefix = person + ": ";
|
||||||
|
if(processed_msg.startsWith(prefix))
|
||||||
|
{
|
||||||
|
namepart = person;
|
||||||
|
processed_msg = processed_msg.slice(prefix.length);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let bodypart = (corpo_editing_turn == i ?
|
let bodypart = (corpo_editing_turn == i ?
|
||||||
`<div class="corpo_edit_outer">
|
`<div class="corpo_edit_outer">
|
||||||
<div class="corpo_edit_inner" id="corpo_edit_inp_lengthtester" style="white-space: nowrap; visibility: hidden; height: 0px; position:absolute; width: auto;"></div>
|
<div class="corpo_edit_inner" id="corpo_edit_inp_lengthtester" style="white-space: nowrap; visibility: hidden; height: 0px; position:absolute; width: auto;"></div>
|
||||||
|
|
42
koboldcpp.py
42
koboldcpp.py
|
@ -523,6 +523,34 @@ def string_contains_sequence_substring(inputstr,sequences):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
import struct
|
||||||
|
|
||||||
|
def read_gguf_layer_count(file_path):
|
||||||
|
fsize = os.path.getsize(file_path)
|
||||||
|
if fsize < 10000: #ignore files under 10kb
|
||||||
|
return 0
|
||||||
|
with open(file_path, 'rb') as f:
|
||||||
|
file_header = f.read(4)
|
||||||
|
if file_header != b'GGUF': #file is not GGUF
|
||||||
|
return 0
|
||||||
|
magic_key = b'.block_count'
|
||||||
|
magic_length = len(magic_key)
|
||||||
|
chunk_size = 4096 # read only first 4kb of file
|
||||||
|
data = f.read(chunk_size)
|
||||||
|
index = data.find(magic_key) # Search for the magic number, Read 2 chunks of 4 byte numbers
|
||||||
|
if index != -1 and index + magic_length + 8 <= chunk_size:
|
||||||
|
start_index = index + magic_length
|
||||||
|
first_value_bytes = data[start_index:start_index + 4]
|
||||||
|
second_value_bytes = data[start_index + 4:start_index + 8]
|
||||||
|
# Unpack each 4 bytes as an unsigned int32 in little-endian format
|
||||||
|
value1 = struct.unpack('<I', first_value_bytes)[0]
|
||||||
|
value2 = struct.unpack('<I', second_value_bytes)[0]
|
||||||
|
if value1 == 4 and value2 > 0 and value2 <= 300:
|
||||||
|
return value2 #contains layer count
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
return 0 #not found
|
||||||
|
|
||||||
def load_model(model_filename):
|
def load_model(model_filename):
|
||||||
global args
|
global args
|
||||||
inputs = load_model_inputs()
|
inputs = load_model_inputs()
|
||||||
|
@ -576,7 +604,7 @@ def load_model(model_filename):
|
||||||
ret = handle.load_model(inputs)
|
ret = handle.load_model(inputs)
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
def generate(prompt, memory="", images=[], max_length=32, max_context_length=512, temperature=0.7, top_k=100, top_a=0.0, top_p=0.92, min_p=0.0, typical_p=1.0, tfs=1.0, rep_pen=1.0, rep_pen_range=128, rep_pen_slope=1.0, presence_penalty=0.0, mirostat=0, mirostat_tau=5.0, mirostat_eta=0.1, dry_multiplier=0.0, dry_base=1.75, dry_allowed_length=2, dry_penalty_last_n=0, dry_sequence_breakers=['\n', ':', '"', '*'], sampler_order=[6,0,1,3,4,2,5], seed=-1, stop_sequence=[], use_default_badwordsids=False, stream_sse=False, grammar='', grammar_retain_state=False, genkey='', trimstop=False, quiet=False, dynatemp_range=0.0, dynatemp_exponent=1.0, smoothing_factor=0.0, logit_biases={}, render_special=False, banned_tokens=[], bypass_eos_token=False):
|
def generate(prompt, memory="", images=[], max_length=32, max_context_length=512, temperature=0.7, top_k=100, top_a=0.0, top_p=0.92, min_p=0.0, typical_p=1.0, tfs=1.0, rep_pen=1.0, rep_pen_range=128, rep_pen_slope=1.0, presence_penalty=0.0, mirostat=0, mirostat_tau=5.0, mirostat_eta=0.1, dry_multiplier=0.0, dry_base=1.75, dry_allowed_length=2, dry_penalty_last_n=0, dry_sequence_breakers=[], sampler_order=[6,0,1,3,4,2,5], seed=-1, stop_sequence=[], use_default_badwordsids=False, stream_sse=False, grammar='', grammar_retain_state=False, genkey='', trimstop=False, quiet=False, dynatemp_range=0.0, dynatemp_exponent=1.0, smoothing_factor=0.0, logit_biases={}, render_special=False, banned_tokens=[], bypass_eos_token=False):
|
||||||
global maxctx, args, currentusergenkey, totalgens, pendingabortkey
|
global maxctx, args, currentusergenkey, totalgens, pendingabortkey
|
||||||
inputs = generation_inputs()
|
inputs = generation_inputs()
|
||||||
inputs.prompt = prompt.encode("UTF-8")
|
inputs.prompt = prompt.encode("UTF-8")
|
||||||
|
@ -631,17 +659,18 @@ def generate(prompt, memory="", images=[], max_length=32, max_context_length=512
|
||||||
# Handle dry_sequence_breakers being passed as a json-encoded array of
|
# Handle dry_sequence_breakers being passed as a json-encoded array of
|
||||||
# strings, rather than as an array of strings itself. This is to support
|
# strings, rather than as an array of strings itself. This is to support
|
||||||
# SillyTavern, which passes sequence breakers to Oobabooga that way.
|
# SillyTavern, which passes sequence breakers to Oobabooga that way.
|
||||||
if isinstance(dry_sequence_breakers, str):
|
if dry_multiplier > 0 and isinstance(dry_sequence_breakers, str):
|
||||||
try:
|
try:
|
||||||
dry_sequence_breakers = json.loads(dry_sequence_breakers)
|
dry_sequence_breakers = json.loads(dry_sequence_breakers)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
print(f"ERROR: dry_sequence_breakers must be an array of strings or a json encoded array of strings. Could not parse '{dry_sequence_breakers}': " + str(e))
|
print(f"ERROR: dry_sequence_breakers must be an array of strings or a json encoded array of strings. Could not parse '{dry_sequence_breakers}': " + str(e))
|
||||||
dry_sequence_breakers = []
|
dry_sequence_breakers = []
|
||||||
for n in range(dry_seq_break_max):
|
for n in range(dry_seq_break_max):
|
||||||
if n < len(dry_sequence_breakers):
|
if dry_multiplier > 0 and n < len(dry_sequence_breakers):
|
||||||
inputs.dry_sequence_breakers[n] = dry_sequence_breakers[n].encode("UTF-8")
|
inputs.dry_sequence_breakers[n] = dry_sequence_breakers[n].encode("UTF-8")
|
||||||
else:
|
else:
|
||||||
inputs.dry_sequence_breakers[n] = "".encode("UTF-8")
|
inputs.dry_sequence_breakers[n] = "".encode("UTF-8")
|
||||||
|
|
||||||
if sampler_order and 0 < len(sampler_order) <= sampler_order_max:
|
if sampler_order and 0 < len(sampler_order) <= sampler_order_max:
|
||||||
try:
|
try:
|
||||||
for i, sampler in enumerate(sampler_order):
|
for i, sampler in enumerate(sampler_order):
|
||||||
|
@ -2266,15 +2295,18 @@ def show_gui():
|
||||||
cs = int(contextsize_text[context_var.get()])
|
cs = int(contextsize_text[context_var.get()])
|
||||||
mem = MaxMemory[0]
|
mem = MaxMemory[0]
|
||||||
layerlimit = 0
|
layerlimit = 0
|
||||||
|
|
||||||
if cs and cs > 4096:
|
if cs and cs > 4096:
|
||||||
fsize *= 1.2
|
fsize *= 1.2
|
||||||
elif cs and cs > 2048:
|
elif cs and cs > 2048:
|
||||||
fsize *= 1.1
|
fsize *= 1.1
|
||||||
|
|
||||||
if mem < fsize*1.6:
|
if mem < fsize*1.6:
|
||||||
|
layers = read_gguf_layer_count(filepath)
|
||||||
|
if layers == 0: #fail to read
|
||||||
sizeperlayer = fsize*0.052
|
sizeperlayer = fsize*0.052
|
||||||
layerlimit = int(min(200,mem/sizeperlayer))
|
layerlimit = int(min(200,mem/sizeperlayer))
|
||||||
|
else:
|
||||||
|
ratio = mem/(fsize*1.5)
|
||||||
|
layerlimit = int(ratio*layers)
|
||||||
else:
|
else:
|
||||||
layerlimit = 200 #assume full offload
|
layerlimit = 200 #assume full offload
|
||||||
old_gui_layers_untouched = gui_layers_untouched
|
old_gui_layers_untouched = gui_layers_untouched
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue