mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
default kv_unified to true, handle LLAMA_SET_ROWS.
This commit is contained in:
parent
30675b0798
commit
6d50def409
5 changed files with 18 additions and 1 deletions
|
@ -585,6 +585,7 @@ static void speculative_decoding_setup(std::string spec_model_filename, const ll
|
||||||
draft_ctx_params.offload_kqv = base_ctx_params.offload_kqv;
|
draft_ctx_params.offload_kqv = base_ctx_params.offload_kqv;
|
||||||
draft_model_params.main_gpu = base_model_params.main_gpu;
|
draft_model_params.main_gpu = base_model_params.main_gpu;
|
||||||
draft_model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;
|
draft_model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;
|
||||||
|
draft_ctx_params.kv_unified = base_ctx_params.kv_unified;
|
||||||
#if defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN)
|
#if defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN)
|
||||||
bool ts_all_zero = true;
|
bool ts_all_zero = true;
|
||||||
for (int i = 0; i < tensor_split_max; ++i) {
|
for (int i = 0; i < tensor_split_max; ++i) {
|
||||||
|
@ -2183,6 +2184,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_ctx_params.offload_kqv = !inputs.low_vram;
|
llama_ctx_params.offload_kqv = !inputs.low_vram;
|
||||||
|
llama_ctx_params.kv_unified = true;
|
||||||
model_params.use_mmap = inputs.use_mmap;
|
model_params.use_mmap = inputs.use_mmap;
|
||||||
model_params.use_mlock = inputs.use_mlock;
|
model_params.use_mlock = inputs.use_mlock;
|
||||||
model_params.n_gpu_layers = inputs.gpulayers;
|
model_params.n_gpu_layers = inputs.gpulayers;
|
||||||
|
|
14
klite.embd
14
klite.embd
|
@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
|
||||||
-->
|
-->
|
||||||
|
|
||||||
<script id="init-config">
|
<script id="init-config">
|
||||||
const LITEVER = 264;
|
const LITEVER = 265;
|
||||||
const urlParams = new URLSearchParams(window.location.search);
|
const urlParams = new URLSearchParams(window.location.search);
|
||||||
var localflag = urlParams.get('local'); //this will be replaced automatically in embedded kcpp
|
var localflag = urlParams.get('local'); //this will be replaced automatically in embedded kcpp
|
||||||
const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
|
const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
|
||||||
|
@ -5996,16 +5996,28 @@ Current version indicated by LITEVER below.
|
||||||
const matchedlw = match.match(/^[ \t]*/);
|
const matchedlw = match.match(/^[ \t]*/);
|
||||||
const leadingWhitespace = matchedlw ? matchedlw[0] : '';
|
const leadingWhitespace = matchedlw ? matchedlw[0] : '';
|
||||||
content = unescape_html(content);
|
content = unescape_html(content);
|
||||||
|
if(content.match(/^\${1,}$/)) //only dollar signs, just return
|
||||||
|
{
|
||||||
|
return match;
|
||||||
|
}
|
||||||
return leadingWhitespace + temml.renderToString(content); // render LaTeX content
|
return leadingWhitespace + temml.renderToString(content); // render LaTeX content
|
||||||
});
|
});
|
||||||
input = input.replace(/(?:^|[^\\])\$(\S[^$\n]*?\S)\$(?!\d)/g, (match, p1) => {
|
input = input.replace(/(?:^|[^\\])\$(\S[^$\n]*?\S)\$(?!\d)/g, (match, p1) => {
|
||||||
let content = p1;
|
let content = p1;
|
||||||
content = unescape_html(content);
|
content = unescape_html(content);
|
||||||
|
if(content.match(/^\${1,}$/)) //only dollar signs, just return
|
||||||
|
{
|
||||||
|
return match;
|
||||||
|
}
|
||||||
return " "+temml.renderToString(content); // render LaTeX content
|
return " "+temml.renderToString(content); // render LaTeX content
|
||||||
});
|
});
|
||||||
input = input.replace(/(^\\begin\{math\}\n([\s\S]*?)\n\\end\{math\}$|^\\begin\{equation\}\n([\s\S]*?)\n\\end\{equation\}$)/gm, (match, p1, p2, p3) => { //match math eqns
|
input = input.replace(/(^\\begin\{math\}\n([\s\S]*?)\n\\end\{math\}$|^\\begin\{equation\}\n([\s\S]*?)\n\\end\{equation\}$)/gm, (match, p1, p2, p3) => { //match math eqns
|
||||||
let content = p2 || p3;
|
let content = p2 || p3;
|
||||||
content = unescape_html(content);
|
content = unescape_html(content);
|
||||||
|
if(content.match(/^\${1,}$/)) //only dollar signs, just return
|
||||||
|
{
|
||||||
|
return match;
|
||||||
|
}
|
||||||
return temml.renderToString(content); // render LaTeX content
|
return temml.renderToString(content); // render LaTeX content
|
||||||
});
|
});
|
||||||
return input;
|
return input;
|
||||||
|
|
|
@ -135,6 +135,7 @@ bool embeddingstype_load_model(const embeddings_load_model_inputs inputs)
|
||||||
ctx_params.n_threads = nthreads;
|
ctx_params.n_threads = nthreads;
|
||||||
ctx_params.n_threads_batch = nthreads;
|
ctx_params.n_threads_batch = nthreads;
|
||||||
ctx_params.flash_attn = inputs.flash_attention;
|
ctx_params.flash_attn = inputs.flash_attention;
|
||||||
|
ctx_params.kv_unified = true;
|
||||||
|
|
||||||
embeddings_ctx = llama_init_from_model(embeddingsmodel, ctx_params);
|
embeddings_ctx = llama_init_from_model(embeddingsmodel, ctx_params);
|
||||||
|
|
||||||
|
|
|
@ -288,6 +288,7 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
|
||||||
|
|
||||||
if (sd_ctx == NULL) {
|
if (sd_ctx == NULL) {
|
||||||
printf("\nError: KCPP SD Failed to create context!\nIf using Flux/SD3.5, make sure you have ALL files required (e.g. VAE, T5, Clip...) or baked in!\n");
|
printf("\nError: KCPP SD Failed to create context!\nIf using Flux/SD3.5, make sure you have ALL files required (e.g. VAE, T5, Clip...) or baked in!\n");
|
||||||
|
printf("Otherwise, if you are using GGUF format, you can try the original .safetensors instead (Comfy GGUF not supported)\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -539,6 +539,7 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
|
||||||
tts_ctx_params.n_threads = nthreads;
|
tts_ctx_params.n_threads = nthreads;
|
||||||
tts_ctx_params.n_threads_batch = nthreads;
|
tts_ctx_params.n_threads_batch = nthreads;
|
||||||
tts_ctx_params.flash_attn = inputs.flash_attention;
|
tts_ctx_params.flash_attn = inputs.flash_attention;
|
||||||
|
tts_ctx_params.kv_unified = true;
|
||||||
|
|
||||||
llama_model * ttcmodel = llama_model_load_from_file(modelfile_ttc.c_str(), tts_model_params);
|
llama_model * ttcmodel = llama_model_load_from_file(modelfile_ttc.c_str(), tts_model_params);
|
||||||
ttc_ctx = llama_init_from_model(ttcmodel, tts_ctx_params);
|
ttc_ctx = llama_init_from_model(ttcmodel, tts_ctx_params);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue