diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 9b8a61ffc..cac42a268 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -149,6 +149,7 @@ static ggml_cuda_device_info ggml_cuda_init() { //#else // GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__); //#endif // GGML_CUDA_FORCE_CUBLAS + GGML_LOG_INFO("Initializing CUDA, please wait, this might take a while for first run...\n", __func__, info.device_count); GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count); for (int id = 0; id < info.device_count; ++id) { int device_vmm = 0; diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh index 2ed6509ac..ade6bbe78 100644 --- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh @@ -315,7 +315,7 @@ void ggml_cuda_flash_attn_ext_vec_f16_case(ggml_backend_cuda_context & ctx, ggml const ggml_tensor * V = dst->src[2]; const int32_t precision = KQV->op_params[3]; - GGML_ASSERT(precision == GGML_PREC_DEFAULT); + GGML_ASSERT_CONTINUE(precision == GGML_PREC_DEFAULT); GGML_ASSERT(K->type == type_K); GGML_ASSERT(V->type == type_V); diff --git a/klite.embd b/klite.embd index c79e084d1..14526f300 100644 --- a/klite.embd +++ b/klite.embd @@ -12648,14 +12648,14 @@ Current version indicated by LITEVER below. //memory is allowed to be up to 0.8 times of ctx allowance, anote up to 0.6 times let max_mem_len = Math.floor(max_allowed_characters*0.8); let max_anote_len = Math.floor(max_allowed_characters*0.6); - let max_wi_len = Math.floor(max_allowed_characters*0.7); + let max_wi_len = Math.floor(max_allowed_characters*0.5); let appendedsysprompt = ""; if(localsettings.opmode==4 && localsettings.instruct_sysprompt!="") { max_mem_len = Math.floor(max_allowed_characters*0.7); appendedsysprompt = get_instruct_systag(false) + localsettings.instruct_sysprompt + "\n"; } - let truncated_memory = appendedsysprompt + substring_to_boundary(current_memory, max_mem_len); + let truncated_memory = substring_to_boundary(current_memory, max_mem_len); if (truncated_memory != null && truncated_memory != "") { if(newlineaftermemory) { @@ -12786,12 +12786,15 @@ Current version indicated by LITEVER below. if(wi_insertlocation>0) { truncated_anote = wistr + truncated_anote; + truncated_anote = substring_to_boundary(truncated_anote, max_anote_len); } else { truncated_memory += wistr; } + truncated_memory = appendedsysprompt + substring_to_boundary(truncated_memory, max_mem_len); + //now we resize the context such that the memory and authors note can fit inside truncated_context = substring_to_boundary(truncated_context, max_allowed_characters);