diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 9b8a61ffc..cac42a268 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -149,6 +149,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
 //#else
 //    GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
 //#endif // GGML_CUDA_FORCE_CUBLAS
+    GGML_LOG_INFO("Initializing CUDA, please wait, this might take a while for first run...\n", __func__, info.device_count);
     GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
     for (int id = 0; id < info.device_count; ++id) {
         int device_vmm = 0;
diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
index 2ed6509ac..ade6bbe78 100644
--- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@@ -315,7 +315,7 @@ void ggml_cuda_flash_attn_ext_vec_f16_case(ggml_backend_cuda_context & ctx, ggml
     const ggml_tensor * V   = dst->src[2];
 
     const int32_t precision = KQV->op_params[3];
-    GGML_ASSERT(precision == GGML_PREC_DEFAULT);
+    GGML_ASSERT_CONTINUE(precision == GGML_PREC_DEFAULT);
 
     GGML_ASSERT(K->type == type_K);
     GGML_ASSERT(V->type == type_V);
diff --git a/klite.embd b/klite.embd
index c79e084d1..14526f300 100644
--- a/klite.embd
+++ b/klite.embd
@@ -12648,14 +12648,14 @@ Current version indicated by LITEVER below.
 			//memory is allowed to be up to 0.8 times of ctx allowance, anote up to 0.6 times
 			let max_mem_len = Math.floor(max_allowed_characters*0.8);
 			let max_anote_len = Math.floor(max_allowed_characters*0.6);
-			let max_wi_len = Math.floor(max_allowed_characters*0.7);
+			let max_wi_len = Math.floor(max_allowed_characters*0.5);
 			let appendedsysprompt = "";
 			if(localsettings.opmode==4 && localsettings.instruct_sysprompt!="")
 			{
 				max_mem_len = Math.floor(max_allowed_characters*0.7);
 				appendedsysprompt = get_instruct_systag(false) + localsettings.instruct_sysprompt + "\n";
 			}
-			let truncated_memory = appendedsysprompt + substring_to_boundary(current_memory, max_mem_len);
+			let truncated_memory = substring_to_boundary(current_memory, max_mem_len);
 			if (truncated_memory != null && truncated_memory != "") {
 				if(newlineaftermemory)
 				{
@@ -12786,12 +12786,15 @@ Current version indicated by LITEVER below.
 			if(wi_insertlocation>0)
 			{
 				truncated_anote = wistr + truncated_anote;
+				truncated_anote = substring_to_boundary(truncated_anote, max_anote_len);
 			}
 			else
 			{
 				truncated_memory += wistr;
 			}
 
+			truncated_memory = appendedsysprompt + substring_to_boundary(truncated_memory, max_mem_len);
+
 			//now we resize the context such that the memory and authors note can fit inside
 			truncated_context = substring_to_boundary(truncated_context, max_allowed_characters);