diff --git a/src/llama.cpp b/src/llama.cpp index d199eb380..8ab9b14a3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -153,6 +153,7 @@ static void zeros(std::ofstream & file, size_t n) { } } +static bool phi3swa_warning_showed = false; //to warn when old phi3 model has no SWA static bool clblast_offload_fallback_mode = false; //used when regular offload will segfault static int clblast_offload_fallback_layers = 0; static int layer_name_to_number(std::string inputString) @@ -4911,7 +4912,7 @@ static void llm_load_hparams( } break; case LLM_ARCH_PHI3: { - ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa,false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { @@ -10807,7 +10808,21 @@ struct llm_build_context { struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(); + struct ggml_tensor * KQ_mask_swa; + if(hparams.n_swa==0) + { + if(!phi3swa_warning_showed) + { + phi3swa_warning_showed = true; + printf("\nWarning: PHI3 model did not contain sliding window!!!\nSWA is disabled. Model may need a new quant.\n"); + } + KQ_mask_swa = build_inp_KQ_mask(); + } + else + { + KQ_mask_swa = build_inp_KQ_mask_swa(); + } + for (int il = 0; il < n_layer; ++il) { auto residual = inpL;