mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
Merge commit 'ab86335760
' into concedo_experimental
# Conflicts: # .github/workflows/release.yml # examples/retrieval/retrieval.cpp # examples/simple-chat/simple-chat.cpp # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-sycl/ggml-sycl.cpp # requirements/requirements-convert_hf_to_gguf.txt # requirements/requirements-convert_hf_to_gguf_update.txt # requirements/requirements-convert_lora_to_gguf.txt # tools/run/run.cpp
This commit is contained in:
commit
22ef97d7d3
23 changed files with 495 additions and 231 deletions
|
@ -858,43 +858,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
// for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
|
||||
if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
|
||||
// default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
|
||||
LLAMA_LOG_WARN("%s: assuming n_swa = 2047 for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct\n", __func__);
|
||||
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
|
||||
hparams.n_swa = 2047;
|
||||
} else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
|
||||
// default value for Phi-3-mini-128k-instruct
|
||||
LLAMA_LOG_WARN("%s: assuming no SWA for Phi-3-mini-128k-instruct\n", __func__);
|
||||
if (found_swa && hparams.n_swa > 0) {
|
||||
LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
|
||||
__func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
|
||||
|
||||
// TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
|
||||
hparams.n_swa = hparams.n_ctx_train;
|
||||
hparams.n_swa_pattern = 1;
|
||||
} else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
|
||||
// default value for Phi-3-medium-128k-instruct
|
||||
LLAMA_LOG_WARN("%s: assuming no SWA for Phi-3-medium-128k-instruct\n", __func__);
|
||||
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
|
||||
hparams.n_swa = hparams.n_ctx_train;
|
||||
hparams.n_swa_pattern = 1;
|
||||
}
|
||||
|
||||
bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
if (!found_swa && hparams.n_swa == 0) {
|
||||
throw std::runtime_error("invalid value for sliding_window");
|
||||
}
|
||||
|
||||
if (hparams.n_swa > hparams.n_ctx_train) {
|
||||
LLAMA_LOG_WARN("%s: unexpected n_swa: %d >= %d, disabling SWA\n", __func__, hparams.n_swa, hparams.n_ctx_train);
|
||||
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
|
||||
hparams.n_swa = hparams.n_ctx_train;
|
||||
hparams.n_swa = 0;
|
||||
hparams.n_swa_pattern = 1;
|
||||
}
|
||||
} break;
|
||||
|
@ -7468,8 +7441,9 @@ struct llm_build_phi2 : public llm_graph_context {
|
|||
}
|
||||
};
|
||||
|
||||
struct llm_build_phi3_iswa : public llm_graph_context {
|
||||
llm_build_phi3_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||
template<bool iswa>
|
||||
struct llm_build_phi3 : public llm_graph_context {
|
||||
llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
|
@ -7483,7 +7457,14 @@ struct llm_build_phi3_iswa : public llm_graph_context {
|
|||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
||||
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
|
||||
inp_attn_type * inp_attn = nullptr;
|
||||
|
||||
if constexpr (iswa) {
|
||||
inp_attn = build_attn_inp_kv_unified_iswa();
|
||||
} else {
|
||||
inp_attn = build_attn_inp_kv_unified();
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
auto * residual = inpL;
|
||||
|
@ -13322,7 +13303,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|||
GGML_TYPE_F32,
|
||||
GGML_TYPE_F32,
|
||||
cparams.offload_kqv,
|
||||
std::max((uint32_t) 1, cparams.n_seq_max));
|
||||
std::max((uint32_t) 1, cparams.n_seq_max),
|
||||
cparams.n_seq_max);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
|
@ -13332,19 +13314,23 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|||
|
||||
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
||||
|
||||
if (hparams.n_swa > 0) {
|
||||
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
||||
GGML_ASSERT(hparams.n_swa_pattern != 1);
|
||||
|
||||
res = new llama_kv_cache_unified_iswa(
|
||||
*this,
|
||||
params.type_k,
|
||||
params.type_v,
|
||||
!cparams.flash_attn,
|
||||
cparams.offload_kqv,
|
||||
cparams.n_ctx,
|
||||
params.swa_full,
|
||||
cparams.n_ctx,
|
||||
cparams.n_seq_max,
|
||||
cparams.n_batch,
|
||||
padding);
|
||||
} else {
|
||||
GGML_ASSERT(hparams.n_swa_pattern == 1);
|
||||
|
||||
res = new llama_kv_cache_unified(
|
||||
*this,
|
||||
nullptr,
|
||||
|
@ -13353,6 +13339,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|||
!cparams.flash_attn,
|
||||
cparams.offload_kqv,
|
||||
cparams.n_ctx,
|
||||
cparams.n_seq_max,
|
||||
padding,
|
||||
hparams.n_swa,
|
||||
hparams.swa_type);
|
||||
|
@ -13453,7 +13440,11 @@ llm_graph_result_ptr llama_model::build_graph(
|
|||
case LLM_ARCH_PHI3:
|
||||
case LLM_ARCH_PHIMOE:
|
||||
{
|
||||
llm = std::make_unique<llm_build_phi3_iswa>(*this, params, gf);
|
||||
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
||||
llm = std::make_unique<llm_build_phi3<true>> (*this, params, gf);
|
||||
} else {
|
||||
llm = std::make_unique<llm_build_phi3<false>>(*this, params, gf);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_PLAMO:
|
||||
{
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue