diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index ba4a33563..4d472da7f 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -7,6 +7,7 @@ //No dynamic memory allocation! Setup structs with FIXED (known) shapes and sizes for ALL output fields //Python will ALWAYS provide the memory, we just write to it. +#include #include #include #include "model_adapter.h" @@ -787,6 +788,19 @@ static int GetBatchSize(int desiredBlasBatchSize,FileFormat in_file_format) return desiredBlasBatchSize; } +//this function applies automatic scaling to rope freq base when the desired context exceeds trained context +static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_train, int n_ctx_desired, bool is_solar) +{ + if(n_ctx_desired <= n_ctx_train || n_ctx_desired <= 2048) + { + return original_rope_base; + } + float ctx_multiplier = (is_solar?8.0f:1.0f); + float chi_ctx_train_value = (n_ctx_train * ctx_multiplier) / 6.28318; + float chi_ctx_value = (n_ctx_desired * ctx_multiplier) / 6.28318; + return powf(original_rope_base, logf(chi_ctx_value) / logf(chi_ctx_train_value)); +} + ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in_file_format, FileFormatExtraMeta in_file_format_meta) { ggml_time_init(); @@ -835,28 +849,16 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } else { - rope_freq_scale = 1.0f; - if (kcpp_params->n_ctx <= 2048) //normie mode + //Set freq base for all, including non GGUF. If we are using GGUF, this will be overwritten with more accurate values later. + rope_freq_base = CalcGradientAIRopeFreqBase(10000.0f,2048,kcpp_params->n_ctx,false); + if(file_format==FileFormat::GGUF_GENERIC) { - rope_freq_base = 10000.0f; + printf("Using automatic RoPE scaling. If the model has customized RoPE settings, they will be used directly instead!\n"); } else { - //approximate NTK aware ctx - auto effectivenctx = kcpp_params->n_ctx; - if((file_format == FileFormat::GGUF_GENERIC) && file_format_meta.n_ctx_train > 2048) - { - float factor = file_format_meta.n_ctx_train/2048; - effectivenctx = effectivenctx/factor; - } - float magic_multiplier = 8.0f; - float base_multiplier = effectivenctx*magic_multiplier; - float base_raw = 10000.0f; - rope_freq_base = (effectivenctx <= 2048 ? base_raw : base_multiplier); - + printf("Using Automatic RoPE scaling, Pre-GGUF (scale:%.3f, base:%.1f).\n",rope_freq_scale, rope_freq_base); } - - printf("Using automatic RoPE scaling. If the model has customized RoPE settings, they will be used directly instead!\n"); } gptj_ctx_v3.hparams.rope_freq_scale = neox_ctx_v3.hparams.rope_freq_scale = rope_freq_scale; gptj_ctx_v3.hparams.rope_freq_base = neox_ctx_v3.hparams.rope_freq_base = rope_freq_base; @@ -1085,7 +1087,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } else { - //if the model modifes rope in any way, use the model values. Otherwise, use our automatic ones + //if the model modifes rope in any way, or uses yarn, use the model values. Otherwise, use our automatic ones //special exception for llama, which uses auto scale if((llamamodel->hparams.rope_freq_base_train!=10000.0f && llamamodel->hparams.rope_freq_base_train!=500000.0f) || llamamodel->hparams.rope_freq_scale_train!=1.0f || @@ -1095,8 +1097,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } else { - float multiplier_rope_base = llamamodel->hparams.rope_freq_base_train/10000.0f; - rope_freq_base *= multiplier_rope_base; + //Calculate rope_freq_base using the gradientAI formula, solar requires ctx *8 for correct scaling + rope_freq_base = CalcGradientAIRopeFreqBase(llamamodel->hparams.rope_freq_base_train, file_format_meta.n_ctx_train, kcpp_params->n_ctx, file_format_meta.model_architecture==GGUFArch::ARCH_SOLAR); llama_ctx_params.rope_freq_base = rope_freq_base; llama_ctx_params.rope_freq_scale = rope_freq_scale; printf("Automatic RoPE Scaling: Using (scale:%.3f, base:%.1f).\n", rope_freq_scale, rope_freq_base); @@ -2467,4 +2469,4 @@ generation_outputs gpttype_generate(const generation_inputs inputs) concat_output_mtx.unlock(); output.text = concat_output_reader_copy_res.c_str(); return output; -} \ No newline at end of file +} diff --git a/model_adapter.cpp b/model_adapter.cpp index c5ea34093..85048c0f3 100644 --- a/model_adapter.cpp +++ b/model_adapter.cpp @@ -271,6 +271,9 @@ void print_tok_vec(std::vector &embd) if(modelarch!="" && fileformatmeta!=nullptr) { + int n_tensors = gguf_get_n_tensors(ctx); + float freq_base_train = 0; + std::string fkey = modelarch+".context_length"; int keyidx = gguf_find_key(ctx, fkey.c_str()); if (keyidx != -1) { @@ -281,8 +284,14 @@ void print_tok_vec(std::vector &embd) if (keyidx != -1) { fileformatmeta->n_expert_count = gguf_get_val_u32(ctx, keyidx); } + fkey = modelarch+".rope.freq_base"; + keyidx = gguf_find_key(ctx, fkey.c_str()); + if (keyidx != -1) { + freq_base_train = gguf_get_val_f32(ctx, keyidx); + } int filever = gguf_get_version(ctx); + fileformatmeta->fileversion = filever; fileformatmeta->model_architecture = GGUFArch::ARCH_DEFAULT; if(modelarch=="phi2") @@ -297,7 +306,12 @@ void print_tok_vec(std::vector &embd) { fileformatmeta->model_architecture = GGUFArch::ARCH_MAMBA; } + else if(modelarch=="llama" && freq_base_train==10000.0f && n_tensors==435) + { + fileformatmeta->model_architecture = GGUFArch::ARCH_SOLAR; + } } + gguf_free(ctx); } @@ -531,4 +545,4 @@ void print_tok_vec(std::vector &embd) //remove all tokens between start part and start of LCS in new prompt, thus avoiding shift //if LCS not found or mismatched, regenerate. chop new prompt and repeat from step B } - } \ No newline at end of file + } diff --git a/model_adapter.h b/model_adapter.h index 0ba7582bb..c5fe6e99c 100644 --- a/model_adapter.h +++ b/model_adapter.h @@ -56,6 +56,7 @@ enum GGUFArch ARCH_FALCON = 1, ARCH_PHI = 2, ARCH_MAMBA = 3, + ARCH_SOLAR = 4, }; struct FileFormatExtraMeta