mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 09:04:36 +00:00
GradientAI Auto ROPE Base calculation (#910)
* GradientAI Auto ROPE Base calculation https://gradient.ai/blog/scaling-rotational-embeddings-for-long-context-language-models has a formula that better fits the ideal rope scaling. Tested with Lllama3, checked calculation is correct for llama2. Retains logic for not scaling rope if under trained CTX. * add in solar scaling logic Solar based models require the context values to be multiplied by 8. This is (i'm guessing) because the positions as based on a 32k context, but sliding window of 4k. * Update model_adapter.h adding in tensor count to identify solar models based on tensor count of 435. * Update model_adapter.cpp add in n_tensor count for solar identification * refactor and cleanup GradientAI rope scaling --------- Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
This commit is contained in:
parent
49e4c3fd7b
commit
1e72b65c38
3 changed files with 39 additions and 22 deletions
|
@ -7,6 +7,7 @@
|
||||||
//No dynamic memory allocation! Setup structs with FIXED (known) shapes and sizes for ALL output fields
|
//No dynamic memory allocation! Setup structs with FIXED (known) shapes and sizes for ALL output fields
|
||||||
//Python will ALWAYS provide the memory, we just write to it.
|
//Python will ALWAYS provide the memory, we just write to it.
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include "model_adapter.h"
|
#include "model_adapter.h"
|
||||||
|
@ -787,6 +788,19 @@ static int GetBatchSize(int desiredBlasBatchSize,FileFormat in_file_format)
|
||||||
return desiredBlasBatchSize;
|
return desiredBlasBatchSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//this function applies automatic scaling to rope freq base when the desired context exceeds trained context
|
||||||
|
static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_train, int n_ctx_desired, bool is_solar)
|
||||||
|
{
|
||||||
|
if(n_ctx_desired <= n_ctx_train || n_ctx_desired <= 2048)
|
||||||
|
{
|
||||||
|
return original_rope_base;
|
||||||
|
}
|
||||||
|
float ctx_multiplier = (is_solar?8.0f:1.0f);
|
||||||
|
float chi_ctx_train_value = (n_ctx_train * ctx_multiplier) / 6.28318;
|
||||||
|
float chi_ctx_value = (n_ctx_desired * ctx_multiplier) / 6.28318;
|
||||||
|
return powf(original_rope_base, logf(chi_ctx_value) / logf(chi_ctx_train_value));
|
||||||
|
}
|
||||||
|
|
||||||
ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in_file_format, FileFormatExtraMeta in_file_format_meta)
|
ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in_file_format, FileFormatExtraMeta in_file_format_meta)
|
||||||
{
|
{
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
@ -835,28 +849,16 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
rope_freq_scale = 1.0f;
|
//Set freq base for all, including non GGUF. If we are using GGUF, this will be overwritten with more accurate values later.
|
||||||
if (kcpp_params->n_ctx <= 2048) //normie mode
|
rope_freq_base = CalcGradientAIRopeFreqBase(10000.0f,2048,kcpp_params->n_ctx,false);
|
||||||
|
if(file_format==FileFormat::GGUF_GENERIC)
|
||||||
{
|
{
|
||||||
rope_freq_base = 10000.0f;
|
printf("Using automatic RoPE scaling. If the model has customized RoPE settings, they will be used directly instead!\n");
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
//approximate NTK aware ctx
|
printf("Using Automatic RoPE scaling, Pre-GGUF (scale:%.3f, base:%.1f).\n",rope_freq_scale, rope_freq_base);
|
||||||
auto effectivenctx = kcpp_params->n_ctx;
|
|
||||||
if((file_format == FileFormat::GGUF_GENERIC) && file_format_meta.n_ctx_train > 2048)
|
|
||||||
{
|
|
||||||
float factor = file_format_meta.n_ctx_train/2048;
|
|
||||||
effectivenctx = effectivenctx/factor;
|
|
||||||
}
|
}
|
||||||
float magic_multiplier = 8.0f;
|
|
||||||
float base_multiplier = effectivenctx*magic_multiplier;
|
|
||||||
float base_raw = 10000.0f;
|
|
||||||
rope_freq_base = (effectivenctx <= 2048 ? base_raw : base_multiplier);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("Using automatic RoPE scaling. If the model has customized RoPE settings, they will be used directly instead!\n");
|
|
||||||
}
|
}
|
||||||
gptj_ctx_v3.hparams.rope_freq_scale = neox_ctx_v3.hparams.rope_freq_scale = rope_freq_scale;
|
gptj_ctx_v3.hparams.rope_freq_scale = neox_ctx_v3.hparams.rope_freq_scale = rope_freq_scale;
|
||||||
gptj_ctx_v3.hparams.rope_freq_base = neox_ctx_v3.hparams.rope_freq_base = rope_freq_base;
|
gptj_ctx_v3.hparams.rope_freq_base = neox_ctx_v3.hparams.rope_freq_base = rope_freq_base;
|
||||||
|
@ -1085,7 +1087,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
//if the model modifes rope in any way, use the model values. Otherwise, use our automatic ones
|
//if the model modifes rope in any way, or uses yarn, use the model values. Otherwise, use our automatic ones
|
||||||
//special exception for llama, which uses auto scale
|
//special exception for llama, which uses auto scale
|
||||||
if((llamamodel->hparams.rope_freq_base_train!=10000.0f && llamamodel->hparams.rope_freq_base_train!=500000.0f) ||
|
if((llamamodel->hparams.rope_freq_base_train!=10000.0f && llamamodel->hparams.rope_freq_base_train!=500000.0f) ||
|
||||||
llamamodel->hparams.rope_freq_scale_train!=1.0f ||
|
llamamodel->hparams.rope_freq_scale_train!=1.0f ||
|
||||||
|
@ -1095,8 +1097,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
float multiplier_rope_base = llamamodel->hparams.rope_freq_base_train/10000.0f;
|
//Calculate rope_freq_base using the gradientAI formula, solar requires ctx *8 for correct scaling
|
||||||
rope_freq_base *= multiplier_rope_base;
|
rope_freq_base = CalcGradientAIRopeFreqBase(llamamodel->hparams.rope_freq_base_train, file_format_meta.n_ctx_train, kcpp_params->n_ctx, file_format_meta.model_architecture==GGUFArch::ARCH_SOLAR);
|
||||||
llama_ctx_params.rope_freq_base = rope_freq_base;
|
llama_ctx_params.rope_freq_base = rope_freq_base;
|
||||||
llama_ctx_params.rope_freq_scale = rope_freq_scale;
|
llama_ctx_params.rope_freq_scale = rope_freq_scale;
|
||||||
printf("Automatic RoPE Scaling: Using (scale:%.3f, base:%.1f).\n", rope_freq_scale, rope_freq_base);
|
printf("Automatic RoPE Scaling: Using (scale:%.3f, base:%.1f).\n", rope_freq_scale, rope_freq_base);
|
||||||
|
|
|
@ -271,6 +271,9 @@ void print_tok_vec(std::vector<float> &embd)
|
||||||
|
|
||||||
if(modelarch!="" && fileformatmeta!=nullptr)
|
if(modelarch!="" && fileformatmeta!=nullptr)
|
||||||
{
|
{
|
||||||
|
int n_tensors = gguf_get_n_tensors(ctx);
|
||||||
|
float freq_base_train = 0;
|
||||||
|
|
||||||
std::string fkey = modelarch+".context_length";
|
std::string fkey = modelarch+".context_length";
|
||||||
int keyidx = gguf_find_key(ctx, fkey.c_str());
|
int keyidx = gguf_find_key(ctx, fkey.c_str());
|
||||||
if (keyidx != -1) {
|
if (keyidx != -1) {
|
||||||
|
@ -281,8 +284,14 @@ void print_tok_vec(std::vector<float> &embd)
|
||||||
if (keyidx != -1) {
|
if (keyidx != -1) {
|
||||||
fileformatmeta->n_expert_count = gguf_get_val_u32(ctx, keyidx);
|
fileformatmeta->n_expert_count = gguf_get_val_u32(ctx, keyidx);
|
||||||
}
|
}
|
||||||
|
fkey = modelarch+".rope.freq_base";
|
||||||
|
keyidx = gguf_find_key(ctx, fkey.c_str());
|
||||||
|
if (keyidx != -1) {
|
||||||
|
freq_base_train = gguf_get_val_f32(ctx, keyidx);
|
||||||
|
}
|
||||||
|
|
||||||
int filever = gguf_get_version(ctx);
|
int filever = gguf_get_version(ctx);
|
||||||
|
|
||||||
fileformatmeta->fileversion = filever;
|
fileformatmeta->fileversion = filever;
|
||||||
fileformatmeta->model_architecture = GGUFArch::ARCH_DEFAULT;
|
fileformatmeta->model_architecture = GGUFArch::ARCH_DEFAULT;
|
||||||
if(modelarch=="phi2")
|
if(modelarch=="phi2")
|
||||||
|
@ -297,7 +306,12 @@ void print_tok_vec(std::vector<float> &embd)
|
||||||
{
|
{
|
||||||
fileformatmeta->model_architecture = GGUFArch::ARCH_MAMBA;
|
fileformatmeta->model_architecture = GGUFArch::ARCH_MAMBA;
|
||||||
}
|
}
|
||||||
|
else if(modelarch=="llama" && freq_base_train==10000.0f && n_tensors==435)
|
||||||
|
{
|
||||||
|
fileformatmeta->model_architecture = GGUFArch::ARCH_SOLAR;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
gguf_free(ctx);
|
gguf_free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -56,6 +56,7 @@ enum GGUFArch
|
||||||
ARCH_FALCON = 1,
|
ARCH_FALCON = 1,
|
||||||
ARCH_PHI = 2,
|
ARCH_PHI = 2,
|
||||||
ARCH_MAMBA = 3,
|
ARCH_MAMBA = 3,
|
||||||
|
ARCH_SOLAR = 4,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct FileFormatExtraMeta
|
struct FileFormatExtraMeta
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue