mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
Gradient rope formula with offsets (#938)
* Gradient rope formula with offsets Positive for Solar models Negative for Llama 1 and 2 models * Update gpttype_adapter.cpp Remove L1/L2 * cleanup PR, skip llama models, keep prints behind debug mode --------- Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
This commit is contained in:
parent
dd5cda06b7
commit
cb2336f5d9
3 changed files with 61 additions and 10 deletions
|
@ -789,16 +789,59 @@ static int GetBatchSize(int desiredBlasBatchSize,FileFormat in_file_format)
|
||||||
}
|
}
|
||||||
|
|
||||||
//this function applies automatic scaling to rope freq base when the desired context exceeds trained context
|
//this function applies automatic scaling to rope freq base when the desired context exceeds trained context
|
||||||
static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_train, int n_ctx_desired, bool is_solar)
|
static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_train, int n_ctx_desired, GGUFArch model_arch)
|
||||||
{
|
{
|
||||||
if(n_ctx_desired <= n_ctx_train || n_ctx_desired <= 2048)
|
if(n_ctx_desired <= n_ctx_train || n_ctx_desired <= 2048)
|
||||||
{
|
{
|
||||||
return original_rope_base;
|
return original_rope_base;
|
||||||
}
|
}
|
||||||
float ctx_multiplier = (is_solar?8.0f:1.0f);
|
else
|
||||||
|
{
|
||||||
|
float ctx_multiplier = (model_arch==GGUFArch::ARCH_SOLAR?8.0f:1.0f);
|
||||||
float chi_ctx_train_value = (n_ctx_train * ctx_multiplier) / 6.28318;
|
float chi_ctx_train_value = (n_ctx_train * ctx_multiplier) / 6.28318;
|
||||||
float chi_ctx_value = (n_ctx_desired * ctx_multiplier) / 6.28318;
|
float chi_ctx_value = (n_ctx_desired * ctx_multiplier) / 6.28318;
|
||||||
return powf(original_rope_base, logf(chi_ctx_value) / logf(chi_ctx_train_value));
|
float gradient_ai_rope_freq_base_value = powf(original_rope_base, log10f(chi_ctx_value) / log10f(chi_ctx_train_value));
|
||||||
|
|
||||||
|
if(debugmode==1)
|
||||||
|
{
|
||||||
|
printf("Trained max context length (value:%.d).\n", n_ctx_train);
|
||||||
|
printf("Desired context length (value:%.d).\n", n_ctx_desired);
|
||||||
|
printf("Solar context multiplier (value:%.3f).\n", ctx_multiplier);
|
||||||
|
printf("Chi context train (value:%.3f).\n", chi_ctx_train_value);
|
||||||
|
printf("Chi chosen context (value:%.3f).\n", chi_ctx_value);
|
||||||
|
printf("Log Chi context train (value:%.3f).\n", log10f(chi_ctx_train_value));
|
||||||
|
printf("Log Chi chosen context (value:%.3f).\n", log10f(chi_ctx_value));
|
||||||
|
printf("RoPE Frequency Base value (value:%.3f).\n", original_rope_base);
|
||||||
|
printf("RoPE base calculated via Gradient AI formula. (value:%.1f).\n", gradient_ai_rope_freq_base_value);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(model_arch==GGUFArch::ARCH_SOLAR)
|
||||||
|
{
|
||||||
|
float extended_rope_positive_offset_value = 1 + ((log10f(chi_ctx_value) - log10f(chi_ctx_train_value)) / ((log10f(chi_ctx_value) * log10f(chi_ctx_train_value)) - (log10f(chi_ctx_value) + log10f(chi_ctx_train_value))));
|
||||||
|
float rope_freq_base_with_positive_offset = gradient_ai_rope_freq_base_value * extended_rope_positive_offset_value;
|
||||||
|
if(debugmode==1)
|
||||||
|
{
|
||||||
|
printf("Extended RoPE Positive Offset (multiplicator) for Solar based models. (value:%.3f).\n", extended_rope_positive_offset_value);
|
||||||
|
printf("RoPE base calculated via Gradient AI formula for Solar based models. (value:%.1f).\n", rope_freq_base_with_positive_offset);
|
||||||
|
}
|
||||||
|
return rope_freq_base_with_positive_offset;
|
||||||
|
}
|
||||||
|
// else if(model_arch==GGUFArch::ARCH_MISTRAL_LLAMA_1_AND_2)
|
||||||
|
// {
|
||||||
|
// float extended_rope_negative_offset_value = 1 + ((log10f(chi_ctx_value) - log10f(chi_ctx_train_value)) / (3.14159265358979323846 * 3.14159265358979323846));
|
||||||
|
// float rope_freq_base_with_negative_offset = gradient_ai_rope_freq_base_value / extended_rope_negative_offset_value;
|
||||||
|
// if(debugmode==1)
|
||||||
|
// {
|
||||||
|
// printf("Extended RoPE Negative Offset (divisor) for Llama 1 and 2 based models. (value:%.3f).\n", extended_rope_negative_offset_value);
|
||||||
|
// printf("RoPE base calculated via Gradient AI formula for Llama 1 and 2 based models. (value:%.1f).\n", rope_freq_base_with_negative_offset);
|
||||||
|
// }
|
||||||
|
// return rope_freq_base_with_negative_offset;
|
||||||
|
// }
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return gradient_ai_rope_freq_base_value;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in_file_format, FileFormatExtraMeta in_file_format_meta)
|
ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in_file_format, FileFormatExtraMeta in_file_format_meta)
|
||||||
|
@ -850,10 +893,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
//Set freq base for all, including non GGUF. If we are using GGUF, this will be overwritten with more accurate values later.
|
//Set freq base for all, including non GGUF. If we are using GGUF, this will be overwritten with more accurate values later.
|
||||||
rope_freq_base = CalcGradientAIRopeFreqBase(10000.0f,2048,kcpp_params->n_ctx,false);
|
rope_freq_base = CalcGradientAIRopeFreqBase(10000.0f,2048,kcpp_params->n_ctx, GGUFArch::ARCH_DEFAULT);
|
||||||
if(file_format==FileFormat::GGUF_GENERIC)
|
if(file_format==FileFormat::GGUF_GENERIC)
|
||||||
{
|
{
|
||||||
printf("Using automatic RoPE scaling. If the model has customized RoPE settings, they will be used directly instead!\n");
|
printf("Using automatic RoPE scaling for GGUF. If the model has custom RoPE settings, they'll be used directly instead!\n");
|
||||||
|
printf("It means that the RoPE values written above will be replaced by the RoPE values indicated after loading.\n");
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -1099,7 +1143,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
//Calculate rope_freq_base using the gradientAI formula, solar requires ctx *8 for correct scaling
|
//Calculate rope_freq_base using the gradientAI formula, solar requires ctx *8 for correct scaling
|
||||||
rope_freq_base = CalcGradientAIRopeFreqBase(llamamodel->hparams.rope_freq_base_train, file_format_meta.n_ctx_train, kcpp_params->n_ctx, file_format_meta.model_architecture==GGUFArch::ARCH_SOLAR);
|
rope_freq_base = CalcGradientAIRopeFreqBase(llamamodel->hparams.rope_freq_base_train, file_format_meta.n_ctx_train, kcpp_params->n_ctx, file_format_meta.model_architecture);
|
||||||
llama_ctx_params.rope_freq_base = rope_freq_base;
|
llama_ctx_params.rope_freq_base = rope_freq_base;
|
||||||
llama_ctx_params.rope_freq_scale = rope_freq_scale;
|
llama_ctx_params.rope_freq_scale = rope_freq_scale;
|
||||||
printf("Automatic RoPE Scaling: Using (scale:%.3f, base:%.1f).\n", rope_freq_scale, rope_freq_base);
|
printf("Automatic RoPE Scaling: Using (scale:%.3f, base:%.1f).\n", rope_freq_scale, rope_freq_base);
|
||||||
|
|
|
@ -306,10 +306,16 @@ void print_tok_vec(std::vector<float> &embd)
|
||||||
{
|
{
|
||||||
fileformatmeta->model_architecture = GGUFArch::ARCH_MAMBA;
|
fileformatmeta->model_architecture = GGUFArch::ARCH_MAMBA;
|
||||||
}
|
}
|
||||||
else if(modelarch=="llama" && freq_base_train==10000.0f && n_tensors==435)
|
else if(modelarch=="llama" && freq_base_train==10000.0f && (n_tensors==435 || n_tensors==611))
|
||||||
{
|
{
|
||||||
fileformatmeta->model_architecture = GGUFArch::ARCH_SOLAR;
|
fileformatmeta->model_architecture = GGUFArch::ARCH_SOLAR;
|
||||||
}
|
}
|
||||||
|
else if(modelarch=="llama" && freq_base_train==10000.0f)
|
||||||
|
{
|
||||||
|
fileformatmeta->model_architecture = GGUFArch::ARCH_MISTRAL_LLAMA_1_AND_2;
|
||||||
|
}
|
||||||
|
printf("Arch Category: %d\n",fileformatmeta->model_architecture);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
gguf_free(ctx);
|
gguf_free(ctx);
|
||||||
|
|
|
@ -52,11 +52,12 @@ enum FileFormat
|
||||||
|
|
||||||
enum GGUFArch
|
enum GGUFArch
|
||||||
{
|
{
|
||||||
ARCH_DEFAULT = 0, //used for llama and other generic gguf
|
ARCH_DEFAULT = 0, //used for llama3 and other generic gguf
|
||||||
ARCH_FALCON = 1,
|
ARCH_FALCON = 1,
|
||||||
ARCH_PHI = 2,
|
ARCH_PHI = 2,
|
||||||
ARCH_MAMBA = 3,
|
ARCH_MAMBA = 3,
|
||||||
ARCH_SOLAR = 4,
|
ARCH_SOLAR = 4,
|
||||||
|
ARCH_MISTRAL_LLAMA_1_AND_2 = 5,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct FileFormatExtraMeta
|
struct FileFormatExtraMeta
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue