mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Quadratic Sampling UI (#652)
* Quadratic Sampling UI Kalomaze's Quadratic Sampling, now has a UI within KCPP. * remove debug prints * cleanup, add smooth sampler to dynatemp --------- Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
This commit is contained in:
parent
504300784f
commit
4cb956c7db
8 changed files with 57 additions and 38 deletions
|
@ -79,6 +79,7 @@ struct gpt_params {
|
||||||
float tfs_z = 1.00f; // 1.0 = disabled
|
float tfs_z = 1.00f; // 1.0 = disabled
|
||||||
float typical_p = 1.00f; // 1.0 = disabled
|
float typical_p = 1.00f; // 1.0 = disabled
|
||||||
float temp = 0.80f; // 1.0 = disabled
|
float temp = 0.80f; // 1.0 = disabled
|
||||||
|
float smoothing_factor = 0.00f; // 0.00 = disabled
|
||||||
float repeat_penalty = 1.10f; // 1.0 = disabled
|
float repeat_penalty = 1.10f; // 1.0 = disabled
|
||||||
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||||
float frequency_penalty = 0.00f; // 0.0 = disabled
|
float frequency_penalty = 0.00f; // 0.0 = disabled
|
||||||
|
|
|
@ -150,9 +150,9 @@ static void sampler_queue(
|
||||||
if (dynatemp_range > 0) {
|
if (dynatemp_range > 0) {
|
||||||
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
|
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
|
||||||
float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
|
float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
|
||||||
llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
|
llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent, 0);
|
||||||
} else {
|
} else {
|
||||||
llama_sample_temp(ctx_main, &cur_p, temp);
|
llama_sample_temp(ctx_main, &cur_p, temp, 0);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
default : break;
|
default : break;
|
||||||
|
|
|
@ -18,6 +18,7 @@ typedef struct llama_sampling_params {
|
||||||
float tfs_z = 1.00f; // 1.0 = disabled
|
float tfs_z = 1.00f; // 1.0 = disabled
|
||||||
float typical_p = 1.00f; // 1.0 = disabled
|
float typical_p = 1.00f; // 1.0 = disabled
|
||||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||||
|
float smoothing_factor = 0.00f; // 0.00 = disabled
|
||||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||||
|
|
1
expose.h
1
expose.h
|
@ -84,6 +84,7 @@ struct generation_inputs
|
||||||
const bool quiet = false;
|
const bool quiet = false;
|
||||||
const float dynatemp_range = 0.0f;
|
const float dynatemp_range = 0.0f;
|
||||||
const float dynatemp_exponent = 1.0f;
|
const float dynatemp_exponent = 1.0f;
|
||||||
|
const float smoothing_factor = 0.0f;
|
||||||
const logit_bias logit_biases[logit_bias_max];
|
const logit_bias logit_biases[logit_bias_max];
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
|
@ -83,9 +83,7 @@ static int n_batch = 8;
|
||||||
static bool useSmartContext = false;
|
static bool useSmartContext = false;
|
||||||
static bool useContextShift = false;
|
static bool useContextShift = false;
|
||||||
static int blasbatchsize = 512;
|
static int blasbatchsize = 512;
|
||||||
static int dontblasbatchsize = 16;
|
static int smallbatchsize = 16;
|
||||||
static int normalbatchsize = 32;
|
|
||||||
static int smallbatchsize = 8;
|
|
||||||
static int debugmode = 0; //-1 = hide all, 0 = normal, 1 = showall
|
static int debugmode = 0; //-1 = hide all, 0 = normal, 1 = showall
|
||||||
static std::string modelname;
|
static std::string modelname;
|
||||||
static std::vector<gpt_vocab::id> last_n_tokens;
|
static std::vector<gpt_vocab::id> last_n_tokens;
|
||||||
|
@ -427,18 +425,18 @@ void sample_rep_pen(int n_ctx, int rep_pen_range, float rep_pen, float presence_
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void sample_temperature(llama_token_data_array * candidates_p, float temp)
|
void sample_temperature(llama_token_data_array * candidates_p, float temp, float smoothing_factor)
|
||||||
{
|
{
|
||||||
if (temp <= 0)
|
if (temp <= 0)
|
||||||
{
|
{
|
||||||
// Imitate greedy sampling
|
// Imitate greedy sampling
|
||||||
temp = 0.00390625f; //cannot be zero else div0, this is 1/256
|
temp = 0.00390625f; //cannot be zero else div0, this is 1/256
|
||||||
llama_sample_temperature(nullptr, candidates_p, temp);
|
llama_sample_temperature(nullptr, candidates_p, temp, 0);
|
||||||
llama_sample_top_k(nullptr, candidates_p, 1, 1); //only want first candidate
|
llama_sample_top_k(nullptr, candidates_p, 1, 1); //only want first candidate
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
llama_sample_temperature(nullptr, candidates_p, temp);
|
llama_sample_temperature(nullptr, candidates_p, temp, smoothing_factor);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -482,7 +480,7 @@ void sample_grammar(FileFormat file_format, int32_t n_vocab, llama_token_data_ar
|
||||||
}
|
}
|
||||||
|
|
||||||
int SampleLogits(const float * logits, int n_ctx, int n_vocab, int rep_pen_range, float rep_pen, float presence_penalty, float top_k, float top_a, float top_p, float min_p, float typical_p, float tfs, float temp, std::mt19937 & rng,
|
int SampleLogits(const float * logits, int n_ctx, int n_vocab, int rep_pen_range, float rep_pen, float presence_penalty, float top_k, float top_a, float top_p, float min_p, float typical_p, float tfs, float temp, std::mt19937 & rng,
|
||||||
int mirostat, float mirostat_tau, float mirostat_eta, const std::vector<samplers> & sampler_order, llama_grammar * grammar, float dynatemp_range, float dynatemp_exponent)
|
int mirostat, float mirostat_tau, float mirostat_eta, const std::vector<samplers> & sampler_order, llama_grammar * grammar, float dynatemp_range, float dynatemp_exponent, float smoothing_factor)
|
||||||
{
|
{
|
||||||
int id = 0;
|
int id = 0;
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> candidates;
|
||||||
|
@ -508,7 +506,7 @@ int mirostat, float mirostat_tau, float mirostat_eta, const std::vector<samplers
|
||||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||||
const int mirostat_m = 100;
|
const int mirostat_m = 100;
|
||||||
sample_rep_pen(n_ctx, rep_pen_range, rep_pen, presence_penalty, &candidates_p);
|
sample_rep_pen(n_ctx, rep_pen_range, rep_pen, presence_penalty, &candidates_p);
|
||||||
sample_temperature(&candidates_p, temp);
|
sample_temperature(&candidates_p, temp, smoothing_factor);
|
||||||
if (mirostat == 1)
|
if (mirostat == 1)
|
||||||
{
|
{
|
||||||
id = sample_token_mirostat(n_vocab, &candidates_p, rng, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
id = sample_token_mirostat(n_vocab, &candidates_p, rng, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
||||||
|
@ -549,11 +547,11 @@ int mirostat, float mirostat_tau, float mirostat_eta, const std::vector<samplers
|
||||||
dynatemp_min = dynatemp_min<0?0:dynatemp_min;
|
dynatemp_min = dynatemp_min<0?0:dynatemp_min;
|
||||||
dynatemp_max = dynatemp_max<0?0:dynatemp_max;
|
dynatemp_max = dynatemp_max<0?0:dynatemp_max;
|
||||||
dynatemp_exponent = dynatemp_exponent<0?0:dynatemp_exponent;
|
dynatemp_exponent = dynatemp_exponent<0?0:dynatemp_exponent;
|
||||||
llama_sample_entropy(nullptr, &candidates_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
|
llama_sample_entropy(nullptr, &candidates_p, dynatemp_min, dynatemp_max, dynatemp_exponent, smoothing_factor);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
sample_temperature(&candidates_p, temp);
|
sample_temperature(&candidates_p, temp, smoothing_factor);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case KCPP_SAMPLER_REP_PEN:
|
case KCPP_SAMPLER_REP_PEN:
|
||||||
|
@ -698,7 +696,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
n_blasthreads = kcpp_params->n_threads_batch = inputs.blasthreads;
|
n_blasthreads = kcpp_params->n_threads_batch = inputs.blasthreads;
|
||||||
bool isGguf = (file_format == FileFormat::GGUF_GENERIC);
|
bool isGguf = (file_format == FileFormat::GGUF_GENERIC);
|
||||||
|
|
||||||
n_batch = kcpp_params->n_batch = (isGguf?normalbatchsize:smallbatchsize);
|
n_batch = kcpp_params->n_batch = smallbatchsize;
|
||||||
modelname = kcpp_params->model = inputs.model_filename;
|
modelname = kcpp_params->model = inputs.model_filename;
|
||||||
useSmartContext = inputs.use_smartcontext;
|
useSmartContext = inputs.use_smartcontext;
|
||||||
useContextShift = inputs.use_contextshift;
|
useContextShift = inputs.use_contextshift;
|
||||||
|
@ -706,7 +704,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
blasbatchsize = inputs.blasbatchsize;
|
blasbatchsize = inputs.blasbatchsize;
|
||||||
if(blasbatchsize<=0)
|
if(blasbatchsize<=0)
|
||||||
{
|
{
|
||||||
blasbatchsize = (isGguf?dontblasbatchsize:smallbatchsize);
|
blasbatchsize = smallbatchsize;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto clamped_max_context_length = inputs.max_context_length;
|
auto clamped_max_context_length = inputs.max_context_length;
|
||||||
|
@ -1533,6 +1531,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
kcpp_params->n_batch = n_batch;
|
kcpp_params->n_batch = n_batch;
|
||||||
kcpp_params->n_threads = n_threads;
|
kcpp_params->n_threads = n_threads;
|
||||||
kcpp_params->n_threads_batch = n_blasthreads;
|
kcpp_params->n_threads_batch = n_blasthreads;
|
||||||
|
kcpp_params->smoothing_factor = inputs.smoothing_factor;
|
||||||
|
|
||||||
bool stream_sse = inputs.stream_sse;
|
bool stream_sse = inputs.stream_sse;
|
||||||
|
|
||||||
|
@ -1675,7 +1674,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
file_format == FileFormat::GPTJ_2 ||
|
file_format == FileFormat::GPTJ_2 ||
|
||||||
file_format == FileFormat::RWKV_1 ||
|
file_format == FileFormat::RWKV_1 ||
|
||||||
file_format==FileFormat::RWKV_2);
|
file_format==FileFormat::RWKV_2);
|
||||||
bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas() && blasbatchsize!=-1);
|
bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas() && blasbatchsize>=32);
|
||||||
// bool blasmode = false;
|
// bool blasmode = false;
|
||||||
int original_batch = kcpp_params->n_batch;
|
int original_batch = kcpp_params->n_batch;
|
||||||
int original_threads = kcpp_params->n_threads;
|
int original_threads = kcpp_params->n_threads;
|
||||||
|
@ -1930,6 +1929,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
const float tfs_z = kcpp_params->tfs_z;
|
const float tfs_z = kcpp_params->tfs_z;
|
||||||
const float dynatemp_range = kcpp_params->dynatemp_range;
|
const float dynatemp_range = kcpp_params->dynatemp_range;
|
||||||
const float dynatemp_exponent = kcpp_params->dynatemp_exponent;
|
const float dynatemp_exponent = kcpp_params->dynatemp_exponent;
|
||||||
|
const float smoothing_factor = kcpp_params->smoothing_factor;
|
||||||
|
|
||||||
if (!startedsampling)
|
if (!startedsampling)
|
||||||
{
|
{
|
||||||
|
@ -1985,7 +1985,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
|
|
||||||
id = SampleLogits(logitsPtr, nctx, n_vocab, last_n_size, repeat_penalty, presence_penalty,
|
id = SampleLogits(logitsPtr, nctx, n_vocab, last_n_size, repeat_penalty, presence_penalty,
|
||||||
top_k, top_a, top_p, min_p, typical_p, tfs_z, temp, rng,
|
top_k, top_a, top_p, min_p, typical_p, tfs_z, temp, rng,
|
||||||
kcpp_params->mirostat, kcpp_params->mirostat_tau, kcpp_params->mirostat_eta, sampler_order, grammar, dynatemp_range, dynatemp_exponent);
|
kcpp_params->mirostat, kcpp_params->mirostat_tau, kcpp_params->mirostat_eta, sampler_order, grammar, dynatemp_range, dynatemp_exponent, smoothing_factor);
|
||||||
|
|
||||||
if (grammar != nullptr) {
|
if (grammar != nullptr) {
|
||||||
grammar_accept_token(file_format, n_vocab, grammar, id);
|
grammar_accept_token(file_format, n_vocab, grammar, id);
|
||||||
|
|
5
koboldcpp.py
Executable file → Normal file
5
koboldcpp.py
Executable file → Normal file
|
@ -81,6 +81,7 @@ class generation_inputs(ctypes.Structure):
|
||||||
("quiet", ctypes.c_bool),
|
("quiet", ctypes.c_bool),
|
||||||
("dynatemp_range", ctypes.c_float),
|
("dynatemp_range", ctypes.c_float),
|
||||||
("dynatemp_exponent", ctypes.c_float),
|
("dynatemp_exponent", ctypes.c_float),
|
||||||
|
("smoothing_factor", ctypes.c_float),
|
||||||
("logit_biases", logit_bias * logit_bias_max)]
|
("logit_biases", logit_bias * logit_bias_max)]
|
||||||
|
|
||||||
class generation_outputs(ctypes.Structure):
|
class generation_outputs(ctypes.Structure):
|
||||||
|
@ -328,7 +329,7 @@ def load_model(model_filename):
|
||||||
ret = handle.load_model(inputs)
|
ret = handle.load_model(inputs)
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
def generate(prompt, memory="", max_length=32, max_context_length=512, temperature=0.7, top_k=100, top_a=0.0, top_p=0.92, min_p=0.0, typical_p=1.0, tfs=1.0, rep_pen=1.0, rep_pen_range=128, presence_penalty=0.0, mirostat=0, mirostat_tau=5.0, mirostat_eta=0.1, sampler_order=[6,0,1,3,4,2,5], seed=-1, stop_sequence=[], use_default_badwordsids=False, stream_sse=False, grammar='', grammar_retain_state=False, genkey='', trimstop=False, quiet=False, dynatemp_range=0.0, dynatemp_exponent=1.0, logit_biases={}):
|
def generate(prompt, memory="", max_length=32, max_context_length=512, temperature=0.7, top_k=100, top_a=0.0, top_p=0.92, min_p=0.0, typical_p=1.0, tfs=1.0, rep_pen=1.0, rep_pen_range=128, presence_penalty=0.0, mirostat=0, mirostat_tau=5.0, mirostat_eta=0.1, sampler_order=[6,0,1,3,4,2,5], seed=-1, stop_sequence=[], use_default_badwordsids=False, stream_sse=False, grammar='', grammar_retain_state=False, genkey='', trimstop=False, quiet=False, dynatemp_range=0.0, dynatemp_exponent=1.0, smoothing_factor=0.0, logit_biases={}):
|
||||||
global maxctx, args, currentusergenkey, totalgens, pendingabortkey
|
global maxctx, args, currentusergenkey, totalgens, pendingabortkey
|
||||||
inputs = generation_inputs()
|
inputs = generation_inputs()
|
||||||
outputs = ctypes.create_unicode_buffer(ctypes.sizeof(generation_outputs))
|
outputs = ctypes.create_unicode_buffer(ctypes.sizeof(generation_outputs))
|
||||||
|
@ -359,6 +360,7 @@ def generate(prompt, memory="", max_length=32, max_context_length=512, temperatu
|
||||||
inputs.quiet = quiet
|
inputs.quiet = quiet
|
||||||
inputs.dynatemp_range = dynatemp_range
|
inputs.dynatemp_range = dynatemp_range
|
||||||
inputs.dynatemp_exponent = dynatemp_exponent
|
inputs.dynatemp_exponent = dynatemp_exponent
|
||||||
|
inputs.smoothing_factor = smoothing_factor
|
||||||
inputs.grammar = grammar.encode("UTF-8")
|
inputs.grammar = grammar.encode("UTF-8")
|
||||||
inputs.grammar_retain_state = grammar_retain_state
|
inputs.grammar_retain_state = grammar_retain_state
|
||||||
inputs.unban_tokens_rt = not use_default_badwordsids
|
inputs.unban_tokens_rt = not use_default_badwordsids
|
||||||
|
@ -588,6 +590,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||||
quiet=is_quiet,
|
quiet=is_quiet,
|
||||||
dynatemp_range=genparams.get('dynatemp_range', 0.0),
|
dynatemp_range=genparams.get('dynatemp_range', 0.0),
|
||||||
dynatemp_exponent=genparams.get('dynatemp_exponent', 1.0),
|
dynatemp_exponent=genparams.get('dynatemp_exponent', 1.0),
|
||||||
|
smoothing_factor=genparams.get('smoothing_factor', 0.0),
|
||||||
logit_biases=genparams.get('logit_bias', {})
|
logit_biases=genparams.get('logit_bias', {})
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
49
llama.cpp
49
llama.cpp
|
@ -8769,7 +8769,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
|
void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val, float smoothing_factor) {
|
||||||
const int64_t t_start_sample_us = ggml_time_us();
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
// no need to do anything if there is only one (or zero) candidates
|
// no need to do anything if there is only one (or zero) candidates
|
||||||
|
@ -8797,15 +8797,6 @@ void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * c
|
||||||
// Map the normalized entropy to the desired temperature range using the power function
|
// Map the normalized entropy to the desired temperature range using the power function
|
||||||
float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
|
float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
|
||||||
|
|
||||||
#ifdef DEBUG
|
|
||||||
LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp);
|
|
||||||
LLAMA_LOG_INFO("Entropy: %f\n", entropy);
|
|
||||||
LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy);
|
|
||||||
LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy);
|
|
||||||
LLAMA_LOG_INFO("Exponent: %f\n", exponent_val);
|
|
||||||
LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Apply the dynamically calculated temperature scaling
|
// Apply the dynamically calculated temperature scaling
|
||||||
for (size_t i = 0; i < candidates_p->size; ++i) {
|
for (size_t i = 0; i < candidates_p->size; ++i) {
|
||||||
candidates_p->data[i].logit /= dyn_temp;
|
candidates_p->data[i].logit /= dyn_temp;
|
||||||
|
@ -8823,34 +8814,54 @@ void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * c
|
||||||
candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
|
candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef DEBUG
|
// Only apply smoothing if smoothing_factor is > 0. Do not change base implementation otherwise.
|
||||||
// Print the updated top 25 probabilities after temperature scaling
|
if (smoothing_factor > 0 && candidates_p->size > 1) {
|
||||||
LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
|
|
||||||
for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) {
|
llama_sample_softmax(ctx, candidates_p);
|
||||||
LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f);
|
float h = candidates_p->data[0].logit; // Find the maximum logit for h to be added after the transformation
|
||||||
|
// Apply quadratic transformation using the smoothing_factor
|
||||||
|
for (size_t i = 0; i < candidates_p->size; ++i)
|
||||||
|
{
|
||||||
|
float logit_shifted = candidates_p->data[i].logit - h;
|
||||||
|
candidates_p->data[i].logit = -smoothing_factor * logit_shifted * logit_shifted + h;
|
||||||
|
}
|
||||||
|
llama_sample_softmax(ctx, candidates_p);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
if (ctx) {
|
if (ctx) {
|
||||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp, float smoothing_factor) {
|
||||||
const int64_t t_start_sample_us = ggml_time_us();
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
for (size_t i = 0; i < candidates_p->size; ++i) {
|
for (size_t i = 0; i < candidates_p->size; ++i) {
|
||||||
candidates_p->data[i].logit /= temp;
|
candidates_p->data[i].logit /= temp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Only apply smoothing if smoothing_factor is > 0. Do not change base implementation otherwise.
|
||||||
|
if (smoothing_factor > 0 && candidates_p->size > 1) {
|
||||||
|
|
||||||
|
llama_sample_softmax(ctx, candidates_p);
|
||||||
|
float h = candidates_p->data[0].logit; // Find the maximum logit for h to be added after the transformation
|
||||||
|
// Apply quadratic transformation using the smoothing_factor
|
||||||
|
for (size_t i = 0; i < candidates_p->size; ++i)
|
||||||
|
{
|
||||||
|
float logit_shifted = candidates_p->data[i].logit - h;
|
||||||
|
candidates_p->data[i].logit = -smoothing_factor * logit_shifted * logit_shifted + h;
|
||||||
|
}
|
||||||
|
llama_sample_softmax(ctx, candidates_p);
|
||||||
|
}
|
||||||
|
|
||||||
if (ctx) {
|
if (ctx) {
|
||||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp, float smoothing_factor) {
|
||||||
llama_sample_temp(ctx, candidates_p, temp);
|
llama_sample_temp(ctx, candidates_p, temp, smoothing_factor);
|
||||||
}
|
}
|
||||||
|
|
||||||
// The llama.cpp repetition penalty code goes unused in kobold's API
|
// The llama.cpp repetition penalty code goes unused in kobold's API
|
||||||
|
|
6
llama.h
6
llama.h
|
@ -789,12 +789,14 @@ extern "C" {
|
||||||
llama_token_data_array * candidates_p,
|
llama_token_data_array * candidates_p,
|
||||||
float min_temp,
|
float min_temp,
|
||||||
float max_temp,
|
float max_temp,
|
||||||
float exponent_val);
|
float exponent_val,
|
||||||
|
float smoothing_factor);
|
||||||
|
|
||||||
LLAMA_API void llama_sample_temp(
|
LLAMA_API void llama_sample_temp(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
llama_token_data_array * candidates,
|
llama_token_data_array * candidates,
|
||||||
float temp);
|
float temp,
|
||||||
|
float smoothing_factor);
|
||||||
|
|
||||||
LLAMA_API DEPRECATED(void llama_sample_temperature(
|
LLAMA_API DEPRECATED(void llama_sample_temperature(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue