diff --git a/common/common.h b/common/common.h index 5df3a04e9..308f16930 100644 --- a/common/common.h +++ b/common/common.h @@ -79,6 +79,7 @@ struct gpt_params { float tfs_z = 1.00f; // 1.0 = disabled float typical_p = 1.00f; // 1.0 = disabled float temp = 0.80f; // 1.0 = disabled + float smoothing_factor = 0.00f; // 0.00 = disabled float repeat_penalty = 1.10f; // 1.0 = disabled int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) float frequency_penalty = 0.00f; // 0.0 = disabled diff --git a/common/sampling.cpp b/common/sampling.cpp index e8675a8c0..8f1a7b31f 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -150,9 +150,9 @@ static void sampler_queue( if (dynatemp_range > 0) { float dynatemp_min = std::max(0.0f, temp - dynatemp_range); float dynatemp_max = std::max(0.0f, temp + dynatemp_range); - llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent); + llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent, 0); } else { - llama_sample_temp(ctx_main, &cur_p, temp); + llama_sample_temp(ctx_main, &cur_p, temp, 0); } break; default : break; diff --git a/common/sampling.h b/common/sampling.h index 88899c094..aea14b99b 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -18,6 +18,7 @@ typedef struct llama_sampling_params { float tfs_z = 1.00f; // 1.0 = disabled float typical_p = 1.00f; // 1.0 = disabled float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities + float smoothing_factor = 0.00f; // 0.00 = disabled float dynatemp_range = 0.00f; // 0.0 = disabled float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) diff --git a/expose.h b/expose.h index 36d3d971a..053cad927 100644 --- a/expose.h +++ b/expose.h @@ -84,6 +84,7 @@ struct generation_inputs const bool quiet = false; const float dynatemp_range = 0.0f; const float dynatemp_exponent = 1.0f; + const float smoothing_factor = 0.0f; const logit_bias logit_biases[logit_bias_max]; }; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 8db770377..9162c7f7c 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -83,9 +83,7 @@ static int n_batch = 8; static bool useSmartContext = false; static bool useContextShift = false; static int blasbatchsize = 512; -static int dontblasbatchsize = 16; -static int normalbatchsize = 32; -static int smallbatchsize = 8; +static int smallbatchsize = 16; static int debugmode = 0; //-1 = hide all, 0 = normal, 1 = showall static std::string modelname; static std::vector last_n_tokens; @@ -427,18 +425,18 @@ void sample_rep_pen(int n_ctx, int rep_pen_range, float rep_pen, float presence_ } -void sample_temperature(llama_token_data_array * candidates_p, float temp) +void sample_temperature(llama_token_data_array * candidates_p, float temp, float smoothing_factor) { if (temp <= 0) { // Imitate greedy sampling temp = 0.00390625f; //cannot be zero else div0, this is 1/256 - llama_sample_temperature(nullptr, candidates_p, temp); + llama_sample_temperature(nullptr, candidates_p, temp, 0); llama_sample_top_k(nullptr, candidates_p, 1, 1); //only want first candidate } else { - llama_sample_temperature(nullptr, candidates_p, temp); + llama_sample_temperature(nullptr, candidates_p, temp, smoothing_factor); } } @@ -482,7 +480,7 @@ void sample_grammar(FileFormat file_format, int32_t n_vocab, llama_token_data_ar } int SampleLogits(const float * logits, int n_ctx, int n_vocab, int rep_pen_range, float rep_pen, float presence_penalty, float top_k, float top_a, float top_p, float min_p, float typical_p, float tfs, float temp, std::mt19937 & rng, -int mirostat, float mirostat_tau, float mirostat_eta, const std::vector & sampler_order, llama_grammar * grammar, float dynatemp_range, float dynatemp_exponent) +int mirostat, float mirostat_tau, float mirostat_eta, const std::vector & sampler_order, llama_grammar * grammar, float dynatemp_range, float dynatemp_exponent, float smoothing_factor) { int id = 0; std::vector candidates; @@ -508,7 +506,7 @@ int mirostat, float mirostat_tau, float mirostat_eta, const std::vectorn_threads_batch = inputs.blasthreads; bool isGguf = (file_format == FileFormat::GGUF_GENERIC); - n_batch = kcpp_params->n_batch = (isGguf?normalbatchsize:smallbatchsize); + n_batch = kcpp_params->n_batch = smallbatchsize; modelname = kcpp_params->model = inputs.model_filename; useSmartContext = inputs.use_smartcontext; useContextShift = inputs.use_contextshift; @@ -706,7 +704,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in blasbatchsize = inputs.blasbatchsize; if(blasbatchsize<=0) { - blasbatchsize = (isGguf?dontblasbatchsize:smallbatchsize); + blasbatchsize = smallbatchsize; } auto clamped_max_context_length = inputs.max_context_length; @@ -1533,6 +1531,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o kcpp_params->n_batch = n_batch; kcpp_params->n_threads = n_threads; kcpp_params->n_threads_batch = n_blasthreads; + kcpp_params->smoothing_factor = inputs.smoothing_factor; bool stream_sse = inputs.stream_sse; @@ -1675,7 +1674,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o file_format == FileFormat::GPTJ_2 || file_format == FileFormat::RWKV_1 || file_format==FileFormat::RWKV_2); - bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas() && blasbatchsize!=-1); + bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas() && blasbatchsize>=32); // bool blasmode = false; int original_batch = kcpp_params->n_batch; int original_threads = kcpp_params->n_threads; @@ -1930,6 +1929,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o const float tfs_z = kcpp_params->tfs_z; const float dynatemp_range = kcpp_params->dynatemp_range; const float dynatemp_exponent = kcpp_params->dynatemp_exponent; + const float smoothing_factor = kcpp_params->smoothing_factor; if (!startedsampling) { @@ -1985,7 +1985,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o id = SampleLogits(logitsPtr, nctx, n_vocab, last_n_size, repeat_penalty, presence_penalty, top_k, top_a, top_p, min_p, typical_p, tfs_z, temp, rng, - kcpp_params->mirostat, kcpp_params->mirostat_tau, kcpp_params->mirostat_eta, sampler_order, grammar, dynatemp_range, dynatemp_exponent); + kcpp_params->mirostat, kcpp_params->mirostat_tau, kcpp_params->mirostat_eta, sampler_order, grammar, dynatemp_range, dynatemp_exponent, smoothing_factor); if (grammar != nullptr) { grammar_accept_token(file_format, n_vocab, grammar, id); diff --git a/koboldcpp.py b/koboldcpp.py old mode 100755 new mode 100644 index 0f4d1cea6..05358f59a --- a/koboldcpp.py +++ b/koboldcpp.py @@ -81,6 +81,7 @@ class generation_inputs(ctypes.Structure): ("quiet", ctypes.c_bool), ("dynatemp_range", ctypes.c_float), ("dynatemp_exponent", ctypes.c_float), + ("smoothing_factor", ctypes.c_float), ("logit_biases", logit_bias * logit_bias_max)] class generation_outputs(ctypes.Structure): @@ -328,7 +329,7 @@ def load_model(model_filename): ret = handle.load_model(inputs) return ret -def generate(prompt, memory="", max_length=32, max_context_length=512, temperature=0.7, top_k=100, top_a=0.0, top_p=0.92, min_p=0.0, typical_p=1.0, tfs=1.0, rep_pen=1.0, rep_pen_range=128, presence_penalty=0.0, mirostat=0, mirostat_tau=5.0, mirostat_eta=0.1, sampler_order=[6,0,1,3,4,2,5], seed=-1, stop_sequence=[], use_default_badwordsids=False, stream_sse=False, grammar='', grammar_retain_state=False, genkey='', trimstop=False, quiet=False, dynatemp_range=0.0, dynatemp_exponent=1.0, logit_biases={}): +def generate(prompt, memory="", max_length=32, max_context_length=512, temperature=0.7, top_k=100, top_a=0.0, top_p=0.92, min_p=0.0, typical_p=1.0, tfs=1.0, rep_pen=1.0, rep_pen_range=128, presence_penalty=0.0, mirostat=0, mirostat_tau=5.0, mirostat_eta=0.1, sampler_order=[6,0,1,3,4,2,5], seed=-1, stop_sequence=[], use_default_badwordsids=False, stream_sse=False, grammar='', grammar_retain_state=False, genkey='', trimstop=False, quiet=False, dynatemp_range=0.0, dynatemp_exponent=1.0, smoothing_factor=0.0, logit_biases={}): global maxctx, args, currentusergenkey, totalgens, pendingabortkey inputs = generation_inputs() outputs = ctypes.create_unicode_buffer(ctypes.sizeof(generation_outputs)) @@ -359,6 +360,7 @@ def generate(prompt, memory="", max_length=32, max_context_length=512, temperatu inputs.quiet = quiet inputs.dynatemp_range = dynatemp_range inputs.dynatemp_exponent = dynatemp_exponent + inputs.smoothing_factor = smoothing_factor inputs.grammar = grammar.encode("UTF-8") inputs.grammar_retain_state = grammar_retain_state inputs.unban_tokens_rt = not use_default_badwordsids @@ -588,6 +590,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): quiet=is_quiet, dynatemp_range=genparams.get('dynatemp_range', 0.0), dynatemp_exponent=genparams.get('dynatemp_exponent', 1.0), + smoothing_factor=genparams.get('smoothing_factor', 0.0), logit_biases=genparams.get('logit_bias', {}) ) diff --git a/llama.cpp b/llama.cpp index 24a8b40a2..f5d59951c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8769,7 +8769,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c } } -void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) { +void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val, float smoothing_factor) { const int64_t t_start_sample_us = ggml_time_us(); // no need to do anything if there is only one (or zero) candidates @@ -8797,15 +8797,6 @@ void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * c // Map the normalized entropy to the desired temperature range using the power function float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val); -#ifdef DEBUG - LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp); - LLAMA_LOG_INFO("Entropy: %f\n", entropy); - LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy); - LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy); - LLAMA_LOG_INFO("Exponent: %f\n", exponent_val); - LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp); -#endif - // Apply the dynamically calculated temperature scaling for (size_t i = 0; i < candidates_p->size; ++i) { candidates_p->data[i].logit /= dyn_temp; @@ -8823,34 +8814,54 @@ void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * c candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities } -#ifdef DEBUG - // Print the updated top 25 probabilities after temperature scaling - LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n"); - for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) { - LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f); + // Only apply smoothing if smoothing_factor is > 0. Do not change base implementation otherwise. + if (smoothing_factor > 0 && candidates_p->size > 1) { + + llama_sample_softmax(ctx, candidates_p); + float h = candidates_p->data[0].logit; // Find the maximum logit for h to be added after the transformation + // Apply quadratic transformation using the smoothing_factor + for (size_t i = 0; i < candidates_p->size; ++i) + { + float logit_shifted = candidates_p->data[i].logit - h; + candidates_p->data[i].logit = -smoothing_factor * logit_shifted * logit_shifted + h; + } + llama_sample_softmax(ctx, candidates_p); } -#endif if (ctx) { ctx->t_sample_us += ggml_time_us() - t_start_sample_us; } } -void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) { +void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp, float smoothing_factor) { const int64_t t_start_sample_us = ggml_time_us(); for (size_t i = 0; i < candidates_p->size; ++i) { candidates_p->data[i].logit /= temp; } + // Only apply smoothing if smoothing_factor is > 0. Do not change base implementation otherwise. + if (smoothing_factor > 0 && candidates_p->size > 1) { + + llama_sample_softmax(ctx, candidates_p); + float h = candidates_p->data[0].logit; // Find the maximum logit for h to be added after the transformation + // Apply quadratic transformation using the smoothing_factor + for (size_t i = 0; i < candidates_p->size; ++i) + { + float logit_shifted = candidates_p->data[i].logit - h; + candidates_p->data[i].logit = -smoothing_factor * logit_shifted * logit_shifted + h; + } + llama_sample_softmax(ctx, candidates_p); + } + if (ctx) { ctx->t_sample_us += ggml_time_us() - t_start_sample_us; } } -void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) { - llama_sample_temp(ctx, candidates_p, temp); +void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp, float smoothing_factor) { + llama_sample_temp(ctx, candidates_p, temp, smoothing_factor); } // The llama.cpp repetition penalty code goes unused in kobold's API diff --git a/llama.h b/llama.h index e55c9826b..8c01f5bf4 100644 --- a/llama.h +++ b/llama.h @@ -789,12 +789,14 @@ extern "C" { llama_token_data_array * candidates_p, float min_temp, float max_temp, - float exponent_val); + float exponent_val, + float smoothing_factor); LLAMA_API void llama_sample_temp( struct llama_context * ctx, llama_token_data_array * candidates, - float temp); + float temp, + float smoothing_factor); LLAMA_API DEPRECATED(void llama_sample_temperature( struct llama_context * ctx,