mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 17:44:38 +00:00
* Add the DRY dynamic N-gram anti-repetition sampler The DRY (Do not Repeat Yourself) sampler is a dynamic N-gram repetition penalty that negatively scores tokens that would extend sequences that already appear in the context. See this discussion for a motivation and explanation of the sampler: https://github.com/oobabooga/text-generation-webui/pull/5677 This implementation of DRY mostly aligns with the obabooga version with a few modifications. It uses a more efficient linear scanning algorithm to identify repetitions. It also supports multi-token sequence breakers. As a limitation, this implementation reuses the rep pen range parameter, rather than introducing a new range just for the DRY sampler. There is a separate change to lite.koboldai.net that exposes the DRY sampler parameters to KoboldAI Lite, so none of the embed files have been changed as part of this commit. * Update default DRY parameters to match lite * Improve DRY token debug logging * Replace `and` with `&&` to fix MSVC compile error Little known fact: The C++98 standard defines `and` as an alternative token for the `&&` operator (along with a bunch of other digraphs). MSVC does not allow these without using the /Za option or including the <iso646.h> header. Change to the more standard operator to make this code more portable. * Fix MSVC compile error because log is not constexpr Replace the compile-time computation with a floating-point approximation of log(std::numeric_limits<float>::max()). * Remove unused llama sampler variables and clean up sequence breakers. * Remove KCPP_SAMPLER_DRY as a separate enum entry The DRY sampler is effectively a repetition penalty and there are very few reasons to apply it at a different place in sampler order than the standard single-token penalty. There are also multiple projects that have dependencies on the existing sampler IDs, including KoboldAI, KoboldAI Lite, and Silly Tavern. In order to minimize the impact of the dependencies of adding the DRY sampler to koboldcpp, it makes the most sense to not add a new ID for now, and instead to piggyback on KCPP_SAMPLER_REP_PEN. In the future if we find a use case for splitting the application of rep pen and DRY we can introduce a new enum entry then. * Add the dry_penalty_last_n to independently control DRY penalty range This parameter follows the oobabooga semantics: it's optional, with a default value of zero. Zero means that DRY should sample the entire context. Otherwise, it's the number of tokens from the end of the context that are scanned for repetitions. * Limit sequence breaker lengths in tokens and characters The core DRY sampler algorithm is linear in the context length, but there are several parts of the sampler related to multi-token sequence breakers that are potentially quadratic. Without any restrictions, a suitably crafted context and sequence breaker could result in a denial-of-service attack on a server running koboldcpp. This change limits the maximum number of characters and the maximum token length of a sequence breaker in order to limit the maximum overhead associated with the sampler. This change also improves some comments, adding more detail and changing the wording to increase clarity.
187 lines
5.3 KiB
C++
187 lines
5.3 KiB
C++
#pragma once
|
|
#include <cstdint>
|
|
|
|
const int stop_token_max = 16;
|
|
const int ban_token_max = 16;
|
|
const int tensor_split_max = 16;
|
|
const int logit_bias_max = 16;
|
|
const int dry_seq_break_max = 16;
|
|
const int images_max = 4;
|
|
|
|
// match kobold's sampler list and order
|
|
enum samplers
|
|
{
|
|
KCPP_SAMPLER_TOP_K=0,
|
|
KCPP_SAMPLER_TOP_A=1,
|
|
KCPP_SAMPLER_TOP_P=2,
|
|
KCPP_SAMPLER_TFS=3,
|
|
KCPP_SAMPLER_TYP=4,
|
|
KCPP_SAMPLER_TEMP=5,
|
|
KCPP_SAMPLER_REP_PEN=6,
|
|
KCPP_SAMPLER_MAX
|
|
};
|
|
enum stop_reason
|
|
{
|
|
INVALID=-1,
|
|
OUT_OF_TOKENS=0,
|
|
EOS_TOKEN_HIT=1,
|
|
CUSTOM_STOPPER=2,
|
|
};
|
|
struct logit_bias {
|
|
int32_t token_id;
|
|
float bias;
|
|
};
|
|
struct load_model_inputs
|
|
{
|
|
const int threads = 0;
|
|
const int blasthreads = 0;
|
|
const int max_context_length = 0;
|
|
const bool low_vram = 0;
|
|
const bool use_mmq = 0;
|
|
const bool use_rowsplit = 0;
|
|
const char * executable_path = nullptr;
|
|
const char * model_filename = nullptr;
|
|
const char * lora_filename = nullptr;
|
|
const char * lora_base = nullptr;
|
|
const char * mmproj_filename = nullptr;
|
|
const bool use_mmap = false;
|
|
const bool use_mlock = false;
|
|
const bool use_smartcontext = false;
|
|
const bool use_contextshift = false;
|
|
const int clblast_info = 0;
|
|
const int cublas_info = 0;
|
|
const char * vulkan_info = nullptr;
|
|
const int blasbatchsize = 512;
|
|
const int debugmode = 0;
|
|
const int forceversion = 0;
|
|
const int gpulayers = 0;
|
|
const float rope_freq_scale = 1.0f;
|
|
const float rope_freq_base = 10000.0f;
|
|
const bool flash_attention = false;
|
|
const float tensor_split[tensor_split_max] = {};
|
|
const int quant_k = 0;
|
|
const int quant_v = 0;
|
|
};
|
|
struct generation_inputs
|
|
{
|
|
const int seed = 0;
|
|
const char * prompt = nullptr;
|
|
const char * memory = nullptr;
|
|
const char * images[images_max] = {};
|
|
const int max_context_length = 0;
|
|
const int max_length = 0;
|
|
const float temperature = 0.0f;
|
|
const int top_k = 0;
|
|
const float top_a = 0.0f;
|
|
const float top_p = 0.0f;
|
|
const float min_p = 0.0f;
|
|
const float typical_p = 0;
|
|
const float tfs = 0;
|
|
const float rep_pen = 0;
|
|
const int rep_pen_range = 0;
|
|
const float rep_pen_slope = 1.0f;
|
|
const float presence_penalty = 0.0f;
|
|
const int mirostat = 0;
|
|
const float mirostat_eta = 0.0f;
|
|
const float mirostat_tau = 0.0f;
|
|
const float dry_multiplier = 0.0f;
|
|
const float dry_base = 0.0f;
|
|
const int dry_allowed_length = 0;
|
|
const int dry_penalty_last_n = 0;
|
|
const char * dry_sequence_breakers[dry_seq_break_max] = {};
|
|
const samplers sampler_order[KCPP_SAMPLER_MAX] = {};
|
|
const int sampler_len = 0;
|
|
const bool allow_eos_token = false;
|
|
const bool bypass_eos_token = false;
|
|
const bool render_special = false;
|
|
const char * stop_sequence[stop_token_max] = {};
|
|
const bool stream_sse = false;
|
|
const char * grammar = nullptr;
|
|
const bool grammar_retain_state = false;
|
|
const bool quiet = false;
|
|
const float dynatemp_range = 0.0f;
|
|
const float dynatemp_exponent = 1.0f;
|
|
const float smoothing_factor = 0.0f;
|
|
const logit_bias logit_biases[logit_bias_max] = {};
|
|
const char * banned_tokens[ban_token_max] = {};
|
|
};
|
|
struct generation_outputs
|
|
{
|
|
int status = -1;
|
|
int stopreason = stop_reason::INVALID;
|
|
const char * text; //response will now be stored in c++ allocated memory
|
|
};
|
|
struct token_count_outputs
|
|
{
|
|
int count = 0;
|
|
int * ids; //we'll just use shared memory for this one, bit of a hack
|
|
};
|
|
struct sd_load_model_inputs
|
|
{
|
|
const char * model_filename = nullptr;
|
|
const char * executable_path = nullptr;
|
|
const int clblast_info = 0;
|
|
const int cublas_info = 0;
|
|
const char * vulkan_info = nullptr;
|
|
const int threads = 0;
|
|
const int quant = 0;
|
|
const bool taesd = false;
|
|
const char * vae_filename = nullptr;
|
|
const char * lora_filename = nullptr;
|
|
const float lora_multiplier = 1.0f;
|
|
const int debugmode = 0;
|
|
};
|
|
struct sd_generation_inputs
|
|
{
|
|
const char * prompt = nullptr;
|
|
const char * negative_prompt = nullptr;
|
|
const char * init_images = "";
|
|
const float denoising_strength = 0.0f;
|
|
const float cfg_scale = 0.0f;
|
|
const int sample_steps = 0;
|
|
const int width = 0;
|
|
const int height = 0;
|
|
const int seed = 0;
|
|
const char * sample_method = nullptr;
|
|
const int clip_skip = -1;
|
|
const bool quiet = false;
|
|
};
|
|
struct sd_generation_outputs
|
|
{
|
|
int status = -1;
|
|
const char * data = "";
|
|
};
|
|
struct whisper_load_model_inputs
|
|
{
|
|
const char * model_filename = nullptr;
|
|
const char * executable_path = nullptr;
|
|
const int clblast_info = 0;
|
|
const int cublas_info = 0;
|
|
const char * vulkan_info = nullptr;
|
|
const int debugmode = 0;
|
|
};
|
|
struct whisper_generation_inputs
|
|
{
|
|
const char * prompt = nullptr;
|
|
const char * audio_data = nullptr;
|
|
const bool quiet = false;
|
|
};
|
|
struct whisper_generation_outputs
|
|
{
|
|
int status = -1;
|
|
const char * text = "";
|
|
};
|
|
|
|
extern std::string executable_path;
|
|
extern std::string lora_filename;
|
|
extern std::string lora_base;
|
|
extern std::string mmproj_filename;
|
|
extern std::vector<std::string> generated_tokens;
|
|
extern bool generation_finished;
|
|
extern float last_eval_time;
|
|
extern float last_process_time;
|
|
extern int last_token_count;
|
|
extern int last_seed;
|
|
extern int total_gens;
|
|
extern int total_img_gens;
|
|
extern stop_reason last_stop_reason;
|