mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-04-28 03:30:20 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # examples/batched/batched.cpp # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # src/llama-context.cpp # tools/cli/README.md # tools/completion/README.md # tools/server/README.md
This commit is contained in:
commit
0d43bdc46d
26 changed files with 906 additions and 421 deletions
|
|
@ -21,7 +21,9 @@ struct llama_sampler_deleter {
|
|||
};
|
||||
|
||||
struct llama_adapter_lora_deleter {
|
||||
void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }
|
||||
void operator()(llama_adapter_lora *) {
|
||||
// llama_adapter_lora_free is deprecated
|
||||
}
|
||||
};
|
||||
|
||||
typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
|
||||
|
|
|
|||
|
|
@ -649,7 +649,8 @@ extern "C" {
|
|||
|
||||
// Manually free a LoRA adapter
|
||||
// NOTE: loaded adapters will be free when the associated model is deleted
|
||||
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
|
||||
LLAMA_API DEPRECATED(void llama_adapter_lora_free(struct llama_adapter_lora * adapter),
|
||||
"adapters are now freed together with the associated model");
|
||||
|
||||
// Get the invocation tokens if the current lora is an alora
|
||||
LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
|
||||
|
|
@ -1258,7 +1259,6 @@ extern "C" {
|
|||
// [EXPERIMENTAL]
|
||||
// attach a sampler to the context
|
||||
// note: prefer initializing the context with llama_context_params.samplers when possible
|
||||
// note: changing the samplers of a context can cause graph reallocations and degraded performance
|
||||
LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
|
||||
|
||||
// mirror of llama_sampler_i:
|
||||
|
|
@ -1398,6 +1398,33 @@ extern "C" {
|
|||
const char ** seq_breakers,
|
||||
size_t num_breakers);
|
||||
|
||||
/// adaptive-p: select tokens near a configurable target probability over time.
|
||||
///
|
||||
/// the adaptive-p sampler transforms the token probability distribution to favor tokens
|
||||
/// that fall near a user-configurable probability target.
|
||||
///
|
||||
/// internally, the sampler maintains an exponential moving average of the *ORIGINAL*
|
||||
/// probabilities of selected tokens at each sampling step. it uses this EMA to compute an
|
||||
/// adapted target probability at each sampling step, thus maintaining the desired target
|
||||
/// probability over time.
|
||||
///
|
||||
/// adaptive-p selects a token ID rather than just mutating candidates, so it must be last
|
||||
/// in the sampler chain (like mirostat, dist, greedy).
|
||||
///
|
||||
/// only mild truncation before this sampler is recommended. we suggest applying min-p
|
||||
/// before adaptive-p as the only other active sampler in the chain.
|
||||
///
|
||||
/// @param target select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
|
||||
/// @param decay EMA decay for adaptation; history ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99)
|
||||
/// @param seed RNG seed
|
||||
///
|
||||
/// ref: https://github.com/ggml-org/llama.cpp/pull/17927
|
||||
///
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_adaptive_p(
|
||||
float target,
|
||||
float decay,
|
||||
uint32_t seed);
|
||||
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
|
||||
int32_t n_vocab,
|
||||
int32_t n_logit_bias,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue