Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.github/workflows/build.yml
#	README.md
#	examples/CMakeLists.txt
#	examples/batched/batched.cpp
#	examples/gritlm/gritlm.cpp
#	examples/llama.android/llama/build.gradle.kts
#	examples/main/README.md
#	examples/retrieval/retrieval.cpp
#	examples/server/CMakeLists.txt
#	examples/server/README.md
#	ggml/CMakeLists.txt
#	ggml/src/ggml-cpu/CMakeLists.txt
#	ggml/src/ggml.c
#	scripts/compare-commits.sh
#	scripts/sync-ggml.last
#	tests/CMakeLists.txt
#	tests/test-backend-ops.cpp
#	tests/test-chat-template.cpp
#	tests/test-sampling.cpp
This commit is contained in:
Concedo 2024-12-19 11:57:43 +08:00
commit ee486bad3e
59 changed files with 20531 additions and 13185 deletions

View file

@ -482,9 +482,6 @@ extern "C" {
// Returns the total number of parameters in the model
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
// Get a llama model tensor
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
// Returns true if the model contains an encoder that requires llama_encode() call
LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
@ -1141,16 +1138,12 @@ extern "C" {
const char * grammar_str,
const char * grammar_root);
/// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
int32_t n_vocab, // llama_n_vocab()
llama_token special_eos_id, // llama_token_eos()
llama_token linefeed_id, // llama_token_nl()
int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat, // 1.0 = disabled
float penalty_freq, // 0.0 = disabled
float penalty_present, // 0.0 = disabled
bool penalize_nl, // consider newlines as a repeatable token
bool ignore_eos); // ignore the end-of-sequence token
int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat, // 1.0 = disabled
float penalty_freq, // 0.0 = disabled
float penalty_present); // 0.0 = disabled
/// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
LLAMA_API struct llama_sampler * llama_sampler_init_dry(