mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/ISSUE_TEMPLATE/config.yml # .gitignore # CMakeLists.txt # CONTRIBUTING.md # Makefile # README.md # ci/run.sh # common/common.h # examples/main-cmake-pkg/CMakeLists.txt # ggml/src/CMakeLists.txt # models/ggml-vocab-bert-bge.gguf.inp # models/ggml-vocab-bert-bge.gguf.out # models/ggml-vocab-deepseek-coder.gguf.inp # models/ggml-vocab-deepseek-coder.gguf.out # models/ggml-vocab-deepseek-llm.gguf.inp # models/ggml-vocab-deepseek-llm.gguf.out # models/ggml-vocab-falcon.gguf.inp # models/ggml-vocab-falcon.gguf.out # models/ggml-vocab-gpt-2.gguf.inp # models/ggml-vocab-gpt-2.gguf.out # models/ggml-vocab-llama-bpe.gguf.inp # models/ggml-vocab-llama-bpe.gguf.out # models/ggml-vocab-llama-spm.gguf.inp # models/ggml-vocab-llama-spm.gguf.out # models/ggml-vocab-mpt.gguf.inp # models/ggml-vocab-mpt.gguf.out # models/ggml-vocab-phi-3.gguf.inp # models/ggml-vocab-phi-3.gguf.out # models/ggml-vocab-starcoder.gguf.inp # models/ggml-vocab-starcoder.gguf.out # requirements.txt # requirements/requirements-convert_legacy_llama.txt # scripts/check-requirements.sh # scripts/pod-llama.sh # src/CMakeLists.txt # src/llama.cpp # tests/test-rope.cpp
This commit is contained in:
commit
5b605d03ea
85 changed files with 4095 additions and 941 deletions
|
@ -473,6 +473,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||
else { invalid_param = true; }
|
||||
return true;
|
||||
}
|
||||
if (arg == "--attention") {
|
||||
CHECK_ARG
|
||||
std::string value(argv[i]);
|
||||
/**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
|
||||
else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
|
||||
else { invalid_param = true; }
|
||||
return true;
|
||||
}
|
||||
if (arg == "--defrag-thold" || arg == "-dt") {
|
||||
CHECK_ARG
|
||||
params.defrag_thold = std::stof(argv[i]);
|
||||
|
@ -758,7 +766,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||
params.cache_type_v = argv[++i];
|
||||
return true;
|
||||
}
|
||||
if (arg == "--multiline-input") {
|
||||
if (arg == "-mli" || arg == "--multiline-input") {
|
||||
params.multiline_input = true;
|
||||
return true;
|
||||
}
|
||||
|
@ -1395,7 +1403,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||
options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
|
||||
options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
|
||||
options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
|
||||
options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with (default: '%s')", params.prompt.c_str() });
|
||||
options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n"
|
||||
"in conversation mode, this will be used as system prompt\n"
|
||||
"(default: '%s')", params.prompt.c_str() });
|
||||
options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" });
|
||||
options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" });
|
||||
options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" });
|
||||
|
@ -1410,7 +1420,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||
"halt generation at PROMPT, return control in interactive mode\n"
|
||||
"can be specified more than once for multiple prompts" });
|
||||
options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
|
||||
options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix, use default chat template) (default: %s)", params.conversation ? "true" : "false" });
|
||||
options.push_back({ "main", "-cnv, --conversation", "run in conversation mode, does not print special tokens and suffix/prefix\n"
|
||||
"if suffix/prefix are not specified, default chat template will be used\n"
|
||||
"(default: %s)", params.conversation ? "true" : "false" });
|
||||
options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
|
||||
options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
|
||||
options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
|
||||
|
@ -1454,6 +1466,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||
options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
|
||||
options.push_back({ "main", " --chat-template JINJA_TEMPLATE",
|
||||
"set custom jinja chat template (default: template taken from model's metadata)\n"
|
||||
"if suffix/prefix are specified, template will be disabled\n"
|
||||
"only commonly used templates are accepted:\n"
|
||||
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
|
||||
options.push_back({ "grammar" });
|
||||
|
@ -1464,8 +1477,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||
"For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" });
|
||||
|
||||
options.push_back({ "embedding" });
|
||||
options.push_back({ "embedding", " --pooling {none,mean,cls}",
|
||||
options.push_back({ "embedding", " --pooling {none,mean,cls,last}",
|
||||
"pooling type for embeddings, use model default if unspecified" });
|
||||
options.push_back({ "embedding", " --attention {causal,non-causal}",
|
||||
"attention type for embeddings, use model default if unspecified" });
|
||||
|
||||
options.push_back({ "context hacking" });
|
||||
options.push_back({ "*", " --rope-scaling {none,linear,yarn}",
|
||||
|
@ -2071,7 +2086,24 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|||
if (params.warmup) {
|
||||
LOG("warming up the model with an empty run\n");
|
||||
|
||||
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
|
||||
std::vector<llama_token> tmp;
|
||||
llama_token bos = llama_token_bos(model);
|
||||
llama_token eos = llama_token_eos(model);
|
||||
// some models (e.g. T5) don't have a BOS token
|
||||
if (bos != -1) {
|
||||
tmp.push_back(bos);
|
||||
}
|
||||
tmp.push_back(eos);
|
||||
|
||||
if (llama_model_has_encoder(model)) {
|
||||
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
|
||||
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
||||
if (decoder_start_token_id == -1) {
|
||||
decoder_start_token_id = bos;
|
||||
}
|
||||
tmp.clear();
|
||||
tmp.push_back(decoder_start_token_id);
|
||||
}
|
||||
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
|
||||
llama_kv_cache_clear(lctx);
|
||||
llama_synchronize(lctx);
|
||||
|
@ -2154,6 +2186,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|||
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
||||
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
||||
cparams.pooling_type = params.pooling_type;
|
||||
cparams.attention_type = params.attention_type;
|
||||
cparams.defrag_thold = params.defrag_thold;
|
||||
cparams.cb_eval = params.cb_eval;
|
||||
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue