mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/llama-server-cuda.Dockerfile # .devops/llama-server-rocm.Dockerfile # .devops/llama-server-vulkan.Dockerfile # .devops/llama-server.Dockerfile # .github/workflows/docker.yml # README.md # llama.cpp # tests/test-chat-template.cpp # tests/test-grammar-integration.cpp # tests/test-json-schema-to-grammar.cpp # tests/test-llama-grammar.cpp
This commit is contained in:
commit
f3dfa96dbc
29 changed files with 2097 additions and 431 deletions
|
@ -1264,11 +1264,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||
return true;
|
||||
}
|
||||
// cvector params
|
||||
if (arg == "--completions-file") {
|
||||
CHECK_ARG
|
||||
params.cvector_completions_file = argv[i];
|
||||
return true;
|
||||
}
|
||||
if (arg == "--positive-file") {
|
||||
CHECK_ARG
|
||||
params.cvector_positive_file = argv[i];
|
||||
|
@ -1279,11 +1274,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||
params.cvector_negative_file = argv[i];
|
||||
return true;
|
||||
}
|
||||
if (arg == "--completions") {
|
||||
CHECK_ARG
|
||||
params.n_completions = std::stoi(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--pca-batch") {
|
||||
CHECK_ARG
|
||||
params.n_pca_batch = std::stoi(argv[i]);
|
||||
|
@ -1294,6 +1284,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||
params.n_pca_iterations = std::stoi(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--method") {
|
||||
CHECK_ARG
|
||||
std::string value(argv[i]);
|
||||
/**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
|
||||
else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
|
||||
else { invalid_param = true; }
|
||||
return true;
|
||||
}
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
// Parse args for logging parameters
|
||||
if (log_param_single_parse(argv[i])) {
|
||||
|
@ -1445,7 +1443,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||
options.push_back({ "main", " --cfg-negative-prompt-file FNAME",
|
||||
"negative prompt file to use for guidance" });
|
||||
options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
|
||||
|
||||
options.push_back({ "main", " --chat-template JINJA_TEMPLATE",
|
||||
"set custom jinja chat template (default: template taken from model's metadata)\n"
|
||||
"only commonly used templates are accepted:\n"
|
||||
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
|
||||
options.push_back({ "grammar" });
|
||||
options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
|
||||
options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" });
|
||||
|
@ -1624,11 +1625,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||
options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() });
|
||||
options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() });
|
||||
options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() });
|
||||
options.push_back({ "cvector", " --completions-file FNAME",
|
||||
"completions file (default: '%s')", params.cvector_completions_file.c_str() });
|
||||
options.push_back({ "cvector", " --completions N", "number of lines of completions file to use (default: %d)", params.n_completions });
|
||||
options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
|
||||
options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
|
||||
options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" });
|
||||
|
||||
printf("usage: %s [options]\n", argv[0]);
|
||||
|
||||
|
@ -2605,12 +2604,67 @@ bool llama_should_add_bos_token(const llama_model * model) {
|
|||
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
|
||||
}
|
||||
|
||||
//
|
||||
// Chat template utils
|
||||
//
|
||||
|
||||
bool llama_chat_verify_template(const std::string & tmpl) {
|
||||
llama_chat_message chat[] = {{"user", "test"}};
|
||||
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
||||
return res >= 0;
|
||||
}
|
||||
|
||||
std::string llama_chat_apply_template(const struct llama_model * model,
|
||||
const std::string & tmpl,
|
||||
const std::vector<llama_chat_msg> & msgs,
|
||||
bool add_ass) {
|
||||
int alloc_size = 0;
|
||||
std::vector<llama_chat_message> chat;
|
||||
for (auto & msg : msgs) {
|
||||
chat.push_back({msg.role.c_str(), msg.content.c_str()});
|
||||
alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
|
||||
}
|
||||
|
||||
const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
|
||||
std::vector<char> buf(alloc_size);
|
||||
|
||||
// run the first time to get the total output length
|
||||
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
|
||||
// if it turns out that our buffer is too small, we resize it
|
||||
if ((size_t) res > buf.size()) {
|
||||
buf.resize(res);
|
||||
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
}
|
||||
|
||||
std::string formatted_chat(buf.data(), res);
|
||||
return formatted_chat;
|
||||
}
|
||||
|
||||
std::string llama_chat_format_single(const struct llama_model * model,
|
||||
const std::string & tmpl,
|
||||
const std::vector<llama_chat_msg> & past_msg,
|
||||
const llama_chat_msg & new_msg,
|
||||
bool add_ass) {
|
||||
auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
|
||||
std::vector<llama_chat_msg> chat_new(past_msg);
|
||||
chat_new.push_back(new_msg);
|
||||
auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
|
||||
auto formatted = fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
|
||||
return formatted;
|
||||
}
|
||||
|
||||
std::string llama_chat_format_example(const struct llama_model * model,
|
||||
const std::string & tmpl) {
|
||||
std::vector<llama_chat_msg> msgs = {
|
||||
{"system", "You are a helpful assistant"},
|
||||
{"user", "Hello"},
|
||||
{"assistant", "Hi there"},
|
||||
{"user", "How are you?"},
|
||||
};
|
||||
return llama_chat_apply_template(model, tmpl, msgs, true);
|
||||
}
|
||||
|
||||
//
|
||||
// KV cache utils
|
||||
//
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue