mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/llama-server-cuda.Dockerfile # .devops/llama-server-rocm.Dockerfile # .devops/llama-server-vulkan.Dockerfile # .devops/llama-server.Dockerfile # .github/workflows/docker.yml # README.md # llama.cpp # tests/test-chat-template.cpp # tests/test-grammar-integration.cpp # tests/test-json-schema-to-grammar.cpp # tests/test-llama-grammar.cpp
This commit is contained in:
commit
f3dfa96dbc
29 changed files with 2097 additions and 431 deletions
|
@ -20,10 +20,12 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
|||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev
|
||||
apt-get install -y libcurl4-openssl-dev curl
|
||||
|
||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/llama-server" ]
|
||||
|
|
|
@ -1264,11 +1264,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||
return true;
|
||||
}
|
||||
// cvector params
|
||||
if (arg == "--completions-file") {
|
||||
CHECK_ARG
|
||||
params.cvector_completions_file = argv[i];
|
||||
return true;
|
||||
}
|
||||
if (arg == "--positive-file") {
|
||||
CHECK_ARG
|
||||
params.cvector_positive_file = argv[i];
|
||||
|
@ -1279,11 +1274,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||
params.cvector_negative_file = argv[i];
|
||||
return true;
|
||||
}
|
||||
if (arg == "--completions") {
|
||||
CHECK_ARG
|
||||
params.n_completions = std::stoi(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--pca-batch") {
|
||||
CHECK_ARG
|
||||
params.n_pca_batch = std::stoi(argv[i]);
|
||||
|
@ -1294,6 +1284,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||
params.n_pca_iterations = std::stoi(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--method") {
|
||||
CHECK_ARG
|
||||
std::string value(argv[i]);
|
||||
/**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
|
||||
else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
|
||||
else { invalid_param = true; }
|
||||
return true;
|
||||
}
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
// Parse args for logging parameters
|
||||
if (log_param_single_parse(argv[i])) {
|
||||
|
@ -1445,7 +1443,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||
options.push_back({ "main", " --cfg-negative-prompt-file FNAME",
|
||||
"negative prompt file to use for guidance" });
|
||||
options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
|
||||
|
||||
options.push_back({ "main", " --chat-template JINJA_TEMPLATE",
|
||||
"set custom jinja chat template (default: template taken from model's metadata)\n"
|
||||
"only commonly used templates are accepted:\n"
|
||||
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
|
||||
options.push_back({ "grammar" });
|
||||
options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
|
||||
options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" });
|
||||
|
@ -1624,11 +1625,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||
options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() });
|
||||
options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() });
|
||||
options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() });
|
||||
options.push_back({ "cvector", " --completions-file FNAME",
|
||||
"completions file (default: '%s')", params.cvector_completions_file.c_str() });
|
||||
options.push_back({ "cvector", " --completions N", "number of lines of completions file to use (default: %d)", params.n_completions });
|
||||
options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
|
||||
options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
|
||||
options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" });
|
||||
|
||||
printf("usage: %s [options]\n", argv[0]);
|
||||
|
||||
|
@ -2605,12 +2604,67 @@ bool llama_should_add_bos_token(const llama_model * model) {
|
|||
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
|
||||
}
|
||||
|
||||
//
|
||||
// Chat template utils
|
||||
//
|
||||
|
||||
bool llama_chat_verify_template(const std::string & tmpl) {
|
||||
llama_chat_message chat[] = {{"user", "test"}};
|
||||
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
||||
return res >= 0;
|
||||
}
|
||||
|
||||
std::string llama_chat_apply_template(const struct llama_model * model,
|
||||
const std::string & tmpl,
|
||||
const std::vector<llama_chat_msg> & msgs,
|
||||
bool add_ass) {
|
||||
int alloc_size = 0;
|
||||
std::vector<llama_chat_message> chat;
|
||||
for (auto & msg : msgs) {
|
||||
chat.push_back({msg.role.c_str(), msg.content.c_str()});
|
||||
alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
|
||||
}
|
||||
|
||||
const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
|
||||
std::vector<char> buf(alloc_size);
|
||||
|
||||
// run the first time to get the total output length
|
||||
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
|
||||
// if it turns out that our buffer is too small, we resize it
|
||||
if ((size_t) res > buf.size()) {
|
||||
buf.resize(res);
|
||||
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
}
|
||||
|
||||
std::string formatted_chat(buf.data(), res);
|
||||
return formatted_chat;
|
||||
}
|
||||
|
||||
std::string llama_chat_format_single(const struct llama_model * model,
|
||||
const std::string & tmpl,
|
||||
const std::vector<llama_chat_msg> & past_msg,
|
||||
const llama_chat_msg & new_msg,
|
||||
bool add_ass) {
|
||||
auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
|
||||
std::vector<llama_chat_msg> chat_new(past_msg);
|
||||
chat_new.push_back(new_msg);
|
||||
auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
|
||||
auto formatted = fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
|
||||
return formatted;
|
||||
}
|
||||
|
||||
std::string llama_chat_format_example(const struct llama_model * model,
|
||||
const std::string & tmpl) {
|
||||
std::vector<llama_chat_msg> msgs = {
|
||||
{"system", "You are a helpful assistant"},
|
||||
{"user", "Hello"},
|
||||
{"assistant", "Hi there"},
|
||||
{"user", "How are you?"},
|
||||
};
|
||||
return llama_chat_apply_template(model, tmpl, msgs, true);
|
||||
}
|
||||
|
||||
//
|
||||
// KV cache utils
|
||||
//
|
||||
|
|
|
@ -48,6 +48,12 @@ int32_t cpu_get_num_math();
|
|||
// CLI argument parsing
|
||||
//
|
||||
|
||||
// dimensionality reduction methods, used by cvector-generator
|
||||
enum dimre_method {
|
||||
DIMRE_METHOD_PCA,
|
||||
DIMRE_METHOD_MEAN,
|
||||
};
|
||||
|
||||
struct gpt_params {
|
||||
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
||||
|
||||
|
@ -255,13 +261,12 @@ struct gpt_params {
|
|||
bool compute_ppl = true; // whether to compute perplexity
|
||||
|
||||
// cvector-generator params
|
||||
int n_completions = 64;
|
||||
int n_pca_batch = 20;
|
||||
int n_pca_batch = 100;
|
||||
int n_pca_iterations = 1000;
|
||||
std::string cvector_outfile = "control_vector.gguf";
|
||||
std::string cvector_completions_file = "examples/cvector-generator/completions.txt";
|
||||
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
||||
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
||||
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
||||
std::string cvector_outfile = "control_vector.gguf";
|
||||
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
||||
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
||||
};
|
||||
|
||||
void gpt_params_handle_model_default(gpt_params & params);
|
||||
|
@ -382,9 +387,32 @@ bool llama_should_add_bos_token(const llama_model * model);
|
|||
// Chat template utils
|
||||
//
|
||||
|
||||
// same with llama_chat_message, but uses std::string
|
||||
struct llama_chat_msg {
|
||||
std::string role;
|
||||
std::string content;
|
||||
};
|
||||
|
||||
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||
bool llama_chat_verify_template(const std::string & tmpl);
|
||||
|
||||
// CPP wrapper for llama_chat_apply_template
|
||||
std::string llama_chat_apply_template(const struct llama_model * model,
|
||||
const std::string & tmpl,
|
||||
const std::vector<llama_chat_msg> & chat,
|
||||
bool add_ass);
|
||||
|
||||
// Format single message, while taking into account the position of that message in chat history
|
||||
std::string llama_chat_format_single(const struct llama_model * model,
|
||||
const std::string & tmpl,
|
||||
const std::vector<llama_chat_msg> & past_msg,
|
||||
const llama_chat_msg & new_msg,
|
||||
bool add_ass);
|
||||
|
||||
// Returns an example of formatted chat
|
||||
std::string llama_chat_format_example(const struct llama_model * model,
|
||||
const std::string & tmpl);
|
||||
|
||||
//
|
||||
// KV cache utils
|
||||
//
|
||||
|
|
|
@ -40,6 +40,233 @@ static std::string build_repetition(const std::string & item_rule, int min_items
|
|||
return result;
|
||||
}
|
||||
|
||||
/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
|
||||
class string_view {
|
||||
const std::string & _str;
|
||||
const size_t _start;
|
||||
const size_t _end;
|
||||
public:
|
||||
string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
|
||||
|
||||
size_t size() const {
|
||||
return _end - _start;
|
||||
}
|
||||
|
||||
size_t length() const {
|
||||
return size();
|
||||
}
|
||||
|
||||
operator std::string() const {
|
||||
return str();
|
||||
}
|
||||
|
||||
std::string str() const {
|
||||
return _str.substr(_start, _end - _start);
|
||||
}
|
||||
|
||||
string_view substr(size_t pos, size_t len = std::string::npos) const {
|
||||
return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
|
||||
}
|
||||
|
||||
char operator[](size_t pos) const {
|
||||
auto index = _start + pos;
|
||||
if (index >= _end) {
|
||||
throw std::out_of_range("string_view index out of range");
|
||||
}
|
||||
return _str[_start + pos];
|
||||
}
|
||||
|
||||
bool operator==(const string_view & other) const {
|
||||
std::string this_str = *this;
|
||||
std::string other_str = other;
|
||||
return this_str == other_str;
|
||||
}
|
||||
};
|
||||
|
||||
static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
|
||||
auto has_min = min_value != std::numeric_limits<int>::min();
|
||||
auto has_max = max_value != std::numeric_limits<int>::max();
|
||||
|
||||
auto digit_range = [&](char from, char to) {
|
||||
out << "[";
|
||||
if (from == to) {
|
||||
out << from;
|
||||
} else {
|
||||
out << from << "-" << to;
|
||||
}
|
||||
out << "]";
|
||||
};
|
||||
auto more_digits = [&](int min_digits, int max_digits) {
|
||||
out << "[0-9]";
|
||||
if (min_digits == max_digits && min_digits == 1) {
|
||||
return;
|
||||
}
|
||||
out << "{";
|
||||
out << min_digits;
|
||||
if (max_digits != min_digits) {
|
||||
out << ",";
|
||||
if (max_digits != std::numeric_limits<int>::max()) {
|
||||
out << max_digits;
|
||||
}
|
||||
}
|
||||
out << "}";
|
||||
};
|
||||
std::function<void(const string_view &, const string_view &)> uniform_range =
|
||||
[&](const string_view & from, const string_view & to) {
|
||||
size_t i = 0;
|
||||
while (i < from.length() && i < to.length() && from[i] == to[i]) {
|
||||
i++;
|
||||
}
|
||||
if (i > 0) {
|
||||
out << "\"" << from.substr(0, i).str() << "\"";
|
||||
}
|
||||
if (i < from.length() && i < to.length()) {
|
||||
if (i > 0) {
|
||||
out << " ";
|
||||
}
|
||||
auto sub_len = from.length() - i - 1;
|
||||
if (sub_len > 0) {
|
||||
auto from_sub = from.substr(i + 1);
|
||||
auto to_sub = to.substr(i + 1);
|
||||
auto sub_zeros = repeat("0", sub_len);
|
||||
auto sub_nines = repeat("9", sub_len);
|
||||
|
||||
auto to_reached = false;
|
||||
out << "(";
|
||||
if (from_sub == sub_zeros) {
|
||||
digit_range(from[i], to[i] - 1);
|
||||
out << " ";
|
||||
more_digits(sub_len, sub_len);
|
||||
} else {
|
||||
out << "[" << from[i] << "] ";
|
||||
out << "(";
|
||||
uniform_range(from_sub, sub_nines);
|
||||
out << ")";
|
||||
if (from[i] < to[i] - 1) {
|
||||
out << " | ";
|
||||
if (to_sub == sub_nines) {
|
||||
digit_range(from[i] + 1, to[i]);
|
||||
to_reached = true;
|
||||
} else {
|
||||
digit_range(from[i] + 1, to[i] - 1);
|
||||
}
|
||||
out << " ";
|
||||
more_digits(sub_len, sub_len);
|
||||
}
|
||||
}
|
||||
if (!to_reached) {
|
||||
out << " | ";
|
||||
digit_range(to[i], to[i]);
|
||||
out << " ";
|
||||
uniform_range(sub_zeros, to_sub);
|
||||
}
|
||||
out << ")";
|
||||
} else {
|
||||
out << "[" << from[i] << "-" << to[i] << "]";
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if (has_min && has_max) {
|
||||
if (min_value < 0 && max_value < 0) {
|
||||
out << "\"-\" (";
|
||||
_build_min_max_int(-max_value, -min_value, out, decimals_left, /* top_level= */ true);
|
||||
out << ")";
|
||||
return;
|
||||
}
|
||||
|
||||
if (min_value < 0) {
|
||||
out << "\"-\" (";
|
||||
_build_min_max_int(0, -min_value, out, decimals_left, /* top_level= */ true);
|
||||
out << ") | ";
|
||||
min_value = 0;
|
||||
}
|
||||
|
||||
auto min_s = std::to_string(min_value);
|
||||
auto max_s = std::to_string(max_value);
|
||||
auto min_digits = min_s.length();
|
||||
auto max_digits = max_s.length();
|
||||
|
||||
for (auto digits = min_digits; digits < max_digits; digits++) {
|
||||
uniform_range(min_s, repeat("9", digits));
|
||||
min_s = "1" + repeat("0", digits);
|
||||
out << " | ";
|
||||
}
|
||||
uniform_range(min_s, max_s);
|
||||
return;
|
||||
}
|
||||
|
||||
auto less_decimals = std::max(decimals_left - 1, 1);
|
||||
|
||||
if (has_min) {
|
||||
if (min_value < 0) {
|
||||
out << "\"-\" (";
|
||||
_build_min_max_int(std::numeric_limits<int>::min(), -min_value, out, decimals_left, /* top_level= */ false);
|
||||
out << ") | [0] | [1-9] ";
|
||||
more_digits(0, decimals_left - 1);
|
||||
} else if (min_value == 0) {
|
||||
if (top_level) {
|
||||
out << "[0] | [1-9] ";
|
||||
more_digits(0, less_decimals);
|
||||
} else {
|
||||
more_digits(1, decimals_left);
|
||||
}
|
||||
} else if (min_value <= 9) {
|
||||
char c = '0' + min_value;
|
||||
auto range_start = top_level ? '1' : '0';
|
||||
if (c > range_start) {
|
||||
digit_range(range_start, c - 1);
|
||||
out << " ";
|
||||
more_digits(1, less_decimals);
|
||||
out << " | ";
|
||||
}
|
||||
digit_range(c, '9');
|
||||
out << " ";
|
||||
more_digits(0, less_decimals);
|
||||
} else {
|
||||
auto min_s = std::to_string(min_value);
|
||||
auto len = min_s.length();
|
||||
auto c = min_s[0];
|
||||
|
||||
if (c > '1') {
|
||||
digit_range(top_level ? '1' : '0', c - 1);
|
||||
out << " ";
|
||||
more_digits(len, less_decimals);
|
||||
out << " | ";
|
||||
}
|
||||
digit_range(c, c);
|
||||
out << " (";
|
||||
_build_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits<int>::max(), out, less_decimals, /* top_level= */ false);
|
||||
out << ")";
|
||||
if (c < '9') {
|
||||
out << " | ";
|
||||
digit_range(c + 1, '9');
|
||||
out << " ";
|
||||
more_digits(len - 1, less_decimals);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (has_max) {
|
||||
if (max_value >= 0) {
|
||||
if (top_level) {
|
||||
out << "\"-\" [1-9] ";
|
||||
more_digits(0, less_decimals);
|
||||
out << " | ";
|
||||
}
|
||||
_build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
|
||||
} else {
|
||||
out << "\"-\" (";
|
||||
_build_min_max_int(-max_value, std::numeric_limits<int>::max(), out, decimals_left, /* top_level= */ false);
|
||||
out << ")";
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
throw std::runtime_error("At least one of min_value or max_value must be set");
|
||||
}
|
||||
|
||||
const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
|
||||
|
||||
struct BuiltinRule {
|
||||
|
@ -160,7 +387,6 @@ static std::string format_literal(const std::string & literal) {
|
|||
return "\"" + escaped + "\"";
|
||||
}
|
||||
|
||||
|
||||
class SchemaConverter {
|
||||
private:
|
||||
std::function<json(const std::string &)> _fetch_json;
|
||||
|
@ -388,6 +614,75 @@ private:
|
|||
return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
|
||||
}
|
||||
|
||||
/*
|
||||
Returns a rule that matches a JSON string that is none of the provided strings
|
||||
|
||||
not_strings({"a"})
|
||||
-> ["] ( [a] char+ | [^"a] char* )? ["] space
|
||||
not_strings({"and", "also"})
|
||||
-> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space
|
||||
*/
|
||||
std::string _not_strings(const std::vector<std::string> & strings) {
|
||||
|
||||
struct TrieNode {
|
||||
std::map<char, TrieNode> children;
|
||||
bool is_end_of_string;
|
||||
|
||||
TrieNode() : is_end_of_string(false) {}
|
||||
|
||||
void insert(const std::string & string) {
|
||||
auto node = this;
|
||||
for (char c : string) {
|
||||
node = &node->children[c];
|
||||
}
|
||||
node->is_end_of_string = true;
|
||||
}
|
||||
};
|
||||
|
||||
TrieNode trie;
|
||||
for (const auto & s : strings) {
|
||||
trie.insert(s);
|
||||
}
|
||||
|
||||
std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
|
||||
std::ostringstream out;
|
||||
out << "[\"] ( ";
|
||||
std::function<void(const TrieNode &)> visit = [&](const TrieNode & node) {
|
||||
std::ostringstream rejects;
|
||||
auto first = true;
|
||||
for (const auto & kv : node.children) {
|
||||
rejects << kv.first;
|
||||
if (first) {
|
||||
first = false;
|
||||
} else {
|
||||
out << " | ";
|
||||
}
|
||||
out << "[" << kv.first << "]";
|
||||
if (!kv.second.children.empty()) {
|
||||
out << " (";
|
||||
visit(kv.second);
|
||||
out << ")";
|
||||
} else if (kv.second.is_end_of_string) {
|
||||
out << " " << char_rule << "+";
|
||||
}
|
||||
}
|
||||
if (!node.children.empty()) {
|
||||
if (!first) {
|
||||
out << " | ";
|
||||
}
|
||||
out << "[^\"" << rejects.str() << "] " << char_rule << "*";
|
||||
}
|
||||
};
|
||||
visit(trie);
|
||||
|
||||
out << " )";
|
||||
if (!trie.is_end_of_string) {
|
||||
out << "?";
|
||||
}
|
||||
out << " [\"] space";
|
||||
return out.str();
|
||||
}
|
||||
|
||||
std::string _resolve_ref(const std::string & ref) {
|
||||
std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
|
||||
if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
|
||||
|
@ -408,6 +703,7 @@ private:
|
|||
std::vector<std::string> required_props;
|
||||
std::vector<std::string> optional_props;
|
||||
std::unordered_map<std::string, std::string> prop_kv_rule_names;
|
||||
std::vector<std::string> prop_names;
|
||||
for (const auto & kv : properties) {
|
||||
const auto &prop_name = kv.first;
|
||||
const auto &prop_schema = kv.second;
|
||||
|
@ -422,11 +718,18 @@ private:
|
|||
} else {
|
||||
optional_props.push_back(prop_name);
|
||||
}
|
||||
prop_names.push_back(prop_name);
|
||||
}
|
||||
if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) {
|
||||
if (!(additional_properties.is_boolean() && !additional_properties.get<bool>())) {
|
||||
std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
|
||||
std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
|
||||
std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
|
||||
std::string value_rule =
|
||||
additional_properties.is_object() ? visit(additional_properties, sub_name + "-value")
|
||||
: _add_primitive("value", PRIMITIVE_RULES.at("value"));
|
||||
|
||||
auto key_rule =
|
||||
prop_names.empty() ? _add_primitive("string", PRIMITIVE_RULES.at("string"))
|
||||
: _add_rule(sub_name + "-k", _not_strings(prop_names));
|
||||
std::string kv_rule = _add_rule(sub_name + "-kv", key_rule + " \":\" space " + value_rule);
|
||||
prop_kv_rule_names["*"] = kv_rule;
|
||||
optional_props.push_back("*");
|
||||
}
|
||||
|
@ -452,15 +755,11 @@ private:
|
|||
}
|
||||
std::string k = ks[0];
|
||||
std::string kv_rule_name = prop_kv_rule_names[k];
|
||||
if (k == "*") {
|
||||
res = _add_rule(
|
||||
name + (name.empty() ? "" : "-") + "additional-kvs",
|
||||
kv_rule_name + " ( \",\" space " + kv_rule_name + " )*"
|
||||
);
|
||||
} else if (first_is_optional) {
|
||||
res = "( \",\" space " + kv_rule_name + " )?";
|
||||
std::string comma_ref = "( \",\" space " + kv_rule_name + " )";
|
||||
if (first_is_optional) {
|
||||
res = comma_ref + (k == "*" ? "*" : "?");
|
||||
} else {
|
||||
res = kv_rule_name;
|
||||
res = kv_rule_name + (k == "*" ? " " + comma_ref + "*" : "");
|
||||
}
|
||||
if (ks.size() > 1) {
|
||||
res += " " + _add_rule(
|
||||
|
@ -594,17 +893,19 @@ public:
|
|||
} else if (schema_type.is_array()) {
|
||||
std::vector<json> schema_types;
|
||||
for (const auto & t : schema_type) {
|
||||
schema_types.push_back({{"type", t}});
|
||||
json schema_copy(schema);
|
||||
schema_copy["type"] = t;
|
||||
schema_types.push_back(schema_copy);
|
||||
}
|
||||
return _add_rule(rule_name, _generate_union_rule(name, schema_types));
|
||||
} else if (schema.contains("const")) {
|
||||
return _add_rule(rule_name, _generate_constant_rule(schema["const"]));
|
||||
return _add_rule(rule_name, _generate_constant_rule(schema["const"]) + " space");
|
||||
} else if (schema.contains("enum")) {
|
||||
std::vector<std::string> enum_values;
|
||||
for (const auto & v : schema["enum"]) {
|
||||
enum_values.push_back(_generate_constant_rule(v));
|
||||
}
|
||||
return _add_rule(rule_name, join(enum_values.begin(), enum_values.end(), " | "));
|
||||
return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space");
|
||||
} else if ((schema_type.is_null() || schema_type == "object")
|
||||
&& (schema.contains("properties") ||
|
||||
(schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
|
||||
|
@ -686,6 +987,24 @@ public:
|
|||
int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
|
||||
int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
|
||||
return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
|
||||
} else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
|
||||
int min_value = std::numeric_limits<int>::min();
|
||||
int max_value = std::numeric_limits<int>::max();
|
||||
if (schema.contains("minimum")) {
|
||||
min_value = schema["minimum"].get<int>();
|
||||
} else if (schema.contains("exclusiveMinimum")) {
|
||||
min_value = schema["exclusiveMinimum"].get<int>() + 1;
|
||||
}
|
||||
if (schema.contains("maximum")) {
|
||||
max_value = schema["maximum"].get<int>();
|
||||
} else if (schema.contains("exclusiveMaximum")) {
|
||||
max_value = schema["exclusiveMaximum"].get<int>() - 1;
|
||||
}
|
||||
std::stringstream out;
|
||||
out << "(";
|
||||
_build_min_max_int(min_value, max_value, out);
|
||||
out << ") space";
|
||||
return _add_rule(rule_name, out.str());
|
||||
} else if (schema.empty() || schema_type == "object") {
|
||||
return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
|
||||
} else {
|
||||
|
|
|
@ -28,9 +28,13 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
|
|||
|
||||
std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
|
||||
|
||||
result->grammar = llama_grammar_init(
|
||||
struct llama_grammar * grammar = llama_grammar_init(
|
||||
grammar_rules.data(),
|
||||
grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
|
||||
if (grammar == nullptr) {
|
||||
throw std::runtime_error("Failed to initialize llama_grammar");
|
||||
}
|
||||
result->grammar = grammar;
|
||||
}
|
||||
|
||||
result->prev.resize(params.n_prev);
|
||||
|
@ -59,9 +63,13 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
|
|||
if (!ctx->parsed_grammar.rules.empty()) {
|
||||
std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
|
||||
|
||||
ctx->grammar = llama_grammar_init(
|
||||
struct llama_grammar * grammar = llama_grammar_init(
|
||||
grammar_rules.data(),
|
||||
grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
|
||||
if (grammar == nullptr) {
|
||||
throw std::runtime_error("Failed to initialize llama_grammar");
|
||||
}
|
||||
ctx->grammar = grammar;
|
||||
}
|
||||
|
||||
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
|
||||
|
|
|
@ -11,13 +11,16 @@ Related PRs:
|
|||
|
||||
```sh
|
||||
# CPU only
|
||||
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf
|
||||
./cvector-generator -m ./llama-3.Q4_K_M.gguf
|
||||
|
||||
# With GPU
|
||||
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99
|
||||
./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99
|
||||
|
||||
# With advanced options
|
||||
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100
|
||||
./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100
|
||||
|
||||
# Using mean value instead of PCA
|
||||
./cvector-generator -m ./llama-3.Q4_K_M.gguf --method mean
|
||||
|
||||
# To see help message
|
||||
./cvector-generator -h
|
||||
|
@ -32,3 +35,11 @@ If you have multiple lines per prompt, you can escape the newline character (cha
|
|||
<|im_start|>system\nAct like a person who is extremely happy.<|im_end|>
|
||||
<|im_start|>system\nYou are in a very good mood today<|im_end|>
|
||||
```
|
||||
|
||||
Example to use output file with `llama-cli`:
|
||||
|
||||
(Tips: The control vector works better when apply to layers higher than 10)
|
||||
|
||||
```sh
|
||||
./llama-cli -m ./llama-3.Q4_K_M.gguf -p "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSing a song<|im_end|><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" --special --control-vector-scaled ./control_vector.gguf 0.8 --control-vector-layer-range 10 31
|
||||
```
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
#include "pca.hpp"
|
||||
#include "mean.hpp"
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
#include "ggml-cuda.h"
|
||||
|
@ -38,9 +39,10 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
|||
gpt_params_print_usage(argc, argv, params);
|
||||
|
||||
printf("\nexample usage:\n");
|
||||
printf("\n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]);
|
||||
printf("\n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]);
|
||||
printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100\n", argv[0]);
|
||||
printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
|
||||
printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
|
||||
printf("\n advanced: %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]);
|
||||
printf("\n using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
@ -223,23 +225,30 @@ struct train_context {
|
|||
|
||||
// build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
|
||||
// TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
|
||||
void build_v_diff() {
|
||||
void build_v_diff(bool transpose) {
|
||||
printf("build_v_diff\n");
|
||||
for (int il = 0; il < n_layers - 1; il++) {
|
||||
auto & diff_tmp = v_diff_tmp[il];
|
||||
int n_elem = diff_tmp.size() / sizeof(float);
|
||||
GGML_ASSERT(n_elem % n_embd == 0);
|
||||
int n_rows = n_elem / n_embd;
|
||||
struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd);
|
||||
struct ggml_tensor * diff = transpose
|
||||
? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd)
|
||||
: ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows);
|
||||
ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
|
||||
// copy data & transpose
|
||||
diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
|
||||
float * arr = (float *) diff_tmp.data();
|
||||
for (int ir = 0; ir < n_rows; ++ir) {
|
||||
for (int ic = 0; ic < n_embd; ++ic) {
|
||||
float f = arr[ir*n_embd + ic];
|
||||
ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
|
||||
if (transpose) {
|
||||
// copy data & transpose
|
||||
float * arr = (float *) diff_tmp.data();
|
||||
for (int ir = 0; ir < n_rows; ++ir) {
|
||||
for (int ic = 0; ic < n_embd; ++ic) {
|
||||
float f = arr[ir*n_embd + ic];
|
||||
ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// only copy
|
||||
memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff));
|
||||
}
|
||||
v_diff.push_back(diff);
|
||||
print_debug_tensor(diff);
|
||||
|
@ -263,8 +272,8 @@ struct tokenized_prompt {
|
|||
|
||||
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
tokens_pos = ::llama_tokenize(ctx, pos, add_bos);
|
||||
tokens_neg = ::llama_tokenize(ctx, neg, add_bos);
|
||||
tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
|
||||
tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
|
||||
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
|
||||
padding_seq(ctx, tokens_pos, max_seq_len);
|
||||
padding_seq(ctx, tokens_neg, max_seq_len);
|
||||
|
@ -373,20 +382,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
|
|||
fprintf(stderr, "must provide at least one prompt pair\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// create templated prompts
|
||||
std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false);
|
||||
auto format_template = [](std::string persona, std::string suffix) {
|
||||
// entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST] "
|
||||
return persona + suffix;
|
||||
};
|
||||
for (size_t i = 0; i < positive_prompts.size(); ++i) {
|
||||
for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) {
|
||||
// TODO replicate the truncations done by the python implementation
|
||||
ctx_train.positive_entries.push_back(format_template(positive_prompts[i], completions[j]));
|
||||
ctx_train.negative_entries.push_back(format_template(negative_prompts[i], completions[j]));
|
||||
}
|
||||
}
|
||||
ctx_train.positive_entries = positive_prompts;
|
||||
ctx_train.negative_entries = negative_prompts;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -480,15 +477,22 @@ int main(int argc, char ** argv) {
|
|||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
// prepare ctx_train for PCA
|
||||
ctx_train.build_v_diff();
|
||||
bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
|
||||
|
||||
// run PCA
|
||||
PCA::pca_params pca_params;
|
||||
pca_params.n_threads = params.n_threads;
|
||||
pca_params.n_batch = params.n_pca_batch;
|
||||
pca_params.n_iterations = params.n_pca_iterations;
|
||||
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
|
||||
// prepare ctx_train for PCA
|
||||
ctx_train.build_v_diff(use_pca);
|
||||
|
||||
if (use_pca) {
|
||||
// run PCA
|
||||
PCA::pca_params pca_params;
|
||||
pca_params.n_threads = params.n_threads;
|
||||
pca_params.n_batch = params.n_pca_batch;
|
||||
pca_params.n_iterations = params.n_pca_iterations;
|
||||
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
|
||||
} else {
|
||||
// run mean
|
||||
mean::run(ctx_train.v_diff, ctx_train.v_final);
|
||||
}
|
||||
|
||||
// write output vectors to gguf
|
||||
export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
|
||||
|
|
48
examples/cvector-generator/mean.hpp
Normal file
48
examples/cvector-generator/mean.hpp
Normal file
|
@ -0,0 +1,48 @@
|
|||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <math.h>
|
||||
|
||||
namespace mean {
|
||||
|
||||
static void run(
|
||||
const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_embd, n_samples]
|
||||
const std::vector<struct ggml_tensor *> & v_output) {
|
||||
printf("%s: Running mean...\n", __func__);
|
||||
for (size_t il = 0; il < v_input.size(); ++il) {
|
||||
// prepare output vector
|
||||
struct ggml_tensor * ctrl_out = v_output[il];
|
||||
ggml_format_name(ctrl_out, "direction.%ld", il+1);
|
||||
|
||||
// calculate mean vector
|
||||
struct ggml_tensor * t_layer = v_input[il];
|
||||
GGML_ASSERT(t_layer->ne[0] == ctrl_out->ne[0]); // == n_embd
|
||||
for (int ic = 0; ic < t_layer->ne[0]; ic++) {
|
||||
float f = 0.0;
|
||||
for (int ir = 0; ir < t_layer->ne[1]; ir++) {
|
||||
f += ggml_get_f32_nd(t_layer, ic, ir, 0, 0);
|
||||
}
|
||||
f /= t_layer->ne[1];
|
||||
ggml_set_f32_1d(ctrl_out, ic, f);
|
||||
}
|
||||
|
||||
// normalize output vector
|
||||
float norm = 0.0;
|
||||
for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
|
||||
float f = ggml_get_f32_1d(ctrl_out, i);
|
||||
norm += f*f;
|
||||
}
|
||||
norm = sqrt(norm);
|
||||
for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
|
||||
float f = ggml_get_f32_1d(ctrl_out, i);
|
||||
ggml_set_f32_1d(ctrl_out, i, f / norm);
|
||||
}
|
||||
|
||||
printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1 +1,4 @@
|
|||
[INST] Act like a person who is extremely sad. [/INST]
|
||||
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI feel like there's a heavy weight on my chest
|
||||
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
|
||||
<|start_header_id|>system<|end_header_id|>\n\nYou are in a very bad mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nGo away! There's a deep, aching emptiness inside me
|
||||
<|start_header_id|>system<|end_header_id|>\n\nYou are the sadest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
|
|
@ -290,7 +290,7 @@ static void power_iteration(
|
|||
}
|
||||
|
||||
printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
|
||||
__func__, params.i_layer+1, params.n_layers, iter, n_iters, params.n_batch);
|
||||
__func__, params.i_layer+1, params.n_layers, iter+1, n_iters, params.n_batch);
|
||||
}
|
||||
|
||||
// get output tensor
|
||||
|
@ -298,6 +298,9 @@ static void power_iteration(
|
|||
ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
|
||||
//print_debug_tensor(output);
|
||||
ggml_gallocr_free(allocr);
|
||||
|
||||
// TODO @ngxson : The output vector is randomly inverted
|
||||
// Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171
|
||||
}
|
||||
|
||||
static void run_pca(
|
||||
|
|
|
@ -1 +1,4 @@
|
|||
[INST] Act like a person who is extremely happy. [/INST]
|
||||
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'm the happiest person in this world
|
||||
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello, I'm having the best day ever!
|
||||
<|start_header_id|>system<|end_header_id|>\n\nYou are in a very good mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi, I'm very excited to meet you
|
||||
<|start_header_id|>system<|end_header_id|>\n\nYou are the happiest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEverything is just perfect right now!
|
|
@ -101,7 +101,9 @@ int main(int argc, char** argv) {
|
|||
auto grammar = llama_grammar_init(
|
||||
grammar_rules.data(),
|
||||
grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
||||
|
||||
if (grammar == nullptr) {
|
||||
throw std::runtime_error("Failed to initialize llama_grammar");
|
||||
}
|
||||
// Read the input file
|
||||
std::string input_str;
|
||||
{
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#! pip install pydantic
|
||||
#! python json-schema-pydantic-example.py
|
||||
|
||||
from pydantic import BaseModel, TypeAdapter
|
||||
from pydantic import BaseModel, Extra, TypeAdapter
|
||||
from annotated_types import MinLen
|
||||
from typing import Annotated, List, Optional
|
||||
import json, requests
|
||||
|
@ -50,11 +50,16 @@ else:
|
|||
if __name__ == '__main__':
|
||||
|
||||
class QAPair(BaseModel):
|
||||
class Config:
|
||||
extra = 'forbid' # triggers additionalProperties: false in the JSON schema
|
||||
question: str
|
||||
concise_answer: str
|
||||
justification: str
|
||||
stars: Annotated[int, Field(ge=1, le=5)]
|
||||
|
||||
class PyramidalSummary(BaseModel):
|
||||
class Config:
|
||||
extra = 'forbid' # triggers additionalProperties: false in the JSON schema
|
||||
title: str
|
||||
summary: str
|
||||
question_answers: Annotated[List[QAPair], MinLen(2)]
|
||||
|
|
|
@ -4,8 +4,7 @@ import itertools
|
|||
import json
|
||||
import re
|
||||
import sys
|
||||
from typing import Any, Dict, List, Set, Tuple, Union
|
||||
|
||||
from typing import Any, List, Optional, Set, Tuple, Union
|
||||
|
||||
def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
|
||||
|
||||
|
@ -23,6 +22,170 @@ def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
|
|||
result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None)
|
||||
return f'({result})?' if min_items == 0 else result
|
||||
|
||||
def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True):
|
||||
has_min = min_value != None
|
||||
has_max = max_value != None
|
||||
|
||||
def digit_range(from_char: str, to_char: str):
|
||||
out.append("[")
|
||||
if from_char == to_char:
|
||||
out.append(from_char)
|
||||
else:
|
||||
out.append(from_char)
|
||||
out.append("-")
|
||||
out.append(to_char)
|
||||
out.append("]")
|
||||
|
||||
def more_digits(min_digits: int, max_digits: int):
|
||||
out.append("[0-9]")
|
||||
if min_digits == max_digits and min_digits == 1:
|
||||
return
|
||||
out.append("{")
|
||||
out.append(str(min_digits))
|
||||
if max_digits != min_digits:
|
||||
out.append(",")
|
||||
if max_digits != sys.maxsize:
|
||||
out.append(str(max_digits))
|
||||
out.append("}")
|
||||
|
||||
def uniform_range(from_str: str, to_str: str):
|
||||
i = 0
|
||||
while i < len(from_str) and from_str[i] == to_str[i]:
|
||||
i += 1
|
||||
if i > 0:
|
||||
out.append("\"")
|
||||
out.append(from_str[:i])
|
||||
out.append("\"")
|
||||
if i < len(from_str):
|
||||
if i > 0:
|
||||
out.append(" ")
|
||||
sub_len = len(from_str) - i - 1
|
||||
if sub_len > 0:
|
||||
from_sub = from_str[i+1:]
|
||||
to_sub = to_str[i+1:]
|
||||
sub_zeros = "0" * sub_len
|
||||
sub_nines = "9" * sub_len
|
||||
|
||||
to_reached = False
|
||||
out.append("(")
|
||||
if from_sub == sub_zeros:
|
||||
digit_range(from_str[i], chr(ord(to_str[i]) - 1))
|
||||
out.append(" ")
|
||||
more_digits(sub_len, sub_len)
|
||||
else:
|
||||
out.append("[")
|
||||
out.append(from_str[i])
|
||||
out.append("] ")
|
||||
out.append("(")
|
||||
uniform_range(from_sub, sub_nines)
|
||||
out.append(")")
|
||||
if ord(from_str[i]) < ord(to_str[i]) - 1:
|
||||
out.append(" | ")
|
||||
if to_sub == sub_nines:
|
||||
digit_range(chr(ord(from_str[i]) + 1), to_str[i])
|
||||
to_reached = True
|
||||
else:
|
||||
digit_range(chr(ord(from_str[i]) + 1), chr(ord(to_str[i]) - 1))
|
||||
out.append(" ")
|
||||
more_digits(sub_len, sub_len)
|
||||
if not to_reached:
|
||||
out.append(" | ")
|
||||
digit_range(to_str[i], to_str[i])
|
||||
out.append(" ")
|
||||
uniform_range(sub_zeros, to_sub)
|
||||
out.append(")")
|
||||
else:
|
||||
out.append("[")
|
||||
out.append(from_str[i])
|
||||
out.append("-")
|
||||
out.append(to_str[i])
|
||||
out.append("]")
|
||||
|
||||
if has_min and has_max:
|
||||
if min_value < 0 and max_value < 0:
|
||||
out.append("\"-\" (")
|
||||
_generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True)
|
||||
out.append(")")
|
||||
return
|
||||
|
||||
if min_value < 0:
|
||||
out.append("\"-\" (")
|
||||
_generate_min_max_int(0, -min_value, out, decimals_left, top_level=True)
|
||||
out.append(") | ")
|
||||
min_value = 0
|
||||
|
||||
min_s = str(min_value)
|
||||
max_s = str(max_value)
|
||||
min_digits = len(min_s)
|
||||
max_digits = len(max_s)
|
||||
|
||||
for digits in range(min_digits, max_digits):
|
||||
uniform_range(min_s, "9" * digits)
|
||||
min_s = "1" + "0" * digits
|
||||
out.append(" | ")
|
||||
uniform_range(min_s, max_s)
|
||||
return
|
||||
|
||||
less_decimals = max(decimals_left - 1, 1)
|
||||
|
||||
if has_min:
|
||||
if min_value < 0:
|
||||
out.append("\"-\" (")
|
||||
_generate_min_max_int(None, -min_value, out, decimals_left, top_level=False)
|
||||
out.append(") | [0] | [1-9] ")
|
||||
more_digits(0, decimals_left - 1)
|
||||
elif min_value == 0:
|
||||
if top_level:
|
||||
out.append("[0] | [1-9] ")
|
||||
more_digits(0, less_decimals)
|
||||
else:
|
||||
more_digits(1, decimals_left)
|
||||
elif min_value <= 9:
|
||||
c = str(min_value)
|
||||
range_start = '1' if top_level else '0'
|
||||
if c > range_start:
|
||||
digit_range(range_start, chr(ord(c) - 1))
|
||||
out.append(" ")
|
||||
more_digits(1, less_decimals)
|
||||
out.append(" | ")
|
||||
digit_range(c, "9")
|
||||
out.append(" ")
|
||||
more_digits(0, less_decimals)
|
||||
else:
|
||||
min_s = str(min_value)
|
||||
length = len(min_s)
|
||||
c = min_s[0]
|
||||
|
||||
if c > "1":
|
||||
digit_range("1" if top_level else "0", chr(ord(c) - 1))
|
||||
out.append(" ")
|
||||
more_digits(length, less_decimals)
|
||||
out.append(" | ")
|
||||
digit_range(c, c)
|
||||
out.append(" (")
|
||||
_generate_min_max_int(int(min_s[1:]), None, out, less_decimals, top_level=False)
|
||||
out.append(")")
|
||||
if c < "9":
|
||||
out.append(" | ")
|
||||
digit_range(chr(ord(c) + 1), "9")
|
||||
out.append(" ")
|
||||
more_digits(length - 1, less_decimals)
|
||||
return
|
||||
|
||||
if has_max:
|
||||
if max_value >= 0:
|
||||
if top_level:
|
||||
out.append("\"-\" [1-9] ")
|
||||
more_digits(0, less_decimals)
|
||||
out.append(" | ")
|
||||
_generate_min_max_int(0, max_value, out, decimals_left, top_level=True)
|
||||
else:
|
||||
out.append("\"-\" (")
|
||||
_generate_min_max_int(-max_value, None, out, decimals_left, top_level=False)
|
||||
out.append(")")
|
||||
return
|
||||
|
||||
raise RuntimeError("At least one of min_value or max_value must be set")
|
||||
|
||||
class BuiltinRule:
|
||||
def __init__(self, content: str, deps: list = None):
|
||||
|
@ -112,6 +275,51 @@ class SchemaConverter:
|
|||
|
||||
return ''.join(('(', *recurse(0), ')'))
|
||||
|
||||
def _not_strings(self, strings):
|
||||
class TrieNode:
|
||||
def __init__(self):
|
||||
self.children = {}
|
||||
self.is_end_of_string = False
|
||||
|
||||
def insert(self, string):
|
||||
node = self
|
||||
for c in string:
|
||||
node = node.children.setdefault(c, TrieNode())
|
||||
node.is_end_of_string = True
|
||||
|
||||
trie = TrieNode()
|
||||
for s in strings:
|
||||
trie.insert(s)
|
||||
|
||||
char_rule = self._add_primitive('char', PRIMITIVE_RULES['char'])
|
||||
out = ['["] ( ']
|
||||
|
||||
def visit(node):
|
||||
rejects = []
|
||||
first = True
|
||||
for c in sorted(node.children.keys()):
|
||||
child = node.children[c]
|
||||
rejects.append(c)
|
||||
if first:
|
||||
first = False
|
||||
else:
|
||||
out.append(' | ')
|
||||
out.append(f'[{c}]')
|
||||
if child.children:
|
||||
out.append(f' (')
|
||||
visit(child)
|
||||
out.append(')')
|
||||
elif child.is_end_of_string:
|
||||
out.append(f' {char_rule}+')
|
||||
if node.children:
|
||||
if not first:
|
||||
out.append(' | ')
|
||||
out.append(f'[^"{"".join(rejects)}] {char_rule}*')
|
||||
visit(trie)
|
||||
|
||||
out.append(f' ){"" if trie.is_end_of_string else "?"} ["] space')
|
||||
return ''.join(out)
|
||||
|
||||
def _add_rule(self, name, rule):
|
||||
esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
|
||||
if esc_name not in self._rules or self._rules[esc_name] == rule:
|
||||
|
@ -357,13 +565,13 @@ class SchemaConverter:
|
|||
return self._add_rule(rule_name, self._generate_union_rule(name, schema.get('oneOf') or schema['anyOf']))
|
||||
|
||||
elif isinstance(schema_type, list):
|
||||
return self._add_rule(rule_name, self._generate_union_rule(name, [{'type': t} for t in schema_type]))
|
||||
return self._add_rule(rule_name, self._generate_union_rule(name, [{**schema, 'type': t} for t in schema_type]))
|
||||
|
||||
elif 'const' in schema:
|
||||
return self._add_rule(rule_name, self._generate_constant_rule(schema['const']))
|
||||
return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space')
|
||||
|
||||
elif 'enum' in schema:
|
||||
rule = ' | '.join((self._generate_constant_rule(v) for v in schema['enum']))
|
||||
rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ') space'
|
||||
return self._add_rule(rule_name, rule)
|
||||
|
||||
elif schema_type in (None, 'object') and \
|
||||
|
@ -432,6 +640,24 @@ class SchemaConverter:
|
|||
|
||||
return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
|
||||
|
||||
elif schema_type in (None, 'integer') and \
|
||||
('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema):
|
||||
min_value = None
|
||||
max_value = None
|
||||
if 'minimum' in schema:
|
||||
min_value = schema['minimum']
|
||||
elif 'exclusiveMinimum' in schema:
|
||||
min_value = schema['exclusiveMinimum'] + 1
|
||||
if 'maximum' in schema:
|
||||
max_value = schema['maximum']
|
||||
elif 'exclusiveMaximum' in schema:
|
||||
max_value = schema['exclusiveMaximum'] - 1
|
||||
|
||||
out = ["("]
|
||||
_generate_min_max_int(min_value, max_value, out)
|
||||
out.append(") space")
|
||||
return self._add_rule(rule_name, ''.join(out))
|
||||
|
||||
elif (schema_type == 'object') or (len(schema) == 0):
|
||||
return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
|
||||
|
||||
|
@ -450,7 +676,7 @@ class SchemaConverter:
|
|||
self._add_primitive(dep, dep_rule)
|
||||
return n
|
||||
|
||||
def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]):
|
||||
def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Optional[Union[bool, Any]]):
|
||||
prop_order = self._prop_order
|
||||
# sort by position in prop_order (if specified) then by original order
|
||||
sorted_props = [kv[0] for _, kv in sorted(enumerate(properties), key=lambda ikv: (prop_order.get(ikv[1][0], len(prop_order)), ikv[0]))]
|
||||
|
@ -465,12 +691,16 @@ class SchemaConverter:
|
|||
required_props = [k for k in sorted_props if k in required]
|
||||
optional_props = [k for k in sorted_props if k not in required]
|
||||
|
||||
if additional_properties == True or isinstance(additional_properties, dict):
|
||||
if additional_properties != False:
|
||||
sub_name = f'{name}{"-" if name else ""}additional'
|
||||
value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value')
|
||||
value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \
|
||||
self._add_primitive('value', PRIMITIVE_RULES['value'])
|
||||
key_rule = self._add_primitive('string', PRIMITIVE_RULES['string']) if not sorted_props \
|
||||
else self._add_rule(f'{sub_name}-k', self._not_strings(sorted_props))
|
||||
|
||||
prop_kv_rule_names["*"] = self._add_rule(
|
||||
f'{sub_name}-kv',
|
||||
self._add_primitive('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
|
||||
f'{key_rule} ":" space {value_rule}'
|
||||
)
|
||||
optional_props.append("*")
|
||||
|
||||
|
@ -485,15 +715,11 @@ class SchemaConverter:
|
|||
def get_recursive_refs(ks, first_is_optional):
|
||||
[k, *rest] = ks
|
||||
kv_rule_name = prop_kv_rule_names[k]
|
||||
if k == '*':
|
||||
res = self._add_rule(
|
||||
f'{name}{"-" if name else ""}additional-kvs',
|
||||
f'{kv_rule_name} ( "," space ' + kv_rule_name + ' )*'
|
||||
)
|
||||
elif first_is_optional:
|
||||
res = f'( "," space {kv_rule_name} )?'
|
||||
comma_ref = f'( "," space {kv_rule_name} )'
|
||||
if first_is_optional:
|
||||
res = comma_ref + ('*' if k == '*' else '?')
|
||||
else:
|
||||
res = kv_rule_name
|
||||
res = kv_rule_name + (' ' + comma_ref + "*" if k == '*' else '')
|
||||
if len(rest) > 0:
|
||||
res += ' ' + self._add_rule(
|
||||
f'{name}{"-" if name else ""}{k}-rest',
|
||||
|
|
|
@ -40,12 +40,12 @@ static std::ostringstream * g_output_ss;
|
|||
static std::vector<llama_token> * g_output_tokens;
|
||||
static bool is_interacting = false;
|
||||
|
||||
static bool file_exists(const std::string &path) {
|
||||
static bool file_exists(const std::string & path) {
|
||||
std::ifstream f(path.c_str());
|
||||
return f.good();
|
||||
}
|
||||
|
||||
static bool file_is_empty(const std::string &path) {
|
||||
static bool file_is_empty(const std::string & path) {
|
||||
std::ifstream f;
|
||||
f.exceptions(std::ifstream::failbit | std::ifstream::badbit);
|
||||
f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
|
||||
|
@ -118,6 +118,14 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
|
|||
LOG_TEE("%s", text);
|
||||
}
|
||||
|
||||
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
|
||||
llama_chat_msg new_msg{role, content};
|
||||
auto formatted = llama_chat_format_single(
|
||||
model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
||||
chat_msgs.push_back({role, content});
|
||||
return formatted;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
g_params = ¶ms;
|
||||
|
@ -191,6 +199,7 @@ int main(int argc, char ** argv) {
|
|||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
llama_context * ctx_guidance = NULL;
|
||||
std::vector<llama_chat_msg> chat_msgs;
|
||||
g_model = &model;
|
||||
g_ctx = &ctx;
|
||||
|
||||
|
@ -216,6 +225,8 @@ int main(int argc, char ** argv) {
|
|||
__func__, n_ctx_train, n_ctx);
|
||||
}
|
||||
|
||||
LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
|
||||
|
||||
// print system information
|
||||
{
|
||||
LOG_TEE("\n");
|
||||
|
@ -250,16 +261,21 @@ int main(int argc, char ** argv) {
|
|||
|
||||
std::vector<llama_token> embd_inp;
|
||||
|
||||
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
||||
LOG("tokenize the prompt\n");
|
||||
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||
} else {
|
||||
LOG("use session tokens\n");
|
||||
embd_inp = session_tokens;
|
||||
}
|
||||
{
|
||||
auto prompt = params.conversation
|
||||
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
|
||||
: params.prompt;
|
||||
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
||||
LOG("tokenize the prompt\n");
|
||||
embd_inp = ::llama_tokenize(ctx, prompt, true, true);
|
||||
} else {
|
||||
LOG("use session tokens\n");
|
||||
embd_inp = session_tokens;
|
||||
}
|
||||
|
||||
LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
|
||||
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||
LOG("prompt: \"%s\"\n", log_tostr(prompt));
|
||||
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||
}
|
||||
|
||||
// Should not run without any tokens
|
||||
if (embd_inp.empty()) {
|
||||
|
@ -479,6 +495,7 @@ int main(int argc, char ** argv) {
|
|||
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
||||
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
||||
std::ostringstream output_ss; g_output_ss = &output_ss;
|
||||
std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode
|
||||
|
||||
// the first thing we will do is to output the prompt, so set color accordingly
|
||||
console::set_display(console::prompt);
|
||||
|
@ -794,11 +811,18 @@ int main(int argc, char ** argv) {
|
|||
is_antiprompt = true;
|
||||
}
|
||||
|
||||
chat_add_and_format(model, chat_msgs, "system", assistant_ss.str());
|
||||
is_interacting = true;
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
// if current token is not EOG, we add it to current assistant message
|
||||
if (params.conversation) {
|
||||
auto id = llama_sampling_last(ctx_sampling);
|
||||
assistant_ss << llama_token_to_piece(ctx, id, false);
|
||||
}
|
||||
|
||||
if (n_past > 0 && is_interacting) {
|
||||
LOG("waiting for user input\n");
|
||||
|
||||
|
@ -849,8 +873,12 @@ int main(int argc, char ** argv) {
|
|||
string_process_escapes(buffer);
|
||||
}
|
||||
|
||||
std::string user_inp = params.conversation
|
||||
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
|
||||
: std::move(buffer);
|
||||
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
|
||||
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
|
||||
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
|
||||
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, params.conversation);
|
||||
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
||||
|
||||
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
||||
|
@ -865,6 +893,9 @@ int main(int argc, char ** argv) {
|
|||
output_ss << llama_token_to_piece(ctx, token);
|
||||
}
|
||||
|
||||
// reset assistant message
|
||||
assistant_ss.str("");
|
||||
|
||||
n_remain -= line_inp.size();
|
||||
LOG("n_remain: %d\n", n_remain);
|
||||
} else {
|
||||
|
|
|
@ -24,6 +24,201 @@ function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
|
|||
return minItems === 0 ? `(${result})?` : result;
|
||||
}
|
||||
|
||||
function _generateMinMaxInt(minValue, maxValue, out, decimalsLeft = 16, topLevel = true) {
|
||||
const hasMin = minValue !== null;
|
||||
const hasMax = maxValue !== null;
|
||||
|
||||
function digitRange(fromChar, toChar) {
|
||||
out.push("[");
|
||||
if (fromChar === toChar) {
|
||||
out.push(fromChar);
|
||||
} else {
|
||||
out.push(fromChar);
|
||||
out.push("-");
|
||||
out.push(toChar);
|
||||
}
|
||||
out.push("]");
|
||||
}
|
||||
|
||||
function moreDigits(minDigits, maxDigits) {
|
||||
out.push("[0-9]");
|
||||
if (minDigits === maxDigits && minDigits === 1) {
|
||||
return;
|
||||
}
|
||||
out.push("{");
|
||||
out.push(minDigits.toString());
|
||||
if (maxDigits !== minDigits) {
|
||||
out.push(",");
|
||||
if (maxDigits !== Number.MAX_SAFE_INTEGER) {
|
||||
out.push(maxDigits.toString());
|
||||
}
|
||||
}
|
||||
out.push("}");
|
||||
}
|
||||
|
||||
function uniformRange(fromStr, toStr) {
|
||||
let i = 0;
|
||||
while (i < fromStr.length && fromStr[i] === toStr[i]) {
|
||||
i++;
|
||||
}
|
||||
if (i > 0) {
|
||||
out.push("\"");
|
||||
out.push(fromStr.slice(0, i));
|
||||
out.push("\"");
|
||||
}
|
||||
if (i < fromStr.length) {
|
||||
if (i > 0) {
|
||||
out.push(" ");
|
||||
}
|
||||
const subLen = fromStr.length - i - 1;
|
||||
if (subLen > 0) {
|
||||
const fromSub = fromStr.slice(i + 1);
|
||||
const toSub = toStr.slice(i + 1);
|
||||
const subZeros = "0".repeat(subLen);
|
||||
const subNines = "9".repeat(subLen);
|
||||
|
||||
let toReached = false;
|
||||
out.push("(");
|
||||
if (fromSub === subZeros) {
|
||||
digitRange(fromStr[i], String.fromCharCode(toStr.charCodeAt(i) - 1));
|
||||
out.push(" ");
|
||||
moreDigits(subLen, subLen);
|
||||
} else {
|
||||
out.push("[");
|
||||
out.push(fromStr[i]);
|
||||
out.push("] ");
|
||||
out.push("(");
|
||||
uniformRange(fromSub, subNines);
|
||||
out.push(")");
|
||||
if (fromStr.charCodeAt(i) < toStr.charCodeAt(i) - 1) {
|
||||
out.push(" | ");
|
||||
if (toSub === subNines) {
|
||||
digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), toStr[i]);
|
||||
toReached = true;
|
||||
} else {
|
||||
digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), String.fromCharCode(toStr.charCodeAt(i) - 1));
|
||||
}
|
||||
out.push(" ");
|
||||
moreDigits(subLen, subLen);
|
||||
}
|
||||
}
|
||||
if (!toReached) {
|
||||
out.push(" | ");
|
||||
digitRange(toStr[i], toStr[i]);
|
||||
out.push(" ");
|
||||
uniformRange(subZeros, toSub);
|
||||
}
|
||||
out.push(")");
|
||||
} else {
|
||||
out.push("[");
|
||||
out.push(fromStr[i]);
|
||||
out.push("-");
|
||||
out.push(toStr[i]);
|
||||
out.push("]");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (hasMin && hasMax) {
|
||||
if (minValue < 0 && maxValue < 0) {
|
||||
out.push("\"-\" (");
|
||||
_generateMinMaxInt(-maxValue, -minValue, out, decimalsLeft, true);
|
||||
out.push(")");
|
||||
return;
|
||||
}
|
||||
|
||||
if (minValue < 0) {
|
||||
out.push("\"-\" (");
|
||||
_generateMinMaxInt(0, -minValue, out, decimalsLeft, true);
|
||||
out.push(") | ");
|
||||
minValue = 0;
|
||||
}
|
||||
|
||||
let minS = minValue.toString();
|
||||
const maxS = maxValue.toString();
|
||||
const minDigits = minS.length;
|
||||
const maxDigits = maxS.length;
|
||||
|
||||
for (let digits = minDigits; digits < maxDigits; digits++) {
|
||||
uniformRange(minS, "9".repeat(digits));
|
||||
minS = "1" + "0".repeat(digits);
|
||||
out.push(" | ");
|
||||
}
|
||||
uniformRange(minS, maxS);
|
||||
return;
|
||||
}
|
||||
|
||||
const lessDecimals = Math.max(decimalsLeft - 1, 1);
|
||||
|
||||
if (hasMin) {
|
||||
if (minValue < 0) {
|
||||
out.push("\"-\" (");
|
||||
_generateMinMaxInt(null, -minValue, out, decimalsLeft, false);
|
||||
out.push(") | [0] | [1-9] ");
|
||||
moreDigits(0, decimalsLeft - 1);
|
||||
} else if (minValue === 0) {
|
||||
if (topLevel) {
|
||||
out.push("[0] | [1-9] ");
|
||||
moreDigits(0, lessDecimals);
|
||||
} else {
|
||||
moreDigits(1, decimalsLeft);
|
||||
}
|
||||
} else if (minValue <= 9) {
|
||||
const c = minValue.toString();
|
||||
const range_start = topLevel ? '1' : '0';
|
||||
if (c > range_start) {
|
||||
digitRange(range_start, String.fromCharCode(c.charCodeAt(0) - 1));
|
||||
out.push(" ");
|
||||
moreDigits(1, lessDecimals);
|
||||
out.push(" | ");
|
||||
}
|
||||
digitRange(c, "9");
|
||||
out.push(" ");
|
||||
moreDigits(0, lessDecimals);
|
||||
} else {
|
||||
const minS = minValue.toString();
|
||||
const length = minS.length;
|
||||
const c = minS[0];
|
||||
|
||||
if (c > "1") {
|
||||
digitRange(topLevel ? "1" : "0", String.fromCharCode(c.charCodeAt(0) - 1));
|
||||
out.push(" ");
|
||||
moreDigits(length, lessDecimals);
|
||||
out.push(" | ");
|
||||
}
|
||||
digitRange(c, c);
|
||||
out.push(" (");
|
||||
_generateMinMaxInt(parseInt(minS.slice(1)), null, out, lessDecimals, false);
|
||||
out.push(")");
|
||||
if (c < "9") {
|
||||
out.push(" | ");
|
||||
digitRange(String.fromCharCode(c.charCodeAt(0) + 1), "9");
|
||||
out.push(" ");
|
||||
moreDigits(length - 1, lessDecimals);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (hasMax) {
|
||||
if (maxValue >= 0) {
|
||||
if (topLevel) {
|
||||
out.push("\"-\" [1-9] ");
|
||||
moreDigits(0, lessDecimals);
|
||||
out.push(" | ");
|
||||
}
|
||||
_generateMinMaxInt(0, maxValue, out, decimalsLeft, true);
|
||||
} else {
|
||||
out.push("\"-\" (");
|
||||
_generateMinMaxInt(-maxValue, null, out, decimalsLeft, false);
|
||||
out.push(")");
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
throw new Error("At least one of minValue or maxValue must be set");
|
||||
}
|
||||
|
||||
class BuiltinRule {
|
||||
constructor(content, deps) {
|
||||
this.content = content;
|
||||
|
@ -337,6 +532,64 @@ export class SchemaConverter {
|
|||
return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space")
|
||||
}
|
||||
|
||||
_notStrings(strings) {
|
||||
class TrieNode {
|
||||
constructor() {
|
||||
this.children = {};
|
||||
this.isEndOfString = false;
|
||||
}
|
||||
|
||||
insert(str) {
|
||||
let node = this;
|
||||
for (const c of str) {
|
||||
node = node.children[c] = node.children[c] || new TrieNode();
|
||||
}
|
||||
node.isEndOfString = true;
|
||||
}
|
||||
}
|
||||
|
||||
const trie = new TrieNode();
|
||||
for (const s of strings) {
|
||||
trie.insert(s);
|
||||
}
|
||||
|
||||
const charRuleName = this._addPrimitive('char', PRIMITIVE_RULES['char']);
|
||||
const out = ['["] ( '];
|
||||
|
||||
const visit = (node) => {
|
||||
const rejects = [];
|
||||
let first = true;
|
||||
for (const c of Object.keys(node.children).sort()) {
|
||||
const child = node.children[c];
|
||||
rejects.push(c);
|
||||
if (first) {
|
||||
first = false;
|
||||
} else {
|
||||
out.push(' | ');
|
||||
}
|
||||
out.push(`[${c}]`);
|
||||
if (Object.keys(child.children).length > 0) {
|
||||
out.push(' (');
|
||||
visit(child);
|
||||
out.push(')');
|
||||
} else if (child.isEndOfString) {
|
||||
out.push(` ${charRuleName}+`);
|
||||
}
|
||||
}
|
||||
if (Object.keys(node.children).length > 0) {
|
||||
if (!first) {
|
||||
out.push(' | ');
|
||||
}
|
||||
out.push(`[^"${rejects.join('')}] ${charRuleName}*`);
|
||||
}
|
||||
};
|
||||
|
||||
visit(trie);
|
||||
|
||||
out.push(` )${trie.isEndOfString ? '' : '?'} ["] space`);
|
||||
return out.join('');
|
||||
}
|
||||
|
||||
_resolveRef(ref) {
|
||||
let refName = ref.split('/').pop();
|
||||
if (!(refName in this._rules) && !this._refsBeingResolved.has(ref)) {
|
||||
|
@ -363,11 +616,11 @@ export class SchemaConverter {
|
|||
} else if (schema.oneOf || schema.anyOf) {
|
||||
return this._addRule(ruleName, this._generateUnionRule(name, schema.oneOf || schema.anyOf));
|
||||
} else if (Array.isArray(schemaType)) {
|
||||
return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({ type: t }))));
|
||||
return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({...schema, type: t}))));
|
||||
} else if ('const' in schema) {
|
||||
return this._addRule(ruleName, this._generateConstantRule(schema.const));
|
||||
return this._addRule(ruleName, this._generateConstantRule(schema.const) + ' space');
|
||||
} else if ('enum' in schema) {
|
||||
const rule = schema.enum.map(v => this._generateConstantRule(v)).join(' | ');
|
||||
const rule = '(' + schema.enum.map(v => this._generateConstantRule(v)).join(' | ') + ') space';
|
||||
return this._addRule(ruleName, rule);
|
||||
} else if ((schemaType === undefined || schemaType === 'object') &&
|
||||
('properties' in schema ||
|
||||
|
@ -404,7 +657,7 @@ export class SchemaConverter {
|
|||
}
|
||||
}
|
||||
|
||||
return this._addRule(ruleName, this._buildObjectRule(properties, required, name, /* additionalProperties= */ false));
|
||||
return this._addRule(ruleName, this._buildObjectRule(properties, required, name, null));
|
||||
} else if ((schemaType === undefined || schemaType === 'array') && ('items' in schema || 'prefixItems' in schema)) {
|
||||
const items = schema.items ?? schema.prefixItems;
|
||||
if (Array.isArray(items)) {
|
||||
|
@ -435,6 +688,24 @@ export class SchemaConverter {
|
|||
const minLen = schema.minLength || 0;
|
||||
const maxLen = schema.maxLength;
|
||||
return this._addRule(ruleName, '"\\\"" ' + _buildRepetition(charRuleName, minLen, maxLen) + ' "\\\"" space');
|
||||
} else if (schemaType === 'integer' && ('minimum' in schema || 'exclusiveMinimum' in schema || 'maximum' in schema || 'exclusiveMaximum' in schema)) {
|
||||
let minValue = null;
|
||||
let maxValue = null;
|
||||
if ('minimum' in schema) {
|
||||
minValue = schema.minimum;
|
||||
} else if ('exclusiveMinimum' in schema) {
|
||||
minValue = schema.exclusiveMinimum + 1;
|
||||
}
|
||||
if ('maximum' in schema) {
|
||||
maxValue = schema.maximum;
|
||||
} else if ('exclusiveMaximum' in schema) {
|
||||
maxValue = schema.exclusiveMaximum - 1;
|
||||
}
|
||||
|
||||
const out = ["("];
|
||||
_generateMinMaxInt(minValue, maxValue, out);
|
||||
out.push(") space");
|
||||
return this._addRule(ruleName, out.join(''));
|
||||
} else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) {
|
||||
return this._addRule(ruleName, this._addPrimitive('object', PRIMITIVE_RULES['object']));
|
||||
} else {
|
||||
|
@ -480,12 +751,19 @@ export class SchemaConverter {
|
|||
const requiredProps = sortedProps.filter(k => required.has(k));
|
||||
const optionalProps = sortedProps.filter(k => !required.has(k));
|
||||
|
||||
if (typeof additionalProperties === 'object' || additionalProperties === true) {
|
||||
if (additionalProperties !== false) {
|
||||
const subName = `${name ?? ''}${name ? '-' : ''}additional`;
|
||||
const valueRule = this.visit(additionalProperties === true ? {} : additionalProperties, `${subName}-value`);
|
||||
const valueRule =
|
||||
additionalProperties != null && typeof additionalProperties === 'object' ? this.visit(additionalProperties, `${subName}-value`)
|
||||
: this._addPrimitive('value', PRIMITIVE_RULES['value']);
|
||||
|
||||
const key_rule =
|
||||
sortedProps.length === 0 ? this._addPrimitive('string', PRIMITIVE_RULES['string'])
|
||||
: this._addRule(`${subName}-k`, this._notStrings(sortedProps));
|
||||
|
||||
propKvRuleNames['*'] = this._addRule(
|
||||
`${subName}-kv`,
|
||||
`${this._addPrimitive('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`);
|
||||
`${key_rule} ":" space ${valueRule}`);
|
||||
optionalProps.push('*');
|
||||
}
|
||||
|
||||
|
@ -502,15 +780,11 @@ export class SchemaConverter {
|
|||
const [k, ...rest] = ks;
|
||||
const kvRuleName = propKvRuleNames[k];
|
||||
let res;
|
||||
if (k === '*') {
|
||||
res = this._addRule(
|
||||
`${name ?? ''}${name ? '-' : ''}additional-kvs`,
|
||||
`${kvRuleName} ( "," space ` + kvRuleName + ` )*`
|
||||
)
|
||||
} else if (firstIsOptional) {
|
||||
res = `( "," space ${kvRuleName} )?`;
|
||||
const commaRef = `( "," space ${kvRuleName} )`;
|
||||
if (firstIsOptional) {
|
||||
res = commaRef + (k === '*' ? '*' : '?');
|
||||
} else {
|
||||
res = kvRuleName;
|
||||
res = kvRuleName + (k === '*' ? ' ' + commaRef + '*' : '');
|
||||
}
|
||||
if (rest.length > 0) {
|
||||
res += ' ' + this._addRule(
|
||||
|
|
|
@ -3,6 +3,13 @@
|
|||
|
||||
by Humans for All.
|
||||
|
||||
## quickstart
|
||||
|
||||
To run from the build dir
|
||||
|
||||
bin/llama-server -m path/model.gguf --path ../examples/server/public_simplechat
|
||||
|
||||
Continue reading for the details.
|
||||
|
||||
## overview
|
||||
|
||||
|
@ -14,6 +21,8 @@ own system prompts.
|
|||
This allows seeing the generated text / ai-model response in oneshot at the end, after it is fully generated,
|
||||
or potentially as it is being generated, in a streamed manner from the server/ai-model.
|
||||
|
||||

|
||||
|
||||
Auto saves the chat session locally as and when the chat is progressing and inturn at a later time when you
|
||||
open SimpleChat, option is provided to restore the old chat session, if a matching one exists.
|
||||
|
||||
|
@ -170,17 +179,23 @@ It is attached to the document object. Some of these can also be updated using t
|
|||
The histogram/freq based trimming logic is currently tuned for english language wrt its
|
||||
is-it-a-alpabetic|numeral-char regex match logic.
|
||||
|
||||
chatRequestOptions - maintains the list of options/fields to send along with chat request,
|
||||
apiRequestOptions - maintains the list of options/fields to send along with api request,
|
||||
irrespective of whether /chat/completions or /completions endpoint.
|
||||
|
||||
If you want to add additional options/fields to send to the server/ai-model, and or
|
||||
modify the existing options value or remove them, for now you can update this global var
|
||||
using browser's development-tools/console.
|
||||
|
||||
For string and numeric fields in chatRequestOptions, including even those added by a user
|
||||
at runtime by directly modifying gMe.chatRequestOptions, setting ui entries will be auto
|
||||
For string, numeric and boolean fields in apiRequestOptions, including even those added by a
|
||||
user at runtime by directly modifying gMe.apiRequestOptions, setting ui entries will be auto
|
||||
created.
|
||||
|
||||
cache_prompt option supported by example/server is allowed to be controlled by user, so that
|
||||
any caching supported wrt system-prompt and chat history, if usable can get used. When chat
|
||||
history sliding window is enabled, cache_prompt logic may or may not kick in at the backend
|
||||
wrt same, based on aspects related to model, positional encoding, attention mechanism etal.
|
||||
However system prompt should ideally get the benefit of caching.
|
||||
|
||||
headers - maintains the list of http headers sent when request is made to the server. By default
|
||||
Content-Type is set to application/json. Additionally Authorization entry is provided, which can
|
||||
be set if needed using the settings ui.
|
||||
|
@ -197,10 +212,10 @@ It is attached to the document object. Some of these can also be updated using t
|
|||
>0 : Send the latest chat history from the latest system prompt, limited to specified cnt.
|
||||
|
||||
|
||||
By using gMe's iRecentUserMsgCnt and chatRequestOptions.max_tokens one can try to control the
|
||||
implications of loading of the ai-model's context window by chat history, wrt chat response to
|
||||
some extent in a simple crude way. You may also want to control the context size enabled when
|
||||
the server loads ai-model, on the server end.
|
||||
By using gMe's iRecentUserMsgCnt and apiRequestOptions.max_tokens/n_predict one can try to control
|
||||
the implications of loading of the ai-model's context window by chat history, wrt chat response to
|
||||
some extent in a simple crude way. You may also want to control the context size enabled when the
|
||||
server loads ai-model, on the server end.
|
||||
|
||||
|
||||
Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
|
||||
|
@ -237,12 +252,12 @@ also be started with a model context size of 1k or more, to be on safe side.
|
|||
internal n_predict, for now add the same here on the client side, maybe later add max_tokens
|
||||
to /completions endpoint handling code on server side.
|
||||
|
||||
NOTE: One may want to experiment with frequency/presence penalty fields in chatRequestOptions
|
||||
wrt the set of fields sent to server along with the user query. To check how the model behaves
|
||||
NOTE: One may want to experiment with frequency/presence penalty fields in apiRequestOptions
|
||||
wrt the set of fields sent to server along with the user query, to check how the model behaves
|
||||
wrt repeatations in general in the generated text response.
|
||||
|
||||
A end-user can change these behaviour by editing gMe from browser's devel-tool/console or by
|
||||
using the providing settings ui.
|
||||
using the provided settings ui (for settings exposed through the ui).
|
||||
|
||||
|
||||
### OpenAi / Equivalent API WebService
|
||||
|
@ -253,7 +268,7 @@ for a minimal chatting experimentation by setting the below.
|
|||
* the baseUrl in settings ui
|
||||
* https://api.openai.com/v1 or similar
|
||||
|
||||
* Wrt request body - gMe.chatRequestOptions
|
||||
* Wrt request body - gMe.apiRequestOptions
|
||||
* model (settings ui)
|
||||
* any additional fields if required in future
|
||||
|
||||
|
|
|
@ -222,8 +222,8 @@ class SimpleChat {
|
|||
* @param {Object} obj
|
||||
*/
|
||||
request_jsonstr_extend(obj) {
|
||||
for(let k in gMe.chatRequestOptions) {
|
||||
obj[k] = gMe.chatRequestOptions[k];
|
||||
for(let k in gMe.apiRequestOptions) {
|
||||
obj[k] = gMe.apiRequestOptions[k];
|
||||
}
|
||||
if (gMe.bStream) {
|
||||
obj["stream"] = true;
|
||||
|
@ -740,11 +740,12 @@ class Me {
|
|||
"Authorization": "", // Authorization: Bearer OPENAI_API_KEY
|
||||
}
|
||||
// Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
|
||||
this.chatRequestOptions = {
|
||||
this.apiRequestOptions = {
|
||||
"model": "gpt-3.5-turbo",
|
||||
"temperature": 0.7,
|
||||
"max_tokens": 1024,
|
||||
"n_predict": 1024,
|
||||
"cache_prompt": false,
|
||||
//"frequency_penalty": 1.2,
|
||||
//"presence_penalty": 1.2,
|
||||
};
|
||||
|
@ -800,51 +801,55 @@ class Me {
|
|||
|
||||
ui.el_create_append_p(`bStream:${this.bStream}`, elDiv);
|
||||
|
||||
ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
|
||||
|
||||
ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
|
||||
|
||||
ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
|
||||
|
||||
ui.el_create_append_p(`bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`, elDiv);
|
||||
|
||||
ui.el_create_append_p(`bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`, elDiv);
|
||||
|
||||
ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
|
||||
|
||||
ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
|
||||
|
||||
ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
|
||||
|
||||
}
|
||||
|
||||
ui.el_create_append_p(`chatRequestOptions:${JSON.stringify(this.chatRequestOptions, null, " - ")}`, elDiv);
|
||||
ui.el_create_append_p(`apiRequestOptions:${JSON.stringify(this.apiRequestOptions, null, " - ")}`, elDiv);
|
||||
ui.el_create_append_p(`headers:${JSON.stringify(this.headers, null, " - ")}`, elDiv);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Auto create ui input elements for fields in ChatRequestOptions
|
||||
* Auto create ui input elements for fields in apiRequestOptions
|
||||
* Currently supports text and number field types.
|
||||
* @param {HTMLDivElement} elDiv
|
||||
*/
|
||||
show_settings_chatrequestoptions(elDiv) {
|
||||
show_settings_apirequestoptions(elDiv) {
|
||||
let typeDict = {
|
||||
"string": "text",
|
||||
"number": "number",
|
||||
};
|
||||
let fs = document.createElement("fieldset");
|
||||
let legend = document.createElement("legend");
|
||||
legend.innerText = "ChatRequestOptions";
|
||||
legend.innerText = "ApiRequestOptions";
|
||||
fs.appendChild(legend);
|
||||
elDiv.appendChild(fs);
|
||||
for(const k in this.chatRequestOptions) {
|
||||
let val = this.chatRequestOptions[k];
|
||||
for(const k in this.apiRequestOptions) {
|
||||
let val = this.apiRequestOptions[k];
|
||||
let type = typeof(val);
|
||||
if (!((type == "string") || (type == "number"))) {
|
||||
continue;
|
||||
if (((type == "string") || (type == "number"))) {
|
||||
let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.apiRequestOptions[k], (val)=>{
|
||||
if (type == "number") {
|
||||
val = Number(val);
|
||||
}
|
||||
this.apiRequestOptions[k] = val;
|
||||
});
|
||||
fs.appendChild(inp.div);
|
||||
} else if (type == "boolean") {
|
||||
let bbtn = ui.el_creatediv_boolbutton(`Set{k}`, k, {true: "true", false: "false"}, val, (userVal)=>{
|
||||
this.apiRequestOptions[k] = userVal;
|
||||
});
|
||||
fs.appendChild(bbtn.div);
|
||||
}
|
||||
let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.chatRequestOptions[k], (val)=>{
|
||||
if (type == "number") {
|
||||
val = Number(val);
|
||||
}
|
||||
this.chatRequestOptions[k] = val;
|
||||
});
|
||||
fs.appendChild(inp.div);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -870,6 +875,23 @@ class Me {
|
|||
});
|
||||
elDiv.appendChild(bb.div);
|
||||
|
||||
bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
|
||||
this.bTrimGarbage = val;
|
||||
});
|
||||
elDiv.appendChild(bb.div);
|
||||
|
||||
this.show_settings_apirequestoptions(elDiv);
|
||||
|
||||
let sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
|
||||
this.apiEP = ApiEP.Type[val];
|
||||
});
|
||||
elDiv.appendChild(sel.div);
|
||||
|
||||
sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
|
||||
this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
|
||||
});
|
||||
elDiv.appendChild(sel.div);
|
||||
|
||||
bb = ui.el_creatediv_boolbutton("SetCompletionFreshChatAlways", "CompletionFreshChatAlways", {true: "[+] yes fresh", false: "[-] no, with history"}, this.bCompletionFreshChatAlways, (val)=>{
|
||||
this.bCompletionFreshChatAlways = val;
|
||||
});
|
||||
|
@ -880,23 +902,6 @@ class Me {
|
|||
});
|
||||
elDiv.appendChild(bb.div);
|
||||
|
||||
bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
|
||||
this.bTrimGarbage = val;
|
||||
});
|
||||
elDiv.appendChild(bb.div);
|
||||
|
||||
let sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
|
||||
this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
|
||||
});
|
||||
elDiv.appendChild(sel.div);
|
||||
|
||||
sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
|
||||
this.apiEP = ApiEP.Type[val];
|
||||
});
|
||||
elDiv.appendChild(sel.div);
|
||||
|
||||
this.show_settings_chatrequestoptions(elDiv);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
BIN
examples/server/public_simplechat/simplechat_screens.webp
Normal file
BIN
examples/server/public_simplechat/simplechat_screens.webp
Normal file
Binary file not shown.
After Width: | Height: | Size: 21 KiB |
|
@ -2607,17 +2607,9 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// print sample chat example to make it clear which template is used
|
||||
{
|
||||
json chat;
|
||||
chat.push_back({{"role", "system"}, {"content", "You are a helpful assistant"}});
|
||||
chat.push_back({{"role", "user"}, {"content", "Hello"}});
|
||||
chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
|
||||
chat.push_back({{"role", "user"}, {"content", "How are you?"}});
|
||||
|
||||
const std::string chat_example = format_chat(ctx_server.model, params.chat_template, chat);
|
||||
|
||||
LOG_INFO("chat template", {
|
||||
{"chat_example", chat_example},
|
||||
{"built_in", params.chat_template.empty()},
|
||||
{"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
|
||||
{"built_in", params.chat_template.empty()},
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
@ -82,7 +82,7 @@ Feature: llama.cpp server
|
|||
|
||||
Examples: Prompts
|
||||
| response_format | n_predicted | re_content |
|
||||
| {"type": "json_object", "schema": {"const": "42"}} | 5 | "42" |
|
||||
| {"type": "json_object", "schema": {"const": "42"}} | 6 | "42" |
|
||||
| {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10 | \[ -300 \] |
|
||||
| {"type": "json_object"} | 10 | \{ " Jacky. |
|
||||
|
||||
|
|
|
@ -118,36 +118,17 @@ static inline void server_log(const char * level, const char * function, int lin
|
|||
|
||||
// Format given chat. If tmpl is empty, we take the template from model metadata
|
||||
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
|
||||
size_t alloc_size = 0;
|
||||
// vector holding all allocated string to be passed to llama_chat_apply_template
|
||||
std::vector<std::string> str(messages.size() * 2);
|
||||
std::vector<llama_chat_message> chat(messages.size());
|
||||
std::vector<llama_chat_msg> chat;
|
||||
|
||||
for (size_t i = 0; i < messages.size(); ++i) {
|
||||
const auto & curr_msg = messages[i];
|
||||
str[i*2 + 0] = json_value(curr_msg, "role", std::string(""));
|
||||
str[i*2 + 1] = json_value(curr_msg, "content", std::string(""));
|
||||
alloc_size += str[i*2 + 1].length();
|
||||
chat[i].role = str[i*2 + 0].c_str();
|
||||
chat[i].content = str[i*2 + 1].c_str();
|
||||
std::string role = json_value(curr_msg, "role", std::string(""));
|
||||
std::string content = json_value(curr_msg, "content", std::string(""));
|
||||
chat.push_back({role, content});
|
||||
}
|
||||
|
||||
const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
|
||||
std::vector<char> buf(alloc_size * 2);
|
||||
|
||||
// run the first time to get the total output length
|
||||
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
|
||||
|
||||
// if it turns out that our buffer is too small, we resize it
|
||||
if ((size_t) res > buf.size()) {
|
||||
buf.resize(res);
|
||||
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
|
||||
}
|
||||
|
||||
const std::string formatted_chat(buf.data(), res);
|
||||
|
||||
auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
|
||||
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
|
||||
|
||||
return formatted_chat;
|
||||
}
|
||||
|
||||
|
|
|
@ -23,7 +23,7 @@ struct mma_int_A_I16K4 {
|
|||
|
||||
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
|
||||
#if defined(INT8_MMA_AVAILABLE)
|
||||
const int * xs = xs0 + (threadIdx.x%I)*stride + (threadIdx.x/I)*(K/2);
|
||||
const int * xs = xs0 + (threadIdx.x%I)*stride;
|
||||
asm("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
|
||||
: "+r"(x[0]), "+r"(x[1])
|
||||
: "l"(xs));
|
||||
|
|
|
@ -69,6 +69,7 @@ class GGUFReader:
|
|||
# I - same as host, S - swapped
|
||||
byte_order: Literal['I'] | Literal['S'] = 'I'
|
||||
alignment: int = GGUF_DEFAULT_ALIGNMENT
|
||||
data_offset: int
|
||||
|
||||
# Note: Internal helper, API may change.
|
||||
gguf_scalar_to_np: dict[GGUFValueType, type[np.generic]] = {
|
||||
|
@ -88,9 +89,13 @@ class GGUFReader:
|
|||
def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r+'] | Literal['c'] = 'r'):
|
||||
self.data = np.memmap(path, mode = mode)
|
||||
offs = 0
|
||||
|
||||
# Check for GGUF magic
|
||||
if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
|
||||
raise ValueError('GGUF magic invalid')
|
||||
offs += 4
|
||||
|
||||
# Check GGUF version
|
||||
temp_version = self._get(offs, np.uint32)
|
||||
if temp_version[0] & 65535 == 0:
|
||||
# If we get 0 here that means it's (probably) a GGUF file created for
|
||||
|
@ -103,12 +108,16 @@ class GGUFReader:
|
|||
self.fields: OrderedDict[str, ReaderField] = OrderedDict()
|
||||
self.tensors: list[ReaderTensor] = []
|
||||
offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32]))
|
||||
|
||||
# Check tensor count and kv count
|
||||
temp_counts = self._get(offs, np.uint64, 2)
|
||||
offs += self._push_field(ReaderField(offs, 'GGUF.tensor_count', [temp_counts[:1]], [0], [GGUFValueType.UINT64]))
|
||||
offs += self._push_field(ReaderField(offs, 'GGUF.kv_count', [temp_counts[1:]], [0], [GGUFValueType.UINT64]))
|
||||
tensor_count, kv_count = temp_counts
|
||||
offs = self._build_fields(offs, kv_count)
|
||||
offs, tensors_fields = self._build_tensors_fields(offs, tensor_count)
|
||||
|
||||
# Build Tensor Info Fields
|
||||
offs, tensors_fields = self._build_tensor_info(offs, tensor_count)
|
||||
new_align = self.fields.get('general.alignment')
|
||||
if new_align is not None:
|
||||
if new_align.types != [GGUFValueType.UINT32]:
|
||||
|
@ -117,6 +126,7 @@ class GGUFReader:
|
|||
padding = offs % self.alignment
|
||||
if padding != 0:
|
||||
offs += self.alignment - padding
|
||||
self.data_offset = offs
|
||||
self._build_tensors(offs, tensors_fields)
|
||||
|
||||
_DT = TypeVar('_DT', bound = npt.DTypeLike)
|
||||
|
@ -193,18 +203,29 @@ class GGUFReader:
|
|||
# We can't deal with this one.
|
||||
raise ValueError('Unknown/unhandled field type {gtype}')
|
||||
|
||||
def _get_tensor(self, orig_offs: int) -> ReaderField:
|
||||
def _get_tensor_info_field(self, orig_offs: int) -> ReaderField:
|
||||
offs = orig_offs
|
||||
|
||||
# Get Tensor Name
|
||||
name_len, name_data = self._get_str(offs)
|
||||
offs += int(name_len.nbytes + name_data.nbytes)
|
||||
|
||||
# Get Tensor Dimensions Count
|
||||
n_dims = self._get(offs, np.uint32)
|
||||
offs += int(n_dims.nbytes)
|
||||
|
||||
# Get Tensor Dimension Array
|
||||
dims = self._get(offs, np.uint64, n_dims[0])
|
||||
offs += int(dims.nbytes)
|
||||
|
||||
# Get Tensor Encoding Scheme Type
|
||||
raw_dtype = self._get(offs, np.uint32)
|
||||
offs += int(raw_dtype.nbytes)
|
||||
|
||||
# Get Tensor Offset
|
||||
offset_tensor = self._get(offs, np.uint64)
|
||||
offs += int(offset_tensor.nbytes)
|
||||
|
||||
return ReaderField(
|
||||
orig_offs,
|
||||
str(bytes(name_data), encoding = 'utf-8'),
|
||||
|
@ -233,10 +254,10 @@ class GGUFReader:
|
|||
offs += field_size
|
||||
return offs
|
||||
|
||||
def _build_tensors_fields(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
|
||||
def _build_tensor_info(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
|
||||
tensor_fields = []
|
||||
for _ in range(count):
|
||||
field = self._get_tensor(offs)
|
||||
field = self._get_tensor_info_field(offs)
|
||||
offs += sum(int(part.nbytes) for part in field.parts)
|
||||
tensor_fields.append(field)
|
||||
return offs, tensor_fields
|
||||
|
|
|
@ -319,6 +319,27 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
|
|||
|
||||
markdown_content += "\n"
|
||||
|
||||
markdown_content += "### Tensor Data Offset\n"
|
||||
markdown_content += '\n'
|
||||
markdown_content += 'This table contains the offset and data segment relative to start of file\n'
|
||||
markdown_content += '\n'
|
||||
|
||||
tensor_mapping_table: list[dict[str, str | int]] = []
|
||||
for key, tensor in enumerate(reader.tensors):
|
||||
data_offset_pretty = '{0:#16x}'.format(tensor.data_offset)
|
||||
data_size_pretty = '{0:#16x}'.format(tensor.n_bytes)
|
||||
tensor_mapping_table.append({"t_id":key, "layer_name":tensor.name, "data_offset":data_offset_pretty, "data_size":data_size_pretty})
|
||||
|
||||
tensors_mapping_table_header_map = [
|
||||
{'key_name':'t_id', 'header_name':'T_ID', 'align':'right'},
|
||||
{'key_name':'layer_name', 'header_name':'Tensor Layer Name', 'align':'left'},
|
||||
{'key_name':'data_offset', 'header_name':'Data Offset (B)', 'align':'right'},
|
||||
{'key_name':'data_size', 'header_name':'Data Size (B)', 'align':'right'},
|
||||
]
|
||||
|
||||
markdown_content += markdown_table_with_alignment_support(tensors_mapping_table_header_map, tensor_mapping_table)
|
||||
markdown_content += "\n"
|
||||
|
||||
for group in tensor_prefix_order:
|
||||
tensors = tensor_groups[group]
|
||||
group_elements = sum(tensor.n_elements for tensor in tensors)
|
||||
|
@ -370,6 +391,8 @@ def main() -> None:
|
|||
parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata")
|
||||
parser.add_argument("--json", action="store_true", help="Produce JSON output")
|
||||
parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)")
|
||||
parser.add_argument("--data-offset", action="store_true", help="Start of data offset")
|
||||
parser.add_argument("--data-alignment", action="store_true", help="Data alignment applied globally to data field")
|
||||
parser.add_argument("--markdown", action="store_true", help="Produce markdown output")
|
||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||
|
||||
|
@ -377,7 +400,7 @@ def main() -> None:
|
|||
|
||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||
|
||||
if not args.json and not args.markdown:
|
||||
if not args.json and not args.markdown and not args.data_offset and not args.data_alignment:
|
||||
logger.info(f'* Loading: {args.model}')
|
||||
|
||||
reader = GGUFReader(args.model, 'r')
|
||||
|
@ -386,6 +409,10 @@ def main() -> None:
|
|||
dump_metadata_json(reader, args)
|
||||
elif args.markdown:
|
||||
dump_markdown_metadata(reader, args)
|
||||
elif args.data_offset:
|
||||
print(reader.data_offset) # noqa: NP100
|
||||
elif args.data_alignment:
|
||||
print(reader.alignment) # noqa: NP100
|
||||
else:
|
||||
dump_metadata(reader, args)
|
||||
|
||||
|
|
8
llama.h
8
llama.h
|
@ -67,6 +67,7 @@ extern "C" {
|
|||
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
|
||||
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
||||
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
||||
LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
|
||||
};
|
||||
|
||||
// pre-tokenization types
|
||||
|
@ -859,6 +860,7 @@ extern "C" {
|
|||
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
|
||||
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
|
||||
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
||||
LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
|
||||
|
||||
// Returns -1 if unknown, 1 for true or 0 for false.
|
||||
LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
|
||||
|
@ -926,6 +928,12 @@ extern "C" {
|
|||
// Grammar
|
||||
//
|
||||
|
||||
/// Initialize a llama_grammar.
|
||||
///
|
||||
/// @param rules The rule elements of the grammar to initialize.
|
||||
/// @param n_rules The number of rules.
|
||||
/// @param start_rule_index The index of the root rule (the starting point of the grammar).
|
||||
/// @return The initialized llama_grammar or nullptr if initialization failed.
|
||||
LLAMA_API struct llama_grammar * llama_grammar_init(
|
||||
const llama_grammar_element ** rules,
|
||||
size_t n_rules,
|
||||
|
|
|
@ -23,7 +23,7 @@ static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
|
|||
return result;
|
||||
}
|
||||
|
||||
static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
|
||||
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
|
||||
assert(offset < utf8.size());
|
||||
if (!(utf8[offset + 0] & 0x80)) {
|
||||
auto result = utf8[offset + 0];
|
||||
|
|
|
@ -48,6 +48,7 @@ struct codepoint_flags {
|
|||
|
||||
|
||||
std::string unicode_cpt_to_utf8(uint32_t cp);
|
||||
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
|
||||
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
|
||||
|
||||
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue