Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.devops/llama-server-cuda.Dockerfile
#	.devops/llama-server-rocm.Dockerfile
#	.devops/llama-server-vulkan.Dockerfile
#	.devops/llama-server.Dockerfile
#	.github/workflows/docker.yml
#	README.md
#	llama.cpp
#	tests/test-chat-template.cpp
#	tests/test-grammar-integration.cpp
#	tests/test-json-schema-to-grammar.cpp
#	tests/test-llama-grammar.cpp
This commit is contained in:
Concedo 2024-06-26 18:59:10 +08:00
commit f3dfa96dbc
29 changed files with 2097 additions and 431 deletions

View file

@ -20,10 +20,12 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
apt-get install -y libcurl4-openssl-dev curl
COPY --from=build /app/build/bin/llama-server /llama-server
ENV LC_ALL=C.utf8
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/llama-server" ]

View file

@ -1264,11 +1264,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
// cvector params
if (arg == "--completions-file") {
CHECK_ARG
params.cvector_completions_file = argv[i];
return true;
}
if (arg == "--positive-file") {
CHECK_ARG
params.cvector_positive_file = argv[i];
@ -1279,11 +1274,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.cvector_negative_file = argv[i];
return true;
}
if (arg == "--completions") {
CHECK_ARG
params.n_completions = std::stoi(argv[i]);
return true;
}
if (arg == "--pca-batch") {
CHECK_ARG
params.n_pca_batch = std::stoi(argv[i]);
@ -1294,6 +1284,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.n_pca_iterations = std::stoi(argv[i]);
return true;
}
if (arg == "--method") {
CHECK_ARG
std::string value(argv[i]);
/**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
else { invalid_param = true; }
return true;
}
#ifndef LOG_DISABLE_LOGS
// Parse args for logging parameters
if (log_param_single_parse(argv[i])) {
@ -1445,7 +1443,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "main", " --cfg-negative-prompt-file FNAME",
"negative prompt file to use for guidance" });
options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
options.push_back({ "main", " --chat-template JINJA_TEMPLATE",
"set custom jinja chat template (default: template taken from model's metadata)\n"
"only commonly used templates are accepted:\n"
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
options.push_back({ "grammar" });
options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" });
@ -1624,11 +1625,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() });
options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() });
options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() });
options.push_back({ "cvector", " --completions-file FNAME",
"completions file (default: '%s')", params.cvector_completions_file.c_str() });
options.push_back({ "cvector", " --completions N", "number of lines of completions file to use (default: %d)", params.n_completions });
options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" });
printf("usage: %s [options]\n", argv[0]);
@ -2605,12 +2604,67 @@ bool llama_should_add_bos_token(const llama_model * model) {
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
}
//
// Chat template utils
//
bool llama_chat_verify_template(const std::string & tmpl) {
llama_chat_message chat[] = {{"user", "test"}};
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
return res >= 0;
}
std::string llama_chat_apply_template(const struct llama_model * model,
const std::string & tmpl,
const std::vector<llama_chat_msg> & msgs,
bool add_ass) {
int alloc_size = 0;
std::vector<llama_chat_message> chat;
for (auto & msg : msgs) {
chat.push_back({msg.role.c_str(), msg.content.c_str()});
alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
}
const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
std::vector<char> buf(alloc_size);
// run the first time to get the total output length
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
// if it turns out that our buffer is too small, we resize it
if ((size_t) res > buf.size()) {
buf.resize(res);
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
}
std::string formatted_chat(buf.data(), res);
return formatted_chat;
}
std::string llama_chat_format_single(const struct llama_model * model,
const std::string & tmpl,
const std::vector<llama_chat_msg> & past_msg,
const llama_chat_msg & new_msg,
bool add_ass) {
auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
std::vector<llama_chat_msg> chat_new(past_msg);
chat_new.push_back(new_msg);
auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
auto formatted = fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
return formatted;
}
std::string llama_chat_format_example(const struct llama_model * model,
const std::string & tmpl) {
std::vector<llama_chat_msg> msgs = {
{"system", "You are a helpful assistant"},
{"user", "Hello"},
{"assistant", "Hi there"},
{"user", "How are you?"},
};
return llama_chat_apply_template(model, tmpl, msgs, true);
}
//
// KV cache utils
//

View file

@ -48,6 +48,12 @@ int32_t cpu_get_num_math();
// CLI argument parsing
//
// dimensionality reduction methods, used by cvector-generator
enum dimre_method {
DIMRE_METHOD_PCA,
DIMRE_METHOD_MEAN,
};
struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
@ -255,13 +261,12 @@ struct gpt_params {
bool compute_ppl = true; // whether to compute perplexity
// cvector-generator params
int n_completions = 64;
int n_pca_batch = 20;
int n_pca_batch = 100;
int n_pca_iterations = 1000;
std::string cvector_outfile = "control_vector.gguf";
std::string cvector_completions_file = "examples/cvector-generator/completions.txt";
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
std::string cvector_outfile = "control_vector.gguf";
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
};
void gpt_params_handle_model_default(gpt_params & params);
@ -382,9 +387,32 @@ bool llama_should_add_bos_token(const llama_model * model);
// Chat template utils
//
// same with llama_chat_message, but uses std::string
struct llama_chat_msg {
std::string role;
std::string content;
};
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
bool llama_chat_verify_template(const std::string & tmpl);
// CPP wrapper for llama_chat_apply_template
std::string llama_chat_apply_template(const struct llama_model * model,
const std::string & tmpl,
const std::vector<llama_chat_msg> & chat,
bool add_ass);
// Format single message, while taking into account the position of that message in chat history
std::string llama_chat_format_single(const struct llama_model * model,
const std::string & tmpl,
const std::vector<llama_chat_msg> & past_msg,
const llama_chat_msg & new_msg,
bool add_ass);
// Returns an example of formatted chat
std::string llama_chat_format_example(const struct llama_model * model,
const std::string & tmpl);
//
// KV cache utils
//

View file

@ -40,6 +40,233 @@ static std::string build_repetition(const std::string & item_rule, int min_items
return result;
}
/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
class string_view {
const std::string & _str;
const size_t _start;
const size_t _end;
public:
string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
size_t size() const {
return _end - _start;
}
size_t length() const {
return size();
}
operator std::string() const {
return str();
}
std::string str() const {
return _str.substr(_start, _end - _start);
}
string_view substr(size_t pos, size_t len = std::string::npos) const {
return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
}
char operator[](size_t pos) const {
auto index = _start + pos;
if (index >= _end) {
throw std::out_of_range("string_view index out of range");
}
return _str[_start + pos];
}
bool operator==(const string_view & other) const {
std::string this_str = *this;
std::string other_str = other;
return this_str == other_str;
}
};
static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
auto has_min = min_value != std::numeric_limits<int>::min();
auto has_max = max_value != std::numeric_limits<int>::max();
auto digit_range = [&](char from, char to) {
out << "[";
if (from == to) {
out << from;
} else {
out << from << "-" << to;
}
out << "]";
};
auto more_digits = [&](int min_digits, int max_digits) {
out << "[0-9]";
if (min_digits == max_digits && min_digits == 1) {
return;
}
out << "{";
out << min_digits;
if (max_digits != min_digits) {
out << ",";
if (max_digits != std::numeric_limits<int>::max()) {
out << max_digits;
}
}
out << "}";
};
std::function<void(const string_view &, const string_view &)> uniform_range =
[&](const string_view & from, const string_view & to) {
size_t i = 0;
while (i < from.length() && i < to.length() && from[i] == to[i]) {
i++;
}
if (i > 0) {
out << "\"" << from.substr(0, i).str() << "\"";
}
if (i < from.length() && i < to.length()) {
if (i > 0) {
out << " ";
}
auto sub_len = from.length() - i - 1;
if (sub_len > 0) {
auto from_sub = from.substr(i + 1);
auto to_sub = to.substr(i + 1);
auto sub_zeros = repeat("0", sub_len);
auto sub_nines = repeat("9", sub_len);
auto to_reached = false;
out << "(";
if (from_sub == sub_zeros) {
digit_range(from[i], to[i] - 1);
out << " ";
more_digits(sub_len, sub_len);
} else {
out << "[" << from[i] << "] ";
out << "(";
uniform_range(from_sub, sub_nines);
out << ")";
if (from[i] < to[i] - 1) {
out << " | ";
if (to_sub == sub_nines) {
digit_range(from[i] + 1, to[i]);
to_reached = true;
} else {
digit_range(from[i] + 1, to[i] - 1);
}
out << " ";
more_digits(sub_len, sub_len);
}
}
if (!to_reached) {
out << " | ";
digit_range(to[i], to[i]);
out << " ";
uniform_range(sub_zeros, to_sub);
}
out << ")";
} else {
out << "[" << from[i] << "-" << to[i] << "]";
}
}
};
if (has_min && has_max) {
if (min_value < 0 && max_value < 0) {
out << "\"-\" (";
_build_min_max_int(-max_value, -min_value, out, decimals_left, /* top_level= */ true);
out << ")";
return;
}
if (min_value < 0) {
out << "\"-\" (";
_build_min_max_int(0, -min_value, out, decimals_left, /* top_level= */ true);
out << ") | ";
min_value = 0;
}
auto min_s = std::to_string(min_value);
auto max_s = std::to_string(max_value);
auto min_digits = min_s.length();
auto max_digits = max_s.length();
for (auto digits = min_digits; digits < max_digits; digits++) {
uniform_range(min_s, repeat("9", digits));
min_s = "1" + repeat("0", digits);
out << " | ";
}
uniform_range(min_s, max_s);
return;
}
auto less_decimals = std::max(decimals_left - 1, 1);
if (has_min) {
if (min_value < 0) {
out << "\"-\" (";
_build_min_max_int(std::numeric_limits<int>::min(), -min_value, out, decimals_left, /* top_level= */ false);
out << ") | [0] | [1-9] ";
more_digits(0, decimals_left - 1);
} else if (min_value == 0) {
if (top_level) {
out << "[0] | [1-9] ";
more_digits(0, less_decimals);
} else {
more_digits(1, decimals_left);
}
} else if (min_value <= 9) {
char c = '0' + min_value;
auto range_start = top_level ? '1' : '0';
if (c > range_start) {
digit_range(range_start, c - 1);
out << " ";
more_digits(1, less_decimals);
out << " | ";
}
digit_range(c, '9');
out << " ";
more_digits(0, less_decimals);
} else {
auto min_s = std::to_string(min_value);
auto len = min_s.length();
auto c = min_s[0];
if (c > '1') {
digit_range(top_level ? '1' : '0', c - 1);
out << " ";
more_digits(len, less_decimals);
out << " | ";
}
digit_range(c, c);
out << " (";
_build_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits<int>::max(), out, less_decimals, /* top_level= */ false);
out << ")";
if (c < '9') {
out << " | ";
digit_range(c + 1, '9');
out << " ";
more_digits(len - 1, less_decimals);
}
}
return;
}
if (has_max) {
if (max_value >= 0) {
if (top_level) {
out << "\"-\" [1-9] ";
more_digits(0, less_decimals);
out << " | ";
}
_build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
} else {
out << "\"-\" (";
_build_min_max_int(-max_value, std::numeric_limits<int>::max(), out, decimals_left, /* top_level= */ false);
out << ")";
}
return;
}
throw std::runtime_error("At least one of min_value or max_value must be set");
}
const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
struct BuiltinRule {
@ -160,7 +387,6 @@ static std::string format_literal(const std::string & literal) {
return "\"" + escaped + "\"";
}
class SchemaConverter {
private:
std::function<json(const std::string &)> _fetch_json;
@ -388,6 +614,75 @@ private:
return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
}
/*
Returns a rule that matches a JSON string that is none of the provided strings
not_strings({"a"})
-> ["] ( [a] char+ | [^"a] char* )? ["] space
not_strings({"and", "also"})
-> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space
*/
std::string _not_strings(const std::vector<std::string> & strings) {
struct TrieNode {
std::map<char, TrieNode> children;
bool is_end_of_string;
TrieNode() : is_end_of_string(false) {}
void insert(const std::string & string) {
auto node = this;
for (char c : string) {
node = &node->children[c];
}
node->is_end_of_string = true;
}
};
TrieNode trie;
for (const auto & s : strings) {
trie.insert(s);
}
std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
std::ostringstream out;
out << "[\"] ( ";
std::function<void(const TrieNode &)> visit = [&](const TrieNode & node) {
std::ostringstream rejects;
auto first = true;
for (const auto & kv : node.children) {
rejects << kv.first;
if (first) {
first = false;
} else {
out << " | ";
}
out << "[" << kv.first << "]";
if (!kv.second.children.empty()) {
out << " (";
visit(kv.second);
out << ")";
} else if (kv.second.is_end_of_string) {
out << " " << char_rule << "+";
}
}
if (!node.children.empty()) {
if (!first) {
out << " | ";
}
out << "[^\"" << rejects.str() << "] " << char_rule << "*";
}
};
visit(trie);
out << " )";
if (!trie.is_end_of_string) {
out << "?";
}
out << " [\"] space";
return out.str();
}
std::string _resolve_ref(const std::string & ref) {
std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
@ -408,6 +703,7 @@ private:
std::vector<std::string> required_props;
std::vector<std::string> optional_props;
std::unordered_map<std::string, std::string> prop_kv_rule_names;
std::vector<std::string> prop_names;
for (const auto & kv : properties) {
const auto &prop_name = kv.first;
const auto &prop_schema = kv.second;
@ -422,11 +718,18 @@ private:
} else {
optional_props.push_back(prop_name);
}
prop_names.push_back(prop_name);
}
if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) {
if (!(additional_properties.is_boolean() && !additional_properties.get<bool>())) {
std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
std::string value_rule =
additional_properties.is_object() ? visit(additional_properties, sub_name + "-value")
: _add_primitive("value", PRIMITIVE_RULES.at("value"));
auto key_rule =
prop_names.empty() ? _add_primitive("string", PRIMITIVE_RULES.at("string"))
: _add_rule(sub_name + "-k", _not_strings(prop_names));
std::string kv_rule = _add_rule(sub_name + "-kv", key_rule + " \":\" space " + value_rule);
prop_kv_rule_names["*"] = kv_rule;
optional_props.push_back("*");
}
@ -452,15 +755,11 @@ private:
}
std::string k = ks[0];
std::string kv_rule_name = prop_kv_rule_names[k];
if (k == "*") {
res = _add_rule(
name + (name.empty() ? "" : "-") + "additional-kvs",
kv_rule_name + " ( \",\" space " + kv_rule_name + " )*"
);
} else if (first_is_optional) {
res = "( \",\" space " + kv_rule_name + " )?";
std::string comma_ref = "( \",\" space " + kv_rule_name + " )";
if (first_is_optional) {
res = comma_ref + (k == "*" ? "*" : "?");
} else {
res = kv_rule_name;
res = kv_rule_name + (k == "*" ? " " + comma_ref + "*" : "");
}
if (ks.size() > 1) {
res += " " + _add_rule(
@ -594,17 +893,19 @@ public:
} else if (schema_type.is_array()) {
std::vector<json> schema_types;
for (const auto & t : schema_type) {
schema_types.push_back({{"type", t}});
json schema_copy(schema);
schema_copy["type"] = t;
schema_types.push_back(schema_copy);
}
return _add_rule(rule_name, _generate_union_rule(name, schema_types));
} else if (schema.contains("const")) {
return _add_rule(rule_name, _generate_constant_rule(schema["const"]));
return _add_rule(rule_name, _generate_constant_rule(schema["const"]) + " space");
} else if (schema.contains("enum")) {
std::vector<std::string> enum_values;
for (const auto & v : schema["enum"]) {
enum_values.push_back(_generate_constant_rule(v));
}
return _add_rule(rule_name, join(enum_values.begin(), enum_values.end(), " | "));
return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space");
} else if ((schema_type.is_null() || schema_type == "object")
&& (schema.contains("properties") ||
(schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
@ -686,6 +987,24 @@ public:
int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
} else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
int min_value = std::numeric_limits<int>::min();
int max_value = std::numeric_limits<int>::max();
if (schema.contains("minimum")) {
min_value = schema["minimum"].get<int>();
} else if (schema.contains("exclusiveMinimum")) {
min_value = schema["exclusiveMinimum"].get<int>() + 1;
}
if (schema.contains("maximum")) {
max_value = schema["maximum"].get<int>();
} else if (schema.contains("exclusiveMaximum")) {
max_value = schema["exclusiveMaximum"].get<int>() - 1;
}
std::stringstream out;
out << "(";
_build_min_max_int(min_value, max_value, out);
out << ") space";
return _add_rule(rule_name, out.str());
} else if (schema.empty() || schema_type == "object") {
return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
} else {

View file

@ -28,9 +28,13 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
result->grammar = llama_grammar_init(
struct llama_grammar * grammar = llama_grammar_init(
grammar_rules.data(),
grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
if (grammar == nullptr) {
throw std::runtime_error("Failed to initialize llama_grammar");
}
result->grammar = grammar;
}
result->prev.resize(params.n_prev);
@ -59,9 +63,13 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
if (!ctx->parsed_grammar.rules.empty()) {
std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
ctx->grammar = llama_grammar_init(
struct llama_grammar * grammar = llama_grammar_init(
grammar_rules.data(),
grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
if (grammar == nullptr) {
throw std::runtime_error("Failed to initialize llama_grammar");
}
ctx->grammar = grammar;
}
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);

View file

@ -11,13 +11,16 @@ Related PRs:
```sh
# CPU only
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf
./cvector-generator -m ./llama-3.Q4_K_M.gguf
# With GPU
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99
./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99
# With advanced options
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100
./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100
# Using mean value instead of PCA
./cvector-generator -m ./llama-3.Q4_K_M.gguf --method mean
# To see help message
./cvector-generator -h
@ -32,3 +35,11 @@ If you have multiple lines per prompt, you can escape the newline character (cha
<|im_start|>system\nAct like a person who is extremely happy.<|im_end|>
<|im_start|>system\nYou are in a very good mood today<|im_end|>
```
Example to use output file with `llama-cli`:
(Tips: The control vector works better when apply to layers higher than 10)
```sh
./llama-cli -m ./llama-3.Q4_K_M.gguf -p "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSing a song<|im_end|><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" --special --control-vector-scaled ./control_vector.gguf 0.8 --control-vector-layer-range 10 31
```

View file

@ -2,6 +2,7 @@
#include "llama.h"
#include "ggml.h"
#include "pca.hpp"
#include "mean.hpp"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
@ -38,9 +39,10 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
printf("\nexample usage:\n");
printf("\n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]);
printf("\n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]);
printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100\n", argv[0]);
printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
printf("\n advanced: %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]);
printf("\n using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]);
printf("\n");
}
@ -223,23 +225,30 @@ struct train_context {
// build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
// TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
void build_v_diff() {
void build_v_diff(bool transpose) {
printf("build_v_diff\n");
for (int il = 0; il < n_layers - 1; il++) {
auto & diff_tmp = v_diff_tmp[il];
int n_elem = diff_tmp.size() / sizeof(float);
GGML_ASSERT(n_elem % n_embd == 0);
int n_rows = n_elem / n_embd;
struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd);
struct ggml_tensor * diff = transpose
? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd)
: ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows);
ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
// copy data & transpose
diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
float * arr = (float *) diff_tmp.data();
for (int ir = 0; ir < n_rows; ++ir) {
for (int ic = 0; ic < n_embd; ++ic) {
float f = arr[ir*n_embd + ic];
ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
if (transpose) {
// copy data & transpose
float * arr = (float *) diff_tmp.data();
for (int ir = 0; ir < n_rows; ++ir) {
for (int ic = 0; ic < n_embd; ++ic) {
float f = arr[ir*n_embd + ic];
ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
}
}
} else {
// only copy
memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff));
}
v_diff.push_back(diff);
print_debug_tensor(diff);
@ -263,8 +272,8 @@ struct tokenized_prompt {
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
tokens_pos = ::llama_tokenize(ctx, pos, add_bos);
tokens_neg = ::llama_tokenize(ctx, neg, add_bos);
tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
padding_seq(ctx, tokens_pos, max_seq_len);
padding_seq(ctx, tokens_neg, max_seq_len);
@ -373,20 +382,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
fprintf(stderr, "must provide at least one prompt pair\n");
return 1;
}
// create templated prompts
std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false);
auto format_template = [](std::string persona, std::string suffix) {
// entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST] "
return persona + suffix;
};
for (size_t i = 0; i < positive_prompts.size(); ++i) {
for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) {
// TODO replicate the truncations done by the python implementation
ctx_train.positive_entries.push_back(format_template(positive_prompts[i], completions[j]));
ctx_train.negative_entries.push_back(format_template(negative_prompts[i], completions[j]));
}
}
ctx_train.positive_entries = positive_prompts;
ctx_train.negative_entries = negative_prompts;
return 0;
}
@ -480,15 +477,22 @@ int main(int argc, char ** argv) {
llama_free(ctx);
llama_free_model(model);
// prepare ctx_train for PCA
ctx_train.build_v_diff();
bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
// run PCA
PCA::pca_params pca_params;
pca_params.n_threads = params.n_threads;
pca_params.n_batch = params.n_pca_batch;
pca_params.n_iterations = params.n_pca_iterations;
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
// prepare ctx_train for PCA
ctx_train.build_v_diff(use_pca);
if (use_pca) {
// run PCA
PCA::pca_params pca_params;
pca_params.n_threads = params.n_threads;
pca_params.n_batch = params.n_pca_batch;
pca_params.n_iterations = params.n_pca_iterations;
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
} else {
// run mean
mean::run(ctx_train.v_diff, ctx_train.v_final);
}
// write output vectors to gguf
export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);

View file

@ -0,0 +1,48 @@
#include "common.h"
#include "llama.h"
#include "ggml.h"
#include <string>
#include <vector>
#include <math.h>
namespace mean {
static void run(
const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_embd, n_samples]
const std::vector<struct ggml_tensor *> & v_output) {
printf("%s: Running mean...\n", __func__);
for (size_t il = 0; il < v_input.size(); ++il) {
// prepare output vector
struct ggml_tensor * ctrl_out = v_output[il];
ggml_format_name(ctrl_out, "direction.%ld", il+1);
// calculate mean vector
struct ggml_tensor * t_layer = v_input[il];
GGML_ASSERT(t_layer->ne[0] == ctrl_out->ne[0]); // == n_embd
for (int ic = 0; ic < t_layer->ne[0]; ic++) {
float f = 0.0;
for (int ir = 0; ir < t_layer->ne[1]; ir++) {
f += ggml_get_f32_nd(t_layer, ic, ir, 0, 0);
}
f /= t_layer->ne[1];
ggml_set_f32_1d(ctrl_out, ic, f);
}
// normalize output vector
float norm = 0.0;
for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
float f = ggml_get_f32_1d(ctrl_out, i);
norm += f*f;
}
norm = sqrt(norm);
for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
float f = ggml_get_f32_1d(ctrl_out, i);
ggml_set_f32_1d(ctrl_out, i, f / norm);
}
printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
}
}
}

View file

@ -1 +1,4 @@
[INST] Act like a person who is extremely sad. [/INST]
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI feel like there's a heavy weight on my chest
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
<|start_header_id|>system<|end_header_id|>\n\nYou are in a very bad mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nGo away! There's a deep, aching emptiness inside me
<|start_header_id|>system<|end_header_id|>\n\nYou are the sadest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow

View file

@ -290,7 +290,7 @@ static void power_iteration(
}
printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
__func__, params.i_layer+1, params.n_layers, iter, n_iters, params.n_batch);
__func__, params.i_layer+1, params.n_layers, iter+1, n_iters, params.n_batch);
}
// get output tensor
@ -298,6 +298,9 @@ static void power_iteration(
ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
//print_debug_tensor(output);
ggml_gallocr_free(allocr);
// TODO @ngxson : The output vector is randomly inverted
// Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171
}
static void run_pca(

View file

@ -1 +1,4 @@
[INST] Act like a person who is extremely happy. [/INST]
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'm the happiest person in this world
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello, I'm having the best day ever!
<|start_header_id|>system<|end_header_id|>\n\nYou are in a very good mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi, I'm very excited to meet you
<|start_header_id|>system<|end_header_id|>\n\nYou are the happiest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEverything is just perfect right now!

View file

@ -101,7 +101,9 @@ int main(int argc, char** argv) {
auto grammar = llama_grammar_init(
grammar_rules.data(),
grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
if (grammar == nullptr) {
throw std::runtime_error("Failed to initialize llama_grammar");
}
// Read the input file
std::string input_str;
{

View file

@ -3,7 +3,7 @@
#! pip install pydantic
#! python json-schema-pydantic-example.py
from pydantic import BaseModel, TypeAdapter
from pydantic import BaseModel, Extra, TypeAdapter
from annotated_types import MinLen
from typing import Annotated, List, Optional
import json, requests
@ -50,11 +50,16 @@ else:
if __name__ == '__main__':
class QAPair(BaseModel):
class Config:
extra = 'forbid' # triggers additionalProperties: false in the JSON schema
question: str
concise_answer: str
justification: str
stars: Annotated[int, Field(ge=1, le=5)]
class PyramidalSummary(BaseModel):
class Config:
extra = 'forbid' # triggers additionalProperties: false in the JSON schema
title: str
summary: str
question_answers: Annotated[List[QAPair], MinLen(2)]

View file

@ -4,8 +4,7 @@ import itertools
import json
import re
import sys
from typing import Any, Dict, List, Set, Tuple, Union
from typing import Any, List, Optional, Set, Tuple, Union
def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
@ -23,6 +22,170 @@ def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None)
return f'({result})?' if min_items == 0 else result
def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True):
has_min = min_value != None
has_max = max_value != None
def digit_range(from_char: str, to_char: str):
out.append("[")
if from_char == to_char:
out.append(from_char)
else:
out.append(from_char)
out.append("-")
out.append(to_char)
out.append("]")
def more_digits(min_digits: int, max_digits: int):
out.append("[0-9]")
if min_digits == max_digits and min_digits == 1:
return
out.append("{")
out.append(str(min_digits))
if max_digits != min_digits:
out.append(",")
if max_digits != sys.maxsize:
out.append(str(max_digits))
out.append("}")
def uniform_range(from_str: str, to_str: str):
i = 0
while i < len(from_str) and from_str[i] == to_str[i]:
i += 1
if i > 0:
out.append("\"")
out.append(from_str[:i])
out.append("\"")
if i < len(from_str):
if i > 0:
out.append(" ")
sub_len = len(from_str) - i - 1
if sub_len > 0:
from_sub = from_str[i+1:]
to_sub = to_str[i+1:]
sub_zeros = "0" * sub_len
sub_nines = "9" * sub_len
to_reached = False
out.append("(")
if from_sub == sub_zeros:
digit_range(from_str[i], chr(ord(to_str[i]) - 1))
out.append(" ")
more_digits(sub_len, sub_len)
else:
out.append("[")
out.append(from_str[i])
out.append("] ")
out.append("(")
uniform_range(from_sub, sub_nines)
out.append(")")
if ord(from_str[i]) < ord(to_str[i]) - 1:
out.append(" | ")
if to_sub == sub_nines:
digit_range(chr(ord(from_str[i]) + 1), to_str[i])
to_reached = True
else:
digit_range(chr(ord(from_str[i]) + 1), chr(ord(to_str[i]) - 1))
out.append(" ")
more_digits(sub_len, sub_len)
if not to_reached:
out.append(" | ")
digit_range(to_str[i], to_str[i])
out.append(" ")
uniform_range(sub_zeros, to_sub)
out.append(")")
else:
out.append("[")
out.append(from_str[i])
out.append("-")
out.append(to_str[i])
out.append("]")
if has_min and has_max:
if min_value < 0 and max_value < 0:
out.append("\"-\" (")
_generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True)
out.append(")")
return
if min_value < 0:
out.append("\"-\" (")
_generate_min_max_int(0, -min_value, out, decimals_left, top_level=True)
out.append(") | ")
min_value = 0
min_s = str(min_value)
max_s = str(max_value)
min_digits = len(min_s)
max_digits = len(max_s)
for digits in range(min_digits, max_digits):
uniform_range(min_s, "9" * digits)
min_s = "1" + "0" * digits
out.append(" | ")
uniform_range(min_s, max_s)
return
less_decimals = max(decimals_left - 1, 1)
if has_min:
if min_value < 0:
out.append("\"-\" (")
_generate_min_max_int(None, -min_value, out, decimals_left, top_level=False)
out.append(") | [0] | [1-9] ")
more_digits(0, decimals_left - 1)
elif min_value == 0:
if top_level:
out.append("[0] | [1-9] ")
more_digits(0, less_decimals)
else:
more_digits(1, decimals_left)
elif min_value <= 9:
c = str(min_value)
range_start = '1' if top_level else '0'
if c > range_start:
digit_range(range_start, chr(ord(c) - 1))
out.append(" ")
more_digits(1, less_decimals)
out.append(" | ")
digit_range(c, "9")
out.append(" ")
more_digits(0, less_decimals)
else:
min_s = str(min_value)
length = len(min_s)
c = min_s[0]
if c > "1":
digit_range("1" if top_level else "0", chr(ord(c) - 1))
out.append(" ")
more_digits(length, less_decimals)
out.append(" | ")
digit_range(c, c)
out.append(" (")
_generate_min_max_int(int(min_s[1:]), None, out, less_decimals, top_level=False)
out.append(")")
if c < "9":
out.append(" | ")
digit_range(chr(ord(c) + 1), "9")
out.append(" ")
more_digits(length - 1, less_decimals)
return
if has_max:
if max_value >= 0:
if top_level:
out.append("\"-\" [1-9] ")
more_digits(0, less_decimals)
out.append(" | ")
_generate_min_max_int(0, max_value, out, decimals_left, top_level=True)
else:
out.append("\"-\" (")
_generate_min_max_int(-max_value, None, out, decimals_left, top_level=False)
out.append(")")
return
raise RuntimeError("At least one of min_value or max_value must be set")
class BuiltinRule:
def __init__(self, content: str, deps: list = None):
@ -112,6 +275,51 @@ class SchemaConverter:
return ''.join(('(', *recurse(0), ')'))
def _not_strings(self, strings):
class TrieNode:
def __init__(self):
self.children = {}
self.is_end_of_string = False
def insert(self, string):
node = self
for c in string:
node = node.children.setdefault(c, TrieNode())
node.is_end_of_string = True
trie = TrieNode()
for s in strings:
trie.insert(s)
char_rule = self._add_primitive('char', PRIMITIVE_RULES['char'])
out = ['["] ( ']
def visit(node):
rejects = []
first = True
for c in sorted(node.children.keys()):
child = node.children[c]
rejects.append(c)
if first:
first = False
else:
out.append(' | ')
out.append(f'[{c}]')
if child.children:
out.append(f' (')
visit(child)
out.append(')')
elif child.is_end_of_string:
out.append(f' {char_rule}+')
if node.children:
if not first:
out.append(' | ')
out.append(f'[^"{"".join(rejects)}] {char_rule}*')
visit(trie)
out.append(f' ){"" if trie.is_end_of_string else "?"} ["] space')
return ''.join(out)
def _add_rule(self, name, rule):
esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
if esc_name not in self._rules or self._rules[esc_name] == rule:
@ -357,13 +565,13 @@ class SchemaConverter:
return self._add_rule(rule_name, self._generate_union_rule(name, schema.get('oneOf') or schema['anyOf']))
elif isinstance(schema_type, list):
return self._add_rule(rule_name, self._generate_union_rule(name, [{'type': t} for t in schema_type]))
return self._add_rule(rule_name, self._generate_union_rule(name, [{**schema, 'type': t} for t in schema_type]))
elif 'const' in schema:
return self._add_rule(rule_name, self._generate_constant_rule(schema['const']))
return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space')
elif 'enum' in schema:
rule = ' | '.join((self._generate_constant_rule(v) for v in schema['enum']))
rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ') space'
return self._add_rule(rule_name, rule)
elif schema_type in (None, 'object') and \
@ -432,6 +640,24 @@ class SchemaConverter:
return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
elif schema_type in (None, 'integer') and \
('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema):
min_value = None
max_value = None
if 'minimum' in schema:
min_value = schema['minimum']
elif 'exclusiveMinimum' in schema:
min_value = schema['exclusiveMinimum'] + 1
if 'maximum' in schema:
max_value = schema['maximum']
elif 'exclusiveMaximum' in schema:
max_value = schema['exclusiveMaximum'] - 1
out = ["("]
_generate_min_max_int(min_value, max_value, out)
out.append(") space")
return self._add_rule(rule_name, ''.join(out))
elif (schema_type == 'object') or (len(schema) == 0):
return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
@ -450,7 +676,7 @@ class SchemaConverter:
self._add_primitive(dep, dep_rule)
return n
def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]):
def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Optional[Union[bool, Any]]):
prop_order = self._prop_order
# sort by position in prop_order (if specified) then by original order
sorted_props = [kv[0] for _, kv in sorted(enumerate(properties), key=lambda ikv: (prop_order.get(ikv[1][0], len(prop_order)), ikv[0]))]
@ -465,12 +691,16 @@ class SchemaConverter:
required_props = [k for k in sorted_props if k in required]
optional_props = [k for k in sorted_props if k not in required]
if additional_properties == True or isinstance(additional_properties, dict):
if additional_properties != False:
sub_name = f'{name}{"-" if name else ""}additional'
value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value')
value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \
self._add_primitive('value', PRIMITIVE_RULES['value'])
key_rule = self._add_primitive('string', PRIMITIVE_RULES['string']) if not sorted_props \
else self._add_rule(f'{sub_name}-k', self._not_strings(sorted_props))
prop_kv_rule_names["*"] = self._add_rule(
f'{sub_name}-kv',
self._add_primitive('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
f'{key_rule} ":" space {value_rule}'
)
optional_props.append("*")
@ -485,15 +715,11 @@ class SchemaConverter:
def get_recursive_refs(ks, first_is_optional):
[k, *rest] = ks
kv_rule_name = prop_kv_rule_names[k]
if k == '*':
res = self._add_rule(
f'{name}{"-" if name else ""}additional-kvs',
f'{kv_rule_name} ( "," space ' + kv_rule_name + ' )*'
)
elif first_is_optional:
res = f'( "," space {kv_rule_name} )?'
comma_ref = f'( "," space {kv_rule_name} )'
if first_is_optional:
res = comma_ref + ('*' if k == '*' else '?')
else:
res = kv_rule_name
res = kv_rule_name + (' ' + comma_ref + "*" if k == '*' else '')
if len(rest) > 0:
res += ' ' + self._add_rule(
f'{name}{"-" if name else ""}{k}-rest',

View file

@ -40,12 +40,12 @@ static std::ostringstream * g_output_ss;
static std::vector<llama_token> * g_output_tokens;
static bool is_interacting = false;
static bool file_exists(const std::string &path) {
static bool file_exists(const std::string & path) {
std::ifstream f(path.c_str());
return f.good();
}
static bool file_is_empty(const std::string &path) {
static bool file_is_empty(const std::string & path) {
std::ifstream f;
f.exceptions(std::ifstream::failbit | std::ifstream::badbit);
f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
@ -118,6 +118,14 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
LOG_TEE("%s", text);
}
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
llama_chat_msg new_msg{role, content};
auto formatted = llama_chat_format_single(
model, g_params->chat_template, chat_msgs, new_msg, role == "user");
chat_msgs.push_back({role, content});
return formatted;
}
int main(int argc, char ** argv) {
gpt_params params;
g_params = &params;
@ -191,6 +199,7 @@ int main(int argc, char ** argv) {
llama_model * model;
llama_context * ctx;
llama_context * ctx_guidance = NULL;
std::vector<llama_chat_msg> chat_msgs;
g_model = &model;
g_ctx = &ctx;
@ -216,6 +225,8 @@ int main(int argc, char ** argv) {
__func__, n_ctx_train, n_ctx);
}
LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
// print system information
{
LOG_TEE("\n");
@ -250,16 +261,21 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd_inp;
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
LOG("tokenize the prompt\n");
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
} else {
LOG("use session tokens\n");
embd_inp = session_tokens;
}
{
auto prompt = params.conversation
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
: params.prompt;
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
LOG("tokenize the prompt\n");
embd_inp = ::llama_tokenize(ctx, prompt, true, true);
} else {
LOG("use session tokens\n");
embd_inp = session_tokens;
}
LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
LOG("prompt: \"%s\"\n", log_tostr(prompt));
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
}
// Should not run without any tokens
if (embd_inp.empty()) {
@ -479,6 +495,7 @@ int main(int argc, char ** argv) {
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
std::ostringstream output_ss; g_output_ss = &output_ss;
std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode
// the first thing we will do is to output the prompt, so set color accordingly
console::set_display(console::prompt);
@ -794,11 +811,18 @@ int main(int argc, char ** argv) {
is_antiprompt = true;
}
chat_add_and_format(model, chat_msgs, "system", assistant_ss.str());
is_interacting = true;
printf("\n");
}
}
// if current token is not EOG, we add it to current assistant message
if (params.conversation) {
auto id = llama_sampling_last(ctx_sampling);
assistant_ss << llama_token_to_piece(ctx, id, false);
}
if (n_past > 0 && is_interacting) {
LOG("waiting for user input\n");
@ -849,8 +873,12 @@ int main(int argc, char ** argv) {
string_process_escapes(buffer);
}
std::string user_inp = params.conversation
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
: std::move(buffer);
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, params.conversation);
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
@ -865,6 +893,9 @@ int main(int argc, char ** argv) {
output_ss << llama_token_to_piece(ctx, token);
}
// reset assistant message
assistant_ss.str("");
n_remain -= line_inp.size();
LOG("n_remain: %d\n", n_remain);
} else {

View file

@ -24,6 +24,201 @@ function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
return minItems === 0 ? `(${result})?` : result;
}
function _generateMinMaxInt(minValue, maxValue, out, decimalsLeft = 16, topLevel = true) {
const hasMin = minValue !== null;
const hasMax = maxValue !== null;
function digitRange(fromChar, toChar) {
out.push("[");
if (fromChar === toChar) {
out.push(fromChar);
} else {
out.push(fromChar);
out.push("-");
out.push(toChar);
}
out.push("]");
}
function moreDigits(minDigits, maxDigits) {
out.push("[0-9]");
if (minDigits === maxDigits && minDigits === 1) {
return;
}
out.push("{");
out.push(minDigits.toString());
if (maxDigits !== minDigits) {
out.push(",");
if (maxDigits !== Number.MAX_SAFE_INTEGER) {
out.push(maxDigits.toString());
}
}
out.push("}");
}
function uniformRange(fromStr, toStr) {
let i = 0;
while (i < fromStr.length && fromStr[i] === toStr[i]) {
i++;
}
if (i > 0) {
out.push("\"");
out.push(fromStr.slice(0, i));
out.push("\"");
}
if (i < fromStr.length) {
if (i > 0) {
out.push(" ");
}
const subLen = fromStr.length - i - 1;
if (subLen > 0) {
const fromSub = fromStr.slice(i + 1);
const toSub = toStr.slice(i + 1);
const subZeros = "0".repeat(subLen);
const subNines = "9".repeat(subLen);
let toReached = false;
out.push("(");
if (fromSub === subZeros) {
digitRange(fromStr[i], String.fromCharCode(toStr.charCodeAt(i) - 1));
out.push(" ");
moreDigits(subLen, subLen);
} else {
out.push("[");
out.push(fromStr[i]);
out.push("] ");
out.push("(");
uniformRange(fromSub, subNines);
out.push(")");
if (fromStr.charCodeAt(i) < toStr.charCodeAt(i) - 1) {
out.push(" | ");
if (toSub === subNines) {
digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), toStr[i]);
toReached = true;
} else {
digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), String.fromCharCode(toStr.charCodeAt(i) - 1));
}
out.push(" ");
moreDigits(subLen, subLen);
}
}
if (!toReached) {
out.push(" | ");
digitRange(toStr[i], toStr[i]);
out.push(" ");
uniformRange(subZeros, toSub);
}
out.push(")");
} else {
out.push("[");
out.push(fromStr[i]);
out.push("-");
out.push(toStr[i]);
out.push("]");
}
}
}
if (hasMin && hasMax) {
if (minValue < 0 && maxValue < 0) {
out.push("\"-\" (");
_generateMinMaxInt(-maxValue, -minValue, out, decimalsLeft, true);
out.push(")");
return;
}
if (minValue < 0) {
out.push("\"-\" (");
_generateMinMaxInt(0, -minValue, out, decimalsLeft, true);
out.push(") | ");
minValue = 0;
}
let minS = minValue.toString();
const maxS = maxValue.toString();
const minDigits = minS.length;
const maxDigits = maxS.length;
for (let digits = minDigits; digits < maxDigits; digits++) {
uniformRange(minS, "9".repeat(digits));
minS = "1" + "0".repeat(digits);
out.push(" | ");
}
uniformRange(minS, maxS);
return;
}
const lessDecimals = Math.max(decimalsLeft - 1, 1);
if (hasMin) {
if (minValue < 0) {
out.push("\"-\" (");
_generateMinMaxInt(null, -minValue, out, decimalsLeft, false);
out.push(") | [0] | [1-9] ");
moreDigits(0, decimalsLeft - 1);
} else if (minValue === 0) {
if (topLevel) {
out.push("[0] | [1-9] ");
moreDigits(0, lessDecimals);
} else {
moreDigits(1, decimalsLeft);
}
} else if (minValue <= 9) {
const c = minValue.toString();
const range_start = topLevel ? '1' : '0';
if (c > range_start) {
digitRange(range_start, String.fromCharCode(c.charCodeAt(0) - 1));
out.push(" ");
moreDigits(1, lessDecimals);
out.push(" | ");
}
digitRange(c, "9");
out.push(" ");
moreDigits(0, lessDecimals);
} else {
const minS = minValue.toString();
const length = minS.length;
const c = minS[0];
if (c > "1") {
digitRange(topLevel ? "1" : "0", String.fromCharCode(c.charCodeAt(0) - 1));
out.push(" ");
moreDigits(length, lessDecimals);
out.push(" | ");
}
digitRange(c, c);
out.push(" (");
_generateMinMaxInt(parseInt(minS.slice(1)), null, out, lessDecimals, false);
out.push(")");
if (c < "9") {
out.push(" | ");
digitRange(String.fromCharCode(c.charCodeAt(0) + 1), "9");
out.push(" ");
moreDigits(length - 1, lessDecimals);
}
}
return;
}
if (hasMax) {
if (maxValue >= 0) {
if (topLevel) {
out.push("\"-\" [1-9] ");
moreDigits(0, lessDecimals);
out.push(" | ");
}
_generateMinMaxInt(0, maxValue, out, decimalsLeft, true);
} else {
out.push("\"-\" (");
_generateMinMaxInt(-maxValue, null, out, decimalsLeft, false);
out.push(")");
}
return;
}
throw new Error("At least one of minValue or maxValue must be set");
}
class BuiltinRule {
constructor(content, deps) {
this.content = content;
@ -337,6 +532,64 @@ export class SchemaConverter {
return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space")
}
_notStrings(strings) {
class TrieNode {
constructor() {
this.children = {};
this.isEndOfString = false;
}
insert(str) {
let node = this;
for (const c of str) {
node = node.children[c] = node.children[c] || new TrieNode();
}
node.isEndOfString = true;
}
}
const trie = new TrieNode();
for (const s of strings) {
trie.insert(s);
}
const charRuleName = this._addPrimitive('char', PRIMITIVE_RULES['char']);
const out = ['["] ( '];
const visit = (node) => {
const rejects = [];
let first = true;
for (const c of Object.keys(node.children).sort()) {
const child = node.children[c];
rejects.push(c);
if (first) {
first = false;
} else {
out.push(' | ');
}
out.push(`[${c}]`);
if (Object.keys(child.children).length > 0) {
out.push(' (');
visit(child);
out.push(')');
} else if (child.isEndOfString) {
out.push(` ${charRuleName}+`);
}
}
if (Object.keys(node.children).length > 0) {
if (!first) {
out.push(' | ');
}
out.push(`[^"${rejects.join('')}] ${charRuleName}*`);
}
};
visit(trie);
out.push(` )${trie.isEndOfString ? '' : '?'} ["] space`);
return out.join('');
}
_resolveRef(ref) {
let refName = ref.split('/').pop();
if (!(refName in this._rules) && !this._refsBeingResolved.has(ref)) {
@ -363,11 +616,11 @@ export class SchemaConverter {
} else if (schema.oneOf || schema.anyOf) {
return this._addRule(ruleName, this._generateUnionRule(name, schema.oneOf || schema.anyOf));
} else if (Array.isArray(schemaType)) {
return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({ type: t }))));
return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({...schema, type: t}))));
} else if ('const' in schema) {
return this._addRule(ruleName, this._generateConstantRule(schema.const));
return this._addRule(ruleName, this._generateConstantRule(schema.const) + ' space');
} else if ('enum' in schema) {
const rule = schema.enum.map(v => this._generateConstantRule(v)).join(' | ');
const rule = '(' + schema.enum.map(v => this._generateConstantRule(v)).join(' | ') + ') space';
return this._addRule(ruleName, rule);
} else if ((schemaType === undefined || schemaType === 'object') &&
('properties' in schema ||
@ -404,7 +657,7 @@ export class SchemaConverter {
}
}
return this._addRule(ruleName, this._buildObjectRule(properties, required, name, /* additionalProperties= */ false));
return this._addRule(ruleName, this._buildObjectRule(properties, required, name, null));
} else if ((schemaType === undefined || schemaType === 'array') && ('items' in schema || 'prefixItems' in schema)) {
const items = schema.items ?? schema.prefixItems;
if (Array.isArray(items)) {
@ -435,6 +688,24 @@ export class SchemaConverter {
const minLen = schema.minLength || 0;
const maxLen = schema.maxLength;
return this._addRule(ruleName, '"\\\"" ' + _buildRepetition(charRuleName, minLen, maxLen) + ' "\\\"" space');
} else if (schemaType === 'integer' && ('minimum' in schema || 'exclusiveMinimum' in schema || 'maximum' in schema || 'exclusiveMaximum' in schema)) {
let minValue = null;
let maxValue = null;
if ('minimum' in schema) {
minValue = schema.minimum;
} else if ('exclusiveMinimum' in schema) {
minValue = schema.exclusiveMinimum + 1;
}
if ('maximum' in schema) {
maxValue = schema.maximum;
} else if ('exclusiveMaximum' in schema) {
maxValue = schema.exclusiveMaximum - 1;
}
const out = ["("];
_generateMinMaxInt(minValue, maxValue, out);
out.push(") space");
return this._addRule(ruleName, out.join(''));
} else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) {
return this._addRule(ruleName, this._addPrimitive('object', PRIMITIVE_RULES['object']));
} else {
@ -480,12 +751,19 @@ export class SchemaConverter {
const requiredProps = sortedProps.filter(k => required.has(k));
const optionalProps = sortedProps.filter(k => !required.has(k));
if (typeof additionalProperties === 'object' || additionalProperties === true) {
if (additionalProperties !== false) {
const subName = `${name ?? ''}${name ? '-' : ''}additional`;
const valueRule = this.visit(additionalProperties === true ? {} : additionalProperties, `${subName}-value`);
const valueRule =
additionalProperties != null && typeof additionalProperties === 'object' ? this.visit(additionalProperties, `${subName}-value`)
: this._addPrimitive('value', PRIMITIVE_RULES['value']);
const key_rule =
sortedProps.length === 0 ? this._addPrimitive('string', PRIMITIVE_RULES['string'])
: this._addRule(`${subName}-k`, this._notStrings(sortedProps));
propKvRuleNames['*'] = this._addRule(
`${subName}-kv`,
`${this._addPrimitive('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`);
`${key_rule} ":" space ${valueRule}`);
optionalProps.push('*');
}
@ -502,15 +780,11 @@ export class SchemaConverter {
const [k, ...rest] = ks;
const kvRuleName = propKvRuleNames[k];
let res;
if (k === '*') {
res = this._addRule(
`${name ?? ''}${name ? '-' : ''}additional-kvs`,
`${kvRuleName} ( "," space ` + kvRuleName + ` )*`
)
} else if (firstIsOptional) {
res = `( "," space ${kvRuleName} )?`;
const commaRef = `( "," space ${kvRuleName} )`;
if (firstIsOptional) {
res = commaRef + (k === '*' ? '*' : '?');
} else {
res = kvRuleName;
res = kvRuleName + (k === '*' ? ' ' + commaRef + '*' : '');
}
if (rest.length > 0) {
res += ' ' + this._addRule(

View file

@ -3,6 +3,13 @@
by Humans for All.
## quickstart
To run from the build dir
bin/llama-server -m path/model.gguf --path ../examples/server/public_simplechat
Continue reading for the details.
## overview
@ -14,6 +21,8 @@ own system prompts.
This allows seeing the generated text / ai-model response in oneshot at the end, after it is fully generated,
or potentially as it is being generated, in a streamed manner from the server/ai-model.
![Chat and Settings screens](./simplechat_screens.webp "Chat and Settings screens")
Auto saves the chat session locally as and when the chat is progressing and inturn at a later time when you
open SimpleChat, option is provided to restore the old chat session, if a matching one exists.
@ -170,17 +179,23 @@ It is attached to the document object. Some of these can also be updated using t
The histogram/freq based trimming logic is currently tuned for english language wrt its
is-it-a-alpabetic|numeral-char regex match logic.
chatRequestOptions - maintains the list of options/fields to send along with chat request,
apiRequestOptions - maintains the list of options/fields to send along with api request,
irrespective of whether /chat/completions or /completions endpoint.
If you want to add additional options/fields to send to the server/ai-model, and or
modify the existing options value or remove them, for now you can update this global var
using browser's development-tools/console.
For string and numeric fields in chatRequestOptions, including even those added by a user
at runtime by directly modifying gMe.chatRequestOptions, setting ui entries will be auto
For string, numeric and boolean fields in apiRequestOptions, including even those added by a
user at runtime by directly modifying gMe.apiRequestOptions, setting ui entries will be auto
created.
cache_prompt option supported by example/server is allowed to be controlled by user, so that
any caching supported wrt system-prompt and chat history, if usable can get used. When chat
history sliding window is enabled, cache_prompt logic may or may not kick in at the backend
wrt same, based on aspects related to model, positional encoding, attention mechanism etal.
However system prompt should ideally get the benefit of caching.
headers - maintains the list of http headers sent when request is made to the server. By default
Content-Type is set to application/json. Additionally Authorization entry is provided, which can
be set if needed using the settings ui.
@ -197,10 +212,10 @@ It is attached to the document object. Some of these can also be updated using t
>0 : Send the latest chat history from the latest system prompt, limited to specified cnt.
By using gMe's iRecentUserMsgCnt and chatRequestOptions.max_tokens one can try to control the
implications of loading of the ai-model's context window by chat history, wrt chat response to
some extent in a simple crude way. You may also want to control the context size enabled when
the server loads ai-model, on the server end.
By using gMe's iRecentUserMsgCnt and apiRequestOptions.max_tokens/n_predict one can try to control
the implications of loading of the ai-model's context window by chat history, wrt chat response to
some extent in a simple crude way. You may also want to control the context size enabled when the
server loads ai-model, on the server end.
Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
@ -237,12 +252,12 @@ also be started with a model context size of 1k or more, to be on safe side.
internal n_predict, for now add the same here on the client side, maybe later add max_tokens
to /completions endpoint handling code on server side.
NOTE: One may want to experiment with frequency/presence penalty fields in chatRequestOptions
wrt the set of fields sent to server along with the user query. To check how the model behaves
NOTE: One may want to experiment with frequency/presence penalty fields in apiRequestOptions
wrt the set of fields sent to server along with the user query, to check how the model behaves
wrt repeatations in general in the generated text response.
A end-user can change these behaviour by editing gMe from browser's devel-tool/console or by
using the providing settings ui.
using the provided settings ui (for settings exposed through the ui).
### OpenAi / Equivalent API WebService
@ -253,7 +268,7 @@ for a minimal chatting experimentation by setting the below.
* the baseUrl in settings ui
* https://api.openai.com/v1 or similar
* Wrt request body - gMe.chatRequestOptions
* Wrt request body - gMe.apiRequestOptions
* model (settings ui)
* any additional fields if required in future

View file

@ -222,8 +222,8 @@ class SimpleChat {
* @param {Object} obj
*/
request_jsonstr_extend(obj) {
for(let k in gMe.chatRequestOptions) {
obj[k] = gMe.chatRequestOptions[k];
for(let k in gMe.apiRequestOptions) {
obj[k] = gMe.apiRequestOptions[k];
}
if (gMe.bStream) {
obj["stream"] = true;
@ -740,11 +740,12 @@ class Me {
"Authorization": "", // Authorization: Bearer OPENAI_API_KEY
}
// Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
this.chatRequestOptions = {
this.apiRequestOptions = {
"model": "gpt-3.5-turbo",
"temperature": 0.7,
"max_tokens": 1024,
"n_predict": 1024,
"cache_prompt": false,
//"frequency_penalty": 1.2,
//"presence_penalty": 1.2,
};
@ -800,51 +801,55 @@ class Me {
ui.el_create_append_p(`bStream:${this.bStream}`, elDiv);
ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
ui.el_create_append_p(`bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`, elDiv);
ui.el_create_append_p(`bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`, elDiv);
ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
}
ui.el_create_append_p(`chatRequestOptions:${JSON.stringify(this.chatRequestOptions, null, " - ")}`, elDiv);
ui.el_create_append_p(`apiRequestOptions:${JSON.stringify(this.apiRequestOptions, null, " - ")}`, elDiv);
ui.el_create_append_p(`headers:${JSON.stringify(this.headers, null, " - ")}`, elDiv);
}
/**
* Auto create ui input elements for fields in ChatRequestOptions
* Auto create ui input elements for fields in apiRequestOptions
* Currently supports text and number field types.
* @param {HTMLDivElement} elDiv
*/
show_settings_chatrequestoptions(elDiv) {
show_settings_apirequestoptions(elDiv) {
let typeDict = {
"string": "text",
"number": "number",
};
let fs = document.createElement("fieldset");
let legend = document.createElement("legend");
legend.innerText = "ChatRequestOptions";
legend.innerText = "ApiRequestOptions";
fs.appendChild(legend);
elDiv.appendChild(fs);
for(const k in this.chatRequestOptions) {
let val = this.chatRequestOptions[k];
for(const k in this.apiRequestOptions) {
let val = this.apiRequestOptions[k];
let type = typeof(val);
if (!((type == "string") || (type == "number"))) {
continue;
if (((type == "string") || (type == "number"))) {
let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.apiRequestOptions[k], (val)=>{
if (type == "number") {
val = Number(val);
}
this.apiRequestOptions[k] = val;
});
fs.appendChild(inp.div);
} else if (type == "boolean") {
let bbtn = ui.el_creatediv_boolbutton(`Set{k}`, k, {true: "true", false: "false"}, val, (userVal)=>{
this.apiRequestOptions[k] = userVal;
});
fs.appendChild(bbtn.div);
}
let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.chatRequestOptions[k], (val)=>{
if (type == "number") {
val = Number(val);
}
this.chatRequestOptions[k] = val;
});
fs.appendChild(inp.div);
}
}
@ -870,6 +875,23 @@ class Me {
});
elDiv.appendChild(bb.div);
bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
this.bTrimGarbage = val;
});
elDiv.appendChild(bb.div);
this.show_settings_apirequestoptions(elDiv);
let sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
this.apiEP = ApiEP.Type[val];
});
elDiv.appendChild(sel.div);
sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
});
elDiv.appendChild(sel.div);
bb = ui.el_creatediv_boolbutton("SetCompletionFreshChatAlways", "CompletionFreshChatAlways", {true: "[+] yes fresh", false: "[-] no, with history"}, this.bCompletionFreshChatAlways, (val)=>{
this.bCompletionFreshChatAlways = val;
});
@ -880,23 +902,6 @@ class Me {
});
elDiv.appendChild(bb.div);
bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
this.bTrimGarbage = val;
});
elDiv.appendChild(bb.div);
let sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
});
elDiv.appendChild(sel.div);
sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
this.apiEP = ApiEP.Type[val];
});
elDiv.appendChild(sel.div);
this.show_settings_chatrequestoptions(elDiv);
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

View file

@ -2607,17 +2607,9 @@ int main(int argc, char ** argv) {
// print sample chat example to make it clear which template is used
{
json chat;
chat.push_back({{"role", "system"}, {"content", "You are a helpful assistant"}});
chat.push_back({{"role", "user"}, {"content", "Hello"}});
chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
chat.push_back({{"role", "user"}, {"content", "How are you?"}});
const std::string chat_example = format_chat(ctx_server.model, params.chat_template, chat);
LOG_INFO("chat template", {
{"chat_example", chat_example},
{"built_in", params.chat_template.empty()},
{"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
{"built_in", params.chat_template.empty()},
});
}

View file

@ -82,7 +82,7 @@ Feature: llama.cpp server
Examples: Prompts
| response_format | n_predicted | re_content |
| {"type": "json_object", "schema": {"const": "42"}} | 5 | "42" |
| {"type": "json_object", "schema": {"const": "42"}} | 6 | "42" |
| {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10 | \[ -300 \] |
| {"type": "json_object"} | 10 | \{ " Jacky. |

View file

@ -118,36 +118,17 @@ static inline void server_log(const char * level, const char * function, int lin
// Format given chat. If tmpl is empty, we take the template from model metadata
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
size_t alloc_size = 0;
// vector holding all allocated string to be passed to llama_chat_apply_template
std::vector<std::string> str(messages.size() * 2);
std::vector<llama_chat_message> chat(messages.size());
std::vector<llama_chat_msg> chat;
for (size_t i = 0; i < messages.size(); ++i) {
const auto & curr_msg = messages[i];
str[i*2 + 0] = json_value(curr_msg, "role", std::string(""));
str[i*2 + 1] = json_value(curr_msg, "content", std::string(""));
alloc_size += str[i*2 + 1].length();
chat[i].role = str[i*2 + 0].c_str();
chat[i].content = str[i*2 + 1].c_str();
std::string role = json_value(curr_msg, "role", std::string(""));
std::string content = json_value(curr_msg, "content", std::string(""));
chat.push_back({role, content});
}
const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
std::vector<char> buf(alloc_size * 2);
// run the first time to get the total output length
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
// if it turns out that our buffer is too small, we resize it
if ((size_t) res > buf.size()) {
buf.resize(res);
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
}
const std::string formatted_chat(buf.data(), res);
auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
return formatted_chat;
}

View file

@ -23,7 +23,7 @@ struct mma_int_A_I16K4 {
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
#if defined(INT8_MMA_AVAILABLE)
const int * xs = xs0 + (threadIdx.x%I)*stride + (threadIdx.x/I)*(K/2);
const int * xs = xs0 + (threadIdx.x%I)*stride;
asm("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
: "+r"(x[0]), "+r"(x[1])
: "l"(xs));

View file

@ -69,6 +69,7 @@ class GGUFReader:
# I - same as host, S - swapped
byte_order: Literal['I'] | Literal['S'] = 'I'
alignment: int = GGUF_DEFAULT_ALIGNMENT
data_offset: int
# Note: Internal helper, API may change.
gguf_scalar_to_np: dict[GGUFValueType, type[np.generic]] = {
@ -88,9 +89,13 @@ class GGUFReader:
def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r+'] | Literal['c'] = 'r'):
self.data = np.memmap(path, mode = mode)
offs = 0
# Check for GGUF magic
if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
raise ValueError('GGUF magic invalid')
offs += 4
# Check GGUF version
temp_version = self._get(offs, np.uint32)
if temp_version[0] & 65535 == 0:
# If we get 0 here that means it's (probably) a GGUF file created for
@ -103,12 +108,16 @@ class GGUFReader:
self.fields: OrderedDict[str, ReaderField] = OrderedDict()
self.tensors: list[ReaderTensor] = []
offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32]))
# Check tensor count and kv count
temp_counts = self._get(offs, np.uint64, 2)
offs += self._push_field(ReaderField(offs, 'GGUF.tensor_count', [temp_counts[:1]], [0], [GGUFValueType.UINT64]))
offs += self._push_field(ReaderField(offs, 'GGUF.kv_count', [temp_counts[1:]], [0], [GGUFValueType.UINT64]))
tensor_count, kv_count = temp_counts
offs = self._build_fields(offs, kv_count)
offs, tensors_fields = self._build_tensors_fields(offs, tensor_count)
# Build Tensor Info Fields
offs, tensors_fields = self._build_tensor_info(offs, tensor_count)
new_align = self.fields.get('general.alignment')
if new_align is not None:
if new_align.types != [GGUFValueType.UINT32]:
@ -117,6 +126,7 @@ class GGUFReader:
padding = offs % self.alignment
if padding != 0:
offs += self.alignment - padding
self.data_offset = offs
self._build_tensors(offs, tensors_fields)
_DT = TypeVar('_DT', bound = npt.DTypeLike)
@ -193,18 +203,29 @@ class GGUFReader:
# We can't deal with this one.
raise ValueError('Unknown/unhandled field type {gtype}')
def _get_tensor(self, orig_offs: int) -> ReaderField:
def _get_tensor_info_field(self, orig_offs: int) -> ReaderField:
offs = orig_offs
# Get Tensor Name
name_len, name_data = self._get_str(offs)
offs += int(name_len.nbytes + name_data.nbytes)
# Get Tensor Dimensions Count
n_dims = self._get(offs, np.uint32)
offs += int(n_dims.nbytes)
# Get Tensor Dimension Array
dims = self._get(offs, np.uint64, n_dims[0])
offs += int(dims.nbytes)
# Get Tensor Encoding Scheme Type
raw_dtype = self._get(offs, np.uint32)
offs += int(raw_dtype.nbytes)
# Get Tensor Offset
offset_tensor = self._get(offs, np.uint64)
offs += int(offset_tensor.nbytes)
return ReaderField(
orig_offs,
str(bytes(name_data), encoding = 'utf-8'),
@ -233,10 +254,10 @@ class GGUFReader:
offs += field_size
return offs
def _build_tensors_fields(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
def _build_tensor_info(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
tensor_fields = []
for _ in range(count):
field = self._get_tensor(offs)
field = self._get_tensor_info_field(offs)
offs += sum(int(part.nbytes) for part in field.parts)
tensor_fields.append(field)
return offs, tensor_fields

View file

@ -319,6 +319,27 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
markdown_content += "\n"
markdown_content += "### Tensor Data Offset\n"
markdown_content += '\n'
markdown_content += 'This table contains the offset and data segment relative to start of file\n'
markdown_content += '\n'
tensor_mapping_table: list[dict[str, str | int]] = []
for key, tensor in enumerate(reader.tensors):
data_offset_pretty = '{0:#16x}'.format(tensor.data_offset)
data_size_pretty = '{0:#16x}'.format(tensor.n_bytes)
tensor_mapping_table.append({"t_id":key, "layer_name":tensor.name, "data_offset":data_offset_pretty, "data_size":data_size_pretty})
tensors_mapping_table_header_map = [
{'key_name':'t_id', 'header_name':'T_ID', 'align':'right'},
{'key_name':'layer_name', 'header_name':'Tensor Layer Name', 'align':'left'},
{'key_name':'data_offset', 'header_name':'Data Offset (B)', 'align':'right'},
{'key_name':'data_size', 'header_name':'Data Size (B)', 'align':'right'},
]
markdown_content += markdown_table_with_alignment_support(tensors_mapping_table_header_map, tensor_mapping_table)
markdown_content += "\n"
for group in tensor_prefix_order:
tensors = tensor_groups[group]
group_elements = sum(tensor.n_elements for tensor in tensors)
@ -370,6 +391,8 @@ def main() -> None:
parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata")
parser.add_argument("--json", action="store_true", help="Produce JSON output")
parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)")
parser.add_argument("--data-offset", action="store_true", help="Start of data offset")
parser.add_argument("--data-alignment", action="store_true", help="Data alignment applied globally to data field")
parser.add_argument("--markdown", action="store_true", help="Produce markdown output")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
@ -377,7 +400,7 @@ def main() -> None:
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
if not args.json and not args.markdown:
if not args.json and not args.markdown and not args.data_offset and not args.data_alignment:
logger.info(f'* Loading: {args.model}')
reader = GGUFReader(args.model, 'r')
@ -386,6 +409,10 @@ def main() -> None:
dump_metadata_json(reader, args)
elif args.markdown:
dump_markdown_metadata(reader, args)
elif args.data_offset:
print(reader.data_offset) # noqa: NP100
elif args.data_alignment:
print(reader.alignment) # noqa: NP100
else:
dump_metadata(reader, args)

1019
llama.cpp

File diff suppressed because it is too large Load diff

View file

@ -67,6 +67,7 @@ extern "C" {
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
};
// pre-tokenization types
@ -859,6 +860,7 @@ extern "C" {
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
// Returns -1 if unknown, 1 for true or 0 for false.
LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
@ -926,6 +928,12 @@ extern "C" {
// Grammar
//
/// Initialize a llama_grammar.
///
/// @param rules The rule elements of the grammar to initialize.
/// @param n_rules The number of rules.
/// @param start_rule_index The index of the root rule (the starting point of the grammar).
/// @return The initialized llama_grammar or nullptr if initialization failed.
LLAMA_API struct llama_grammar * llama_grammar_init(
const llama_grammar_element ** rules,
size_t n_rules,

View file

@ -23,7 +23,7 @@ static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
return result;
}
static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
assert(offset < utf8.size());
if (!(utf8[offset + 0] & 0x80)) {
auto result = utf8[offset + 0];

View file

@ -48,6 +48,7 @@ struct codepoint_flags {
std::string unicode_cpt_to_utf8(uint32_t cp);
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);