Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.devops/full-cuda.Dockerfile
#	.devops/full.Dockerfile
#	.devops/main-cuda.Dockerfile
#	.devops/main-rocm.Dockerfile
#	.devops/main-vulkan.Dockerfile
#	.devops/main.Dockerfile
#	.devops/server-cuda.Dockerfile
#	.devops/server.Dockerfile
#	README.md
#	common/CMakeLists.txt
#	grammars/README.md
#	tests/test-grammar-integration.cpp
#	tests/test-grammar-parser.cpp
#	tests/test-json-schema-to-grammar.cpp
This commit is contained in:
Concedo 2024-06-09 17:30:05 +08:00
commit 562d980140
25 changed files with 881 additions and 676 deletions

View file

@ -201,19 +201,13 @@ void gpt_params_handle_model_default(gpt_params & params) {
} }
params.hf_file = params.model; params.hf_file = params.model;
} else if (params.model.empty()) { } else if (params.model.empty()) {
std::string cache_directory = fs_get_cache_directory(); params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
const bool success = fs_create_directory_with_parents(cache_directory);
if (!success) {
throw std::runtime_error("failed to create cache directory: " + cache_directory);
}
params.model = cache_directory + string_split(params.hf_file, '/').back();
} }
} else if (!params.model_url.empty()) { } else if (!params.model_url.empty()) {
if (params.model.empty()) { if (params.model.empty()) {
auto f = string_split(params.model_url, '#').front(); auto f = string_split(params.model_url, '#').front();
f = string_split(f, '?').front(); f = string_split(f, '?').front();
f = string_split(f, '/').back(); params.model = fs_get_cache_file(string_split(f, '/').back());
params.model = "models/" + f;
} }
} else if (params.model.empty()) { } else if (params.model.empty()) {
params.model = DEFAULT_MODEL_PATH; params.model = DEFAULT_MODEL_PATH;
@ -274,6 +268,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
} }
} catch (const std::invalid_argument & ex) { } catch (const std::invalid_argument & ex) {
fprintf(stderr, "%s\n", ex.what()); fprintf(stderr, "%s\n", ex.what());
params = params_org;
return false; return false;
} }
@ -409,6 +404,20 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
} }
return true; return true;
} }
if (arg == "--in-file") {
if (++i >= argc) {
invalid_param = true;
return true;
}
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
invalid_param = true;
return true;
}
params.in_files.push_back(argv[i]);
return true;
}
if (arg == "-n" || arg == "--predict" || arg == "--n-predict") { if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -1082,7 +1091,15 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true; return true;
} }
if (arg == "-v" || arg == "--verbose") { if (arg == "-v" || arg == "--verbose") {
params.verbose = true; params.verbosity = 1;
return true;
}
if (arg == "--verbosity") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.verbosity = std::stoi(argv[i]);
return true; return true;
} }
if (arg == "--verbose-prompt") { if (arg == "--verbose-prompt") {
@ -1392,6 +1409,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.timeout_write = std::stoi(argv[i]); params.timeout_write = std::stoi(argv[i]);
return true; return true;
} }
if (arg == "--threads-http") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.n_threads_http = std::stoi(argv[i]);
return true;
}
if (arg == "-spf" || arg == "--system-prompt-file") { if (arg == "-spf" || arg == "--system-prompt-file") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -1461,6 +1486,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.chat_template = argv[i]; params.chat_template = argv[i];
return true; return true;
} }
if (arg == "--slot-prompt-similarity" || arg == "-sps") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.slot_prompt_similarity = std::stof(argv[i]);
return true;
}
if (arg == "-pps") { if (arg == "-pps") {
params.is_pp_shared = true; params.is_pp_shared = true;
return true; return true;
@ -1538,6 +1571,46 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.i_pos = std::stoi(argv[i]); params.i_pos = std::stoi(argv[i]);
return true; return true;
} }
if (arg == "-o" || arg == "--output" || arg == "--output-file") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.out_file = argv[i];
return true;
}
if (arg == "-ofreq" || arg == "--output-frequency") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.n_out_freq = std::stoi(argv[i]);
return true;
}
if (arg == "--save-frequency") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.n_save_freq = std::stoi(argv[i]);
return true;
}
if (arg == "--process-output") {
params.process_output = true;
return true;
}
if (arg == "--no-ppl") {
params.compute_ppl = false;
return true;
}
if (arg == "--chunk" || arg == "--from-chunk") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.i_chunk = std::stoi(argv[i]);
return true;
}
#ifndef LOG_DISABLE_LOGS #ifndef LOG_DISABLE_LOGS
// Parse args for logging parameters // Parse args for logging parameters
if (log_param_single_parse(argv[i])) { if (log_param_single_parse(argv[i])) {
@ -1613,6 +1686,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", "-h, --help, --usage", "print usage and exit" }); options.push_back({ "*", "-h, --help, --usage", "print usage and exit" });
options.push_back({ "*", " --version", "show version and build info" }); options.push_back({ "*", " --version", "show version and build info" });
options.push_back({ "*", "-v, --verbose", "print verbose information" }); options.push_back({ "*", "-v, --verbose", "print verbose information" });
options.push_back({ "*", " --verbosity N", "set specific verbosity level (default: %d)", params.verbosity });
options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" }); options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" }); options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" }); options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
@ -1638,6 +1712,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" }); options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with (default: '%s')", params.prompt.c_str() }); options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with (default: '%s')", params.prompt.c_str() });
options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" }); options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" });
options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" });
options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" }); options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" });
options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" }); options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" });
options.push_back({ "*", " --no-escape", "do not process escape sequences" }); options.push_back({ "*", " --no-escape", "do not process escape sequences" });
@ -1805,6 +1880,14 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk }); options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk });
options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos }); options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos });
options.push_back({ "imatrix" });
options.push_back({ "imatrix", "-o, --output FNAME", "output file (default: '%s')", params.out_file.c_str() });
options.push_back({ "imatrix", " --output-frequency N", "output the imatrix every N iterations (default: %d)", params.n_out_freq });
options.push_back({ "imatrix", " --save-frequency N", "save an imatrix copy every N iterations (default: %d)", params.n_save_freq });
options.push_back({ "imatrix", " --process-output", "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" });
options.push_back({ "imatrix", " --no-ppl", "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" });
options.push_back({ "imatrix", " --chunk N", "start processing the input from chunk N (default: %d)", params.i_chunk });
options.push_back({ "bench" }); options.push_back({ "bench" });
options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" }); options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" });
options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" }); options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" });
@ -1821,6 +1904,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" }); options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
options.push_back({ "server", " --ssl-cert-file FNAME", "path to file a PEM-encoded SSL certificate" }); options.push_back({ "server", " --ssl-cert-file FNAME", "path to file a PEM-encoded SSL certificate" });
options.push_back({ "server", " --timeout N", "server read/write timeout in seconds (default: %d)", params.timeout_read }); options.push_back({ "server", " --timeout N", "server read/write timeout in seconds (default: %d)", params.timeout_read });
options.push_back({ "server", " --threads-http N", "number of threads used to process HTTP requests (default: %d)", params.n_threads_http });
options.push_back({ "server", " --system-prompt-file FNAME", options.push_back({ "server", " --system-prompt-file FNAME",
"set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" }); "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" });
options.push_back({ "server", " --log-format {text,json}", options.push_back({ "server", " --log-format {text,json}",
@ -1832,6 +1916,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
"set custom jinja chat template (default: template taken from model's metadata)\n" "set custom jinja chat template (default: template taken from model's metadata)\n"
"only commonly used templates are accepted:\n" "only commonly used templates are accepted:\n"
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
"how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
#ifndef LOG_DISABLE_LOGS #ifndef LOG_DISABLE_LOGS
options.push_back({ "logging" }); options.push_back({ "logging" });
@ -2188,6 +2274,16 @@ std::string fs_get_cache_directory() {
return ensure_trailing_slash(cache_directory); return ensure_trailing_slash(cache_directory);
} }
std::string fs_get_cache_file(const std::string & filename) {
GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
std::string cache_directory = fs_get_cache_directory();
const bool success = fs_create_directory_with_parents(cache_directory);
if (!success) {
throw std::runtime_error("failed to create cache directory: " + cache_directory);
}
return cache_directory + filename;
}
// //
// Model utils // Model utils

View file

@ -67,7 +67,6 @@ struct gpt_params {
float p_split = 0.1f; // speculative decoding split probability float p_split = 0.1f; // speculative decoding split probability
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
int32_t n_beams = 0; // if non-zero then use beam search of given width. int32_t n_beams = 0; // if non-zero then use beam search of given width.
@ -82,13 +81,13 @@ struct gpt_params {
float yarn_beta_slow = 1.0f; // YaRN high correction dim float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold float defrag_thold = -1.0f; // KV cache defragmentation threshold
std::string rpc_servers = ""; // comma separated list of RPC servers
ggml_backend_sched_eval_callback cb_eval = nullptr; ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr; void * cb_eval_user_data = nullptr;
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
@ -131,7 +130,9 @@ struct gpt_params {
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
std::string logits_file = ""; // file for saving *all* logits std::string logits_file = ""; // file for saving *all* logits
std::string rpc_servers = ""; // comma separated list of RPC servers
std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override> kv_overrides; std::vector<llama_model_kv_override> kv_overrides;
@ -141,6 +142,7 @@ struct gpt_params {
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
int32_t verbosity = 0;
int32_t control_vector_layer_start = -1; // layer range for control vector int32_t control_vector_layer_start = -1; // layer range for control vector
int32_t control_vector_layer_end = -1; // layer range for control vector int32_t control_vector_layer_end = -1; // layer range for control vector
@ -152,7 +154,7 @@ struct gpt_params {
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
@ -180,7 +182,6 @@ struct gpt_params {
bool logits_all = false; // return logits for all tokens in the batch bool logits_all = false; // return logits for all tokens in the batch
bool use_mmap = true; // use mmap for faster loads bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory bool use_mlock = false; // use mlock to keep model in memory
bool verbose = false;
bool verbose_prompt = false; // print prompt tokens before generation bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation bool display_prompt = true; // print prompt before generation
bool infill = false; // use infill mode bool infill = false; // use infill mode
@ -197,10 +198,10 @@ struct gpt_params {
std::vector<std::string> image; // path to image file(s) std::vector<std::string> image; // path to image file(s)
// server params // server params
int32_t port = 8080; int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; int32_t timeout_read = 600; // http read timeout in seconds
int32_t timeout_write = timeout_read; int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; int32_t n_threads_http = -1; // number of threads to process HTTP requests
std::string hostname = "127.0.0.1"; std::string hostname = "127.0.0.1";
std::string public_path = ""; std::string public_path = "";
@ -219,6 +220,8 @@ struct gpt_params {
std::string slot_save_path; std::string slot_save_path;
float slot_prompt_similarity = 0.5f;
// batched-bench params // batched-bench params
bool is_pp_shared = false; bool is_pp_shared = false;
@ -236,6 +239,16 @@ struct gpt_params {
// passkey params // passkey params
int32_t n_junk = 250; // number of times to repeat the junk text int32_t n_junk = 250; // number of times to repeat the junk text
int32_t i_pos = -1; // position of the passkey in the junk text int32_t i_pos = -1; // position of the passkey in the junk text
// imatrix params
std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
int32_t i_chunk = 0; // start processing from this chunk
bool process_output = false; // collect data for the output tensor
bool compute_ppl = true; // whether to compute perplexity
}; };
void gpt_params_handle_model_default(gpt_params & params); void gpt_params_handle_model_default(gpt_params & params);
@ -281,6 +294,7 @@ bool fs_validate_filename(const std::string & filename);
bool fs_create_directory_with_parents(const std::string & path); bool fs_create_directory_with_parents(const std::string & path);
std::string fs_get_cache_directory(); std::string fs_get_cache_directory();
std::string fs_get_cache_file(const std::string & filename);
// //
// Model utils // Model utils

View file

@ -46,8 +46,12 @@ namespace grammar_parser {
state.rules[rule_id] = rule; state.rules[rule_id] = rule;
} }
static bool is_digit_char(char c) {
return '0' <= c && c <= '9';
}
static bool is_word_char(char c) { static bool is_word_char(char c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9'); return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
} }
static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) { static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
@ -99,6 +103,17 @@ namespace grammar_parser {
return pos; return pos;
} }
static const char * parse_int(const char * src) {
const char * pos = src;
while (is_digit_char(*pos)) {
pos++;
}
if (pos == src) {
throw std::runtime_error(std::string("expecting integer at ") + src);
}
return pos;
}
static std::pair<uint32_t, const char *> parse_char(const char * src) { static std::pair<uint32_t, const char *> parse_char(const char * src) {
if (*src == '\\') { if (*src == '\\') {
switch (src[1]) { switch (src[1]) {
@ -137,6 +152,60 @@ namespace grammar_parser {
bool is_nested) { bool is_nested) {
size_t last_sym_start = out_elements.size(); size_t last_sym_start = out_elements.size();
const char * pos = src; const char * pos = src;
auto handle_repetitions = [&](int min_times, int max_times) {
if (last_sym_start == out_elements.size()) {
throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
}
// apply transformation to previous symbol (last_sym_start to end) according to
// the following rewrite rules:
// S{m,n} --> S S S (m times) S'(n-m)
// S'(x) ::= S S'(x-1) |
// (... n-m definitions of these S' rules ...)
// S'(1) ::= S |
// S{m,} --> S S S (m times) S'
// S' ::= S S' |
// S* --> S{0,}
// --> S' ::= S S' |
// S+ --> S{1,}
// --> S S'
// S' ::= S S' |
// S? --> S{0,1}
// --> S'
// S' ::= S |
std::vector<llama_grammar_element> previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
if (min_times == 0) {
out_elements.resize(last_sym_start);
} else {
// Repeat the previous elements (min_times - 1) times
for (int i = 1; i < min_times; i++) {
out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
}
}
uint32_t last_rec_rule_id = 0;
auto n_opt = max_times < 0 ? 1 : max_times - min_times;
std::vector<llama_grammar_element> rec_rule(previous_elements);
for (int i = 0; i < n_opt; i++) {
rec_rule.resize(previous_elements.size());
uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
if (i > 0 || max_times < 0) {
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
}
rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
rec_rule.push_back({LLAMA_GRETYPE_END, 0});
add_rule(state, rec_rule_id, rec_rule);
last_rec_rule_id = rec_rule_id;
}
if (n_opt > 0) {
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
}
};
while (*pos) { while (*pos) {
if (*pos == '"') { // literal string if (*pos == '"') { // literal string
pos++; pos++;
@ -197,40 +266,51 @@ namespace grammar_parser {
throw std::runtime_error(std::string("expecting ')' at ") + pos); throw std::runtime_error(std::string("expecting ')' at ") + pos);
} }
pos = parse_space(pos + 1, is_nested); pos = parse_space(pos + 1, is_nested);
} else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator } else if (*pos == '.') { // any char
if (last_sym_start == out_elements.size()) { last_sym_start = out_elements.size();
throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos); out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
}
// apply transformation to previous symbol (last_sym_start to end) according to
// rewrite rules:
// S* --> S' ::= S S' |
// S+ --> S' ::= S S' | S
// S? --> S' ::= S |
uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
std::vector<llama_grammar_element> sub_rule;
// add preceding symbol to generated rule
sub_rule.insert(
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
if (*pos == '*' || *pos == '+') {
// cause generated rule to recurse
sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
}
// mark start of alternate def
sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
if (*pos == '+') {
// add preceding symbol as alternate only for '+' (otherwise empty)
sub_rule.insert(
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
}
sub_rule.push_back({LLAMA_GRETYPE_END, 0});
add_rule(state, sub_rule_id, sub_rule);
// in original rule, replace previous symbol with reference to generated rule
out_elements.resize(last_sym_start);
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
pos = parse_space(pos + 1, is_nested); pos = parse_space(pos + 1, is_nested);
} else if (*pos == '*') {
pos = parse_space(pos + 1, is_nested);
handle_repetitions(0, -1);
} else if (*pos == '+') {
pos = parse_space(pos + 1, is_nested);
handle_repetitions(1, -1);
} else if (*pos == '?') {
pos = parse_space(pos + 1, is_nested);
handle_repetitions(0, 1);
} else if (*pos == '{') {
pos = parse_space(pos + 1, is_nested);
if (!is_digit_char(*pos)) {
throw std::runtime_error(std::string("expecting an int at ") + pos);
}
const char * int_end = parse_int(pos);
int min_times = std::stoul(std::string(pos, int_end - pos));
pos = parse_space(int_end, is_nested);
int max_times = -1;
if (*pos == '}') {
max_times = min_times;
pos = parse_space(pos + 1, is_nested);
} else if (*pos == ',') {
pos = parse_space(pos + 1, is_nested);
if (is_digit_char(*pos)) {
const char * int_end = parse_int(pos);
max_times = std::stoul(std::string(pos, int_end - pos));
pos = parse_space(int_end, is_nested);
}
if (*pos != '}') {
throw std::runtime_error(std::string("expecting '}' at ") + pos);
}
pos = parse_space(pos + 1, is_nested);
} else {
throw std::runtime_error(std::string("expecting ',' at ") + pos);
}
handle_repetitions(min_times, max_times);
} else { } else {
break; break;
} }
@ -325,6 +405,7 @@ namespace grammar_parser {
case LLAMA_GRETYPE_CHAR_NOT: return true; case LLAMA_GRETYPE_CHAR_NOT: return true;
case LLAMA_GRETYPE_CHAR_ALT: return true; case LLAMA_GRETYPE_CHAR_ALT: return true;
case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true; case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
case LLAMA_GRETYPE_CHAR_ANY: return true;
default: return false; default: return false;
} }
} }
@ -339,6 +420,7 @@ namespace grammar_parser {
case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break; case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break; case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break; case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
} }
switch (elem.type) { switch (elem.type) {
case LLAMA_GRETYPE_END: case LLAMA_GRETYPE_END:
@ -350,6 +432,7 @@ namespace grammar_parser {
case LLAMA_GRETYPE_CHAR_NOT: case LLAMA_GRETYPE_CHAR_NOT:
case LLAMA_GRETYPE_CHAR_RNG_UPPER: case LLAMA_GRETYPE_CHAR_RNG_UPPER:
case LLAMA_GRETYPE_CHAR_ALT: case LLAMA_GRETYPE_CHAR_ALT:
case LLAMA_GRETYPE_CHAR_ANY:
fprintf(file, "(\""); fprintf(file, "(\"");
print_grammar_char(file, elem.value); print_grammar_char(file, elem.value);
fprintf(file, "\") "); fprintf(file, "\") ");
@ -407,11 +490,15 @@ namespace grammar_parser {
} }
print_grammar_char(file, elem.value); print_grammar_char(file, elem.value);
break; break;
case LLAMA_GRETYPE_CHAR_ANY:
fprintf(file, ".");
break;
} }
if (is_char_element(elem)) { if (is_char_element(elem)) {
switch (rule[i + 1].type) { switch (rule[i + 1].type) {
case LLAMA_GRETYPE_CHAR_ALT: case LLAMA_GRETYPE_CHAR_ALT:
case LLAMA_GRETYPE_CHAR_RNG_UPPER: case LLAMA_GRETYPE_CHAR_RNG_UPPER:
case LLAMA_GRETYPE_CHAR_ANY:
break; break;
default: default:
fprintf(file, "] "); fprintf(file, "] ");

View file

@ -16,58 +16,27 @@ static std::string join(Iterator begin, Iterator end, const std::string & separa
static std::string repeat(const std::string & str, size_t n); static std::string repeat(const std::string & str, size_t n);
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) { static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
if (separator_rule.empty()) { auto has_max = max_items != std::numeric_limits<int>::max();
if (min_items == 0 && max_items == 1) { if (min_items == 0 && max_items == 1) {
return item_rule + "?"; return item_rule + "?";
} else if (min_items == 1 && max_items == std::numeric_limits<int>::max()) { }
if (separator_rule.empty()) {
if (min_items == 1 && !has_max) {
return item_rule + "+"; return item_rule + "+";
} } else if (min_items == 0 && !has_max) {
} return item_rule + "*";
std::string result;
if (min_items > 0) {
if (item_rule_is_literal && separator_rule.empty()) {
result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
} else { } else {
std::vector<std::string> items(min_items, item_rule); return item_rule + "{" + std::to_string(min_items) + "," + (has_max ? std::to_string(max_items) : "") + "}";
result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
} }
} }
std::function<std::string(int, bool)> opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string { auto result = item_rule + " " + build_repetition("(" + separator_rule + " " + item_rule + ")", min_items == 0 ? 0 : min_items - 1, has_max ? max_items - 1 : max_items);
auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule; if (min_items == 0) {
result = "(" + result + ")?";
if (up_to_n == 0) {
return "";
} else if (up_to_n == 1) {
return "(" + content + ")?";
} else if (!separator_rule.empty() && !prefix_with_sep) {
return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
} else {
std::string res = repeat("(" + content + " ", up_to_n);
// strip trailing space
res = res.substr(0, res.length() - 1);
res += repeat(")?", up_to_n);
return res;
} }
};
if (min_items > 0 && max_items != min_items) {
result += " ";
}
if (max_items != std::numeric_limits<int>::max()) {
result += opt_repetitions(max_items - min_items, min_items > 0);
} else {
std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
if (min_items == 0 && !separator_rule.empty()) {
result = "(" + item_rule + " " + item_operator + "*)?";
} else {
result += item_operator + "*";
}
}
return result; return result;
} }
@ -78,30 +47,24 @@ struct BuiltinRule {
std::vector<std::string> deps; std::vector<std::string> deps;
}; };
const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = { std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
{"boolean", {"(\"true\" | \"false\") space", {}}}, {"boolean", {"(\"true\" | \"false\") space", {}}},
{"decimal-part", {"[0-9] " + _up_to_15_digits, {}}}, {"decimal-part", {"[0-9]{1,16}", {}}},
{"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}}, {"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}},
{"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}}, {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
{"integer", {"(\"-\"? integral-part) space", {"integral-part"}}}, {"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
{"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}}, {"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}}, {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}}, {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
{"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] " {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] " {"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
{"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}}, {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
{"null", {"\"null\" space", {}}}, {"null", {"\"null\" space", {}}},
}; };
std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = { std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
{"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}}, {"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
{"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}}, {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
{"date-time", {"date \"T\" time", {"date", "time"}}}, {"date-time", {"date \"T\" time", {"date", "time"}}},
{"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}}, {"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
{"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}}, {"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
@ -385,8 +348,7 @@ private:
sub_is_literal ? "\"" + sub + "\"" : sub, sub_is_literal ? "\"" + sub + "\"" : sub,
min_times, min_times,
max_times, max_times,
"", ""
sub_is_literal
); );
seq.back().second = false; seq.back().second = false;
} else { } else {

View file

@ -83,6 +83,7 @@ models = [
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", }, {"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", }, {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
] ]

View file

@ -47,11 +47,12 @@ class Model:
_model_classes: dict[str, type[Model]] = {} _model_classes: dict[str, type[Model]] = {}
dir_model: Path dir_model: Path
ftype: int ftype: gguf.LlamaFileType
is_big_endian: bool is_big_endian: bool
endianess: gguf.GGUFEndian endianess: gguf.GGUFEndian
use_temp_file: bool use_temp_file: bool
lazy: bool lazy: bool
model_name: str | None
part_names: list[str] part_names: list[str]
is_safetensors: bool is_safetensors: bool
hparams: dict[str, Any] hparams: dict[str, Any]
@ -64,7 +65,7 @@ class Model:
# subclasses should define this! # subclasses should define this!
model_arch: gguf.MODEL_ARCH model_arch: gguf.MODEL_ARCH
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool): def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None):
if type(self) is Model: if type(self) is Model:
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
self.dir_model = dir_model self.dir_model = dir_model
@ -73,10 +74,11 @@ class Model:
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
self.use_temp_file = use_temp_file self.use_temp_file = use_temp_file
self.lazy = not eager self.lazy = not eager
self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors") self.model_name = model_name
self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
self.is_safetensors = len(self.part_names) > 0 self.is_safetensors = len(self.part_names) > 0
if not self.is_safetensors: if not self.is_safetensors:
self.part_names = Model.get_model_part_names(self.dir_model, ".bin") self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
self.hparams = Model.load_hparams(self.dir_model) self.hparams = Model.load_hparams(self.dir_model)
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
@ -94,7 +96,7 @@ class Model:
ftype_lw: str = ftype_up.lower() ftype_lw: str = ftype_up.lower()
# allow templating the file name with the output ftype, useful with the "auto" ftype # allow templating the file name with the output ftype, useful with the "auto" ftype
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up) self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file) self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
@classmethod @classmethod
def __init_subclass__(cls): def __init_subclass__(cls):
@ -182,7 +184,7 @@ class Model:
return new_name return new_name
def set_gguf_parameters(self): def set_gguf_parameters(self):
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_block_count(self.block_count) self.gguf_writer.add_block_count(self.block_count)
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
@ -324,21 +326,21 @@ class Model:
def write(self): def write(self):
self.write_tensors() self.write_tensors()
self.gguf_writer.write_header_to_file() self.gguf_writer.write_header_to_file(self.fname_out)
self.gguf_writer.write_kv_data_to_file() self.gguf_writer.write_kv_data_to_file()
self.gguf_writer.write_tensors_to_file(progress=True) self.gguf_writer.write_tensors_to_file(progress=True)
self.gguf_writer.close() self.gguf_writer.close()
def write_vocab(self): def write_vocab(self):
self.gguf_writer.write_header_to_file() self.gguf_writer.write_header_to_file(self.fname_out)
self.gguf_writer.write_kv_data_to_file() self.gguf_writer.write_kv_data_to_file()
self.gguf_writer.close() self.gguf_writer.close()
@staticmethod @staticmethod
def get_model_part_names(dir_model: Path, suffix: str) -> list[str]: def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
part_names: list[str] = [] part_names: list[str] = []
for filename in os.listdir(dir_model): for filename in os.listdir(dir_model):
if filename.endswith(suffix): if filename.startswith(prefix) and filename.endswith(suffix):
part_names.append(filename) part_names.append(filename)
part_names.sort() part_names.sort()
@ -475,6 +477,9 @@ class Model:
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d": if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
res = "smaug-bpe" res = "smaug-bpe"
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
res = "jina-v2-code"
if res is None: if res is None:
logger.warning("\n") logger.warning("\n")
@ -662,7 +667,7 @@ class GPTNeoXModel(Model):
def set_gguf_parameters(self): def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"] block_count = self.hparams["num_hidden_layers"]
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_block_count(block_count)
@ -795,7 +800,7 @@ class MPTModel(Model):
def set_gguf_parameters(self): def set_gguf_parameters(self):
block_count = self.hparams["n_layers"] block_count = self.hparams["n_layers"]
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
self.gguf_writer.add_embedding_length(self.hparams["d_model"]) self.gguf_writer.add_embedding_length(self.hparams["d_model"])
self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_block_count(block_count)
@ -847,7 +852,7 @@ class OrionModel(Model):
raise ValueError("gguf: can not find ctx length parameter.") raise ValueError("gguf: can not find ctx length parameter.")
self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_source_hf_repo(hf_repo) self.gguf_writer.add_source_hf_repo(hf_repo)
self.gguf_writer.add_tensor_data_layout("Meta AI original pth") self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
self.gguf_writer.add_context_length(ctx_length) self.gguf_writer.add_context_length(ctx_length)
@ -884,7 +889,7 @@ class BaichuanModel(Model):
else: else:
raise ValueError("gguf: can not find ctx length parameter.") raise ValueError("gguf: can not find ctx length parameter.")
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_source_hf_repo(hf_repo) self.gguf_writer.add_source_hf_repo(hf_repo)
self.gguf_writer.add_tensor_data_layout("Meta AI original pth") self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
self.gguf_writer.add_context_length(ctx_length) self.gguf_writer.add_context_length(ctx_length)
@ -1007,7 +1012,7 @@ class XverseModel(Model):
else: else:
raise ValueError("gguf: can not find ctx length parameter.") raise ValueError("gguf: can not find ctx length parameter.")
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_source_hf_repo(hf_repo) self.gguf_writer.add_source_hf_repo(hf_repo)
self.gguf_writer.add_tensor_data_layout("Meta AI original pth") self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
self.gguf_writer.add_context_length(ctx_length) self.gguf_writer.add_context_length(ctx_length)
@ -1203,7 +1208,7 @@ class StableLMModel(Model):
hparams = self.hparams hparams = self.hparams
block_count = hparams["num_hidden_layers"] block_count = hparams["num_hidden_layers"]
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_block_count(block_count)
@ -1678,7 +1683,7 @@ class GPT2Model(Model):
model_arch = gguf.MODEL_ARCH.GPT2 model_arch = gguf.MODEL_ARCH.GPT2
def set_gguf_parameters(self): def set_gguf_parameters(self):
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_block_count(self.hparams["n_layer"]) self.gguf_writer.add_block_count(self.hparams["n_layer"])
self.gguf_writer.add_context_length(self.hparams["n_ctx"]) self.gguf_writer.add_context_length(self.hparams["n_ctx"])
self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
@ -2245,7 +2250,7 @@ class GemmaModel(Model):
hparams = self.hparams hparams = self.hparams
block_count = hparams["num_hidden_layers"] block_count = hparams["num_hidden_layers"]
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_block_count(block_count)
@ -2345,7 +2350,7 @@ class MambaModel(Model):
# Fail early for models which don't have a block expansion factor of 2 # Fail early for models which don't have a block expansion factor of 2
assert d_inner == 2 * d_model assert d_inner == 2 * d_model
self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
self.gguf_writer.add_embedding_length(d_model) self.gguf_writer.add_embedding_length(d_model)
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
@ -2452,11 +2457,13 @@ class JinaBertV2Model(BertModel):
def get_tensors(self): def get_tensors(self):
for name, data in super().get_tensors(): for name, data in super().get_tensors():
if 'gated_layers' in name: if 'gated_layer' in name:
d1 = data[:self.intermediate_size, :] d1 = data[:self.intermediate_size, :]
name1 = name.replace('gated_layers', 'gated_layers_w') name1 = name.replace('gated_layers', 'gated_layers_w')
name1 = name1.replace('up_gated_layer', 'gated_layers_v')
d2 = data[self.intermediate_size:, :] d2 = data[self.intermediate_size:, :]
name2 = name.replace('gated_layers', 'gated_layers_v') name2 = name.replace('gated_layers', 'gated_layers_v')
name2 = name2.replace('up_gated_layer', 'gated_layers_w')
yield name1, d1 yield name1, d1
yield name2, d2 yield name2, d2
continue continue
@ -2847,7 +2854,7 @@ def main() -> None:
logger.error(f"Model {hparams['architectures'][0]} is not supported") logger.error(f"Model {hparams['architectures'][0]} is not supported")
sys.exit(1) sys.exit(1)
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy) model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name)
logger.info("Set model parameters") logger.info("Set model parameters")
model_instance.set_gguf_parameters() model_instance.set_gguf_parameters()

View file

@ -62,10 +62,10 @@ static size_t split_str_to_n_bytes(std::string str) {
int n; int n;
if (str.back() == 'M') { if (str.back() == 'M') {
sscanf(str.c_str(), "%d", &n); sscanf(str.c_str(), "%d", &n);
n_bytes = (size_t)n * 1024 * 1024; // megabytes n_bytes = (size_t)n * 1000 * 1000; // megabytes
} else if (str.back() == 'G') { } else if (str.back() == 'G') {
sscanf(str.c_str(), "%d", &n); sscanf(str.c_str(), "%d", &n);
n_bytes = (size_t)n * 1024 * 1024 * 1024; // gigabytes n_bytes = (size_t)n * 1000 * 1000 * 1000; // gigabytes
} else { } else {
throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back())); throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
} }
@ -285,7 +285,7 @@ struct split_strategy {
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i)); struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i));
total_size += ggml_nbytes(t); total_size += ggml_nbytes(t);
} }
total_size = total_size / 1024 / 1024; // convert to megabytes total_size = total_size / 1000 / 1000; // convert to megabytes
printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size); printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
i_split++; i_split++;
} }

View file

@ -6,16 +6,19 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/
## Usage ## Usage
``` ```
./imatrix -m <some_fp_model> -f <some_training_data> [-o <output_file>] [--verbosity <verbosity_level>] ./imatrix \
[-ofreq num_chunks] [-ow <0 or 1>] [other common params] -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
[--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
[--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
``` ```
Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory. Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory.
The parameters in square brackets are optional and have the following meaning: The parameters in square brackets are optional and have the following meaning:
* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used. * `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
* `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`. * `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
* `-ofreq` (or `--output-frequency`) specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks) * `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
* `-ow` (or `--output-weight`) specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default. * `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
* `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
For faster computation, make sure to use GPU offloading via the `-ngl` argument For faster computation, make sure to use GPU offloading via the `-ngl` argument

View file

@ -18,39 +18,37 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s \\\n"
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
" [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
LOG_TEE("\n");
}
struct Stats { struct Stats {
std::vector<float> values; std::vector<float> values;
std::vector<int> counts; std::vector<int> counts;
int ncall = 0; int ncall = 0;
}; };
struct StatParams {
std::string dataset;
std::string ofile = "imatrix.dat";
int n_output_frequency = 10;
int verbosity = 1;
int keep_every = 0;
bool collect_output_weight = false;
};
class IMatrixCollector { class IMatrixCollector {
public: public:
IMatrixCollector() = default; IMatrixCollector() = default;
void set_parameters(StatParams&& params) { m_params = std::move(params); } void set_params(gpt_params params) { m_params = std::move(params); }
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
void save_imatrix() const; void save_imatrix(int ncall = -1) const;
bool load_imatrix(const char * file_name, bool add); bool load_imatrix(const char * file_name);
static bool load_imatrix(const char * file_name, std::unordered_map<std::string, Stats>& imatrix);
private: private:
std::unordered_map<std::string, Stats> m_stats; std::unordered_map<std::string, Stats> m_stats;
StatParams m_params; gpt_params m_params;
std::mutex m_mutex; std::mutex m_mutex;
int m_last_call = 0; int m_last_call = 0;
std::vector<float> m_src1_data; std::vector<float> m_src1_data;
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
//
void save_imatrix(const char * file_name, const char * dataset) const;
void keep_imatrix(int ncall) const;
}; };
// remove any prefix and suffixes from the name // remove any prefix and suffixes from the name
@ -86,7 +84,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
if (t->op != GGML_OP_MUL_MAT) return false; if (t->op != GGML_OP_MUL_MAT) return false;
// why are small batches ignored (<16 tokens)? // why are small batches ignored (<16 tokens)?
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false; if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false; if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false;
return true; return true;
} }
@ -154,21 +152,25 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
for (int j = 0; j < (int)src1->ne[0]; ++j) { for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[e_start + j] += x[j]*x[j]; e.values[e_start + j] += x[j]*x[j];
e.counts[e_start + j]++; e.counts[e_start + j]++;
if (!std::isfinite(e.values[e_start + j])) {
fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
exit(1);
}
} }
} }
} }
if (e.ncall > m_last_call) { if (e.ncall > m_last_call) {
m_last_call = e.ncall; m_last_call = e.ncall;
if (m_last_call % m_params.n_output_frequency == 0) { if (m_last_call % m_params.n_out_freq == 0) {
save_imatrix(); save_imatrix();
} }
if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) { if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
keep_imatrix(m_last_call); save_imatrix(m_last_call);
} }
} }
} }
} else { } else {
auto& e = m_stats[wname]; auto & e = m_stats[wname];
if (e.values.empty()) { if (e.values.empty()) {
e.values.resize(src1->ne[0], 0); e.values.resize(src1->ne[0], 0);
e.counts.resize(src1->ne[0], 0); e.counts.resize(src1->ne[0], 0);
@ -186,15 +188,19 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
for (int j = 0; j < (int)src1->ne[0]; ++j) { for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[j] += x[j]*x[j]; e.values[j] += x[j]*x[j];
e.counts[j]++; e.counts[j]++;
if (!std::isfinite(e.values[j])) {
fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
exit(1);
}
} }
} }
if (e.ncall > m_last_call) { if (e.ncall > m_last_call) {
m_last_call = e.ncall; m_last_call = e.ncall;
if (m_last_call % m_params.n_output_frequency == 0) { if (m_last_call % m_params.n_out_freq == 0) {
save_imatrix(); save_imatrix();
} }
if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) { if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
keep_imatrix(m_last_call); save_imatrix(m_last_call);
} }
} }
} }
@ -202,19 +208,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
return true; return true;
} }
void IMatrixCollector::save_imatrix() const { void IMatrixCollector::save_imatrix(int ncall) const {
save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str()); auto fname = m_params.out_file;
} if (fname.empty()) {
fname = "imatrix.dat";
}
void IMatrixCollector::keep_imatrix(int ncall) const { if (ncall > 0) {
auto file_name = m_params.ofile; fname += ".at_";
if (file_name.empty()) file_name = "imatrix.dat"; fname += std::to_string(ncall);
file_name += ".at_"; }
file_name += std::to_string(ncall);
save_imatrix(file_name.c_str(), m_params.dataset.c_str());
}
void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
std::ofstream out(fname, std::ios::binary); std::ofstream out(fname, std::ios::binary);
int n_entries = m_stats.size(); int n_entries = m_stats.size();
out.write((const char *) &n_entries, sizeof(n_entries)); out.write((const char *) &n_entries, sizeof(n_entries));
@ -237,26 +241,28 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co
// Write the number of call the matrix was computed with // Write the number of call the matrix was computed with
out.write((const char *) &m_last_call, sizeof(m_last_call)); out.write((const char *) &m_last_call, sizeof(m_last_call));
// Write the dataset name at the end of the file to later on specify it in quantize // Write the input filename at the end of the file to later on specify it in quantize
int n_dataset = strlen(dataset); {
out.write((const char *) &n_dataset, sizeof(n_dataset)); int len = m_params.prompt_file.size();
out.write(dataset, n_dataset); out.write((const char *) &len, sizeof(len));
out.write(m_params.prompt_file.c_str(), len);
}
if (m_params.verbosity > 0) { if (m_params.verbosity > 0) {
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname); fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
} }
} }
bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_map<std::string, Stats>& imatrix_data) { bool IMatrixCollector::load_imatrix(const char * fname) {
std::ifstream in(imatrix_file, std::ios::binary); std::ifstream in(fname, std::ios::binary);
if (!in) { if (!in) {
printf("%s: failed to open %s\n",__func__,imatrix_file); printf("%s: failed to open %s\n",__func__, fname);
return false; return false;
} }
int n_entries; int n_entries;
in.read((char*)&n_entries, sizeof(n_entries)); in.read((char*)&n_entries, sizeof(n_entries));
if (in.fail() || n_entries < 1) { if (in.fail() || n_entries < 1) {
printf("%s: no data in file %s\n", __func__, imatrix_file); printf("%s: no data in file %s\n", __func__, fname);
return false; return false;
} }
for (int i = 0; i < n_entries; ++i) { for (int i = 0; i < n_entries; ++i) {
@ -264,23 +270,22 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
std::vector<char> name_as_vec(len+1); std::vector<char> name_as_vec(len+1);
in.read((char *)name_as_vec.data(), len); in.read((char *)name_as_vec.data(), len);
if (in.fail()) { if (in.fail()) {
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file); printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
return false; return false;
} }
name_as_vec[len] = 0; name_as_vec[len] = 0;
std::string name{name_as_vec.data()}; std::string name{name_as_vec.data()};
auto& e = imatrix_data[std::move(name)]; auto & e = m_stats[std::move(name)];
int ncall; int ncall;
in.read((char*)&ncall, sizeof(ncall)); in.read((char*)&ncall, sizeof(ncall));
int nval; int nval;
in.read((char *)&nval, sizeof(nval)); in.read((char *)&nval, sizeof(nval));
if (in.fail() || nval < 1) { if (in.fail() || nval < 1) {
printf("%s: failed reading number of values for entry %d\n",__func__,i); printf("%s: failed reading number of values for entry %d\n",__func__,i);
imatrix_data = {}; m_stats = {};
return false; return false;
} }
// When re-called from load_imatrix() with add set, this will already be created.
if (e.values.empty()) { if (e.values.empty()) {
e.values.resize(nval, 0); e.values.resize(nval, 0);
e.counts.resize(nval, 0); e.counts.resize(nval, 0);
@ -290,7 +295,7 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
in.read((char*)tmp.data(), nval*sizeof(float)); in.read((char*)tmp.data(), nval*sizeof(float));
if (in.fail()) { if (in.fail()) {
printf("%s: failed reading data for entry %d\n",__func__,i); printf("%s: failed reading data for entry %d\n",__func__,i);
imatrix_data = {}; m_stats = {};
return false; return false;
} }
@ -305,13 +310,6 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
return true; return true;
} }
bool IMatrixCollector::load_imatrix(const char * file_name, bool add) {
if (!add) {
m_stats.clear();
}
return load_imatrix(file_name, m_stats);
}
static IMatrixCollector g_collector; static IMatrixCollector g_collector;
static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@ -325,7 +323,7 @@ struct results_log_softmax {
float prob; float prob;
}; };
static std::vector<float> softmax(const std::vector<float>& logits) { static std::vector<float> softmax(const std::vector<float> & logits) {
std::vector<float> probs(logits.size()); std::vector<float> probs(logits.size());
float max_logit = logits[0]; float max_logit = logits[0];
for (float v : logits) { for (float v : logits) {
@ -359,8 +357,7 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to
static void process_logits( static void process_logits(
int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers, int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
double & nll, double & nll2, float * logit_history, float * prob_history double & nll, double & nll2, float * logit_history, float * prob_history) {
) {
std::mutex mutex; std::mutex mutex;
int counter = 0; int counter = 0;
auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () { auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
@ -392,8 +389,7 @@ static void process_logits(
} }
} }
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) { static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1); GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
@ -406,13 +402,13 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
auto tim2 = std::chrono::high_resolution_clock::now(); auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count()); fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
if (from_chunk > 0) { if (params.i_chunk > 0) {
if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) { if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk); fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
return false; return false;
} }
fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx); fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx); tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
} }
if (int(tokens.size()) < 2*n_ctx) { if (int(tokens.size()) < 2*n_ctx) {
@ -425,7 +421,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
std::vector<float> logit_history; std::vector<float> logit_history;
std::vector<float> prob_history; std::vector<float> prob_history;
if (compute_ppl) { if (params.compute_ppl) {
logit_history.resize(tokens.size()); logit_history.resize(tokens.size());
prob_history.resize(tokens.size()); prob_history.resize(tokens.size());
} }
@ -447,7 +443,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
const int num_batches = (n_ctx + n_batch - 1) / n_batch; const int num_batches = (n_ctx + n_batch - 1) / n_batch;
std::vector<float> logits; std::vector<float> logits;
if (compute_ppl && num_batches > 1) { if (params.compute_ppl && num_batches > 1) {
logits.reserve((size_t)n_ctx * n_vocab); logits.reserve((size_t)n_ctx * n_vocab);
} }
@ -483,7 +479,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
// restore the original token in case it was set to BOS // restore the original token in case it was set to BOS
tokens[batch_start] = token_org; tokens[batch_start] = token_org;
if (compute_ppl && num_batches > 1) { if (params.compute_ppl && num_batches > 1) {
const auto * batch_logits = llama_get_logits(ctx); const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
} }
@ -502,7 +498,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
} }
if (compute_ppl) { if (params.compute_ppl) {
const int first = n_ctx/2; const int first = n_ctx/2;
const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
@ -517,7 +513,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
} }
printf("\n"); printf("\n");
if (compute_ppl) { if (params.compute_ppl) {
nll2 /= count; nll2 /= count;
nll /= count; nll /= count;
const double ppl = exp(nll); const double ppl = exp(nll);
@ -534,109 +530,32 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
StatParams sparams;
std::string prev_result_file;
std::string combine_files;
bool compute_ppl = true;
int from_chunk = 0;
std::vector<char*> args;
args.push_back(argv[0]);
int iarg = 1;
for (; iarg < argc-1; ++iarg) {
std::string arg{argv[iarg]};
if (arg == "-o" || arg == "--output-file") {
sparams.ofile = argv[++iarg];
}
else if (arg == "-ofreq" || arg == "--output-frequency") {
sparams.n_output_frequency = std::stoi(argv[++iarg]);
}
else if (arg == "-ow" || arg == "--output-weight") {
sparams.collect_output_weight = std::stoi(argv[++iarg]);
}
else if (arg == "--verbosity") {
sparams.verbosity = std::stoi(argv[++iarg]);
} else if (arg == "--no-ppl") {
compute_ppl = false;
} else if (arg == "--keep-imatrix") {
sparams.keep_every = std::stoi(argv[++iarg]);
} else if (arg == "--continue-from") {
prev_result_file = argv[++iarg];
} else if (arg == "--combine") {
combine_files = argv[++iarg];
}
else if (arg == "--from-chunk") {
from_chunk = std::stoi(argv[++iarg]);
} else {
args.push_back(argv[iarg]);
}
}
if (iarg < argc) {
std::string arg{argv[iarg]};
if (arg == "--no-ppl") {
compute_ppl = false;
} else {
args.push_back(argv[iarg]);
}
}
gpt_params params; gpt_params params;
params.n_batch = 512;
params.n_ctx = 512;
params.logits_all = true;
params.verbosity = 1;
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params); print_usage(argc, argv, params);
return 1; return 1;
} }
params.logits_all = true;
params.n_batch = std::min(params.n_batch, params.n_ctx); params.n_batch = std::min(params.n_batch, params.n_ctx);
print_build_info(); g_collector.set_params(params);
if (params.seed == LLAMA_DEFAULT_SEED) { for (const auto & in_file : params.in_files) {
params.seed = time(NULL); printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
} if (!g_collector.load_imatrix(in_file.c_str())) {
fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
sparams.dataset = params.prompt_file;
g_collector.set_parameters(std::move(sparams));
if (!combine_files.empty()) {
std::vector<std::string> files;
size_t pos = 0;
while (true) {
auto new_pos = combine_files.find(',', pos);
if (new_pos != std::string::npos) {
files.emplace_back(combine_files.substr(pos, new_pos - pos));
pos = new_pos + 1;
} else {
files.emplace_back(combine_files.substr(pos));
break;
}
}
if (files.size() < 2) {
fprintf(stderr, "You must provide at least two comma separated files to use --combine\n");
return 1;
}
printf("Combining the following %d files\n", int(files.size()));
for (auto& file : files) {
printf(" %s\n", file.c_str());
if (!g_collector.load_imatrix(file.c_str(), true)) {
fprintf(stderr, "Failed to load %s\n", file.c_str());
return 1; return 1;
} }
} }
if (params.in_files.size() > 1) {
printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
g_collector.save_imatrix(); g_collector.save_imatrix();
return 0;
}
if (!prev_result_file.empty()) {
if (!g_collector.load_imatrix(prev_result_file.c_str(), false)) {
fprintf(stderr, "=============== Failed to load %s\n", prev_result_file.c_str());
return 1;
}
} }
llama_backend_init(); llama_backend_init();
@ -651,6 +570,7 @@ int main(int argc, char ** argv) {
// init // init
llama_model * model; llama_model * model;
llama_context * ctx; llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params(params); std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == nullptr || ctx == nullptr) { if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__); fprintf(stderr, "%s : failed to init\n", __func__);
@ -669,8 +589,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
} }
bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk); if (!compute_imatrix(ctx, params)) {
if (!OK) {
return 1; return 1;
} }

View file

@ -6,52 +6,22 @@ import re
import sys import sys
from typing import Any, Dict, List, Set, Tuple, Union from typing import Any, Dict, List, Set, Tuple, Union
def _build_repetition(item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False):
if not separator_rule: def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
if min_items == 0 and max_items == 1: if min_items == 0 and max_items == 1:
return f'{item_rule}?' return f'{item_rule}?'
elif min_items == 1 and max_items is None:
if not separator_rule:
if min_items == 1 and max_items is None:
return f'{item_rule}+' return f'{item_rule}+'
elif min_items == 0 and max_items is None:
result = '' return f'{item_rule}*'
if min_items > 0:
if item_rule_is_literal and separator_rule is None:
result = '"' + (item_rule[1:-1] * min_items) + '"'
else: else:
result = (f' {separator_rule} ' if separator_rule else ' ').join([item_rule] * min_items) return f'{item_rule}{{{min_items},{max_items if max_items is not None else ""}}}'
def opt_repetitions(up_to_n, prefix_with_sep=False): result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None)
''' return f'({result})?' if min_items == 0 else result
- n=4, no sep: '(a (a (a (a)?)?)?)?'
- n=4, sep=',', prefix: '("," a ("," a ("," a ("," a)?)?)?)?'
- n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?'
'''
content = f'{separator_rule} {item_rule}' if prefix_with_sep and separator_rule else item_rule
if up_to_n == 0:
return ''
elif up_to_n == 1:
return f'({content})?'
elif separator_rule and not prefix_with_sep:
return f'({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?'
else:
return (f'({content} ' * up_to_n).rstrip() + (')?' * up_to_n)
if min_items > 0 and max_items != min_items:
result += ' '
if max_items is not None:
result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0)
else:
item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})'
if min_items == 0 and separator_rule:
result = f'({item_rule} {item_operator}*)?'
else:
result += f'{item_operator}*'
return result
class BuiltinRule: class BuiltinRule:
@ -59,31 +29,29 @@ class BuiltinRule:
self.content = content self.content = content
self.deps = deps or [] self.deps = deps or []
_up_to_15_digits = _build_repetition('[0-9]', 0, 15)
# whitespace is constrained to a single space char to prevent model "running away" in # whitespace is constrained to a single space char to prevent model "running away" in
# whitespace. Also maybe improves generation quality? # whitespace. Also maybe improves generation quality?
SPACE_RULE = '" "?' SPACE_RULE = '" "?'
PRIMITIVE_RULES = { PRIMITIVE_RULES = {
'boolean' : BuiltinRule('("true" | "false") space', []), 'boolean' : BuiltinRule('("true" | "false") space', []),
'decimal-part' : BuiltinRule('[0-9] ' + _up_to_15_digits, []), 'decimal-part' : BuiltinRule('[0-9]{1,16}', []),
'integral-part': BuiltinRule('[0-9] | [1-9] ' + _up_to_15_digits, []), 'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']), 'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
'integer' : BuiltinRule('("-"? integral-part) space', ['integral-part']), 'integer' : BuiltinRule('("-"? integral-part) space', ['integral-part']),
'value' : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']), 'value' : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']), 'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']), 'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
'uuid' : BuiltinRule(r'"\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + r' "\"" space', []), 'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])', []), 'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})', []),
'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']), 'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']),
'null' : BuiltinRule('"null" space', []), 'null' : BuiltinRule('"null" space', []),
} }
# TODO: support "uri", "email" string formats # TODO: support "uri", "email" string formats
STRING_FORMAT_RULES = { STRING_FORMAT_RULES = {
'date' : BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []), 'date' : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []), 'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
'date-time' : BuiltinRule('date "T" time', ['date', 'time']), 'date-time' : BuiltinRule('date "T" time', ['date', 'time']),
'date-string' : BuiltinRule('"\\"" date "\\"" space', ['date']), 'date-string' : BuiltinRule('"\\"" date "\\"" space', ['date']),
'time-string' : BuiltinRule('"\\"" time "\\"" space', ['time']), 'time-string' : BuiltinRule('"\\"" time "\\"" space', ['time']),
@ -333,7 +301,7 @@ class SchemaConverter:
sub_rule_ids[sub] = id sub_rule_ids[sub] = id
sub = id sub = id
seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times, item_rule_is_literal=sub_is_literal), False) seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times), False)
else: else:
literal = '' literal = ''
while i < length: while i < length:

View file

@ -624,7 +624,7 @@ string ::= "\"" (
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" ws )* "\"" ws
ws ::= ([ \t\n] ws)? ws ::= ([ \t\n] ws)?
float ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws float ::= ("-"? ([0] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
integer ::= [0-9]+""" integer ::= [0-9]+"""

View file

@ -6,10 +6,6 @@
#include "ggml-metal.h" #include "ggml-metal.h"
#endif #endif
#ifdef GGML_USE_SYCL
#include "ggml-sycl.h"
#endif
#include "ggml-rpc.h" #include "ggml-rpc.h"
#ifdef _WIN32 #ifdef _WIN32
# include <windows.h> # include <windows.h>
@ -83,12 +79,6 @@ static ggml_backend_t create_backend() {
if (!backend) { if (!backend) {
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
} }
#elif GGML_USE_SYCL
fprintf(stderr, "%s: using SYCL backend\n", __func__);
backend = ggml_backend_sycl_init(0); // init device 0
if (!backend) {
fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
}
#endif #endif
// if there aren't GPU Backends fallback to CPU backend // if there aren't GPU Backends fallback to CPU backend

View file

@ -279,7 +279,7 @@ node index.js
`id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1` `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1`
`cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. Default: `false` `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)

View file

@ -2,57 +2,26 @@
const SPACE_RULE = '" "?'; const SPACE_RULE = '" "?';
function _buildRepetition(itemRule, minItems, maxItems, opts={}) { function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
if (minItems === 0 && maxItems === 1) {
return `${itemRule}?`;
}
const separatorRule = opts.separatorRule ?? ''; const separatorRule = opts.separatorRule ?? '';
const itemRuleIsLiteral = opts.itemRuleIsLiteral ?? false const itemRuleIsLiteral = opts.itemRuleIsLiteral ?? false
if (separatorRule === '') { if (separatorRule === '') {
if (minItems === 0 && maxItems === 1) { if (minItems === 1 && maxItems === undefined) {
return `${itemRule}?`;
} else if (minItems === 1 && maxItems === undefined) {
return `${itemRule}+`; return `${itemRule}+`;
} } else if (minItems === 0 && maxItems === undefined) {
} return `${itemRule}*`;
let result = '';
if (minItems > 0) {
if (itemRuleIsLiteral && separatorRule === '') {
result = `"${itemRule.slice(1, -1).repeat(minItems)}"`;
} else { } else {
result = Array.from({ length: minItems }, () => itemRule) return `${itemRule}{${minItems},${maxItems !== undefined ? maxItems : ''}}`;
.join(separatorRule !== '' ? ` ${separatorRule} ` : ' ');
} }
} }
const optRepetitions = (upToN, prefixWithSep=false) => { const result = itemRule + ' ' + _buildRepetition(`(${separatorRule} ${itemRule})`, minItems > 0 ? minItems - 1 : 0, maxItems !== undefined ? maxItems - 1 : undefined);
const content = separatorRule !== '' && prefixWithSep ? `${separatorRule} ${itemRule}` : itemRule; return minItems === 0 ? `(${result})?` : result;
if (upToN === 0) {
return '';
} else if (upToN === 1) {
return `(${content})?`;
} else if (separatorRule !== '' && !prefixWithSep) {
return `(${content} ${optRepetitions(upToN - 1, true)})?`;
} else {
return Array.from({ length: upToN }, () => `(${content}`).join(' ').trim() + Array.from({ length: upToN }, () => ')?').join('');
}
};
if (minItems > 0 && maxItems !== minItems) {
result += ' ';
}
if (maxItems !== undefined) {
result += optRepetitions(maxItems - minItems, minItems > 0);
} else {
const itemOperator = `(${separatorRule !== '' ? separatorRule + ' ' : ''}${itemRule})`;
if (minItems === 0 && separatorRule !== '') {
result = `(${itemRule} ${itemOperator}*)?`;
} else {
result += `${itemOperator}*`;
}
}
return result;
} }
class BuiltinRule { class BuiltinRule {
@ -62,27 +31,25 @@ class BuiltinRule {
} }
} }
const UP_TO_15_DIGITS = _buildRepetition('[0-9]', 0, 15);
const PRIMITIVE_RULES = { const PRIMITIVE_RULES = {
boolean : new BuiltinRule('("true" | "false") space', []), boolean : new BuiltinRule('("true" | "false") space', []),
'decimal-part' : new BuiltinRule('[0-9] ' + UP_TO_15_DIGITS, []), 'decimal-part' : new BuiltinRule('[0-9]{1,16}', []),
'integral-part': new BuiltinRule('[0-9] | [1-9] ' + UP_TO_15_DIGITS, []), 'integral-part': new BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
number : new BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']), number : new BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
integer : new BuiltinRule('("-"? integral-part) space', ['integral-part']), integer : new BuiltinRule('("-"? integral-part) space', ['integral-part']),
value : new BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']), value : new BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
object : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']), object : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
array : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']), array : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
uuid : new BuiltinRule('"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space', []), uuid : new BuiltinRule('"\\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\\"" space', []),
char : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])`, []), char : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F]{4})`, []),
string : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']), string : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']),
null : new BuiltinRule('"null" space', []), null : new BuiltinRule('"null" space', []),
}; };
// TODO: support "uri", "email" string formats // TODO: support "uri", "email" string formats
const STRING_FORMAT_RULES = { const STRING_FORMAT_RULES = {
'date' : new BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []), 'date' : new BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
'time' : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []), 'time' : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
'date-time' : new BuiltinRule('date "T" time', ['date', 'time']), 'date-time' : new BuiltinRule('date "T" time', ['date', 'time']),
'date-string' : new BuiltinRule('"\\"" date "\\"" space', ['date']), 'date-string' : new BuiltinRule('"\\"" date "\\"" space', ['date']),
'time-string' : new BuiltinRule('"\\"" time "\\"" space', ['time']), 'time-string' : new BuiltinRule('"\\"" time "\\"" space', ['time']),

View file

@ -648,6 +648,9 @@ struct server_context {
server_metrics metrics; server_metrics metrics;
// Necessary similarity of prompt for slot selection
float slot_prompt_similarity = 0.0f;
~server_context() { ~server_context() {
if (ctx) { if (ctx) {
llama_free(ctx); llama_free(ctx);
@ -796,24 +799,88 @@ struct server_context {
return prompt_tokens; return prompt_tokens;
} }
server_slot * get_slot(int id) { server_slot * get_slot_by_id(int id) {
int64_t t_last = ggml_time_us();
server_slot * last_used = nullptr;
for (server_slot & slot : slots) { for (server_slot & slot : slots) {
if (slot.id == id && slot.available()) { if (slot.id == id) {
return &slot; return &slot;
} }
}
// among all available slots, find the one that has been least recently used return nullptr;
if (slot.available() && slot.t_last_used < t_last) { }
last_used = &slot;
server_slot * get_available_slot(const std::string & prompt) {
server_slot * ret = nullptr;
// find the slot that has at least n% prompt similarity
if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty()) {
int max_lcp_len = 0;
float similarity = 0;
for (server_slot & slot : slots) {
// skip the slot if it is not available
if (!slot.available()) {
continue;
}
// skip the slot if it does not contains prompt
if (!slot.prompt.is_string()) {
continue;
}
// current slot's prompt
std::string slot_prompt = slot.prompt.get<std::string>();
// length of the current slot's prompt
int slot_prompt_len = slot_prompt.size();
// length of the Longest Common Prefix between the current slot's prompt and the input prompt
int lcp_len = common_part(slot_prompt, prompt);
// fraction of the common substring length compared to the current slot's prompt length
similarity = static_cast<float>(lcp_len) / slot_prompt_len;
// select the current slot if the criteria match
if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) {
max_lcp_len = lcp_len;
ret = &slot;
}
}
if (ret != nullptr) {
LOG_VERBOSE("selected slot by lcp similarity", {
{"id_slot", ret->id},
{"max_lcp_len", max_lcp_len},
{"similarity", similarity},
});
}
}
// find the slot that has been least recently used
if (ret == nullptr) {
int64_t t_last = ggml_time_us();
for (server_slot & slot : slots) {
// skip the slot if it is not available
if (!slot.available()) {
continue;
}
// select the current slot if the criteria match
if (slot.t_last_used < t_last) {
t_last = slot.t_last_used; t_last = slot.t_last_used;
ret = &slot;
} }
} }
return last_used; if (ret != nullptr) {
LOG_VERBOSE("selected slot by lru", {
{"id_slot", ret->id},
{"t_last", t_last},
});
}
}
return ret;
} }
bool launch_slot_with_task(server_slot & slot, const server_task & task) { bool launch_slot_with_task(server_slot & slot, const server_task & task) {
@ -889,7 +956,7 @@ struct server_context {
slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix); slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix);
// get prompt // get prompt
{ if (!task.infill) {
const auto & prompt = data.find("prompt"); const auto & prompt = data.find("prompt");
if (prompt == data.end()) { if (prompt == data.end()) {
send_error(task, "Either \"prompt\" or \"messages\" must be provided", ERROR_TYPE_INVALID_REQUEST); send_error(task, "Either \"prompt\" or \"messages\" must be provided", ERROR_TYPE_INVALID_REQUEST);
@ -1516,13 +1583,29 @@ struct server_context {
switch (task.type) { switch (task.type) {
case SERVER_TASK_TYPE_COMPLETION: case SERVER_TASK_TYPE_COMPLETION:
{ {
server_slot * slot = get_slot(json_value(task.data, "id_slot", -1)); int id_slot = json_value(task.data, "id_slot", -1);
std::string prompt = json_value(task.data, "prompt", std::string());
server_slot * slot;
if (id_slot != -1) {
slot = get_slot_by_id(id_slot);
} else {
slot = get_available_slot(prompt);
}
if (slot == nullptr) { if (slot == nullptr) {
// if no slot is available, we defer this task for processing later // if no slot is available, we defer this task for processing later
LOG_VERBOSE("no slot is available", {{"id_task", task.id}}); LOG_VERBOSE("no slot is available", {{"id_task", task.id}});
queue_tasks.defer(task); queue_tasks.defer(task);
break; break;
} }
if (!slot->available()) {
// if requested slot is unavailable, we defer this task for processing later
LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
queue_tasks.defer(task);
break;
}
if (task.data.contains("system_prompt")) { if (task.data.contains("system_prompt")) {
std::string sys_prompt = json_value(task.data, "system_prompt", std::string()); std::string sys_prompt = json_value(task.data, "system_prompt", std::string());
@ -1639,11 +1722,17 @@ struct server_context {
case SERVER_TASK_TYPE_SLOT_SAVE: case SERVER_TASK_TYPE_SLOT_SAVE:
{ {
int id_slot = task.data.at("id_slot"); int id_slot = task.data.at("id_slot");
server_slot * slot = get_slot(id_slot); server_slot * slot = get_slot_by_id(id_slot);
if (slot == nullptr) { if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
break; break;
} }
if (!slot->available()) {
// if requested slot is unavailable, we defer this task for processing later
LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
queue_tasks.defer(task);
break;
}
const size_t token_count = slot->cache_tokens.size(); const size_t token_count = slot->cache_tokens.size();
const int64_t t_start = ggml_time_us(); const int64_t t_start = ggml_time_us();
@ -1674,11 +1763,17 @@ struct server_context {
case SERVER_TASK_TYPE_SLOT_RESTORE: case SERVER_TASK_TYPE_SLOT_RESTORE:
{ {
int id_slot = task.data.at("id_slot"); int id_slot = task.data.at("id_slot");
server_slot * slot = get_slot(id_slot); server_slot * slot = get_slot_by_id(id_slot);
if (slot == nullptr) { if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
break; break;
} }
if (!slot->available()) {
// if requested slot is unavailable, we defer this task for processing later
LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
queue_tasks.defer(task);
break;
}
const int64_t t_start = ggml_time_us(); const int64_t t_start = ggml_time_us();
@ -1716,11 +1811,17 @@ struct server_context {
case SERVER_TASK_TYPE_SLOT_ERASE: case SERVER_TASK_TYPE_SLOT_ERASE:
{ {
int id_slot = task.data.at("id_slot"); int id_slot = task.data.at("id_slot");
server_slot * slot = get_slot(id_slot); server_slot * slot = get_slot_by_id(id_slot);
if (slot == nullptr) { if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
break; break;
} }
if (!slot->available()) {
// if requested slot is unavailable, we defer this task for processing later
LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
queue_tasks.defer(task);
break;
}
// Erase token cache // Erase token cache
const size_t n_erased = slot->cache_tokens.size(); const size_t n_erased = slot->cache_tokens.size();
@ -2361,7 +2462,7 @@ int main(int argc, char ** argv) {
// TODO: not great to use extern vars // TODO: not great to use extern vars
server_log_json = params.log_json; server_log_json = params.log_json;
server_verbose = params.verbose; server_verbose = params.verbosity > 0;
// struct that contains llama context and inference // struct that contains llama context and inference
server_context ctx_server; server_context ctx_server;
@ -2468,6 +2569,9 @@ int main(int argc, char ** argv) {
log_data["api_key"] = "api_key: " + std::to_string(params.api_keys.size()) + " keys loaded"; log_data["api_key"] = "api_key: " + std::to_string(params.api_keys.size()) + " keys loaded";
} }
// Necessary similarity of prompt for slot selection
ctx_server.slot_prompt_similarity = params.slot_prompt_similarity;
// load the model // load the model
if (!ctx_server.load_model(params)) { if (!ctx_server.load_model(params)) {
state.store(SERVER_STATE_ERROR); state.store(SERVER_STATE_ERROR);

View file

@ -253,6 +253,13 @@ static size_t common_part(const std::vector<llama_token> & a, const std::vector<
return i; return i;
} }
static size_t common_part(const std::string & a, const std::string & b) {
size_t i;
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
return i;
}
static bool ends_with(const std::string & str, const std::string & suffix) { static bool ends_with(const std::string & str, const std::string & suffix) {
return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
} }

View file

@ -9108,6 +9108,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
// find the sum of exps in the block // find the sum of exps in the block
tmp = warp_reduce_sum(tmp, item_ct1); tmp = warp_reduce_sum(tmp, item_ct1);
if (block_size > WARP_SIZE) { if (block_size > WARP_SIZE) {
item_ct1.barrier(sycl::access::fence_space::local_space);
if (warp_id == 0) { if (warp_id == 0) {
buf[lane_id] = 0.f; buf[lane_id] = 0.f;
} }

View file

@ -345,15 +345,12 @@ struct vk_context {
}; };
struct ggml_tensor_extra_gpu { struct ggml_tensor_extra_gpu {
bool ready;
size_t ctx_idx; size_t ctx_idx;
vk_buffer_ref buffer_gpu; vk_buffer_ref buffer_gpu;
uint64_t offset; uint64_t offset;
void reset() { void reset() {
ready = false;
ctx_idx = 0; ctx_idx = 0;
buffer_gpu.reset(); buffer_gpu.reset();
offset = 0; offset = 0;
@ -2949,7 +2946,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
const uint64_t d_sz = sizeof(float) * d_ne; const uint64_t d_sz = sizeof(float) * d_ne;
vk_buffer d_D = extra->buffer_gpu.lock(); vk_buffer d_D = extra->buffer_gpu.lock();
const uint64_t d_buf_offset = extra->offset; const uint64_t d_buf_offset = extra->offset + dst->view_offs;
GGML_ASSERT(d_D != nullptr); GGML_ASSERT(d_D != nullptr);
GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03); GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
vk_buffer d_X; vk_buffer d_X;
@ -2958,12 +2955,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
uint64_t y_buf_offset = 0; uint64_t y_buf_offset = 0;
if (!src0_uma) { if (!src0_uma) {
d_Qx = extra_src0->buffer_gpu.lock(); d_Qx = extra_src0->buffer_gpu.lock();
qx_buf_offset = extra_src0->offset; qx_buf_offset = extra_src0->offset + src0->view_offs;
GGML_ASSERT(d_Qx != nullptr); GGML_ASSERT(d_Qx != nullptr);
} }
if (!src1_uma) { if (!src1_uma) {
d_Qy = extra_src1->buffer_gpu.lock(); d_Qy = extra_src1->buffer_gpu.lock();
qy_buf_offset = extra_src1->offset; qy_buf_offset = extra_src1->offset + src1->view_offs;
GGML_ASSERT(d_Qy != nullptr); GGML_ASSERT(d_Qy != nullptr);
} }
if (qx_needs_dequant) { if (qx_needs_dequant) {
@ -3114,7 +3111,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
const uint64_t d_sz = sizeof(float) * d_ne; const uint64_t d_sz = sizeof(float) * d_ne;
vk_buffer d_D = extra->buffer_gpu.lock(); vk_buffer d_D = extra->buffer_gpu.lock();
const uint64_t d_buf_offset = extra->offset; const uint64_t d_buf_offset = extra->offset + dst->view_offs;
GGML_ASSERT(d_D != nullptr); GGML_ASSERT(d_D != nullptr);
vk_buffer d_X; vk_buffer d_X;
uint64_t x_buf_offset = 0; uint64_t x_buf_offset = 0;
@ -3122,12 +3119,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
uint64_t y_buf_offset = 0; uint64_t y_buf_offset = 0;
if(!src0_uma) { if(!src0_uma) {
d_Qx = extra_src0->buffer_gpu.lock(); d_Qx = extra_src0->buffer_gpu.lock();
qx_buf_offset = extra_src0->offset; qx_buf_offset = extra_src0->offset + src0->view_offs;
GGML_ASSERT(d_Qx != nullptr); GGML_ASSERT(d_Qx != nullptr);
} }
if(!src1_uma) { if(!src1_uma) {
d_Qy = extra_src1->buffer_gpu.lock(); d_Qy = extra_src1->buffer_gpu.lock();
qy_buf_offset = extra_src1->offset; qy_buf_offset = extra_src1->offset + src1->view_offs;
GGML_ASSERT(d_Qy != nullptr); GGML_ASSERT(d_Qy != nullptr);
} }
if (qx_needs_dequant) { if (qx_needs_dequant) {
@ -3246,14 +3243,14 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
const uint64_t d_sz = sizeof(float) * d_ne; const uint64_t d_sz = sizeof(float) * d_ne;
vk_buffer d_D = extra->buffer_gpu.lock(); vk_buffer d_D = extra->buffer_gpu.lock();
const uint64_t d_buf_offset = extra->offset; const uint64_t d_buf_offset = extra->offset + dst->view_offs;
GGML_ASSERT(d_D != nullptr); GGML_ASSERT(d_D != nullptr);
vk_buffer d_Qx = extra_src0->buffer_gpu.lock(); vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
const uint64_t qx_buf_offset = extra_src0->offset; const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
GGML_ASSERT(d_Qx != nullptr); GGML_ASSERT(d_Qx != nullptr);
if (!src1_uma) { if (!src1_uma) {
d_Qy = extra_src1->buffer_gpu.lock(); d_Qy = extra_src1->buffer_gpu.lock();
qy_buf_offset = extra_src1->offset; qy_buf_offset = extra_src1->offset + src1->view_offs;
GGML_ASSERT(d_Qx != nullptr); GGML_ASSERT(d_Qx != nullptr);
} }
@ -3323,14 +3320,14 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
const uint64_t d_sz = sizeof(float) * d_ne; const uint64_t d_sz = sizeof(float) * d_ne;
vk_buffer d_D = extra->buffer_gpu.lock(); vk_buffer d_D = extra->buffer_gpu.lock();
const uint64_t d_buf_offset = extra->offset; const uint64_t d_buf_offset = extra->offset + dst->view_offs;
GGML_ASSERT(d_D != nullptr); GGML_ASSERT(d_D != nullptr);
vk_buffer d_Qx = extra_src0->buffer_gpu.lock(); vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
const uint64_t qx_buf_offset = extra_src0->offset; const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
GGML_ASSERT(d_Qx != nullptr); GGML_ASSERT(d_Qx != nullptr);
if (!src1_uma) { if (!src1_uma) {
d_Qy = extra_src1->buffer_gpu.lock(); d_Qy = extra_src1->buffer_gpu.lock();
qy_buf_offset = extra_src1->offset; qy_buf_offset = extra_src1->offset + src1->view_offs;
GGML_ASSERT(d_Qx != nullptr); GGML_ASSERT(d_Qx != nullptr);
} }
@ -3459,7 +3456,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
const uint64_t d_sz = sizeof(float) * d_ne; const uint64_t d_sz = sizeof(float) * d_ne;
vk_buffer d_D = extra->buffer_gpu.lock(); vk_buffer d_D = extra->buffer_gpu.lock();
const uint64_t d_buf_offset = extra->offset; const uint64_t d_buf_offset = extra->offset + dst->view_offs;
GGML_ASSERT(d_D != nullptr); GGML_ASSERT(d_D != nullptr);
vk_buffer d_X; vk_buffer d_X;
uint64_t x_buf_offset = 0; uint64_t x_buf_offset = 0;
@ -3467,17 +3464,17 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
uint64_t y_buf_offset = 0; uint64_t y_buf_offset = 0;
if (!src0_uma) { if (!src0_uma) {
d_Qx = extra_src0->buffer_gpu.lock(); d_Qx = extra_src0->buffer_gpu.lock();
qx_buf_offset = extra_src0->offset; qx_buf_offset = extra_src0->offset + src0->view_offs;
GGML_ASSERT(d_Qx != nullptr); GGML_ASSERT(d_Qx != nullptr);
} }
if (!src1_uma) { if (!src1_uma) {
d_Qy = extra_src1->buffer_gpu.lock(); d_Qy = extra_src1->buffer_gpu.lock();
qy_buf_offset = extra_src1->offset; qy_buf_offset = extra_src1->offset + src1->view_offs;
GGML_ASSERT(d_Qy != nullptr); GGML_ASSERT(d_Qy != nullptr);
} }
if (!ids_uma) { if (!ids_uma) {
d_ids = extra_ids->buffer_gpu.lock(); d_ids = extra_ids->buffer_gpu.lock();
ids_buf_offset = extra_ids->offset; ids_buf_offset = extra_ids->offset + ids->view_offs;
GGML_ASSERT(d_ids != nullptr); GGML_ASSERT(d_ids != nullptr);
} }
if (qx_needs_dequant) { if (qx_needs_dequant) {
@ -3636,7 +3633,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
const uint64_t d_sz = sizeof(float) * d_ne; const uint64_t d_sz = sizeof(float) * d_ne;
vk_buffer d_D = extra->buffer_gpu.lock(); vk_buffer d_D = extra->buffer_gpu.lock();
const uint64_t d_buf_offset = extra->offset; const uint64_t d_buf_offset = extra->offset + dst->view_offs;
GGML_ASSERT(d_D != nullptr); GGML_ASSERT(d_D != nullptr);
vk_buffer d_X; vk_buffer d_X;
uint64_t x_buf_offset = 0; uint64_t x_buf_offset = 0;
@ -3644,17 +3641,17 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
uint64_t y_buf_offset = 0; uint64_t y_buf_offset = 0;
if(!src0_uma) { if(!src0_uma) {
d_Qx = extra_src0->buffer_gpu.lock(); d_Qx = extra_src0->buffer_gpu.lock();
qx_buf_offset = extra_src0->offset; qx_buf_offset = extra_src0->offset + src0->view_offs;
GGML_ASSERT(d_Qx != nullptr); GGML_ASSERT(d_Qx != nullptr);
} }
if(!src1_uma) { if(!src1_uma) {
d_Qy = extra_src1->buffer_gpu.lock(); d_Qy = extra_src1->buffer_gpu.lock();
qy_buf_offset = extra_src1->offset; qy_buf_offset = extra_src1->offset + src1->view_offs;
GGML_ASSERT(d_Qy != nullptr); GGML_ASSERT(d_Qy != nullptr);
} }
if(!ids_uma) { if(!ids_uma) {
d_ids = extra_ids->buffer_gpu.lock(); d_ids = extra_ids->buffer_gpu.lock();
ids_buf_offset = extra_ids->offset; ids_buf_offset = extra_ids->offset + ids->view_offs;
GGML_ASSERT(d_ids != nullptr); GGML_ASSERT(d_ids != nullptr);
} }
if (qx_needs_dequant) { if (qx_needs_dequant) {
@ -3769,9 +3766,9 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
const vk_buffer src_buf = extra_src0->buffer_gpu.lock(); const vk_buffer src_buf = extra_src0->buffer_gpu.lock();
const uint64_t src_offset = extra_src0->offset; const uint64_t src_offset = extra_src0->offset + src0->view_offs;
vk_buffer dst_buf = extra->buffer_gpu.lock(); vk_buffer dst_buf = extra->buffer_gpu.lock();
const uint64_t dst_offset = extra->offset; const uint64_t dst_offset = extra->offset + dst->view_offs;
std::vector<vk::BufferCopy> copies; std::vector<vk::BufferCopy> copies;
@ -4062,21 +4059,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
} }
GGML_ASSERT(d_D != nullptr); GGML_ASSERT(d_D != nullptr);
uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; uint64_t d_buf_offset = ((extra->offset + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
if(!src0_uma) { if(!src0_uma) {
d_X = extra_src0->buffer_gpu.lock(); d_X = extra_src0->buffer_gpu.lock();
x_buf_offset = extra_src0->offset; x_buf_offset = extra_src0->offset + src0->view_offs;
GGML_ASSERT(d_X != nullptr); GGML_ASSERT(d_X != nullptr);
} }
if (use_src1 && !src1_uma) { if (use_src1 && !src1_uma) {
d_Y = extra_src1->buffer_gpu.lock(); d_Y = extra_src1->buffer_gpu.lock();
y_buf_offset = extra_src1->offset; y_buf_offset = extra_src1->offset + src1->view_offs;
GGML_ASSERT(d_Y != nullptr); GGML_ASSERT(d_Y != nullptr);
} }
if (use_src2 && !src2_uma) { if (use_src2 && !src2_uma) {
d_Z = extra_src2->buffer_gpu.lock(); d_Z = extra_src2->buffer_gpu.lock();
z_buf_offset = extra_src2->offset; z_buf_offset = extra_src2->offset + src2->view_offs;
GGML_ASSERT(d_Z != nullptr); GGML_ASSERT(d_Z != nullptr);
} }
@ -4336,7 +4333,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t dst_type_size = ggml_type_size(dst->type); const uint32_t dst_type_size = ggml_type_size(dst->type);
const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, { ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
(uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src0),
@ -5569,6 +5566,13 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
const ggml_tensor * src2 = node->src[2]; const ggml_tensor * src2 = node->src[2];
switch (node->op) { switch (node->op) {
// Return on empty ops to avoid generating a compute_ctx and setting exit_tensor
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
case GGML_OP_NONE:
return;
case GGML_OP_UNARY: case GGML_OP_UNARY:
switch (ggml_get_unary_op(node)) { switch (ggml_get_unary_op(node)) {
case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_SILU:
@ -5590,10 +5594,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_OP_CPY: case GGML_OP_CPY:
case GGML_OP_CONT: case GGML_OP_CONT:
case GGML_OP_DUP: case GGML_OP_DUP:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
case GGML_OP_NORM: case GGML_OP_NORM:
case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM:
case GGML_OP_DIAG_MASK_INF: case GGML_OP_DIAG_MASK_INF:
@ -5601,7 +5601,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_OP_ROPE: case GGML_OP_ROPE:
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID: case GGML_OP_MUL_MAT_ID:
case GGML_OP_NONE:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
case GGML_OP_SUM_ROWS: case GGML_OP_SUM_ROWS:
break; break;
@ -5654,12 +5653,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_OP_DUP: case GGML_OP_DUP:
ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node); ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node);
break;
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
case GGML_OP_NONE:
break; break;
case GGML_OP_NORM: case GGML_OP_NORM:
ggml_vk_norm(ctx, ctx->compute_ctx, src0, node); ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
@ -5712,7 +5705,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
return; return;
} }
extra->ready = true;
extra->ctx_idx = ctx->compute_ctx->idx; extra->ctx_idx = ctx->compute_ctx->idx;
#ifdef GGML_VULKAN_CHECK_RESULTS #ifdef GGML_VULKAN_CHECK_RESULTS
@ -5796,8 +5788,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
ggml_vk_check_results_0(ctx, params, tensor); ggml_vk_check_results_0(ctx, params, tensor);
#endif #endif
GGML_ASSERT(extra->ready);
vk_context& subctx = ctx->gc.contexts[extra->ctx_idx]; vk_context& subctx = ctx->gc.contexts[extra->ctx_idx];
// Only run if ctx hasn't been submitted yet // Only run if ctx hasn't been submitted yet
@ -5822,8 +5812,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
subctx.out_memcpys.clear(); subctx.out_memcpys.clear();
} }
extra->ready = false;
return true; return true;
} }
@ -5943,8 +5931,10 @@ struct ggml_backend_vk_buffer_context {
~ggml_backend_vk_buffer_context() { ~ggml_backend_vk_buffer_context() {
ggml_vk_destroy_buffer(dev_buffer); ggml_vk_destroy_buffer(dev_buffer);
if (temp_tensor_extras != nullptr) {
delete[] temp_tensor_extras; delete[] temp_tensor_extras;
} }
}
ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() { ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() {
if (temp_tensor_extras == nullptr) { if (temp_tensor_extras == nullptr) {
@ -5990,18 +5980,16 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
#endif #endif
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra(); if (tensor->view_src != nullptr) {
if (tensor->view_src != nullptr && tensor->view_src->extra != nullptr) {
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
ggml_tensor_extra_gpu * extra_view = (ggml_tensor_extra_gpu *) tensor->view_src->extra; GGML_ASSERT(tensor->view_src->extra != nullptr);
extra->buffer_gpu = extra_view->buffer_gpu; tensor->extra = tensor->view_src->extra;
extra->offset = extra_view->offset + tensor->view_offs;
} else { } else {
ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
extra->buffer_gpu = ctx->dev_buffer; extra->buffer_gpu = ctx->dev_buffer;
extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base; extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
}
tensor->extra = extra; tensor->extra = extra;
}
} }
GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@ -6014,7 +6002,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
vk_buffer buf = extra->buffer_gpu.lock(); vk_buffer buf = extra->buffer_gpu.lock();
ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + offset, data, size); ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
} }
GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@ -6027,7 +6015,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
vk_buffer buf = extra->buffer_gpu.lock(); vk_buffer buf = extra->buffer_gpu.lock();
ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + offset, data, size); ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
} }
GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
@ -6038,7 +6026,7 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu
vk_buffer src_buf = src_extra->buffer_gpu.lock(); vk_buffer src_buf = src_extra->buffer_gpu.lock();
vk_buffer dst_buf = dst_extra->buffer_gpu.lock(); vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
ggml_vk_buffer_copy(dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src)); ggml_vk_buffer_copy(dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
return true; return true;
} }
@ -6264,7 +6252,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
vk_buffer buf = extra->buffer_gpu.lock(); vk_buffer buf = extra->buffer_gpu.lock();
ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size); ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
} }
GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@ -6284,7 +6272,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
vk_buffer buf = extra->buffer_gpu.lock(); vk_buffer buf = extra->buffer_gpu.lock();
ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size); ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
} }
GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
@ -6305,7 +6293,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
vk_buffer src_buf = src_extra->buffer_gpu.lock(); vk_buffer src_buf = src_extra->buffer_gpu.lock();
vk_buffer dst_buf = dst_extra->buffer_gpu.lock(); vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src)); ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
return true; return true;
} }
@ -6478,11 +6466,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
// return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; // return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
// } break; // } break;
case GGML_OP_ROPE: case GGML_OP_ROPE:
{
const int mode = ((const int32_t *) op->op_params)[2];
return true; return true;
} break;
case GGML_OP_NONE: case GGML_OP_NONE:
case GGML_OP_RESHAPE: case GGML_OP_RESHAPE:
case GGML_OP_VIEW: case GGML_OP_VIEW:
@ -6725,7 +6709,7 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
vk_buffer buffer_gpu = extra->buffer_gpu.lock(); vk_buffer buffer_gpu = extra->buffer_gpu.lock();
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size); ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
} }
std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl; std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
@ -6809,7 +6793,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
} else if (ggml_backend_buffer_is_vk(src0->buffer)) { } else if (ggml_backend_buffer_is_vk(src0->buffer)) {
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
vk_buffer buffer_gpu = extra->buffer_gpu.lock(); vk_buffer buffer_gpu = extra->buffer_gpu.lock();
uint64_t offset = extra->offset; uint64_t offset = extra->offset + src0->view_offs;
if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) { if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
for (int i3 = 0; i3 < src0->ne[3]; i3++) { for (int i3 = 0; i3 < src0->ne[3]; i3++) {
for (int i2 = 0; i2 < src0->ne[2]; i2++) { for (int i2 = 0; i2 < src0->ne[2]; i2++) {
@ -6851,7 +6835,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
} else if (ggml_backend_buffer_is_vk(src1->buffer)) { } else if (ggml_backend_buffer_is_vk(src1->buffer)) {
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
vk_buffer buffer_gpu = extra->buffer_gpu.lock(); vk_buffer buffer_gpu = extra->buffer_gpu.lock();
uint64_t offset = extra->offset; uint64_t offset = extra->offset + src1->view_offs;
if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) { if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
for (int i3 = 0; i3 < src1->ne[3]; i3++) { for (int i3 = 0; i3 < src1->ne[3]; i3++) {
for (int i2 = 0; i2 < src1->ne[2]; i2++) { for (int i2 = 0; i2 < src1->ne[2]; i2++) {
@ -6909,7 +6893,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
} else if (ggml_backend_buffer_is_vk(src2->buffer)) { } else if (ggml_backend_buffer_is_vk(src2->buffer)) {
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
vk_buffer buffer_gpu = extra->buffer_gpu.lock(); vk_buffer buffer_gpu = extra->buffer_gpu.lock();
uint64_t offset = extra->offset; uint64_t offset = extra->offset + src2->view_offs;
if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) { if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
for (int i3 = 0; i3 < src2->ne[3]; i3++) { for (int i3 = 0; i3 < src2->ne[3]; i3++) {
for (int i2 = 0; i2 < src2->ne[2]; i2++) { for (int i2 = 0; i2 < src2->ne[2]; i2++) {
@ -7092,11 +7076,11 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
vk_buffer buffer_gpu = extra->buffer_gpu.lock(); vk_buffer buffer_gpu = extra->buffer_gpu.lock();
if (extra->offset + tensor_size >= buffer_gpu->size) { if (extra->offset + tensor->view_offs + tensor_size >= buffer_gpu->size) {
tensor_size = buffer_gpu->size - (extra->offset); tensor_size = buffer_gpu->size - (extra->offset + tensor->view_offs);
} }
ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size); ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
} }
float first_error_result = -1.0f; float first_error_result = -1.0f;

View file

@ -415,6 +415,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.TOKEN_EMBD_NORM, MODEL_TENSOR.TOKEN_EMBD_NORM,
MODEL_TENSOR.TOKEN_TYPES, MODEL_TENSOR.TOKEN_TYPES,
MODEL_TENSOR.ATTN_NORM_2,
MODEL_TENSOR.ATTN_OUT_NORM, MODEL_TENSOR.ATTN_OUT_NORM,
MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_Q_NORM, MODEL_TENSOR.ATTN_Q_NORM,

View file

@ -5,6 +5,7 @@ import os
import shutil import shutil
import struct import struct
import tempfile import tempfile
from dataclasses import dataclass
from enum import Enum, auto from enum import Enum, auto
from io import BufferedWriter from io import BufferedWriter
from typing import IO, Any, Sequence, Mapping from typing import IO, Any, Sequence, Mapping
@ -30,17 +31,36 @@ from .quants import quant_shape_from_byte_shape
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@dataclass
class TensorInfo:
shape: Sequence[int]
dtype: GGMLQuantizationType
nbytes: int
tensor: np.ndarray[Any, Any] | None = None
@dataclass
class GGUFValue:
value: Any
type: GGUFValueType
class WriterState(Enum): class WriterState(Enum):
NO_FILE = auto()
EMPTY = auto() EMPTY = auto()
HEADER = auto() HEADER = auto()
KV_DATA = auto() KV_DATA = auto()
TI_DATA = auto() TI_DATA = auto()
WEIGHTS = auto()
class GGUFWriter: class GGUFWriter:
fout: BufferedWriter fout: BufferedWriter | None
path: os.PathLike[str] | str | None
temp_file: tempfile.SpooledTemporaryFile[bytes] | None temp_file: tempfile.SpooledTemporaryFile[bytes] | None
tensors: list[np.ndarray[Any, Any]] tensors: dict[str, TensorInfo]
kv_data: dict[str, GGUFValue]
state: WriterState
_simple_value_packing = { _simple_value_packing = {
GGUFValueType.UINT8: "B", GGUFValueType.UINT8: "B",
GGUFValueType.INT8: "b", GGUFValueType.INT8: "b",
@ -56,141 +76,140 @@ class GGUFWriter:
} }
def __init__( def __init__(
self, path: os.PathLike[str] | str, arch: str, use_temp_file: bool = True, self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False,
endianess: GGUFEndian = GGUFEndian.LITTLE, endianess: GGUFEndian = GGUFEndian.LITTLE,
): ):
self.fout = open(path, "wb") self.fout = None
self.path = path
self.arch = arch self.arch = arch
self.endianess = endianess self.endianess = endianess
self.offset_tensor = 0
self.data_alignment = GGUF_DEFAULT_ALIGNMENT self.data_alignment = GGUF_DEFAULT_ALIGNMENT
self.kv_data = bytearray()
self.kv_data_count = 0
self.ti_data = bytearray()
self.ti_data_count = 0
self.ti_names = set()
self.use_temp_file = use_temp_file self.use_temp_file = use_temp_file
self.temp_file = None self.temp_file = None
self.tensors = [] self.tensors = dict()
self.kv_data = dict()
logger.info("gguf: This GGUF file is for {0} Endian only".format( logger.info("gguf: This GGUF file is for {0} Endian only".format(
"Big" if self.endianess == GGUFEndian.BIG else "Little", "Big" if self.endianess == GGUFEndian.BIG else "Little",
)) ))
self.state = WriterState.EMPTY self.state = WriterState.NO_FILE
self.add_architecture() self.add_architecture()
def write_header_to_file(self) -> None: def open_output_file(self, path: os.PathLike[str] | str | None = None) -> None:
if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path):
# allow calling this multiple times as long as the path is the same
return
if self.state is not WriterState.NO_FILE:
raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
if path is not None:
self.path = path
if self.path is not None:
if self.fout is not None:
self.fout.close()
self.fout = open(self.path, "wb")
self.state = WriterState.EMPTY
def write_header_to_file(self, path: os.PathLike[str] | str | None = None) -> None:
self.open_output_file(path)
if self.state is not WriterState.EMPTY: if self.state is not WriterState.EMPTY:
raise ValueError(f'Expected output file to be empty, got {self.state}') raise ValueError(f'Expected output file to be empty, got {self.state}')
self._write_packed("<I", GGUF_MAGIC, skip_pack_prefix = True) self._write_packed("<I", GGUF_MAGIC, skip_pack_prefix = True)
self._write_packed("I", GGUF_VERSION) self._write_packed("I", GGUF_VERSION)
self._write_packed("Q", self.ti_data_count) self._write_packed("Q", len(self.tensors))
self._write_packed("Q", self.kv_data_count) self._write_packed("Q", len(self.kv_data))
self.flush() self.flush()
self.state = WriterState.HEADER self.state = WriterState.HEADER
def write_kv_data_to_file(self) -> None: def write_kv_data_to_file(self) -> None:
if self.state is not WriterState.HEADER: if self.state is not WriterState.HEADER:
raise ValueError(f'Expected output file to contain the header, got {self.state}') raise ValueError(f'Expected output file to contain the header, got {self.state}')
assert self.fout is not None
self.fout.write(self.kv_data) kv_data = bytearray()
for key, val in self.kv_data.items():
kv_data += self._pack_val(key, GGUFValueType.STRING, add_vtype=False)
kv_data += self._pack_val(val.value, val.type, add_vtype=True)
self.fout.write(kv_data)
self.flush() self.flush()
self.state = WriterState.KV_DATA self.state = WriterState.KV_DATA
def write_ti_data_to_file(self) -> None: def write_ti_data_to_file(self) -> None:
if self.state is not WriterState.KV_DATA: if self.state is not WriterState.KV_DATA:
raise ValueError(f'Expected output file to contain KV data, got {self.state}') raise ValueError(f'Expected output file to contain KV data, got {self.state}')
assert self.fout is not None
self.fout.write(self.ti_data) ti_data = bytearray()
offset_tensor = 0
for name, ti in self.tensors.items():
ti_data += self._pack_val(name, GGUFValueType.STRING, add_vtype=False)
n_dims = len(ti.shape)
ti_data += self._pack("I", n_dims)
for i in range(n_dims):
ti_data += self._pack("Q", ti.shape[n_dims - 1 - i])
ti_data += self._pack("I", ti.dtype)
ti_data += self._pack("Q", offset_tensor)
offset_tensor += GGUFWriter.ggml_pad(ti.nbytes, self.data_alignment)
self.fout.write(ti_data)
self.flush() self.flush()
self.state = WriterState.TI_DATA self.state = WriterState.TI_DATA
def add_key(self, key: str) -> None: def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None:
self.add_val(key, GGUFValueType.STRING, add_vtype=False) if key in self.kv_data:
raise ValueError(f'Duplicated key name {key!r}')
self.kv_data[key] = GGUFValue(value=val, type=vtype)
def add_uint8(self, key: str, val: int) -> None: def add_uint8(self, key: str, val: int) -> None:
self.add_key(key) self.add_key_value(key,val, GGUFValueType.UINT8)
self.add_val(val, GGUFValueType.UINT8)
def add_int8(self, key: str, val: int) -> None: def add_int8(self, key: str, val: int) -> None:
self.add_key(key) self.add_key_value(key, val, GGUFValueType.INT8)
self.add_val(val, GGUFValueType.INT8)
def add_uint16(self, key: str, val: int) -> None: def add_uint16(self, key: str, val: int) -> None:
self.add_key(key) self.add_key_value(key, val, GGUFValueType.UINT16)
self.add_val(val, GGUFValueType.UINT16)
def add_int16(self, key: str, val: int) -> None: def add_int16(self, key: str, val: int) -> None:
self.add_key(key) self.add_key_value(key, val, GGUFValueType.INT16)
self.add_val(val, GGUFValueType.INT16)
def add_uint32(self, key: str, val: int) -> None: def add_uint32(self, key: str, val: int) -> None:
self.add_key(key) self.add_key_value(key, val, GGUFValueType.UINT32)
self.add_val(val, GGUFValueType.UINT32)
def add_int32(self, key: str, val: int) -> None: def add_int32(self, key: str, val: int) -> None:
self.add_key(key) self.add_key_value(key, val, GGUFValueType.INT32)
self.add_val(val, GGUFValueType.INT32)
def add_float32(self, key: str, val: float) -> None: def add_float32(self, key: str, val: float) -> None:
self.add_key(key) self.add_key_value(key, val, GGUFValueType.FLOAT32)
self.add_val(val, GGUFValueType.FLOAT32)
def add_uint64(self, key: str, val: int) -> None: def add_uint64(self, key: str, val: int) -> None:
self.add_key(key) self.add_key_value(key, val, GGUFValueType.UINT64)
self.add_val(val, GGUFValueType.UINT64)
def add_int64(self, key: str, val: int) -> None: def add_int64(self, key: str, val: int) -> None:
self.add_key(key) self.add_key_value(key, val, GGUFValueType.INT64)
self.add_val(val, GGUFValueType.INT64)
def add_float64(self, key: str, val: float) -> None: def add_float64(self, key: str, val: float) -> None:
self.add_key(key) self.add_key_value(key, val, GGUFValueType.FLOAT64)
self.add_val(val, GGUFValueType.FLOAT64)
def add_bool(self, key: str, val: bool) -> None: def add_bool(self, key: str, val: bool) -> None:
self.add_key(key) self.add_key_value(key, val, GGUFValueType.BOOL)
self.add_val(val, GGUFValueType.BOOL)
def add_string(self, key: str, val: str) -> None: def add_string(self, key: str, val: str) -> None:
if not val: if not val:
return return
self.add_key(key) self.add_key_value(key, val, GGUFValueType.STRING)
self.add_val(val, GGUFValueType.STRING)
def add_array(self, key: str, val: Sequence[Any]) -> None: def add_array(self, key: str, val: Sequence[Any]) -> None:
if not isinstance(val, Sequence): if not isinstance(val, Sequence):
raise ValueError("Value must be a sequence for array type") raise ValueError("Value must be a sequence for array type")
self.add_key(key) self.add_key_value(key, val, GGUFValueType.ARRAY)
self.add_val(val, GGUFValueType.ARRAY)
def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True) -> None:
if vtype is None:
vtype = GGUFValueType.get_type(val)
if add_vtype:
self.kv_data += self._pack("I", vtype)
self.kv_data_count += 1
pack_fmt = self._simple_value_packing.get(vtype)
if pack_fmt is not None:
self.kv_data += self._pack(pack_fmt, val, skip_pack_prefix = vtype == GGUFValueType.BOOL)
elif vtype == GGUFValueType.STRING:
encoded_val = val.encode("utf-8") if isinstance(val, str) else val
self.kv_data += self._pack("Q", len(encoded_val))
self.kv_data += encoded_val
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
ltype = GGUFValueType.get_type(val[0])
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
raise ValueError("All items in a GGUF array should be of the same type")
self.kv_data += self._pack("I", ltype)
self.kv_data += self._pack("Q", len(val))
for item in val:
self.add_val(item, add_vtype=False)
else:
raise ValueError("Invalid GGUF metadata value type or value")
@staticmethod @staticmethod
def ggml_pad(x: int, n: int) -> int: def ggml_pad(x: int, n: int) -> int:
@ -200,16 +219,12 @@ class GGUFWriter:
self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype, self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype,
tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None, tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None,
) -> None: ) -> None:
if self.state is not WriterState.EMPTY: if self.state is not WriterState.NO_FILE:
raise ValueError(f'Expected output file to be empty, got {self.state}') raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
if name in self.ti_names: if name in self.tensors:
raise ValueError(f'Duplicated tensor name {name}') raise ValueError(f'Duplicated tensor name {name!r}')
self.ti_names.add(name)
encoded_name = name.encode("utf-8")
self.ti_data += self._pack("Q", len(encoded_name))
self.ti_data += encoded_name
if raw_dtype is None: if raw_dtype is None:
if tensor_dtype == np.float16: if tensor_dtype == np.float16:
dtype = GGMLQuantizationType.F16 dtype = GGMLQuantizationType.F16
@ -231,14 +246,8 @@ class GGUFWriter:
dtype = raw_dtype dtype = raw_dtype
if tensor_dtype == np.uint8: if tensor_dtype == np.uint8:
tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype) tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
n_dims = len(tensor_shape)
self.ti_data += self._pack("I", n_dims) self.tensors[name] = TensorInfo(shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes)
for i in range(n_dims):
self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
self.ti_data += self._pack("I", dtype)
self.ti_data += self._pack("Q", self.offset_tensor)
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
self.ti_data_count += 1
def add_tensor( def add_tensor(
self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
@ -252,10 +261,10 @@ class GGUFWriter:
self.temp_file = fp self.temp_file = fp
shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape
self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype) self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype=raw_dtype)
if self.temp_file is None: if self.temp_file is None:
self.tensors.append(tensor) self.tensors[name].tensor = tensor
return return
tensor.tofile(self.temp_file) tensor.tofile(self.temp_file)
@ -267,8 +276,9 @@ class GGUFWriter:
fp.write(bytes([0] * pad)) fp.write(bytes([0] * pad))
def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None: def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:
if self.state is not WriterState.TI_DATA: if self.state is not WriterState.TI_DATA and self.state is not WriterState.WEIGHTS:
raise ValueError(f'Expected output file to contain tensor info, got {self.state}') raise ValueError(f'Expected output file to contain tensor info or weights, got {self.state}')
assert self.fout is not None
if self.endianess == GGUFEndian.BIG: if self.endianess == GGUFEndian.BIG:
tensor.byteswap(inplace=True) tensor.byteswap(inplace=True)
@ -276,50 +286,51 @@ class GGUFWriter:
tensor.tofile(self.fout) tensor.tofile(self.fout)
self.write_padding(self.fout, tensor.nbytes) self.write_padding(self.fout, tensor.nbytes)
self.state = WriterState.WEIGHTS
def write_tensors_to_file(self, *, progress: bool = False) -> None: def write_tensors_to_file(self, *, progress: bool = False) -> None:
self.write_ti_data_to_file() self.write_ti_data_to_file()
assert self.fout is not None
self.write_padding(self.fout, self.fout.tell()) self.write_padding(self.fout, self.fout.tell())
if self.temp_file is None: if self.temp_file is None:
self.tensors.reverse() # to pop from the "beginning" in constant time bar = None
if progress: if progress:
from tqdm import tqdm from tqdm import tqdm
total_bytes = sum(t.nbytes for t in self.tensors) total_bytes = sum(t.nbytes for t in self.tensors.values())
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True) bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
while True: # relying on the fact that Python dicts preserve insertion order (since 3.7)
try: for ti in self.tensors.values():
tensor = self.tensors.pop() assert ti.tensor is not None # can only iterate once over the tensors
except IndexError: assert ti.tensor.nbytes == ti.nbytes
break ti.tensor.tofile(self.fout)
tensor.tofile(self.fout) if bar is not None:
bar.update(tensor.nbytes) bar.update(ti.nbytes)
self.write_padding(self.fout, tensor.nbytes) self.write_padding(self.fout, ti.nbytes)
return ti.tensor = None
while True: else:
try:
tensor = self.tensors.pop()
except IndexError:
break
tensor.tofile(self.fout)
self.write_padding(self.fout, tensor.nbytes)
return
self.temp_file.seek(0) self.temp_file.seek(0)
shutil.copyfileobj(self.temp_file, self.fout) shutil.copyfileobj(self.temp_file, self.fout)
self.flush() self.flush()
self.temp_file.close() self.temp_file.close()
self.state = WriterState.WEIGHTS
def flush(self) -> None: def flush(self) -> None:
assert self.fout is not None
self.fout.flush() self.fout.flush()
def close(self) -> None: def close(self) -> None:
if self.fout is not None:
self.fout.close() self.fout.close()
self.fout = None
def add_architecture(self) -> None: def add_architecture(self) -> None:
self.add_string(Keys.General.ARCHITECTURE, self.arch) self.add_string(Keys.General.ARCHITECTURE, self.arch)
@ -449,7 +460,7 @@ class GGUFWriter:
def add_rope_scaling_factor(self, value: float) -> None: def add_rope_scaling_factor(self, value: float) -> None:
self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value) self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value)
def add_rope_scaling_attn_factors(self, value: Sequence[float]) -> None: def add_rope_scaling_attn_factors(self, value: float) -> None:
self.add_float32(Keys.Rope.SCALING_ATTN_FACTOR.format(arch=self.arch), value) self.add_float32(Keys.Rope.SCALING_ATTN_FACTOR.format(arch=self.arch), value)
def add_rope_scaling_orig_ctx_len(self, value: int) -> None: def add_rope_scaling_orig_ctx_len(self, value: int) -> None:
@ -571,5 +582,32 @@ class GGUFWriter:
pack_prefix = '<' if self.endianess == GGUFEndian.LITTLE else '>' pack_prefix = '<' if self.endianess == GGUFEndian.LITTLE else '>'
return struct.pack(f'{pack_prefix}{fmt}', value) return struct.pack(f'{pack_prefix}{fmt}', value)
def _pack_val(self, val: Any, vtype: GGUFValueType, add_vtype: bool) -> bytes:
kv_data = bytearray()
if add_vtype:
kv_data += self._pack("I", vtype)
pack_fmt = self._simple_value_packing.get(vtype)
if pack_fmt is not None:
kv_data += self._pack(pack_fmt, val, skip_pack_prefix = vtype == GGUFValueType.BOOL)
elif vtype == GGUFValueType.STRING:
encoded_val = val.encode("utf-8") if isinstance(val, str) else val
kv_data += self._pack("Q", len(encoded_val))
kv_data += encoded_val
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
ltype = GGUFValueType.get_type(val[0])
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
raise ValueError("All items in a GGUF array should be of the same type")
kv_data += self._pack("I", ltype)
kv_data += self._pack("Q", len(val))
for item in val:
kv_data += self._pack_val(item, ltype, add_vtype=False)
else:
raise ValueError("Invalid GGUF metadata value type or value")
return kv_data
def _write_packed(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> None: def _write_packed(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> None:
assert self.fout is not None
self.fout.write(self._pack(fmt, value, skip_pack_prefix)) self.fout.write(self._pack(fmt, value, skip_pack_prefix))

View file

@ -102,6 +102,7 @@ class TensorNameMap:
# Attention norm 2 # Attention norm 2
MODEL_TENSOR.ATTN_NORM_2: ( MODEL_TENSOR.ATTN_NORM_2: (
"transformer.h.{bid}.ln_attn", # falcon40b "transformer.h.{bid}.ln_attn", # falcon40b
"encoder.layer.{bid}.layer_norm_1", # jina-v2-code
), ),
# Attention query-key-value # Attention query-key-value
@ -311,6 +312,7 @@ class TensorNameMap:
"model.layers.{bid}.mlp.c_proj", # starcoder2 "model.layers.{bid}.mlp.c_proj", # starcoder2
"encoder.layer.{bid}.mlp.wo", # jina-bert-v2 "encoder.layer.{bid}.mlp.wo", # jina-bert-v2
"model.layers.{bid}.residual_mlp.w2", # arctic "model.layers.{bid}.residual_mlp.w2", # arctic
"encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2
), ),
MODEL_TENSOR.FFN_DOWN_EXP: ( MODEL_TENSOR.FFN_DOWN_EXP: (
@ -350,6 +352,7 @@ class TensorNameMap:
"encoder.layers.{bid}.norm2", # nomic-bert "encoder.layers.{bid}.norm2", # nomic-bert
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok "transformer.decoder_layer.{bid}.rms_norm_3", # Grok
"encoder.layer.{bid}.mlp.layernorm", # jina-bert-v2 "encoder.layer.{bid}.mlp.layernorm", # jina-bert-v2
"encoder.layer.{bid}.layer_norm_2" # jina-v2-code
), ),
MODEL_TENSOR.SSM_IN: ( MODEL_TENSOR.SSM_IN: (

View file

@ -101,8 +101,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
logger.debug(f'Copying {field.name}') logger.debug(f'Copying {field.name}')
if val.value is not None: if val.value is not None:
writer.add_key(field.name) writer.add_key_value(field.name, val.value, val.type)
writer.add_val(val.value, val.type)
if gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata: if gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata:
logger.debug('Adding chat template(s)') logger.debug('Adding chat template(s)')
@ -111,8 +110,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
for key, val in new_metadata.items(): for key, val in new_metadata.items():
logger.debug(f'Adding {key}: "{val.value}" {val.description}') logger.debug(f'Adding {key}: "{val.value}" {val.description}')
writer.add_key(key) writer.add_key_value(key, val.value, val.type)
writer.add_val(val.value, val.type)
total_bytes = 0 total_bytes = 0

View file

@ -3671,7 +3671,7 @@ Current version: 145
const XTTS_ID = 1000; const XTTS_ID = 1000;
const ALLTALK_ID = 1001; const ALLTALK_ID = 1001;
const HD_RES_PX = 512; const HD_RES_PX = 512;
const NO_HD_RES_PX = 256; const NO_HD_RES_PX = 320;
//all configurable globals //all configurable globals
var perfdata = null; //if it's null, we are not connected var perfdata = null; //if it's null, we are not connected
@ -12691,6 +12691,29 @@ Current version: 145
//original source https://github.com/kdavis-mozilla/vad.js Copyright (c) 2015, Kelly Daviss //original source https://github.com/kdavis-mozilla/vad.js Copyright (c) 2015, Kelly Daviss
let VAD=function(t){for(var e in this.options={fftSize:512,bufferLen:512,voice_stop:function(){},voice_start:function(){},smoothingTimeConstant:.99,energy_offset:1e-8,energy_threshold_ratio_pos:2,energy_threshold_ratio_neg:.5,energy_integration:1,filter:[{f:200,v:0},{f:2e3,v:1}],source:null,context:null},t)t.hasOwnProperty(e)&&(this.options[e]=t[e]);if(!this.options.source)throw Error("The options must specify a MediaStreamAudioSourceNode.");this.options.context=this.options.source.context,this.hertzPerBin=this.options.context.sampleRate/this.options.fftSize,this.iterationFrequency=this.options.context.sampleRate/this.options.bufferLen,this.iterationPeriod=1/this.iterationFrequency,this.setFilter=function(t){this.filter=[];for(var e=0,i=this.options.fftSize/2;e<i;e++){this.filter[e]=0;for(var s=0,o=t.length;s<o;s++)if(e*this.hertzPerBin<t[s].f){this.filter[e]=t[s].v;break}}},this.setFilter(this.options.filter),this.ready={},this.vadState=!1,this.energy_offset=this.options.energy_offset,this.energy_threshold_pos=this.energy_offset*this.options.energy_threshold_ratio_pos,this.energy_threshold_neg=this.energy_offset*this.options.energy_threshold_ratio_neg,this.voiceTrend=0,this.voiceTrendMax=10,this.voiceTrendMin=-10,this.voiceTrendStart=5,this.voiceTrendEnd=-5,this.analyser=this.options.context.createAnalyser(),this.analyser.smoothingTimeConstant=this.options.smoothingTimeConstant,this.analyser.fftSize=this.options.fftSize,this.floatFrequencyData=new Float32Array(this.analyser.frequencyBinCount),this.floatFrequencyDataLinear=new Float32Array(this.floatFrequencyData.length),this.options.source.connect(this.analyser),this.scriptProcessorNode=this.options.context.createScriptProcessor(this.options.bufferLen,1,1),this.scriptProcessorNode.connect(this.options.context.destination);var i=this;this.scriptProcessorNode.onaudioprocess=function(t){i.analyser.getFloatFrequencyData(i.floatFrequencyData),i.update(),i.monitor()},this.options.source.connect(this.scriptProcessorNode),this.update=function(){for(var t=this.floatFrequencyData,e=0,i=t.length;e<i;e++)this.floatFrequencyDataLinear[e]=Math.pow(10,t[e]/10);this.ready={}},this.getEnergy=function(){if(this.ready.energy)return this.energy;for(var t=0,e=this.floatFrequencyDataLinear,i=0,s=e.length;i<s;i++)t+=this.filter[i]*e[i]*e[i];return this.energy=t,this.ready.energy=!0,t},this.monitor=function(){var t=this.getEnergy()-this.energy_offset;t>this.energy_threshold_pos?this.voiceTrend=this.voiceTrend+1>this.voiceTrendMax?this.voiceTrendMax:this.voiceTrend+1:t<-this.energy_threshold_neg?this.voiceTrend=this.voiceTrend-1<this.voiceTrendMin?this.voiceTrendMin:this.voiceTrend-1:this.voiceTrend>0?this.voiceTrend--:this.voiceTrend<0&&this.voiceTrend++;var e=!1,i=!1;this.voiceTrend>this.voiceTrendStart?e=!0:this.voiceTrend<this.voiceTrendEnd&&(i=!0);var s=t*this.iterationPeriod*this.options.energy_integration;return s>0||!i?this.energy_offset+=s:this.energy_offset+=10*s,this.energy_offset=this.energy_offset<0?0:this.energy_offset,this.energy_threshold_pos=this.energy_offset*this.options.energy_threshold_ratio_pos,this.energy_threshold_neg=this.energy_offset*this.options.energy_threshold_ratio_neg,e&&!this.vadState&&(this.vadState=!0,this.options.voice_start()),i&&this.vadState&&(this.vadState=!1,this.options.voice_stop()),t}}; let VAD=function(t){for(var e in this.options={fftSize:512,bufferLen:512,voice_stop:function(){},voice_start:function(){},smoothingTimeConstant:.99,energy_offset:1e-8,energy_threshold_ratio_pos:2,energy_threshold_ratio_neg:.5,energy_integration:1,filter:[{f:200,v:0},{f:2e3,v:1}],source:null,context:null},t)t.hasOwnProperty(e)&&(this.options[e]=t[e]);if(!this.options.source)throw Error("The options must specify a MediaStreamAudioSourceNode.");this.options.context=this.options.source.context,this.hertzPerBin=this.options.context.sampleRate/this.options.fftSize,this.iterationFrequency=this.options.context.sampleRate/this.options.bufferLen,this.iterationPeriod=1/this.iterationFrequency,this.setFilter=function(t){this.filter=[];for(var e=0,i=this.options.fftSize/2;e<i;e++){this.filter[e]=0;for(var s=0,o=t.length;s<o;s++)if(e*this.hertzPerBin<t[s].f){this.filter[e]=t[s].v;break}}},this.setFilter(this.options.filter),this.ready={},this.vadState=!1,this.energy_offset=this.options.energy_offset,this.energy_threshold_pos=this.energy_offset*this.options.energy_threshold_ratio_pos,this.energy_threshold_neg=this.energy_offset*this.options.energy_threshold_ratio_neg,this.voiceTrend=0,this.voiceTrendMax=10,this.voiceTrendMin=-10,this.voiceTrendStart=5,this.voiceTrendEnd=-5,this.analyser=this.options.context.createAnalyser(),this.analyser.smoothingTimeConstant=this.options.smoothingTimeConstant,this.analyser.fftSize=this.options.fftSize,this.floatFrequencyData=new Float32Array(this.analyser.frequencyBinCount),this.floatFrequencyDataLinear=new Float32Array(this.floatFrequencyData.length),this.options.source.connect(this.analyser),this.scriptProcessorNode=this.options.context.createScriptProcessor(this.options.bufferLen,1,1),this.scriptProcessorNode.connect(this.options.context.destination);var i=this;this.scriptProcessorNode.onaudioprocess=function(t){i.analyser.getFloatFrequencyData(i.floatFrequencyData),i.update(),i.monitor()},this.options.source.connect(this.scriptProcessorNode),this.update=function(){for(var t=this.floatFrequencyData,e=0,i=t.length;e<i;e++)this.floatFrequencyDataLinear[e]=Math.pow(10,t[e]/10);this.ready={}},this.getEnergy=function(){if(this.ready.energy)return this.energy;for(var t=0,e=this.floatFrequencyDataLinear,i=0,s=e.length;i<s;i++)t+=this.filter[i]*e[i]*e[i];return this.energy=t,this.ready.energy=!0,t},this.monitor=function(){var t=this.getEnergy()-this.energy_offset;t>this.energy_threshold_pos?this.voiceTrend=this.voiceTrend+1>this.voiceTrendMax?this.voiceTrendMax:this.voiceTrend+1:t<-this.energy_threshold_neg?this.voiceTrend=this.voiceTrend-1<this.voiceTrendMin?this.voiceTrendMin:this.voiceTrend-1:this.voiceTrend>0?this.voiceTrend--:this.voiceTrend<0&&this.voiceTrend++;var e=!1,i=!1;this.voiceTrend>this.voiceTrendStart?e=!0:this.voiceTrend<this.voiceTrendEnd&&(i=!0);var s=t*this.iterationPeriod*this.options.energy_integration;return s>0||!i?this.energy_offset+=s:this.energy_offset+=10*s,this.energy_offset=this.energy_offset<0?0:this.energy_offset,this.energy_threshold_pos=this.energy_offset*this.options.energy_threshold_ratio_pos,this.energy_threshold_neg=this.energy_offset*this.options.energy_threshold_ratio_neg,e&&!this.vadState&&(this.vadState=!0,this.options.voice_start()),i&&this.vadState&&(this.vadState=!1,this.options.voice_stop()),t}};
//resample audio freq (to 16khz)
function resampleAudioBuffer(buffer, resampledRate, callback) {
const originalSampleRate = buffer.sampleRate;
const channels = buffer.numberOfChannels;
const length = buffer.length;
const ratio = originalSampleRate / resampledRate;
const newLength = Math.round(length / ratio);
const audioContext = new (window.AudioContext || window.webkitAudioContext)();
const resampledBuffer = audioContext.createBuffer(channels, newLength, resampledRate);
const offlineContext = new OfflineAudioContext(channels, newLength, resampledRate);
const source = offlineContext.createBufferSource();
source.buffer = buffer;
source.connect(offlineContext.destination);
const startRendering = offlineContext.startRendering();
startRendering.then((renderedBuffer) => {
callback(renderedBuffer);
}).catch((error) => {
console.error('Resample Err:', error);
});
source.start();
return resampledBuffer;
}
let audioBufferToWavBlob = function (audioBuffer) { let audioBufferToWavBlob = function (audioBuffer) {
let writeWavString = function (view, offset, string) { let writeWavString = function (view, offset, string) {
for (let i = 0; i < string.length; i++) { for (let i = 0; i < string.length; i++) {
@ -12738,7 +12761,7 @@ Current version: 145
reader.onloadend = function() { reader.onloadend = function() {
let arrayBuffer = reader.result; let arrayBuffer = reader.result;
window.AudioContext = window.AudioContext || window.webkitAudioContext; window.AudioContext = window.AudioContext || window.webkitAudioContext;
let audioContext = new AudioContext({ sampleRate: 16000 }); let audioContext = new AudioContext();
audioContext.decodeAudioData(arrayBuffer, (buffer)=>{ audioContext.decodeAudioData(arrayBuffer, (buffer)=>{
onDone(buffer); onDone(buffer);
},(err)=> },(err)=>
@ -12778,20 +12801,24 @@ Current version: 145
if(preaudioblobs.length<2) if(preaudioblobs.length<2)
{ {
audioBlobToDecodedAudioBuffer(completeRecording,(buffer)=>{ audioBlobToDecodedAudioBuffer(completeRecording,(buffer)=>{
let wavblob = audioBufferToWavBlob(buffer); resampleAudioBuffer(buffer,16000,(rsBuffer)=>{
let wavblob = audioBufferToWavBlob(rsBuffer);
audiodatareader.readAsDataURL(wavblob); audiodatareader.readAsDataURL(wavblob);
}); });
});
} else { } else {
audioBlobToDecodedAudioBuffer(completeRecording,(buffer)=>{ audioBlobToDecodedAudioBuffer(completeRecording,(buffer)=>{
audioBlobToDecodedAudioBuffer(preaudioblobs[0],(buffer2)=>{ audioBlobToDecodedAudioBuffer(preaudioblobs[0],(buffer2)=>{
audioBlobToDecodedAudioBuffer(preaudioblobs[1],(buffer3)=>{ audioBlobToDecodedAudioBuffer(preaudioblobs[1],(buffer3)=>{
let prefix = concatenateAudioBuffers(buffer2,buffer3); let prefix = concatenateAudioBuffers(buffer2,buffer3);
let finalbuf = concatenateAudioBuffers(prefix,buffer); let finalbuf = concatenateAudioBuffers(prefix,buffer);
let wavblob = audioBufferToWavBlob(finalbuf); resampleAudioBuffer(finalbuf,16000,(rsBuffer)=>{
let wavblob = audioBufferToWavBlob(rsBuffer);
audiodatareader.readAsDataURL(wavblob); audiodatareader.readAsDataURL(wavblob);
}); });
}); });
}); });
});
} }
audiodatareader.onloadend = function () { audiodatareader.onloadend = function () {
@ -12825,7 +12852,7 @@ Current version: 145
voicerecorder = new MediaRecorder(stream); voicerecorder = new MediaRecorder(stream);
voicerecorder.addEventListener('dataavailable', onRecordingReady); voicerecorder.addEventListener('dataavailable', onRecordingReady);
window.AudioContext = window.AudioContext || window.webkitAudioContext; window.AudioContext = window.AudioContext || window.webkitAudioContext;
let audioContext = new AudioContext({ sampleRate: 16000 }); let audioContext = new AudioContext();
let source = audioContext.createMediaStreamSource(stream); let source = audioContext.createMediaStreamSource(stream);
let options = { let options = {
source: source, source: source,

View file

@ -728,6 +728,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
{ LLM_TENSOR_TOKEN_TYPES, "token_types" }, { LLM_TENSOR_TOKEN_TYPES, "token_types" },
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
@ -4715,8 +4716,7 @@ static void llm_load_vocab(
LLAMA_LOG_WARN("%s: ************************************ \n", __func__); LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
LLAMA_LOG_WARN("%s: \n", __func__); LLAMA_LOG_WARN("%s: \n", __func__);
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if ( } else if (tokenizer_pre == "default") {
tokenizer_pre == "default") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if ( } else if (
tokenizer_pre == "llama3" || tokenizer_pre == "llama3" ||
@ -4743,7 +4743,8 @@ static void llm_load_vocab(
tokenizer_pre == "jina-es" || tokenizer_pre == "jina-es" ||
tokenizer_pre == "jina-de" || tokenizer_pre == "jina-de" ||
tokenizer_pre == "jina-v2-es" || tokenizer_pre == "jina-v2-es" ||
tokenizer_pre == "jina-v2-de") { tokenizer_pre == "jina-v2-de" ||
tokenizer_pre == "jina-v2-code") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
} else if ( } else if (
tokenizer_pre == "refact") { tokenizer_pre == "refact") {
@ -5634,6 +5635,9 @@ static bool llm_load_tensors(
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}); layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
@ -8597,6 +8601,11 @@ struct llm_build_context {
// attention layer norm // attention layer norm
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il); cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
if (model.layers[il].attn_norm_2 != nullptr) {
cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il);
}
struct ggml_tensor * ffn_inp = cur; struct ggml_tensor * ffn_inp = cur;
cb(ffn_inp, "ffn_inp", il); cb(ffn_inp, "ffn_inp", il);
@ -13940,7 +13949,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
const uint32_t chr) { const uint32_t chr) {
bool found = false; bool found = false;
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR; bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
@ -13949,6 +13958,10 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
// inclusive range, e.g. [a-z] // inclusive range, e.g. [a-z]
found = found || (pos->value <= chr && chr <= pos[1].value); found = found || (pos->value <= chr && chr <= pos[1].value);
pos += 2; pos += 2;
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
// Any character matches "."
found = true;
pos += 1;
} else { } else {
// exact char match, e.g. [a] or "a" // exact char match, e.g. [a] or "a"
found = found || pos->value == chr; found = found || pos->value == chr;
@ -13966,7 +13979,7 @@ static bool llama_grammar_match_partial_char(
const llama_grammar_element * pos, const llama_grammar_element * pos,
const llama_partial_utf8 partial_utf8) { const llama_partial_utf8 partial_utf8) {
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR; bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
uint32_t partial_value = partial_utf8.value; uint32_t partial_value = partial_utf8.value;
@ -13996,6 +14009,9 @@ static bool llama_grammar_match_partial_char(
return is_positive_char; return is_positive_char;
} }
pos += 2; pos += 2;
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
// Any character matches "."
return true;
} else { } else {
// exact char match, e.g. [a] or "a" // exact char match, e.g. [a] or "a"
if (low <= pos->value && pos->value <= high) { if (low <= pos->value && pos->value <= high) {
@ -14056,6 +14072,7 @@ static void llama_grammar_advance_stack(
} }
case LLAMA_GRETYPE_CHAR: case LLAMA_GRETYPE_CHAR:
case LLAMA_GRETYPE_CHAR_NOT: case LLAMA_GRETYPE_CHAR_NOT:
case LLAMA_GRETYPE_CHAR_ANY:
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) { if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
// only add the stack if it's not a duplicate of one we already have // only add the stack if it's not a duplicate of one we already have
new_stacks.emplace_back(stack); new_stacks.emplace_back(stack);
@ -15543,6 +15560,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (imatrix_data) { if (imatrix_data) {
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size())); LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
qs.has_imatrix = true; qs.has_imatrix = true;
// check imatrix for nans or infs
for (const auto & kv : *imatrix_data) {
for (float f : kv.second) {
if (!std::isfinite(f)) {
throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
}
}
}
} }
} }

View file

@ -365,6 +365,9 @@ extern "C" {
// modifies a preceding LLAMA_GRETYPE_CHAR or // modifies a preceding LLAMA_GRETYPE_CHAR or
// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA]) // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
LLAMA_GRETYPE_CHAR_ALT = 6, LLAMA_GRETYPE_CHAR_ALT = 6,
// any character (.)
LLAMA_GRETYPE_CHAR_ANY = 7,
}; };
typedef struct llama_grammar_element { typedef struct llama_grammar_element {