Merge branch 'upstream' into concedo_experimental

# Conflicts: # .devops/full-cuda.Dockerfile # .devops/full.Dockerfile # .devops/main-cuda.Dockerfile # .devops/main-rocm.Dockerfile # .devops/main-vulkan.Dockerfile # .devops/main.Dockerfile # .devops/server-cuda.Dockerfile # .devops/server.Dockerfile # README.md # common/CMakeLists.txt # grammars/README.md # tests/test-grammar-integration.cpp # tests/test-grammar-parser.cpp # tests/test-json-schema-to-grammar.cpp
2025-09-11 09:34:37 +00:00 · 2024-06-09 17:30:05 +08:00 · 2024-06-09 17:30:05 +08:00 · 562d980140
commit 562d980140
parent 02357eadf8 2decf57bc6
25 changed files with 881 additions and 676 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -201,19 +201,13 @@ void gpt_params_handle_model_default(gpt_params & params) {
            }
            params.hf_file = params.model;
        } else if (params.model.empty()) {
-            std::string cache_directory = fs_get_cache_directory();
+            params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
            const bool success = fs_create_directory_with_parents(cache_directory);
            if (!success) {
                throw std::runtime_error("failed to create cache directory: " + cache_directory);
            }
            params.model = cache_directory + string_split(params.hf_file, '/').back();
        }
    } else if (!params.model_url.empty()) {
        if (params.model.empty()) {
            auto f = string_split(params.model_url, '#').front();
            f = string_split(f, '?').front();
-            f = string_split(f, '/').back();
+            params.model = fs_get_cache_file(string_split(f, '/').back());
            params.model =  "models/" + f;
        }
    } else if (params.model.empty()) {
        params.model = DEFAULT_MODEL_PATH;
@ -274,6 +268,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        }
    } catch (const std::invalid_argument & ex) {
        fprintf(stderr, "%s\n", ex.what());
        params = params_org;
        return false;
    }
@ -409,6 +404,20 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        }
        return true;
    }
    if (arg == "--in-file") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        std::ifstream file(argv[i]);
        if (!file) {
            fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
            invalid_param = true;
            return true;
        }
        params.in_files.push_back(argv[i]);
        return true;
    }
    if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
        if (++i >= argc) {
            invalid_param = true;
@ -1082,7 +1091,15 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        return true;
    }
    if (arg == "-v" || arg == "--verbose") {
-        params.verbose = true;
+        params.verbosity = 1;
        return true;
    }
    if (arg == "--verbosity") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        params.verbosity = std::stoi(argv[i]);
        return true;
    }
    if (arg == "--verbose-prompt") {
@ -1392,6 +1409,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.timeout_write = std::stoi(argv[i]);
        return true;
    }
    if (arg == "--threads-http") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        params.n_threads_http = std::stoi(argv[i]);
        return true;
    }
    if (arg == "-spf" || arg == "--system-prompt-file") {
        if (++i >= argc) {
            invalid_param = true;
@ -1461,6 +1486,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.chat_template = argv[i];
        return true;
    }
    if (arg == "--slot-prompt-similarity" || arg == "-sps") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        params.slot_prompt_similarity = std::stof(argv[i]);
        return true;
    }
    if (arg == "-pps") {
        params.is_pp_shared = true;
        return true;
@ -1538,6 +1571,46 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.i_pos = std::stoi(argv[i]);
        return true;
    }
    if (arg == "-o" || arg == "--output" || arg == "--output-file") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        params.out_file = argv[i];
        return true;
    }
    if (arg == "-ofreq" || arg == "--output-frequency") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        params.n_out_freq = std::stoi(argv[i]);
        return true;
    }
    if (arg == "--save-frequency") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        params.n_save_freq = std::stoi(argv[i]);
        return true;
    }
    if (arg == "--process-output") {
        params.process_output = true;
        return true;
    }
    if (arg == "--no-ppl") {
        params.compute_ppl = false;
        return true;
    }
    if (arg == "--chunk" || arg == "--from-chunk") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        params.i_chunk = std::stoi(argv[i]);
        return true;
    }
 #ifndef LOG_DISABLE_LOGS
    // Parse args for logging parameters
    if (log_param_single_parse(argv[i])) {
@ -1613,6 +1686,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "-h,    --help, --usage",        "print usage and exit" });
    options.push_back({ "*",           "       --version",              "show version and build info" });
    options.push_back({ "*",           "-v,    --verbose",              "print verbose information" });
    options.push_back({ "*",           "       --verbosity N",          "set specific verbosity level (default: %d)", params.verbosity });
    options.push_back({ "*",           "       --verbose-prompt",       "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
    options.push_back({ "*",           "       --no-display-prompt",    "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
    options.push_back({ "*",           "-co,   --color",                "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
@ -1638,6 +1712,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "-fa,   --flash-attn",           "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
    options.push_back({ "*",           "-p,    --prompt PROMPT",        "prompt to start generation with (default: '%s')", params.prompt.c_str() });
    options.push_back({ "*",           "-f,    --file FNAME",           "a file containing the prompt (default: none)" });
    options.push_back({ "*",           "       --in-file FNAME",        "an input file (repeat to specify multiple files)" });
    options.push_back({ "*",           "-bf,   --binary-file FNAME",    "binary file containing the prompt (default: none)" });
    options.push_back({ "*",           "-e,    --escape",               "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" });
    options.push_back({ "*",           "       --no-escape",            "do not process escape sequences" });
@ -1805,6 +1880,14 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "passkey",     "       --junk N",               "number of times to repeat the junk text (default: %d)", params.n_junk });
    options.push_back({ "passkey",     "       --pos N",                "position of the passkey in the junk text (default: %d)", params.i_pos });
    options.push_back({ "imatrix" });
    options.push_back({ "imatrix",     "-o,    --output FNAME",         "output file (default: '%s')", params.out_file.c_str() });
    options.push_back({ "imatrix",     "       --output-frequency N",   "output the imatrix every N iterations (default: %d)", params.n_out_freq });
    options.push_back({ "imatrix",     "       --save-frequency N",     "save an imatrix copy every N iterations (default: %d)", params.n_save_freq });
    options.push_back({ "imatrix",     "       --process-output",       "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" });
    options.push_back({ "imatrix",     "       --no-ppl",               "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" });
    options.push_back({ "imatrix",     "       --chunk N",              "start processing the input from chunk N (default: %d)", params.i_chunk });
    options.push_back({ "bench" });
    options.push_back({ "bench",       "-pps",                          "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" });
    options.push_back({ "bench",       "-npp n0,n1,...",                "number of prompt tokens" });
@ -1821,6 +1904,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "server",      "       --ssl-key-file FNAME",   "path to file a PEM-encoded SSL private key" });
    options.push_back({ "server",      "       --ssl-cert-file FNAME",  "path to file a PEM-encoded SSL certificate" });
    options.push_back({ "server",      "       --timeout N",            "server read/write timeout in seconds (default: %d)", params.timeout_read });
    options.push_back({ "server",      "       --threads-http N",       "number of threads used to process HTTP requests (default: %d)", params.n_threads_http });
    options.push_back({ "server",      "       --system-prompt-file FNAME",
                                                                        "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" });
    options.push_back({ "server",      "       --log-format {text,json}",
@ -1832,6 +1916,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                        "set custom jinja chat template (default: template taken from model's metadata)\n"
                                                                        "only commonly used templates are accepted:\n"
                                                                        "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
    options.push_back({ "server",      "-sps,  --slot-prompt-similarity SIMILARITY",
                                                                        "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
 #ifndef LOG_DISABLE_LOGS
    options.push_back({ "logging" });
@ -2188,6 +2274,16 @@ std::string fs_get_cache_directory() {
    return ensure_trailing_slash(cache_directory);
 }
 std::string fs_get_cache_file(const std::string & filename) {
    GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
    std::string cache_directory = fs_get_cache_directory();
    const bool success = fs_create_directory_with_parents(cache_directory);
    if (!success) {
        throw std::runtime_error("failed to create cache directory: " + cache_directory);
    }
    return cache_directory + filename;
 }
 //
 // Model utils
--- a/common/common.h
+++ b/common/common.h
@ -67,7 +67,6 @@ struct gpt_params {
    float   p_split               =  0.1f; // speculative decoding split probability
    int32_t n_gpu_layers          =    -1; // number of layers to store in VRAM (-1 - use default)
    int32_t n_gpu_layers_draft    =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
    llama_split_mode split_mode   = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
    int32_t main_gpu              =     0; // the GPU that is used for scratch and small tensors
    float   tensor_split[128]     =   {0}; // how split tensors should be distributed across GPUs
    int32_t n_beams               =     0; // if non-zero then use beam search of given width.
@ -82,13 +81,13 @@ struct gpt_params {
    float   yarn_beta_slow        =  1.0f; // YaRN high correction dim
    int32_t yarn_orig_ctx         =     0; // YaRN original context length
    float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
    std::string rpc_servers       = "";    // comma separated list of RPC servers
    ggml_backend_sched_eval_callback cb_eval = nullptr;
    void * cb_eval_user_data                 = nullptr;
    ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
    enum llama_split_mode        split_mode        = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
@ -131,7 +130,9 @@ struct gpt_params {
    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding
    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
    std::string logits_file          = ""; // file for saving *all* logits
    std::string rpc_servers          = ""; // comma separated list of RPC servers
    std::vector<std::string> in_files;   // all input files
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
    std::vector<llama_model_kv_override> kv_overrides;
@ -141,6 +142,7 @@ struct gpt_params {
    std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
    int32_t verbosity                  = 0;
    int32_t control_vector_layer_start = -1; // layer range for control vector
    int32_t control_vector_layer_end   = -1; // layer range for control vector
@ -152,7 +154,7 @@ struct gpt_params {
    size_t hellaswag_tasks  = 400;   // number of tasks to use when computing the HellaSwag score
    bool   winogrande       = false; // compute Winogrande score over random tasks from datafile supplied in prompt
-    size_t winogrande_tasks= 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
+    size_t winogrande_tasks = 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
    bool   multiple_choice  = false;  // compute TruthfulQA score over random tasks from datafile supplied in prompt
    size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
@ -180,7 +182,6 @@ struct gpt_params {
    bool logits_all        = false; // return logits for all tokens in the batch
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool verbose           = false;
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool display_prompt    = true;  // print prompt before generation
    bool infill            = false; // use infill mode
@ -197,10 +198,10 @@ struct gpt_params {
    std::vector<std::string> image; // path to image file(s)
    // server params
-    int32_t port           = 8080;
+    int32_t port           = 8080;         // server listens on this network port
-    int32_t timeout_read   = 600;
+    int32_t timeout_read   = 600;          // http read timeout in seconds
-    int32_t timeout_write  = timeout_read;
+    int32_t timeout_write  = timeout_read; // http write timeout in seconds
-    int32_t n_threads_http = -1;
+    int32_t n_threads_http = -1;           // number of threads to process HTTP requests
    std::string hostname      = "127.0.0.1";
    std::string public_path   = "";
@ -219,6 +220,8 @@ struct gpt_params {
    std::string slot_save_path;
    float slot_prompt_similarity = 0.5f;
    // batched-bench params
    bool is_pp_shared = false;
@ -236,6 +239,16 @@ struct gpt_params {
    // passkey params
    int32_t n_junk = 250; // number of times to repeat the junk text
    int32_t i_pos  = -1;  // position of the passkey in the junk text
    // imatrix params
    std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
    int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
    int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations
    int32_t i_chunk     =  0; // start processing from this chunk
    bool process_output = false; // collect data for the output tensor
    bool compute_ppl    = true;  // whether to compute perplexity
 };
 void gpt_params_handle_model_default(gpt_params & params);
@ -281,6 +294,7 @@ bool fs_validate_filename(const std::string & filename);
 bool fs_create_directory_with_parents(const std::string & path);
 std::string fs_get_cache_directory();
 std::string fs_get_cache_file(const std::string & filename);
 //
 // Model utils
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@ -46,8 +46,12 @@ namespace grammar_parser {
        state.rules[rule_id] = rule;
    }
    static bool is_digit_char(char c) {
        return '0' <= c && c <= '9';
    }
    static bool is_word_char(char c) {
-        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
+        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
    }
    static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
@ -99,6 +103,17 @@ namespace grammar_parser {
        return pos;
    }
    static const char * parse_int(const char * src) {
        const char * pos = src;
        while (is_digit_char(*pos)) {
            pos++;
        }
        if (pos == src) {
            throw std::runtime_error(std::string("expecting integer at ") + src);
        }
        return pos;
    }
    static std::pair<uint32_t, const char *> parse_char(const char * src) {
        if (*src == '\\') {
            switch (src[1]) {
@ -137,6 +152,60 @@ namespace grammar_parser {
            bool                                 is_nested) {
        size_t last_sym_start = out_elements.size();
        const char * pos = src;
        auto handle_repetitions = [&](int min_times, int max_times) {
            if (last_sym_start == out_elements.size()) {
                throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
            }
            // apply transformation to previous symbol (last_sym_start to end) according to
            // the following rewrite rules:
            // S{m,n} --> S S S (m times) S'(n-m)
            //            S'(x)   ::= S S'(x-1) |
            //            (... n-m definitions of these S' rules ...)
            //            S'(1)   ::= S |
            // S{m,} -->  S S S (m times) S'
            //            S'     ::= S S' |
            // S*     --> S{0,}
            //        --> S'     ::= S S' |
            // S+     --> S{1,}
            //        --> S S'
            //            S'     ::= S S' |
            // S?     --> S{0,1}
            //        --> S'
            //            S'     ::= S |
            std::vector<llama_grammar_element> previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
            if (min_times == 0) {
                out_elements.resize(last_sym_start);
            } else {
                // Repeat the previous elements (min_times - 1) times
                for (int i = 1; i < min_times; i++) {
                    out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
                }
            }
            uint32_t last_rec_rule_id = 0;
            auto n_opt = max_times < 0 ? 1 : max_times - min_times;
            std::vector<llama_grammar_element> rec_rule(previous_elements);
            for (int i = 0; i < n_opt; i++) {
                rec_rule.resize(previous_elements.size());
                uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
                if (i > 0 || max_times < 0) {
                    rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
                }
                rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
                rec_rule.push_back({LLAMA_GRETYPE_END, 0});
                add_rule(state, rec_rule_id, rec_rule);
                last_rec_rule_id = rec_rule_id;
            }
            if (n_opt > 0) {
                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
            }
        };
        while (*pos) {
            if (*pos == '"') { // literal string
                pos++;
@ -197,40 +266,51 @@ namespace grammar_parser {
                    throw std::runtime_error(std::string("expecting ')' at ") + pos);
                }
                pos = parse_space(pos + 1, is_nested);
-            } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
+            } else if (*pos == '.') { // any char
-                if (last_sym_start == out_elements.size()) {
+                last_sym_start = out_elements.size();
-                    throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
+                out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
                }
                // apply transformation to previous symbol (last_sym_start to end) according to
                // rewrite rules:
                // S* --> S' ::= S S' |
                // S+ --> S' ::= S S' | S
                // S? --> S' ::= S |
                uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
                std::vector<llama_grammar_element> sub_rule;
                // add preceding symbol to generated rule
                sub_rule.insert(
                    sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
                if (*pos == '*' || *pos == '+') {
                    // cause generated rule to recurse
                    sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
                }
                // mark start of alternate def
                sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
                if (*pos == '+') {
                    // add preceding symbol as alternate only for '+' (otherwise empty)
                    sub_rule.insert(
                        sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
                }
                sub_rule.push_back({LLAMA_GRETYPE_END, 0});
                add_rule(state, sub_rule_id, sub_rule);
                // in original rule, replace previous symbol with reference to generated rule
                out_elements.resize(last_sym_start);
                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
                pos = parse_space(pos + 1, is_nested);
            } else if (*pos == '*') {
                pos = parse_space(pos + 1, is_nested);
                handle_repetitions(0, -1);
            } else if (*pos == '+') {
                pos = parse_space(pos + 1, is_nested);
                handle_repetitions(1, -1);
            } else if (*pos == '?') {
                pos = parse_space(pos + 1, is_nested);
                handle_repetitions(0, 1);
            } else if (*pos == '{') {
                pos = parse_space(pos + 1, is_nested);
                if (!is_digit_char(*pos)) {
                    throw std::runtime_error(std::string("expecting an int at ") + pos);
                }
                const char * int_end = parse_int(pos);
                int min_times = std::stoul(std::string(pos, int_end - pos));
                pos = parse_space(int_end, is_nested);
                int max_times = -1;
                if (*pos == '}') {
                    max_times = min_times;
                    pos = parse_space(pos + 1, is_nested);
                } else if (*pos == ',') {
                    pos = parse_space(pos + 1, is_nested);
                    if (is_digit_char(*pos)) {
                        const char * int_end = parse_int(pos);
                        max_times = std::stoul(std::string(pos, int_end - pos));
                        pos = parse_space(int_end, is_nested);
                    }
                    if (*pos != '}') {
                        throw std::runtime_error(std::string("expecting '}' at ") + pos);
                    }
                    pos = parse_space(pos + 1, is_nested);
                } else {
                    throw std::runtime_error(std::string("expecting ',' at ") + pos);
                }
                handle_repetitions(min_times, max_times);
            } else {
                break;
            }
@ -325,6 +405,7 @@ namespace grammar_parser {
            case LLAMA_GRETYPE_CHAR_NOT:       return true;
            case LLAMA_GRETYPE_CHAR_ALT:       return true;
            case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
            case LLAMA_GRETYPE_CHAR_ANY:       return true;
            default:                           return false;
        }
    }
@ -339,6 +420,7 @@ namespace grammar_parser {
                case LLAMA_GRETYPE_CHAR_NOT:       fprintf(file, "CHAR_NOT");       break;
                case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
                case LLAMA_GRETYPE_CHAR_ALT:       fprintf(file, "CHAR_ALT");       break;
                case LLAMA_GRETYPE_CHAR_ANY:       fprintf(file, "CHAR_ANY");       break;
            }
            switch (elem.type) {
                case LLAMA_GRETYPE_END:
@ -350,6 +432,7 @@ namespace grammar_parser {
                case LLAMA_GRETYPE_CHAR_NOT:
                case LLAMA_GRETYPE_CHAR_RNG_UPPER:
                case LLAMA_GRETYPE_CHAR_ALT:
                case LLAMA_GRETYPE_CHAR_ANY:
                    fprintf(file, "(\"");
                    print_grammar_char(file, elem.value);
                    fprintf(file, "\") ");
@ -407,11 +490,15 @@ namespace grammar_parser {
                    }
                    print_grammar_char(file, elem.value);
                    break;
                case LLAMA_GRETYPE_CHAR_ANY:
                    fprintf(file, ".");
                    break;
            }
            if (is_char_element(elem)) {
                switch (rule[i + 1].type) {
                    case LLAMA_GRETYPE_CHAR_ALT:
                    case LLAMA_GRETYPE_CHAR_RNG_UPPER:
                    case LLAMA_GRETYPE_CHAR_ANY:
                        break;
                    default:
                        fprintf(file, "] ");
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@ -16,58 +16,27 @@ static std::string join(Iterator begin, Iterator end, const std::string & separa
 static std::string repeat(const std::string & str, size_t n);
-static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) {
+static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
-    if (separator_rule.empty()) {
+    auto has_max = max_items != std::numeric_limits<int>::max();
    if (min_items == 0 && max_items == 1) {
        return item_rule + "?";
-        } else if (min_items == 1 && max_items == std::numeric_limits<int>::max()) {
+    }
    if (separator_rule.empty()) {
        if (min_items == 1 && !has_max) {
            return item_rule + "+";
-        }
+        } else if (min_items == 0 && !has_max) {
-    }
+            return item_rule + "*";
    std::string result;
    if (min_items > 0) {
        if (item_rule_is_literal && separator_rule.empty()) {
            result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
        } else {
-            std::vector<std::string> items(min_items, item_rule);
+            return item_rule + "{" + std::to_string(min_items) + "," + (has_max ? std::to_string(max_items) : "") + "}";
            result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
        }
    }
-    std::function<std::string(int, bool)> opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string {
+    auto result = item_rule + " " + build_repetition("(" + separator_rule + " " + item_rule + ")", min_items == 0 ? 0 : min_items - 1, has_max ? max_items - 1 : max_items);
-        auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule;
+    if (min_items == 0) {
-
+        result = "(" + result + ")?";
        if (up_to_n == 0) {
            return "";
        } else if (up_to_n == 1) {
            return "(" + content + ")?";
        } else if (!separator_rule.empty() && !prefix_with_sep) {
            return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
        } else {
            std::string res = repeat("(" + content + " ", up_to_n);
            // strip trailing space
            res = res.substr(0, res.length() - 1);
            res += repeat(")?", up_to_n);
            return res;
    }
    };
    if (min_items > 0 && max_items != min_items) {
        result += " ";
    }
    if (max_items != std::numeric_limits<int>::max()) {
        result += opt_repetitions(max_items - min_items, min_items > 0);
    } else {
        std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
        if (min_items == 0 && !separator_rule.empty()) {
            result = "(" + item_rule + " " + item_operator + "*)?";
        } else {
            result += item_operator + "*";
        }
    }
    return result;
 }
@ -78,30 +47,24 @@ struct BuiltinRule {
    std::vector<std::string> deps;
 };
 const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
 std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
    {"boolean", {"(\"true\" | \"false\") space", {}}},
-    {"decimal-part", {"[0-9] " + _up_to_15_digits, {}}},
+    {"decimal-part", {"[0-9]{1,16}", {}}},
-    {"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}},
+    {"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}},
    {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
    {"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
    {"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
    {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
    {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
-    {"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+    {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
-                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+    {"char",   {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
    {"char",   {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
    {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
    {"null", {"\"null\" space", {}}},
 };
 std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
-    {"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
+    {"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
-    {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
+    {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
    {"date-time", {"date \"T\" time", {"date", "time"}}},
    {"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
    {"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
@ -385,8 +348,7 @@ private:
                        sub_is_literal ? "\"" + sub + "\"" : sub,
                        min_times,
                        max_times,
-                        "",
+                        ""
                        sub_is_literal
                    );
                    seq.back().second = false;
                } else {
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@ -83,6 +83,7 @@ models = [
    {"name": "jina-v2-es",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
    {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
    {"name": "smaug-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
    {"name": "jina-v2-code",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
 ]
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -47,11 +47,12 @@ class Model:
    _model_classes: dict[str, type[Model]] = {}
    dir_model: Path
-    ftype: int
+    ftype: gguf.LlamaFileType
    is_big_endian: bool
    endianess: gguf.GGUFEndian
    use_temp_file: bool
    lazy: bool
    model_name: str | None
    part_names: list[str]
    is_safetensors: bool
    hparams: dict[str, Any]
@ -64,7 +65,7 @@ class Model:
    # subclasses should define this!
    model_arch: gguf.MODEL_ARCH
-    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool):
+    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None):
        if type(self) is Model:
            raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
        self.dir_model = dir_model
@ -73,10 +74,11 @@ class Model:
        self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
        self.use_temp_file = use_temp_file
        self.lazy = not eager
-        self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors")
+        self.model_name = model_name
        self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
        self.is_safetensors = len(self.part_names) > 0
        if not self.is_safetensors:
-            self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
+            self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
        self.hparams = Model.load_hparams(self.dir_model)
        self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
        self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
@ -94,7 +96,7 @@ class Model:
        ftype_lw: str = ftype_up.lower()
        # allow templating the file name with the output ftype, useful with the "auto" ftype
        self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
-        self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
+        self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
    @classmethod
    def __init_subclass__(cls):
@ -182,7 +184,7 @@ class Model:
        return new_name
    def set_gguf_parameters(self):
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_block_count(self.block_count)
        if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
@ -324,21 +326,21 @@ class Model:
    def write(self):
        self.write_tensors()
-        self.gguf_writer.write_header_to_file()
+        self.gguf_writer.write_header_to_file(self.fname_out)
        self.gguf_writer.write_kv_data_to_file()
        self.gguf_writer.write_tensors_to_file(progress=True)
        self.gguf_writer.close()
    def write_vocab(self):
-        self.gguf_writer.write_header_to_file()
+        self.gguf_writer.write_header_to_file(self.fname_out)
        self.gguf_writer.write_kv_data_to_file()
        self.gguf_writer.close()
    @staticmethod
-    def get_model_part_names(dir_model: Path, suffix: str) -> list[str]:
+    def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
        part_names: list[str] = []
        for filename in os.listdir(dir_model):
-            if filename.endswith(suffix):
+            if filename.startswith(prefix) and filename.endswith(suffix):
                part_names.append(filename)
        part_names.sort()
@ -475,6 +477,9 @@ class Model:
        if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
            # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
            res = "smaug-bpe"
        if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
            res = "jina-v2-code"
        if res is None:
            logger.warning("\n")
@ -662,7 +667,7 @@ class GPTNeoXModel(Model):
    def set_gguf_parameters(self):
        block_count = self.hparams["num_hidden_layers"]
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
        self.gguf_writer.add_block_count(block_count)
@ -795,7 +800,7 @@ class MPTModel(Model):
    def set_gguf_parameters(self):
        block_count = self.hparams["n_layers"]
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
        self.gguf_writer.add_block_count(block_count)
@ -847,7 +852,7 @@ class OrionModel(Model):
            raise ValueError("gguf: can not find ctx length parameter.")
        self.gguf_writer.add_file_type(self.ftype)
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_source_hf_repo(hf_repo)
        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
        self.gguf_writer.add_context_length(ctx_length)
@ -884,7 +889,7 @@ class BaichuanModel(Model):
        else:
            raise ValueError("gguf: can not find ctx length parameter.")
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_source_hf_repo(hf_repo)
        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
        self.gguf_writer.add_context_length(ctx_length)
@ -1007,7 +1012,7 @@ class XverseModel(Model):
        else:
            raise ValueError("gguf: can not find ctx length parameter.")
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_source_hf_repo(hf_repo)
        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
        self.gguf_writer.add_context_length(ctx_length)
@ -1203,7 +1208,7 @@ class StableLMModel(Model):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_block_count(block_count)
@ -1678,7 +1683,7 @@ class GPT2Model(Model):
    model_arch = gguf.MODEL_ARCH.GPT2
    def set_gguf_parameters(self):
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_block_count(self.hparams["n_layer"])
        self.gguf_writer.add_context_length(self.hparams["n_ctx"])
        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
@ -2245,7 +2250,7 @@ class GemmaModel(Model):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_block_count(block_count)
@ -2345,7 +2350,7 @@ class MambaModel(Model):
        # Fail early for models which don't have a block expansion factor of 2
        assert d_inner == 2 * d_model
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
        self.gguf_writer.add_embedding_length(d_model)
        self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
@ -2452,11 +2457,13 @@ class JinaBertV2Model(BertModel):
    def get_tensors(self):
        for name, data in super().get_tensors():
-            if 'gated_layers' in name:
+            if 'gated_layer' in name:
                d1 = data[:self.intermediate_size, :]
                name1 = name.replace('gated_layers', 'gated_layers_w')
                name1 = name1.replace('up_gated_layer', 'gated_layers_v')
                d2 = data[self.intermediate_size:, :]
                name2 = name.replace('gated_layers', 'gated_layers_v')
                name2 = name2.replace('up_gated_layer', 'gated_layers_w')
                yield name1, d1
                yield name2, d2
                continue
@ -2847,7 +2854,7 @@ def main() -> None:
            logger.error(f"Model {hparams['architectures'][0]} is not supported")
            sys.exit(1)
-        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
+        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name)
        logger.info("Set model parameters")
        model_instance.set_gguf_parameters()
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@ -62,10 +62,10 @@ static size_t split_str_to_n_bytes(std::string str) {
    int n;
    if (str.back() == 'M') {
        sscanf(str.c_str(), "%d", &n);
-        n_bytes = (size_t)n * 1024 * 1024; // megabytes
+        n_bytes = (size_t)n * 1000 * 1000; // megabytes
    } else if (str.back() == 'G') {
        sscanf(str.c_str(), "%d", &n);
-        n_bytes = (size_t)n * 1024 * 1024 * 1024; // gigabytes
+        n_bytes = (size_t)n * 1000 * 1000 * 1000; // gigabytes
    } else {
        throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
    }
@ -285,7 +285,7 @@ struct split_strategy {
                struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i));
                total_size += ggml_nbytes(t);
            }
-            total_size = total_size / 1024 / 1024; // convert to megabytes
+            total_size = total_size / 1000 / 1000; // convert to megabytes
            printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
            i_split++;
        }
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@ -6,16 +6,19 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/
 ## Usage
 ```
-./imatrix -m <some_fp_model> -f <some_training_data> [-o <output_file>] [--verbosity <verbosity_level>]
+./imatrix \
-        [-ofreq num_chunks] [-ow <0 or 1>] [other common params]
+    -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
    [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
    [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
 ```
 Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory.
 The parameters in square brackets are optional and have the following meaning:
 * `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
 * `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
-* `-ofreq` (or `--output-frequency`) specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
+* `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
-* `-ow` (or `--output-weight`) specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
+* `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
 * `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
 For faster computation, make sure to use GPU offloading via the `-ngl` argument
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -18,39 +18,37 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 static void print_usage(int argc, char ** argv, const gpt_params & params) {
    gpt_params_print_usage(argc, argv, params);
    LOG_TEE("\nexample usage:\n");
    LOG_TEE("\n    %s \\\n"
            "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
            "       [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
            "       [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
    LOG_TEE("\n");
 }
 struct Stats {
    std::vector<float> values;
    std::vector<int> counts;
    int ncall = 0;
 };
 struct StatParams {
    std::string dataset;
    std::string ofile = "imatrix.dat";
    int         n_output_frequency = 10;
    int         verbosity = 1;
    int         keep_every = 0;
    bool        collect_output_weight = false;
 };
 class IMatrixCollector {
 public:
    IMatrixCollector() = default;
-    void set_parameters(StatParams&& params) { m_params = std::move(params); }
+    void set_params(gpt_params params) { m_params = std::move(params); }
    bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
-    void save_imatrix() const;
+    void save_imatrix(int ncall = -1) const;
-    bool load_imatrix(const char * file_name, bool add);
+    bool load_imatrix(const char * file_name);
    static bool load_imatrix(const char * file_name, std::unordered_map<std::string, Stats>& imatrix);
 private:
    std::unordered_map<std::string, Stats> m_stats;
-    StatParams                             m_params;
+    gpt_params                             m_params;
    std::mutex                             m_mutex;
    int                                    m_last_call = 0;
    std::vector<float>                     m_src1_data;
    std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id
                                                  //
    void save_imatrix(const char * file_name, const char * dataset) const;
    void keep_imatrix(int ncall) const;
 };
 // remove any prefix and suffixes from the name
@ -86,7 +84,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        if (t->op != GGML_OP_MUL_MAT) return false;
        // why are small batches ignored (<16 tokens)?
        if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
-        if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false;
+        if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false;
        return true;
    }
@ -154,21 +152,25 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                    for (int j = 0; j < (int)src1->ne[0]; ++j) {
                        e.values[e_start + j] += x[j]*x[j];
                        e.counts[e_start + j]++;
                        if (!std::isfinite(e.values[e_start + j])) {
                            fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
                            exit(1);
                        }
                    }
                }
            }
            if (e.ncall > m_last_call) {
                m_last_call = e.ncall;
-                if (m_last_call % m_params.n_output_frequency == 0) {
+                if (m_last_call % m_params.n_out_freq == 0) {
                    save_imatrix();
                }
-                if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
+                if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
-                    keep_imatrix(m_last_call);
+                    save_imatrix(m_last_call);
                }
            }
        }
    } else {
-        auto& e = m_stats[wname];
+        auto & e = m_stats[wname];
        if (e.values.empty()) {
            e.values.resize(src1->ne[0], 0);
            e.counts.resize(src1->ne[0], 0);
@ -186,15 +188,19 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
            for (int j = 0; j < (int)src1->ne[0]; ++j) {
                e.values[j] += x[j]*x[j];
                e.counts[j]++;
                if (!std::isfinite(e.values[j])) {
                    fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
                    exit(1);
                }
            }
        }
        if (e.ncall > m_last_call) {
            m_last_call = e.ncall;
-            if (m_last_call % m_params.n_output_frequency == 0) {
+            if (m_last_call % m_params.n_out_freq == 0) {
                save_imatrix();
            }
-            if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
+            if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
-                keep_imatrix(m_last_call);
+                save_imatrix(m_last_call);
            }
        }
    }
@ -202,19 +208,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
    return true;
 }
-void IMatrixCollector::save_imatrix() const {
+void IMatrixCollector::save_imatrix(int ncall) const {
-    save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
+    auto fname = m_params.out_file;
-}
+    if (fname.empty()) {
        fname = "imatrix.dat";
    }
-void IMatrixCollector::keep_imatrix(int ncall) const {
+    if (ncall > 0) {
-    auto file_name = m_params.ofile;
+        fname += ".at_";
-    if (file_name.empty()) file_name = "imatrix.dat";
+        fname += std::to_string(ncall);
-    file_name += ".at_";
+    }
    file_name += std::to_string(ncall);
    save_imatrix(file_name.c_str(), m_params.dataset.c_str());
 }
 void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
    std::ofstream out(fname, std::ios::binary);
    int n_entries = m_stats.size();
    out.write((const char *) &n_entries, sizeof(n_entries));
@ -237,26 +241,28 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co
    // Write the number of call the matrix was computed with
    out.write((const char *) &m_last_call, sizeof(m_last_call));
-    // Write the dataset name at the end of the file to later on specify it in quantize
+    // Write the input filename at the end of the file to later on specify it in quantize
-    int n_dataset = strlen(dataset);
+    {
-    out.write((const char *) &n_dataset, sizeof(n_dataset));
+        int len = m_params.prompt_file.size();
-    out.write(dataset, n_dataset);
+        out.write((const char *) &len, sizeof(len));
        out.write(m_params.prompt_file.c_str(), len);
    }
    if (m_params.verbosity > 0) {
-        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
+        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
    }
 }
-bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_map<std::string, Stats>& imatrix_data) {
+bool IMatrixCollector::load_imatrix(const char * fname) {
-    std::ifstream in(imatrix_file, std::ios::binary);
+    std::ifstream in(fname, std::ios::binary);
    if (!in) {
-        printf("%s: failed to open %s\n",__func__,imatrix_file);
+        printf("%s: failed to open %s\n",__func__, fname);
        return false;
    }
    int n_entries;
    in.read((char*)&n_entries, sizeof(n_entries));
    if (in.fail() || n_entries < 1) {
-        printf("%s: no data in file %s\n", __func__, imatrix_file);
+        printf("%s: no data in file %s\n", __func__, fname);
        return false;
    }
    for (int i = 0; i < n_entries; ++i) {
@ -264,23 +270,22 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
        std::vector<char> name_as_vec(len+1);
        in.read((char *)name_as_vec.data(), len);
        if (in.fail()) {
-            printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file);
+            printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
            return false;
        }
        name_as_vec[len] = 0;
        std::string name{name_as_vec.data()};
-        auto& e = imatrix_data[std::move(name)];
+        auto & e = m_stats[std::move(name)];
        int ncall;
        in.read((char*)&ncall, sizeof(ncall));
        int nval;
        in.read((char *)&nval, sizeof(nval));
        if (in.fail() || nval < 1) {
            printf("%s: failed reading number of values for entry %d\n",__func__,i);
-            imatrix_data = {};
+            m_stats = {};
            return false;
        }
        // When re-called from load_imatrix() with add set, this will already be created.
        if (e.values.empty()) {
            e.values.resize(nval, 0);
            e.counts.resize(nval, 0);
@ -290,7 +295,7 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
        in.read((char*)tmp.data(), nval*sizeof(float));
        if (in.fail()) {
            printf("%s: failed reading data for entry %d\n",__func__,i);
-            imatrix_data = {};
+            m_stats = {};
            return false;
        }
@ -305,13 +310,6 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
    return true;
 }
 bool IMatrixCollector::load_imatrix(const char * file_name, bool add) {
    if (!add) {
        m_stats.clear();
    }
    return load_imatrix(file_name, m_stats);
 }
 static IMatrixCollector g_collector;
 static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@ -325,7 +323,7 @@ struct results_log_softmax {
    float  prob;
 };
-static std::vector<float> softmax(const std::vector<float>& logits) {
+static std::vector<float> softmax(const std::vector<float> & logits) {
    std::vector<float> probs(logits.size());
    float max_logit = logits[0];
    for (float v : logits) {
@ -359,8 +357,7 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to
 static void process_logits(
    int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
-    double & nll, double & nll2, float * logit_history, float * prob_history
+    double & nll, double & nll2, float * logit_history, float * prob_history) {
 ) {
    std::mutex mutex;
    int counter = 0;
    auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
@ -392,8 +389,7 @@ static void process_logits(
    }
 }
-static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
+static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
    GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
    const int n_ctx = llama_n_ctx(ctx);
@ -406,13 +402,13 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
    auto tim2 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
-    if (from_chunk > 0) {
+    if (params.i_chunk > 0) {
-        if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) {
+        if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
-            fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk);
+            fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
            return false;
        }
-        fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx);
+        fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
-        tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx);
+        tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
    }
    if (int(tokens.size()) < 2*n_ctx) {
@ -425,7 +421,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
    std::vector<float> logit_history;
    std::vector<float> prob_history;
-    if (compute_ppl) {
+    if (params.compute_ppl) {
        logit_history.resize(tokens.size());
        prob_history.resize(tokens.size());
    }
@ -447,7 +443,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
    const int num_batches = (n_ctx + n_batch - 1) / n_batch;
    std::vector<float> logits;
-    if (compute_ppl && num_batches > 1) {
+    if (params.compute_ppl && num_batches > 1) {
        logits.reserve((size_t)n_ctx * n_vocab);
    }
@ -483,7 +479,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
            // restore the original token in case it was set to BOS
            tokens[batch_start] = token_org;
-            if (compute_ppl && num_batches > 1) {
+            if (params.compute_ppl && num_batches > 1) {
                const auto * batch_logits = llama_get_logits(ctx);
                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
            }
@ -502,7 +498,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
        }
-        if (compute_ppl) {
+        if (params.compute_ppl) {
            const int first = n_ctx/2;
            const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
@ -517,7 +513,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
    }
    printf("\n");
-    if (compute_ppl) {
+    if (params.compute_ppl) {
        nll2 /= count;
        nll /= count;
        const double ppl = exp(nll);
@ -534,109 +530,32 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
 }
 int main(int argc, char ** argv) {
    StatParams sparams;
    std::string prev_result_file;
    std::string combine_files;
    bool compute_ppl = true;
    int  from_chunk  = 0;
    std::vector<char*> args;
    args.push_back(argv[0]);
    int iarg = 1;
    for (; iarg < argc-1; ++iarg) {
        std::string arg{argv[iarg]};
        if (arg == "-o" || arg == "--output-file") {
            sparams.ofile = argv[++iarg];
        }
        else if (arg == "-ofreq" || arg == "--output-frequency") {
            sparams.n_output_frequency = std::stoi(argv[++iarg]);
        }
        else if (arg == "-ow" || arg == "--output-weight") {
            sparams.collect_output_weight = std::stoi(argv[++iarg]);
        }
        else if (arg == "--verbosity") {
            sparams.verbosity = std::stoi(argv[++iarg]);
        } else if (arg == "--no-ppl") {
            compute_ppl = false;
        } else if (arg == "--keep-imatrix") {
            sparams.keep_every = std::stoi(argv[++iarg]);
        } else if (arg == "--continue-from") {
            prev_result_file = argv[++iarg];
        } else if (arg == "--combine") {
            combine_files = argv[++iarg];
        }
        else if (arg == "--from-chunk") {
            from_chunk = std::stoi(argv[++iarg]);
        } else {
            args.push_back(argv[iarg]);
        }
    }
    if (iarg < argc) {
        std::string arg{argv[iarg]};
        if (arg == "--no-ppl") {
            compute_ppl = false;
        } else {
            args.push_back(argv[iarg]);
        }
    }
    gpt_params params;
-    params.n_batch = 512;
+
    params.n_ctx = 512;
    params.logits_all = true;
    params.verbosity = 1;
    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+        print_usage(argc, argv, params);
        return 1;
    }
    params.logits_all = true;
    params.n_batch = std::min(params.n_batch, params.n_ctx);
-    print_build_info();
+    g_collector.set_params(params);
-    if (params.seed == LLAMA_DEFAULT_SEED) {
+    for (const auto & in_file : params.in_files) {
-        params.seed = time(NULL);
+        printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
-    }
+        if (!g_collector.load_imatrix(in_file.c_str())) {
-
+            fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
    std::mt19937 rng(params.seed);
    sparams.dataset = params.prompt_file;
    g_collector.set_parameters(std::move(sparams));
    if (!combine_files.empty()) {
        std::vector<std::string> files;
        size_t pos = 0;
        while (true) {
            auto new_pos = combine_files.find(',', pos);
            if (new_pos != std::string::npos) {
                files.emplace_back(combine_files.substr(pos, new_pos - pos));
                pos = new_pos + 1;
            } else {
                files.emplace_back(combine_files.substr(pos));
                break;
            }
        }
        if (files.size() < 2) {
            fprintf(stderr, "You must provide at least two comma separated files to use --combine\n");
            return 1;
        }
        printf("Combining the following %d files\n", int(files.size()));
        for (auto& file : files) {
            printf("    %s\n", file.c_str());
            if (!g_collector.load_imatrix(file.c_str(), true)) {
                fprintf(stderr, "Failed to load %s\n", file.c_str());
            return 1;
        }
    }
    if (params.in_files.size() > 1) {
        printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
        g_collector.save_imatrix();
        return 0;
    }
    if (!prev_result_file.empty()) {
        if (!g_collector.load_imatrix(prev_result_file.c_str(), false)) {
            fprintf(stderr, "=============== Failed to load %s\n", prev_result_file.c_str());
            return 1;
        }
    }
    llama_backend_init();
@ -651,6 +570,7 @@ int main(int argc, char ** argv) {
    // init
    llama_model * model;
    llama_context * ctx;
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    if (model == nullptr || ctx == nullptr) {
        fprintf(stderr, "%s : failed to init\n", __func__);
@ -669,8 +589,7 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
    }
-    bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
+    if (!compute_imatrix(ctx, params)) {
    if (!OK) {
        return 1;
    }
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@ -6,52 +6,22 @@ import re
 import sys
 from typing import Any, Dict, List, Set, Tuple, Union
-def _build_repetition(item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False):
+
-    if not separator_rule:
+def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
    if min_items == 0 and max_items == 1:
        return f'{item_rule}?'
-        elif min_items == 1 and max_items is None:
+
    if not separator_rule:
        if min_items == 1 and max_items is None:
            return f'{item_rule}+'
-
+        elif min_items == 0 and max_items is None:
-    result = ''
+            return f'{item_rule}*'
    if min_items > 0:
        if item_rule_is_literal and separator_rule is None:
            result = '"' + (item_rule[1:-1] * min_items) + '"'
        else:
-            result = (f' {separator_rule} ' if separator_rule else ' ').join([item_rule] * min_items)
+            return f'{item_rule}{{{min_items},{max_items if max_items is not None else ""}}}'
-    def opt_repetitions(up_to_n, prefix_with_sep=False):
+    result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None)
-        '''
+    return f'({result})?' if min_items == 0 else result
            - n=4, no sep:             '(a (a (a (a)?)?)?)?'
            - n=4, sep=',', prefix:    '("," a ("," a ("," a ("," a)?)?)?)?'
            - n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?'
        '''
        content = f'{separator_rule} {item_rule}' if prefix_with_sep and separator_rule else item_rule
        if up_to_n == 0:
            return ''
        elif up_to_n == 1:
            return f'({content})?'
        elif separator_rule and not prefix_with_sep:
            return f'({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?'
        else:
            return (f'({content} ' * up_to_n).rstrip() + (')?' * up_to_n)
    if min_items > 0 and max_items != min_items:
        result += ' '
    if max_items is not None:
        result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0)
    else:
        item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})'
        if min_items == 0 and separator_rule:
            result = f'({item_rule} {item_operator}*)?'
        else:
            result += f'{item_operator}*'
    return result
 class BuiltinRule:
@ -59,31 +29,29 @@ class BuiltinRule:
        self.content = content
        self.deps = deps or []
 _up_to_15_digits = _build_repetition('[0-9]', 0, 15)
 # whitespace is constrained to a single space char to prevent model "running away" in
 # whitespace. Also maybe improves generation quality?
 SPACE_RULE = '" "?'
 PRIMITIVE_RULES = {
    'boolean'      : BuiltinRule('("true" | "false") space', []),
-    'decimal-part' : BuiltinRule('[0-9] ' + _up_to_15_digits, []),
+    'decimal-part' : BuiltinRule('[0-9]{1,16}', []),
-    'integral-part': BuiltinRule('[0-9] | [1-9] ' + _up_to_15_digits, []),
+    'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
    'number'       : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
    'integer'      : BuiltinRule('("-"? integral-part) space', ['integral-part']),
    'value'        : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
    'object'       : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
    'array'        : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
-    'uuid'         : BuiltinRule(r'"\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + r' "\"" space', []),
+    'uuid'         : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
-    'char'         : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])', []),
+    'char'         : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})', []),
    'string'       : BuiltinRule(r'"\"" char* "\"" space', ['char']),
    'null'         : BuiltinRule('"null" space', []),
 }
 # TODO: support "uri", "email" string formats
 STRING_FORMAT_RULES = {
-    'date'            : BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
+    'date'            : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
-    'time'            : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
+    'time'            : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
    'date-time'       : BuiltinRule('date "T" time', ['date', 'time']),
    'date-string'     : BuiltinRule('"\\"" date "\\"" space', ['date']),
    'time-string'     : BuiltinRule('"\\"" time "\\"" space', ['time']),
@ -333,7 +301,7 @@ class SchemaConverter:
                            sub_rule_ids[sub] = id
                        sub = id
-                    seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times, item_rule_is_literal=sub_is_literal), False)
+                    seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times), False)
                else:
                    literal = ''
                    while i < length:
--- a/examples/pydantic_models_to_grammar.py
+++ b/examples/pydantic_models_to_grammar.py
@ -624,7 +624,7 @@ string ::= "\"" (
        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
      )* "\"" ws
 ws ::= ([ \t\n] ws)?
-float ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
+float ::= ("-"? ([0] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
 integer ::= [0-9]+"""
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@ -6,10 +6,6 @@
 #include "ggml-metal.h"
 #endif
 #ifdef GGML_USE_SYCL
 #include "ggml-sycl.h"
 #endif
 #include "ggml-rpc.h"
 #ifdef _WIN32
 #  include <windows.h>
@ -83,12 +79,6 @@ static ggml_backend_t create_backend() {
    if (!backend) {
        fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
    }
 #elif GGML_USE_SYCL
    fprintf(stderr, "%s: using SYCL backend\n", __func__);
    backend = ggml_backend_sycl_init(0); // init device 0
    if (!backend) {
        fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
    }
 #endif
    // if there aren't GPU Backends fallback to CPU backend
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -279,7 +279,7 @@ node index.js
    `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot.  Default: `-1`
-    `cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch.  Default: `false`
+    `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@ -2,57 +2,26 @@
 const SPACE_RULE = '" "?';
 function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
  if (minItems === 0 && maxItems === 1) {
    return `${itemRule}?`;
  }
  const separatorRule = opts.separatorRule ?? '';
  const itemRuleIsLiteral = opts.itemRuleIsLiteral ?? false
  if (separatorRule === '') {
-    if (minItems === 0 && maxItems === 1) {
+    if (minItems === 1 && maxItems === undefined) {
      return `${itemRule}?`;
    } else if (minItems === 1 && maxItems === undefined) {
      return `${itemRule}+`;
-    }
+    } else if (minItems === 0 && maxItems === undefined) {
-  }
+      return `${itemRule}*`;
  let result = '';
  if (minItems > 0) {
    if (itemRuleIsLiteral && separatorRule === '') {
      result = `"${itemRule.slice(1, -1).repeat(minItems)}"`;
    } else {
-      result = Array.from({ length: minItems }, () => itemRule)
+      return `${itemRule}{${minItems},${maxItems !== undefined ? maxItems : ''}}`;
        .join(separatorRule !== '' ? ` ${separatorRule} ` : ' ');
    }
  }
-  const optRepetitions = (upToN, prefixWithSep=false) => {
+  const result = itemRule + ' ' + _buildRepetition(`(${separatorRule} ${itemRule})`, minItems > 0 ? minItems - 1 : 0, maxItems !== undefined ? maxItems - 1 : undefined);
-    const content = separatorRule !== '' && prefixWithSep ? `${separatorRule} ${itemRule}` : itemRule;
+  return minItems === 0 ? `(${result})?` : result;
    if (upToN === 0) {
      return '';
    } else if (upToN === 1) {
      return `(${content})?`;
    } else if (separatorRule !== '' && !prefixWithSep) {
      return `(${content} ${optRepetitions(upToN - 1, true)})?`;
    } else {
      return Array.from({ length: upToN }, () => `(${content}`).join(' ').trim() + Array.from({ length: upToN }, () => ')?').join('');
    }
  };
  if (minItems > 0 && maxItems !== minItems) {
    result += ' ';
  }
  if (maxItems !== undefined) {
    result += optRepetitions(maxItems - minItems, minItems > 0);
  } else {
    const itemOperator = `(${separatorRule !== '' ? separatorRule + ' ' : ''}${itemRule})`;
    if (minItems === 0 && separatorRule !== '') {
      result = `(${itemRule} ${itemOperator}*)?`;
    } else {
      result += `${itemOperator}*`;
    }
  }
  return result;
 }
 class BuiltinRule {
@ -62,27 +31,25 @@ class BuiltinRule {
  }
 }
 const UP_TO_15_DIGITS = _buildRepetition('[0-9]', 0, 15);
 const PRIMITIVE_RULES = {
  boolean        : new BuiltinRule('("true" | "false") space', []),
-  'decimal-part' : new BuiltinRule('[0-9] ' + UP_TO_15_DIGITS, []),
+  'decimal-part' : new BuiltinRule('[0-9]{1,16}', []),
-  'integral-part': new BuiltinRule('[0-9] | [1-9] ' + UP_TO_15_DIGITS, []),
+  'integral-part': new BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
  number         : new BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
  integer        : new BuiltinRule('("-"? integral-part) space', ['integral-part']),
  value          : new BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
  object         : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
  array          : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
-  uuid           : new BuiltinRule('"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space', []),
+  uuid           : new BuiltinRule('"\\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\\"" space', []),
-  char           : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])`, []),
+  char           : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F]{4})`, []),
  string         : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']),
  null           : new BuiltinRule('"null" space', []),
 };
 // TODO: support "uri", "email" string formats
 const STRING_FORMAT_RULES = {
-  'date'            : new BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
+  'date'            : new BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
-  'time'            : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
+  'time'            : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
  'date-time'       : new BuiltinRule('date "T" time', ['date', 'time']),
  'date-string'     : new BuiltinRule('"\\"" date "\\"" space', ['date']),
  'time-string'     : new BuiltinRule('"\\"" time "\\"" space', ['time']),
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -648,6 +648,9 @@ struct server_context {
    server_metrics metrics;
    // Necessary similarity of prompt for slot selection
    float slot_prompt_similarity = 0.0f;
    ~server_context() {
        if (ctx) {
            llama_free(ctx);
@ -796,24 +799,88 @@ struct server_context {
        return prompt_tokens;
    }
-    server_slot * get_slot(int id) {
+    server_slot * get_slot_by_id(int id) {
        int64_t t_last = ggml_time_us();
        server_slot * last_used = nullptr;
        for (server_slot & slot : slots) {
-            if (slot.id == id && slot.available()) {
+            if (slot.id == id) {
                return &slot;
            }
        }
-            // among all available slots, find the one that has been least recently used
+        return nullptr;
-            if (slot.available() && slot.t_last_used < t_last) {
+    }
-                last_used = &slot;
+
    server_slot * get_available_slot(const std::string & prompt) {
        server_slot * ret = nullptr;
        // find the slot that has at least n% prompt similarity
        if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty()) {
            int max_lcp_len = 0;
            float similarity = 0;
            for (server_slot & slot : slots) {
                // skip the slot if it is not available
                if (!slot.available()) {
                    continue;
                }
                // skip the slot if it does not contains prompt
                if (!slot.prompt.is_string()) {
                    continue;
                }
                // current slot's prompt
                std::string slot_prompt = slot.prompt.get<std::string>();
                // length of the current slot's prompt
                int slot_prompt_len = slot_prompt.size();
                // length of the Longest Common Prefix between the current slot's prompt and the input prompt
                int lcp_len = common_part(slot_prompt, prompt);
                // fraction of the common substring length compared to the current slot's prompt length
                similarity = static_cast<float>(lcp_len) / slot_prompt_len;
                // select the current slot if the criteria match
                if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) {
                    max_lcp_len = lcp_len;
                    ret = &slot;
                }
            }
            if (ret != nullptr) {
                LOG_VERBOSE("selected slot by lcp similarity", {
                    {"id_slot", ret->id},
                    {"max_lcp_len", max_lcp_len},
                    {"similarity", similarity},
                });
            }
        }
        // find the slot that has been least recently used
        if (ret == nullptr) {
            int64_t t_last = ggml_time_us();
            for (server_slot & slot : slots) {
                // skip the slot if it is not available
                if (!slot.available()) {
                    continue;
                }
                // select the current slot if the criteria match
                if (slot.t_last_used < t_last) {
                    t_last = slot.t_last_used;
                    ret = &slot;
                }
            }
-        return last_used;
+            if (ret != nullptr) {
                LOG_VERBOSE("selected slot by lru", {
                    {"id_slot", ret->id},
                    {"t_last", t_last},
                });
            }
        }
        return ret;
    }
    bool launch_slot_with_task(server_slot & slot, const server_task & task) {
@ -889,7 +956,7 @@ struct server_context {
        slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix);
        // get prompt
-        {
+        if (!task.infill) {
            const auto & prompt = data.find("prompt");
            if (prompt == data.end()) {
                send_error(task, "Either \"prompt\" or \"messages\" must be provided", ERROR_TYPE_INVALID_REQUEST);
@ -1516,13 +1583,29 @@ struct server_context {
        switch (task.type) {
            case SERVER_TASK_TYPE_COMPLETION:
                {
-                    server_slot * slot = get_slot(json_value(task.data, "id_slot", -1));
+                    int id_slot        = json_value(task.data, "id_slot", -1);
                    std::string prompt = json_value(task.data, "prompt", std::string());
                    server_slot * slot;
                    if (id_slot != -1) {
                        slot = get_slot_by_id(id_slot);
                    } else {
                        slot = get_available_slot(prompt);
                    }
                    if (slot == nullptr) {
                        // if no slot is available, we defer this task for processing later
                        LOG_VERBOSE("no slot is available", {{"id_task", task.id}});
                        queue_tasks.defer(task);
                        break;
                    }
                    if (!slot->available()) {
                        // if requested slot is unavailable, we defer this task for processing later
                        LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
                        queue_tasks.defer(task);
                        break;
                    }
                    if (task.data.contains("system_prompt")) {
                        std::string sys_prompt = json_value(task.data, "system_prompt", std::string());
@ -1639,11 +1722,17 @@ struct server_context {
            case SERVER_TASK_TYPE_SLOT_SAVE:
                {
                    int id_slot = task.data.at("id_slot");
-                    server_slot * slot = get_slot(id_slot);
+                    server_slot * slot = get_slot_by_id(id_slot);
                    if (slot == nullptr) {
                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
                        break;
                    }
                    if (!slot->available()) {
                        // if requested slot is unavailable, we defer this task for processing later
                        LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
                        queue_tasks.defer(task);
                        break;
                    }
                    const size_t token_count = slot->cache_tokens.size();
                    const int64_t t_start = ggml_time_us();
@ -1674,11 +1763,17 @@ struct server_context {
            case SERVER_TASK_TYPE_SLOT_RESTORE:
                {
                    int id_slot = task.data.at("id_slot");
-                    server_slot * slot = get_slot(id_slot);
+                    server_slot * slot = get_slot_by_id(id_slot);
                    if (slot == nullptr) {
                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
                        break;
                    }
                    if (!slot->available()) {
                        // if requested slot is unavailable, we defer this task for processing later
                        LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
                        queue_tasks.defer(task);
                        break;
                    }
                    const int64_t t_start = ggml_time_us();
@ -1716,11 +1811,17 @@ struct server_context {
            case SERVER_TASK_TYPE_SLOT_ERASE:
                {
                    int id_slot = task.data.at("id_slot");
-                    server_slot * slot = get_slot(id_slot);
+                    server_slot * slot = get_slot_by_id(id_slot);
                    if (slot == nullptr) {
                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
                        break;
                    }
                    if (!slot->available()) {
                        // if requested slot is unavailable, we defer this task for processing later
                        LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
                        queue_tasks.defer(task);
                        break;
                    }
                    // Erase token cache
                    const size_t n_erased = slot->cache_tokens.size();
@ -2361,7 +2462,7 @@ int main(int argc, char ** argv) {
    // TODO: not great to use extern vars
    server_log_json = params.log_json;
-    server_verbose = params.verbose;
+    server_verbose = params.verbosity > 0;
    // struct that contains llama context and inference
    server_context ctx_server;
@ -2468,6 +2569,9 @@ int main(int argc, char ** argv) {
        log_data["api_key"] = "api_key: " + std::to_string(params.api_keys.size()) + " keys loaded";
    }
    // Necessary similarity of prompt for slot selection
    ctx_server.slot_prompt_similarity = params.slot_prompt_similarity;
    // load the model
    if (!ctx_server.load_model(params)) {
        state.store(SERVER_STATE_ERROR);
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -253,6 +253,13 @@ static size_t common_part(const std::vector<llama_token> & a, const std::vector<
    return i;
 }
 static size_t common_part(const std::string & a, const std::string & b) {
    size_t i;
    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
    return i;
 }
 static bool ends_with(const std::string & str, const std::string & suffix) {
    return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
 }
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@ -9108,6 +9108,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
    // find the sum of exps in the block
    tmp = warp_reduce_sum(tmp, item_ct1);
    if (block_size > WARP_SIZE) {
        item_ct1.barrier(sycl::access::fence_space::local_space);
        if (warp_id == 0) {
            buf[lane_id] = 0.f;
        }
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@ -345,15 +345,12 @@ struct vk_context {
 };
 struct ggml_tensor_extra_gpu {
    bool ready;
    size_t ctx_idx;
    vk_buffer_ref buffer_gpu;
    uint64_t offset;
    void reset() {
        ready = false;
        ctx_idx = 0;
        buffer_gpu.reset();
        offset = 0;
@ -2949,7 +2946,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
    const uint64_t d_sz = sizeof(float) * d_ne;
    vk_buffer d_D = extra->buffer_gpu.lock();
-    const uint64_t d_buf_offset = extra->offset;
+    const uint64_t d_buf_offset = extra->offset + dst->view_offs;
    GGML_ASSERT(d_D != nullptr);
    GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
    vk_buffer d_X;
@ -2958,12 +2955,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
    uint64_t y_buf_offset = 0;
    if (!src0_uma) {
        d_Qx = extra_src0->buffer_gpu.lock();
-        qx_buf_offset = extra_src0->offset;
+        qx_buf_offset = extra_src0->offset + src0->view_offs;
        GGML_ASSERT(d_Qx != nullptr);
    }
    if (!src1_uma) {
        d_Qy = extra_src1->buffer_gpu.lock();
-        qy_buf_offset = extra_src1->offset;
+        qy_buf_offset = extra_src1->offset + src1->view_offs;
        GGML_ASSERT(d_Qy != nullptr);
    }
    if (qx_needs_dequant) {
@ -3114,7 +3111,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
    const uint64_t d_sz = sizeof(float) * d_ne;
    vk_buffer d_D = extra->buffer_gpu.lock();
-    const uint64_t d_buf_offset = extra->offset;
+    const uint64_t d_buf_offset = extra->offset + dst->view_offs;
    GGML_ASSERT(d_D != nullptr);
    vk_buffer d_X;
    uint64_t x_buf_offset = 0;
@ -3122,12 +3119,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
    uint64_t y_buf_offset = 0;
    if(!src0_uma) {
        d_Qx = extra_src0->buffer_gpu.lock();
-        qx_buf_offset = extra_src0->offset;
+        qx_buf_offset = extra_src0->offset + src0->view_offs;
        GGML_ASSERT(d_Qx != nullptr);
    }
    if(!src1_uma) {
        d_Qy = extra_src1->buffer_gpu.lock();
-        qy_buf_offset = extra_src1->offset;
+        qy_buf_offset = extra_src1->offset + src1->view_offs;
        GGML_ASSERT(d_Qy != nullptr);
    }
    if (qx_needs_dequant) {
@ -3246,14 +3243,14 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
    const uint64_t d_sz = sizeof(float) * d_ne;
    vk_buffer d_D = extra->buffer_gpu.lock();
-    const uint64_t d_buf_offset = extra->offset;
+    const uint64_t d_buf_offset = extra->offset + dst->view_offs;
    GGML_ASSERT(d_D != nullptr);
    vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
-    const uint64_t qx_buf_offset = extra_src0->offset;
+    const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
    GGML_ASSERT(d_Qx != nullptr);
    if (!src1_uma) {
        d_Qy = extra_src1->buffer_gpu.lock();
-        qy_buf_offset = extra_src1->offset;
+        qy_buf_offset = extra_src1->offset + src1->view_offs;
        GGML_ASSERT(d_Qx != nullptr);
    }
@ -3323,14 +3320,14 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
    const uint64_t d_sz = sizeof(float) * d_ne;
    vk_buffer d_D = extra->buffer_gpu.lock();
-    const uint64_t d_buf_offset = extra->offset;
+    const uint64_t d_buf_offset = extra->offset + dst->view_offs;
    GGML_ASSERT(d_D != nullptr);
    vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
-    const uint64_t qx_buf_offset = extra_src0->offset;
+    const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
    GGML_ASSERT(d_Qx != nullptr);
    if (!src1_uma) {
        d_Qy = extra_src1->buffer_gpu.lock();
-        qy_buf_offset = extra_src1->offset;
+        qy_buf_offset = extra_src1->offset + src1->view_offs;
        GGML_ASSERT(d_Qx != nullptr);
    }
@ -3459,7 +3456,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
    const uint64_t d_sz = sizeof(float) * d_ne;
    vk_buffer d_D = extra->buffer_gpu.lock();
-    const uint64_t d_buf_offset = extra->offset;
+    const uint64_t d_buf_offset = extra->offset + dst->view_offs;
    GGML_ASSERT(d_D != nullptr);
    vk_buffer d_X;
    uint64_t x_buf_offset = 0;
@ -3467,17 +3464,17 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
    uint64_t y_buf_offset = 0;
    if (!src0_uma) {
        d_Qx = extra_src0->buffer_gpu.lock();
-        qx_buf_offset = extra_src0->offset;
+        qx_buf_offset = extra_src0->offset + src0->view_offs;
        GGML_ASSERT(d_Qx != nullptr);
    }
    if (!src1_uma) {
        d_Qy = extra_src1->buffer_gpu.lock();
-        qy_buf_offset = extra_src1->offset;
+        qy_buf_offset = extra_src1->offset + src1->view_offs;
        GGML_ASSERT(d_Qy != nullptr);
    }
    if (!ids_uma) {
        d_ids = extra_ids->buffer_gpu.lock();
-        ids_buf_offset = extra_ids->offset;
+        ids_buf_offset = extra_ids->offset + ids->view_offs;
        GGML_ASSERT(d_ids != nullptr);
    }
    if (qx_needs_dequant) {
@ -3636,7 +3633,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
    const uint64_t d_sz = sizeof(float) * d_ne;
    vk_buffer d_D = extra->buffer_gpu.lock();
-    const uint64_t d_buf_offset = extra->offset;
+    const uint64_t d_buf_offset = extra->offset + dst->view_offs;
    GGML_ASSERT(d_D != nullptr);
    vk_buffer d_X;
    uint64_t x_buf_offset = 0;
@ -3644,17 +3641,17 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
    uint64_t y_buf_offset = 0;
    if(!src0_uma) {
        d_Qx = extra_src0->buffer_gpu.lock();
-        qx_buf_offset = extra_src0->offset;
+        qx_buf_offset = extra_src0->offset + src0->view_offs;
        GGML_ASSERT(d_Qx != nullptr);
    }
    if(!src1_uma) {
        d_Qy = extra_src1->buffer_gpu.lock();
-        qy_buf_offset = extra_src1->offset;
+        qy_buf_offset = extra_src1->offset + src1->view_offs;
        GGML_ASSERT(d_Qy != nullptr);
    }
    if(!ids_uma) {
        d_ids = extra_ids->buffer_gpu.lock();
-        ids_buf_offset = extra_ids->offset;
+        ids_buf_offset = extra_ids->offset + ids->view_offs;
        GGML_ASSERT(d_ids != nullptr);
    }
    if (qx_needs_dequant) {
@ -3769,9 +3766,9 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
    ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
    const vk_buffer src_buf = extra_src0->buffer_gpu.lock();
-    const uint64_t src_offset = extra_src0->offset;
+    const uint64_t src_offset = extra_src0->offset + src0->view_offs;
    vk_buffer dst_buf = extra->buffer_gpu.lock();
-    const uint64_t dst_offset = extra->offset;
+    const uint64_t dst_offset = extra->offset + dst->view_offs;
    std::vector<vk::BufferCopy> copies;
@ -4062,21 +4059,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
    }
    GGML_ASSERT(d_D != nullptr);
-    uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
+    uint64_t d_buf_offset = ((extra->offset + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
    GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY);  // NOLINT
    if(!src0_uma) {
        d_X = extra_src0->buffer_gpu.lock();
-        x_buf_offset = extra_src0->offset;
+        x_buf_offset = extra_src0->offset + src0->view_offs;
        GGML_ASSERT(d_X != nullptr);
    }
    if (use_src1 && !src1_uma) {
        d_Y = extra_src1->buffer_gpu.lock();
-        y_buf_offset = extra_src1->offset;
+        y_buf_offset = extra_src1->offset + src1->view_offs;
        GGML_ASSERT(d_Y != nullptr);
    }
    if (use_src2 && !src2_uma) {
        d_Z = extra_src2->buffer_gpu.lock();
-        z_buf_offset = extra_src2->offset;
+        z_buf_offset = extra_src2->offset + src2->view_offs;
        GGML_ASSERT(d_Z != nullptr);
    }
@ -4336,7 +4333,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
    const uint32_t src0_type_size = ggml_type_size(src0->type);
    const uint32_t dst_type_size = ggml_type_size(dst->type);
-    const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
+    const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
        (uint32_t)ggml_nelements(src0),
@ -5569,6 +5566,13 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
    const ggml_tensor * src2 = node->src[2];
    switch (node->op) {
    // Return on empty ops to avoid generating a compute_ctx and setting exit_tensor
    case GGML_OP_RESHAPE:
    case GGML_OP_VIEW:
    case GGML_OP_PERMUTE:
    case GGML_OP_TRANSPOSE:
    case GGML_OP_NONE:
        return;
    case GGML_OP_UNARY:
        switch (ggml_get_unary_op(node)) {
        case GGML_UNARY_OP_SILU:
@ -5590,10 +5594,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
    case GGML_OP_CPY:
    case GGML_OP_CONT:
    case GGML_OP_DUP:
    case GGML_OP_RESHAPE:
    case GGML_OP_VIEW:
    case GGML_OP_PERMUTE:
    case GGML_OP_TRANSPOSE:
    case GGML_OP_NORM:
    case GGML_OP_RMS_NORM:
    case GGML_OP_DIAG_MASK_INF:
@ -5601,7 +5601,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
    case GGML_OP_ROPE:
    case GGML_OP_MUL_MAT:
    case GGML_OP_MUL_MAT_ID:
    case GGML_OP_NONE:
    case GGML_OP_ARGSORT:
    case GGML_OP_SUM_ROWS:
        break;
@ -5654,12 +5653,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
    case GGML_OP_DUP:
        ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node);
        break;
    case GGML_OP_RESHAPE:
    case GGML_OP_VIEW:
    case GGML_OP_PERMUTE:
    case GGML_OP_TRANSPOSE:
    case GGML_OP_NONE:
        break;
    case GGML_OP_NORM:
        ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
@ -5712,7 +5705,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
        return;
    }
    extra->ready = true;
    extra->ctx_idx = ctx->compute_ctx->idx;
 #ifdef GGML_VULKAN_CHECK_RESULTS
@ -5796,8 +5788,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
    ggml_vk_check_results_0(ctx, params, tensor);
 #endif
    GGML_ASSERT(extra->ready);
    vk_context& subctx = ctx->gc.contexts[extra->ctx_idx];
    // Only run if ctx hasn't been submitted yet
@ -5822,8 +5812,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
        subctx.out_memcpys.clear();
    }
    extra->ready = false;
    return true;
 }
@ -5943,8 +5931,10 @@ struct ggml_backend_vk_buffer_context {
    ~ggml_backend_vk_buffer_context() {
        ggml_vk_destroy_buffer(dev_buffer);
        if (temp_tensor_extras != nullptr) {
            delete[] temp_tensor_extras;
        }
    }
    ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() {
        if (temp_tensor_extras == nullptr) {
@ -5990,18 +5980,16 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
 #endif
    ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
-    ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
+    if (tensor->view_src != nullptr) {
    if (tensor->view_src != nullptr && tensor->view_src->extra != nullptr) {
        GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
-        ggml_tensor_extra_gpu * extra_view = (ggml_tensor_extra_gpu *) tensor->view_src->extra;
+        GGML_ASSERT(tensor->view_src->extra != nullptr);
-        extra->buffer_gpu = extra_view->buffer_gpu;
+        tensor->extra = tensor->view_src->extra;
        extra->offset = extra_view->offset + tensor->view_offs;
    } else {
        ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
        extra->buffer_gpu = ctx->dev_buffer;
        extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
    }
        tensor->extra = extra;
    }
 }
 GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@ -6014,7 +6002,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
    vk_buffer buf = extra->buffer_gpu.lock();
-    ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + offset, data, size);
+    ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
 }
 GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@ -6027,7 +6015,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
    vk_buffer buf = extra->buffer_gpu.lock();
-    ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + offset, data, size);
+    ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
 }
 GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
@ -6038,7 +6026,7 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu
        vk_buffer src_buf = src_extra->buffer_gpu.lock();
        vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
-        ggml_vk_buffer_copy(dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
+        ggml_vk_buffer_copy(dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
        return true;
    }
@ -6264,7 +6252,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
    vk_buffer buf = extra->buffer_gpu.lock();
-    ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
+    ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
 }
 GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@ -6284,7 +6272,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
    vk_buffer buf = extra->buffer_gpu.lock();
-    ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
+    ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
 }
 GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
@ -6305,7 +6293,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
        vk_buffer src_buf = src_extra->buffer_gpu.lock();
        vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
-        ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
+        ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
        return true;
    }
@ -6478,11 +6466,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
        //         return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
        //     } break;
        case GGML_OP_ROPE:
            {
                const int mode = ((const int32_t *) op->op_params)[2];
            return true;
            } break;
        case GGML_OP_NONE:
        case GGML_OP_RESHAPE:
        case GGML_OP_VIEW:
@ -6725,7 +6709,7 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
        vk_buffer buffer_gpu = extra->buffer_gpu.lock();
-        ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
+        ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
    }
    std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
@ -6809,7 +6793,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
        } else if (ggml_backend_buffer_is_vk(src0->buffer)) {
            ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
            vk_buffer buffer_gpu = extra->buffer_gpu.lock();
-            uint64_t offset = extra->offset;
+            uint64_t offset = extra->offset + src0->view_offs;
            if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
                for (int i3 = 0; i3 < src0->ne[3]; i3++) {
                    for (int i2 = 0; i2 < src0->ne[2]; i2++) {
@ -6851,7 +6835,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
        } else if (ggml_backend_buffer_is_vk(src1->buffer)) {
            ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
            vk_buffer buffer_gpu = extra->buffer_gpu.lock();
-            uint64_t offset = extra->offset;
+            uint64_t offset = extra->offset + src1->view_offs;
            if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
                for (int i3 = 0; i3 < src1->ne[3]; i3++) {
                    for (int i2 = 0; i2 < src1->ne[2]; i2++) {
@ -6909,7 +6893,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
        } else if (ggml_backend_buffer_is_vk(src2->buffer)) {
            ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
            vk_buffer buffer_gpu = extra->buffer_gpu.lock();
-            uint64_t offset = extra->offset;
+            uint64_t offset = extra->offset + src2->view_offs;
            if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
                for (int i3 = 0; i3 < src2->ne[3]; i3++) {
                    for (int i2 = 0; i2 < src2->ne[2]; i2++) {
@ -7092,11 +7076,11 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
        vk_buffer buffer_gpu = extra->buffer_gpu.lock();
-        if (extra->offset + tensor_size >= buffer_gpu->size) {
+        if (extra->offset + tensor->view_offs + tensor_size >= buffer_gpu->size) {
-            tensor_size = buffer_gpu->size - (extra->offset);
+            tensor_size = buffer_gpu->size - (extra->offset + tensor->view_offs);
        }
-        ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
+        ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
    }
    float first_error_result = -1.0f;
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -415,6 +415,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.TOKEN_EMBD_NORM,
        MODEL_TENSOR.TOKEN_TYPES,
        MODEL_TENSOR.ATTN_NORM_2,
        MODEL_TENSOR.ATTN_OUT_NORM,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_Q_NORM,
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -5,6 +5,7 @@ import os
 import shutil
 import struct
 import tempfile
 from dataclasses import dataclass
 from enum import Enum, auto
 from io import BufferedWriter
 from typing import IO, Any, Sequence, Mapping
@ -30,17 +31,36 @@ from .quants import quant_shape_from_byte_shape
 logger = logging.getLogger(__name__)
@dataclass
 class TensorInfo:
    shape: Sequence[int]
    dtype: GGMLQuantizationType
    nbytes: int
    tensor: np.ndarray[Any, Any] | None = None
@dataclass
 class GGUFValue:
    value: Any
    type: GGUFValueType
 class WriterState(Enum):
    NO_FILE = auto()
    EMPTY   = auto()
    HEADER  = auto()
    KV_DATA = auto()
    TI_DATA = auto()
    WEIGHTS = auto()
 class GGUFWriter:
-    fout: BufferedWriter
+    fout: BufferedWriter | None
    path: os.PathLike[str] | str | None
    temp_file: tempfile.SpooledTemporaryFile[bytes] | None
-    tensors: list[np.ndarray[Any, Any]]
+    tensors: dict[str, TensorInfo]
    kv_data: dict[str, GGUFValue]
    state: WriterState
    _simple_value_packing = {
        GGUFValueType.UINT8:   "B",
        GGUFValueType.INT8:    "b",
@ -56,141 +76,140 @@ class GGUFWriter:
    }
    def __init__(
-        self, path: os.PathLike[str] | str, arch: str, use_temp_file: bool = True,
+        self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False,
        endianess: GGUFEndian = GGUFEndian.LITTLE,
    ):
-        self.fout = open(path, "wb")
+        self.fout = None
        self.path = path
        self.arch = arch
        self.endianess = endianess
        self.offset_tensor = 0
        self.data_alignment = GGUF_DEFAULT_ALIGNMENT
        self.kv_data = bytearray()
        self.kv_data_count = 0
        self.ti_data = bytearray()
        self.ti_data_count = 0
        self.ti_names = set()
        self.use_temp_file = use_temp_file
        self.temp_file = None
-        self.tensors = []
+        self.tensors = dict()
        self.kv_data = dict()
        logger.info("gguf: This GGUF file is for {0} Endian only".format(
            "Big" if self.endianess == GGUFEndian.BIG else "Little",
        ))
-        self.state = WriterState.EMPTY
+        self.state = WriterState.NO_FILE
        self.add_architecture()
-    def write_header_to_file(self) -> None:
+    def open_output_file(self, path: os.PathLike[str] | str | None = None) -> None:
        if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path):
            # allow calling this multiple times as long as the path is the same
            return
        if self.state is not WriterState.NO_FILE:
            raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
        if path is not None:
            self.path = path
        if self.path is not None:
            if self.fout is not None:
                self.fout.close()
            self.fout = open(self.path, "wb")
            self.state = WriterState.EMPTY
    def write_header_to_file(self, path: os.PathLike[str] | str | None = None) -> None:
        self.open_output_file(path)
        if self.state is not WriterState.EMPTY:
            raise ValueError(f'Expected output file to be empty, got {self.state}')
        self._write_packed("<I", GGUF_MAGIC, skip_pack_prefix = True)
        self._write_packed("I", GGUF_VERSION)
-        self._write_packed("Q", self.ti_data_count)
+        self._write_packed("Q", len(self.tensors))
-        self._write_packed("Q", self.kv_data_count)
+        self._write_packed("Q", len(self.kv_data))
        self.flush()
        self.state = WriterState.HEADER
    def write_kv_data_to_file(self) -> None:
        if self.state is not WriterState.HEADER:
            raise ValueError(f'Expected output file to contain the header, got {self.state}')
        assert self.fout is not None
-        self.fout.write(self.kv_data)
+        kv_data = bytearray()
        for key, val in self.kv_data.items():
            kv_data += self._pack_val(key, GGUFValueType.STRING, add_vtype=False)
            kv_data += self._pack_val(val.value, val.type, add_vtype=True)
        self.fout.write(kv_data)
        self.flush()
        self.state = WriterState.KV_DATA
    def write_ti_data_to_file(self) -> None:
        if self.state is not WriterState.KV_DATA:
            raise ValueError(f'Expected output file to contain KV data, got {self.state}')
        assert self.fout is not None
-        self.fout.write(self.ti_data)
+        ti_data = bytearray()
        offset_tensor = 0
        for name, ti in self.tensors.items():
            ti_data += self._pack_val(name, GGUFValueType.STRING, add_vtype=False)
            n_dims = len(ti.shape)
            ti_data += self._pack("I", n_dims)
            for i in range(n_dims):
                ti_data += self._pack("Q", ti.shape[n_dims - 1 - i])
            ti_data += self._pack("I", ti.dtype)
            ti_data += self._pack("Q", offset_tensor)
            offset_tensor += GGUFWriter.ggml_pad(ti.nbytes, self.data_alignment)
        self.fout.write(ti_data)
        self.flush()
        self.state = WriterState.TI_DATA
-    def add_key(self, key: str) -> None:
+    def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None:
-        self.add_val(key, GGUFValueType.STRING, add_vtype=False)
+        if key in self.kv_data:
            raise ValueError(f'Duplicated key name {key!r}')
        self.kv_data[key] = GGUFValue(value=val, type=vtype)
    def add_uint8(self, key: str, val: int) -> None:
-        self.add_key(key)
+        self.add_key_value(key,val, GGUFValueType.UINT8)
        self.add_val(val, GGUFValueType.UINT8)
    def add_int8(self, key: str, val: int) -> None:
-        self.add_key(key)
+        self.add_key_value(key, val, GGUFValueType.INT8)
        self.add_val(val, GGUFValueType.INT8)
    def add_uint16(self, key: str, val: int) -> None:
-        self.add_key(key)
+        self.add_key_value(key, val, GGUFValueType.UINT16)
        self.add_val(val, GGUFValueType.UINT16)
    def add_int16(self, key: str, val: int) -> None:
-        self.add_key(key)
+        self.add_key_value(key, val, GGUFValueType.INT16)
        self.add_val(val, GGUFValueType.INT16)
    def add_uint32(self, key: str, val: int) -> None:
-        self.add_key(key)
+        self.add_key_value(key, val, GGUFValueType.UINT32)
        self.add_val(val, GGUFValueType.UINT32)
    def add_int32(self, key: str, val: int) -> None:
-        self.add_key(key)
+        self.add_key_value(key, val, GGUFValueType.INT32)
        self.add_val(val, GGUFValueType.INT32)
    def add_float32(self, key: str, val: float) -> None:
-        self.add_key(key)
+        self.add_key_value(key, val, GGUFValueType.FLOAT32)
        self.add_val(val, GGUFValueType.FLOAT32)
    def add_uint64(self, key: str, val: int) -> None:
-        self.add_key(key)
+        self.add_key_value(key, val, GGUFValueType.UINT64)
        self.add_val(val, GGUFValueType.UINT64)
    def add_int64(self, key: str, val: int) -> None:
-        self.add_key(key)
+        self.add_key_value(key, val, GGUFValueType.INT64)
        self.add_val(val, GGUFValueType.INT64)
    def add_float64(self, key: str, val: float) -> None:
-        self.add_key(key)
+        self.add_key_value(key, val, GGUFValueType.FLOAT64)
        self.add_val(val, GGUFValueType.FLOAT64)
    def add_bool(self, key: str, val: bool) -> None:
-        self.add_key(key)
+        self.add_key_value(key, val, GGUFValueType.BOOL)
        self.add_val(val, GGUFValueType.BOOL)
    def add_string(self, key: str, val: str) -> None:
        if not val:
            return
-        self.add_key(key)
+        self.add_key_value(key, val, GGUFValueType.STRING)
        self.add_val(val, GGUFValueType.STRING)
    def add_array(self, key: str, val: Sequence[Any]) -> None:
        if not isinstance(val, Sequence):
            raise ValueError("Value must be a sequence for array type")
-        self.add_key(key)
+        self.add_key_value(key, val, GGUFValueType.ARRAY)
        self.add_val(val, GGUFValueType.ARRAY)
    def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True) -> None:
        if vtype is None:
            vtype = GGUFValueType.get_type(val)
        if add_vtype:
            self.kv_data += self._pack("I", vtype)
            self.kv_data_count += 1
        pack_fmt = self._simple_value_packing.get(vtype)
        if pack_fmt is not None:
            self.kv_data += self._pack(pack_fmt, val, skip_pack_prefix = vtype == GGUFValueType.BOOL)
        elif vtype == GGUFValueType.STRING:
            encoded_val = val.encode("utf-8") if isinstance(val, str) else val
            self.kv_data += self._pack("Q", len(encoded_val))
            self.kv_data += encoded_val
        elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
            ltype = GGUFValueType.get_type(val[0])
            if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
                raise ValueError("All items in a GGUF array should be of the same type")
            self.kv_data += self._pack("I", ltype)
            self.kv_data += self._pack("Q", len(val))
            for item in val:
                self.add_val(item, add_vtype=False)
        else:
            raise ValueError("Invalid GGUF metadata value type or value")
    @staticmethod
    def ggml_pad(x: int, n: int) -> int:
@ -200,16 +219,12 @@ class GGUFWriter:
        self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype,
        tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None,
    ) -> None:
-        if self.state is not WriterState.EMPTY:
+        if self.state is not WriterState.NO_FILE:
-            raise ValueError(f'Expected output file to be empty, got {self.state}')
+            raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
-        if name in self.ti_names:
+        if name in self.tensors:
-            raise ValueError(f'Duplicated tensor name {name}')
+            raise ValueError(f'Duplicated tensor name {name!r}')
        self.ti_names.add(name)
        encoded_name = name.encode("utf-8")
        self.ti_data += self._pack("Q", len(encoded_name))
        self.ti_data += encoded_name
        if raw_dtype is None:
            if tensor_dtype == np.float16:
                dtype = GGMLQuantizationType.F16
@ -231,14 +246,8 @@ class GGUFWriter:
            dtype = raw_dtype
            if tensor_dtype == np.uint8:
                tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
-        n_dims = len(tensor_shape)
+
-        self.ti_data += self._pack("I", n_dims)
+        self.tensors[name] = TensorInfo(shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes)
        for i in range(n_dims):
            self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
        self.ti_data += self._pack("I", dtype)
        self.ti_data += self._pack("Q", self.offset_tensor)
        self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
        self.ti_data_count += 1
    def add_tensor(
        self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
@ -252,10 +261,10 @@ class GGUFWriter:
            self.temp_file = fp
        shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape
-        self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
+        self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype=raw_dtype)
        if self.temp_file is None:
-            self.tensors.append(tensor)
+            self.tensors[name].tensor = tensor
            return
        tensor.tofile(self.temp_file)
@ -267,8 +276,9 @@ class GGUFWriter:
            fp.write(bytes([0] * pad))
    def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:
-        if self.state is not WriterState.TI_DATA:
+        if self.state is not WriterState.TI_DATA and self.state is not WriterState.WEIGHTS:
-            raise ValueError(f'Expected output file to contain tensor info, got {self.state}')
+            raise ValueError(f'Expected output file to contain tensor info or weights, got {self.state}')
        assert self.fout is not None
        if self.endianess == GGUFEndian.BIG:
            tensor.byteswap(inplace=True)
@ -276,50 +286,51 @@ class GGUFWriter:
        tensor.tofile(self.fout)
        self.write_padding(self.fout, tensor.nbytes)
        self.state = WriterState.WEIGHTS
    def write_tensors_to_file(self, *, progress: bool = False) -> None:
        self.write_ti_data_to_file()
        assert self.fout is not None
        self.write_padding(self.fout, self.fout.tell())
        if self.temp_file is None:
-            self.tensors.reverse()  # to pop from the "beginning" in constant time
+            bar = None
            if progress:
                from tqdm import tqdm
-                total_bytes = sum(t.nbytes for t in self.tensors)
+                total_bytes = sum(t.nbytes for t in self.tensors.values())
                bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
-                while True:
+            # relying on the fact that Python dicts preserve insertion order (since 3.7)
-                    try:
+            for ti in self.tensors.values():
-                        tensor = self.tensors.pop()
+                assert ti.tensor is not None  # can only iterate once over the tensors
-                    except IndexError:
+                assert ti.tensor.nbytes == ti.nbytes
-                        break
+                ti.tensor.tofile(self.fout)
-                    tensor.tofile(self.fout)
+                if bar is not None:
-                    bar.update(tensor.nbytes)
+                    bar.update(ti.nbytes)
-                    self.write_padding(self.fout, tensor.nbytes)
+                self.write_padding(self.fout, ti.nbytes)
-                return
+                ti.tensor = None
-            while True:
+        else:
                try:
                    tensor = self.tensors.pop()
                except IndexError:
                    break
                tensor.tofile(self.fout)
                self.write_padding(self.fout, tensor.nbytes)
            return
            self.temp_file.seek(0)
            shutil.copyfileobj(self.temp_file, self.fout)
            self.flush()
            self.temp_file.close()
        self.state = WriterState.WEIGHTS
    def flush(self) -> None:
        assert self.fout is not None
        self.fout.flush()
    def close(self) -> None:
        if self.fout is not None:
            self.fout.close()
            self.fout = None
    def add_architecture(self) -> None:
        self.add_string(Keys.General.ARCHITECTURE, self.arch)
@ -449,7 +460,7 @@ class GGUFWriter:
    def add_rope_scaling_factor(self, value: float) -> None:
        self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value)
-    def add_rope_scaling_attn_factors(self, value: Sequence[float]) -> None:
+    def add_rope_scaling_attn_factors(self, value: float) -> None:
        self.add_float32(Keys.Rope.SCALING_ATTN_FACTOR.format(arch=self.arch), value)
    def add_rope_scaling_orig_ctx_len(self, value: int) -> None:
@ -571,5 +582,32 @@ class GGUFWriter:
            pack_prefix = '<' if self.endianess == GGUFEndian.LITTLE else '>'
        return struct.pack(f'{pack_prefix}{fmt}', value)
    def _pack_val(self, val: Any, vtype: GGUFValueType, add_vtype: bool) -> bytes:
        kv_data = bytearray()
        if add_vtype:
            kv_data += self._pack("I", vtype)
        pack_fmt = self._simple_value_packing.get(vtype)
        if pack_fmt is not None:
            kv_data += self._pack(pack_fmt, val, skip_pack_prefix = vtype == GGUFValueType.BOOL)
        elif vtype == GGUFValueType.STRING:
            encoded_val = val.encode("utf-8") if isinstance(val, str) else val
            kv_data += self._pack("Q", len(encoded_val))
            kv_data += encoded_val
        elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
            ltype = GGUFValueType.get_type(val[0])
            if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
                raise ValueError("All items in a GGUF array should be of the same type")
            kv_data += self._pack("I", ltype)
            kv_data += self._pack("Q", len(val))
            for item in val:
                kv_data += self._pack_val(item, ltype, add_vtype=False)
        else:
            raise ValueError("Invalid GGUF metadata value type or value")
        return kv_data
    def _write_packed(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> None:
        assert self.fout is not None
        self.fout.write(self._pack(fmt, value, skip_pack_prefix))
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -102,6 +102,7 @@ class TensorNameMap:
        # Attention norm 2
        MODEL_TENSOR.ATTN_NORM_2: (
            "transformer.h.{bid}.ln_attn",  # falcon40b
            "encoder.layer.{bid}.layer_norm_1",             # jina-v2-code
        ),
        # Attention query-key-value
@ -311,6 +312,7 @@ class TensorNameMap:
            "model.layers.{bid}.mlp.c_proj",                          # starcoder2
            "encoder.layer.{bid}.mlp.wo",                             # jina-bert-v2
            "model.layers.{bid}.residual_mlp.w2",                     # arctic
            "encoder.layer.{bid}.mlp.down_layer",                     # jina-bert-v2
        ),
        MODEL_TENSOR.FFN_DOWN_EXP: (
@ -350,6 +352,7 @@ class TensorNameMap:
            "encoder.layers.{bid}.norm2",                   # nomic-bert
            "transformer.decoder_layer.{bid}.rms_norm_3",   # Grok
            "encoder.layer.{bid}.mlp.layernorm",            # jina-bert-v2
            "encoder.layer.{bid}.layer_norm_2"              # jina-v2-code
        ),
        MODEL_TENSOR.SSM_IN: (
--- a/gguf-py/scripts/gguf-new-metadata.py
+++ b/gguf-py/scripts/gguf-new-metadata.py
@ -101,8 +101,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
            logger.debug(f'Copying {field.name}')
        if val.value is not None:
-            writer.add_key(field.name)
+            writer.add_key_value(field.name, val.value, val.type)
            writer.add_val(val.value, val.type)
    if gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata:
        logger.debug('Adding chat template(s)')
@ -111,8 +110,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
    for key, val in new_metadata.items():
        logger.debug(f'Adding {key}: "{val.value}" {val.description}')
-        writer.add_key(key)
+        writer.add_key_value(key, val.value, val.type)
        writer.add_val(val.value, val.type)
    total_bytes = 0
--- a/klite.embd
+++ b/klite.embd
@ -3671,7 +3671,7 @@ Current version: 145
 	const XTTS_ID = 1000;
 	const ALLTALK_ID = 1001;
 	const HD_RES_PX = 512;
-	const NO_HD_RES_PX = 256;
+	const NO_HD_RES_PX = 320;
 	//all configurable globals
 	var perfdata = null; //if it's null, we are not connected
@ -12691,6 +12691,29 @@ Current version: 145
 		//original source https://github.com/kdavis-mozilla/vad.js Copyright (c) 2015, Kelly Daviss
 		let VAD=function(t){for(var e in this.options={fftSize:512,bufferLen:512,voice_stop:function(){},voice_start:function(){},smoothingTimeConstant:.99,energy_offset:1e-8,energy_threshold_ratio_pos:2,energy_threshold_ratio_neg:.5,energy_integration:1,filter:[{f:200,v:0},{f:2e3,v:1}],source:null,context:null},t)t.hasOwnProperty(e)&&(this.options[e]=t[e]);if(!this.options.source)throw Error("The options must specify a MediaStreamAudioSourceNode.");this.options.context=this.options.source.context,this.hertzPerBin=this.options.context.sampleRate/this.options.fftSize,this.iterationFrequency=this.options.context.sampleRate/this.options.bufferLen,this.iterationPeriod=1/this.iterationFrequency,this.setFilter=function(t){this.filter=[];for(var e=0,i=this.options.fftSize/2;e<i;e++){this.filter[e]=0;for(var s=0,o=t.length;s<o;s++)if(e*this.hertzPerBin<t[s].f){this.filter[e]=t[s].v;break}}},this.setFilter(this.options.filter),this.ready={},this.vadState=!1,this.energy_offset=this.options.energy_offset,this.energy_threshold_pos=this.energy_offset*this.options.energy_threshold_ratio_pos,this.energy_threshold_neg=this.energy_offset*this.options.energy_threshold_ratio_neg,this.voiceTrend=0,this.voiceTrendMax=10,this.voiceTrendMin=-10,this.voiceTrendStart=5,this.voiceTrendEnd=-5,this.analyser=this.options.context.createAnalyser(),this.analyser.smoothingTimeConstant=this.options.smoothingTimeConstant,this.analyser.fftSize=this.options.fftSize,this.floatFrequencyData=new Float32Array(this.analyser.frequencyBinCount),this.floatFrequencyDataLinear=new Float32Array(this.floatFrequencyData.length),this.options.source.connect(this.analyser),this.scriptProcessorNode=this.options.context.createScriptProcessor(this.options.bufferLen,1,1),this.scriptProcessorNode.connect(this.options.context.destination);var i=this;this.scriptProcessorNode.onaudioprocess=function(t){i.analyser.getFloatFrequencyData(i.floatFrequencyData),i.update(),i.monitor()},this.options.source.connect(this.scriptProcessorNode),this.update=function(){for(var t=this.floatFrequencyData,e=0,i=t.length;e<i;e++)this.floatFrequencyDataLinear[e]=Math.pow(10,t[e]/10);this.ready={}},this.getEnergy=function(){if(this.ready.energy)return this.energy;for(var t=0,e=this.floatFrequencyDataLinear,i=0,s=e.length;i<s;i++)t+=this.filter[i]*e[i]*e[i];return this.energy=t,this.ready.energy=!0,t},this.monitor=function(){var t=this.getEnergy()-this.energy_offset;t>this.energy_threshold_pos?this.voiceTrend=this.voiceTrend+1>this.voiceTrendMax?this.voiceTrendMax:this.voiceTrend+1:t<-this.energy_threshold_neg?this.voiceTrend=this.voiceTrend-1<this.voiceTrendMin?this.voiceTrendMin:this.voiceTrend-1:this.voiceTrend>0?this.voiceTrend--:this.voiceTrend<0&&this.voiceTrend++;var e=!1,i=!1;this.voiceTrend>this.voiceTrendStart?e=!0:this.voiceTrend<this.voiceTrendEnd&&(i=!0);var s=t*this.iterationPeriod*this.options.energy_integration;return s>0||!i?this.energy_offset+=s:this.energy_offset+=10*s,this.energy_offset=this.energy_offset<0?0:this.energy_offset,this.energy_threshold_pos=this.energy_offset*this.options.energy_threshold_ratio_pos,this.energy_threshold_neg=this.energy_offset*this.options.energy_threshold_ratio_neg,e&&!this.vadState&&(this.vadState=!0,this.options.voice_start()),i&&this.vadState&&(this.vadState=!1,this.options.voice_stop()),t}};
 		//resample audio freq (to 16khz)
 		function resampleAudioBuffer(buffer, resampledRate, callback) {
 			const originalSampleRate = buffer.sampleRate;
 			const channels = buffer.numberOfChannels;
 			const length = buffer.length;
 			const ratio = originalSampleRate / resampledRate;
 			const newLength = Math.round(length / ratio);
 			const audioContext = new (window.AudioContext || window.webkitAudioContext)();
 			const resampledBuffer = audioContext.createBuffer(channels, newLength, resampledRate);
 			const offlineContext = new OfflineAudioContext(channels, newLength, resampledRate);
 			const source = offlineContext.createBufferSource();
 			source.buffer = buffer;
 			source.connect(offlineContext.destination);
 			const startRendering = offlineContext.startRendering();
 			startRendering.then((renderedBuffer) => {
 				callback(renderedBuffer);
 			}).catch((error) => {
 				console.error('Resample Err:', error);
 			});
 			source.start();
 			return resampledBuffer;
 		}
 		let audioBufferToWavBlob = function (audioBuffer) {
 			let writeWavString = function (view, offset, string) {
 				for (let i = 0; i < string.length; i++) {
@ -12738,7 +12761,7 @@ Current version: 145
 			reader.onloadend = function() {
 			let arrayBuffer = reader.result;
 			window.AudioContext = window.AudioContext || window.webkitAudioContext;
-			let audioContext = new AudioContext({ sampleRate: 16000 });
+			let audioContext = new AudioContext();
 			audioContext.decodeAudioData(arrayBuffer, (buffer)=>{
 				onDone(buffer);
 			},(err)=>
@ -12778,20 +12801,24 @@ Current version: 145
 			if(preaudioblobs.length<2)
 			{
 				audioBlobToDecodedAudioBuffer(completeRecording,(buffer)=>{
-					let wavblob = audioBufferToWavBlob(buffer);
+					resampleAudioBuffer(buffer,16000,(rsBuffer)=>{
 						let wavblob = audioBufferToWavBlob(rsBuffer);
 						audiodatareader.readAsDataURL(wavblob);
 					});
 				});
 			} else {
 				audioBlobToDecodedAudioBuffer(completeRecording,(buffer)=>{
 					audioBlobToDecodedAudioBuffer(preaudioblobs[0],(buffer2)=>{
 						audioBlobToDecodedAudioBuffer(preaudioblobs[1],(buffer3)=>{
 							let prefix = concatenateAudioBuffers(buffer2,buffer3);
 							let finalbuf = concatenateAudioBuffers(prefix,buffer);
-							let wavblob = audioBufferToWavBlob(finalbuf);
+							resampleAudioBuffer(finalbuf,16000,(rsBuffer)=>{
 								let wavblob = audioBufferToWavBlob(rsBuffer);
 								audiodatareader.readAsDataURL(wavblob);
 							});
 						});
 					});
 				});
 			}
 			audiodatareader.onloadend = function () {
@ -12825,7 +12852,7 @@ Current version: 145
 			voicerecorder = new MediaRecorder(stream);
 			voicerecorder.addEventListener('dataavailable', onRecordingReady);
 			window.AudioContext = window.AudioContext || window.webkitAudioContext;
-			let audioContext = new AudioContext({ sampleRate: 16000 });
+			let audioContext = new AudioContext();
 			let source = audioContext.createMediaStreamSource(stream);
 			let options = {
 				source: source,
--- a/llama.cpp
+++ b/llama.cpp
@ -728,6 +728,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
            { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
            { LLM_TENSOR_TOKEN_TYPES,     "token_types" },
            { LLM_TENSOR_ATTN_NORM_2,     "blk.%d.attn_norm_2" },
            { LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },
            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },
@ -4715,8 +4716,7 @@ static void llm_load_vocab(
                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
                LLAMA_LOG_WARN("%s:                                             \n", __func__);
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
-            } else if (
+            } else if (tokenizer_pre == "default") {
                    tokenizer_pre == "default") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
            } else if (
                    tokenizer_pre == "llama3"   ||
@ -4743,7 +4743,8 @@ static void llm_load_vocab(
                    tokenizer_pre == "jina-es" ||
                    tokenizer_pre == "jina-de" ||
                    tokenizer_pre == "jina-v2-es" ||
-                    tokenizer_pre == "jina-v2-de") {
+                    tokenizer_pre == "jina-v2-de" ||
                    tokenizer_pre == "jina-v2-code") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
            } else if (
                    tokenizer_pre == "refact") {
@ -5634,6 +5635,9 @@ static bool llm_load_tensors(
                        layer.attn_out_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
                        layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i),   {n_embd});
                        layer.attn_norm_2   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
                        layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
                        layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,        "weight", i), {n_embd, n_ff});
                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE,    "weight", i), {n_embd, n_ff});
@ -8597,6 +8601,11 @@ struct llm_build_context {
            // attention layer norm
            cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
            if (model.layers[il].attn_norm_2 != nullptr) {
                cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
                cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il);
            }
            struct ggml_tensor * ffn_inp = cur;
            cb(ffn_inp, "ffn_inp", il);
@ -13940,7 +13949,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
        const uint32_t                chr) {
    bool found            = false;
-    bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
+    bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
    GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
@ -13949,6 +13958,10 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
            // inclusive range, e.g. [a-z]
            found = found || (pos->value <= chr && chr <= pos[1].value);
            pos += 2;
        } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
            // Any character matches "."
            found = true;
            pos += 1;
        } else {
            // exact char match, e.g. [a] or "a"
            found = found || pos->value == chr;
@ -13966,7 +13979,7 @@ static bool llama_grammar_match_partial_char(
        const llama_grammar_element * pos,
        const llama_partial_utf8      partial_utf8) {
-    bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
+    bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
    GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
    uint32_t partial_value = partial_utf8.value;
@ -13996,6 +14009,9 @@ static bool llama_grammar_match_partial_char(
                return is_positive_char;
            }
            pos += 2;
        } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
            // Any character matches "."
            return true;
        } else {
            // exact char match, e.g. [a] or "a"
            if (low <= pos->value && pos->value <= high) {
@ -14056,6 +14072,7 @@ static void llama_grammar_advance_stack(
        }
        case LLAMA_GRETYPE_CHAR:
        case LLAMA_GRETYPE_CHAR_NOT:
        case LLAMA_GRETYPE_CHAR_ANY:
            if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
                // only add the stack if it's not a duplicate of one we already have
                new_stacks.emplace_back(stack);
@ -15543,6 +15560,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        if (imatrix_data) {
            LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
            qs.has_imatrix = true;
            // check imatrix for nans or infs
            for (const auto & kv : *imatrix_data) {
                for (float f : kv.second) {
                    if (!std::isfinite(f)) {
                        throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
                    }
                }
            }
        }
    }
--- a/llama.h
+++ b/llama.h
@ -365,6 +365,9 @@ extern "C" {
        // modifies a preceding LLAMA_GRETYPE_CHAR or
        // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
        LLAMA_GRETYPE_CHAR_ALT       = 6,
        // any character (.)
        LLAMA_GRETYPE_CHAR_ANY       = 7,
    };
    typedef struct llama_grammar_element {