Merge commit 'df270ef745' into concedo_experimental

# Conflicts: # Makefile # common/CMakeLists.txt # common/common.h # common/sampling.cpp # common/sampling.h # examples/infill/infill.cpp # examples/llama-bench/llama-bench.cpp # examples/quantize-stats/quantize-stats.cpp # examples/server/server.cpp # include/llama.h # src/llama-sampling.cpp # src/llama-sampling.h # src/llama.cpp # tests/test-grammar-integration.cpp # tests/test-grammar-parser.cpp # tests/test-json-schema-to-grammar.cpp # tests/test-llama-grammar.cpp # tests/test-sampling.cpp
2025-09-10 09:04:36 +00:00 · 2024-09-09 17:10:08 +08:00 · 2024-09-09 17:10:08 +08:00 · 12fd16bfd4
commit 12fd16bfd4
parent 70cdb55cc9 df270ef745
86 changed files with 3406 additions and 7795 deletions
--- a/.gitignore
+++ b/.gitignore
@ -89,7 +89,6 @@ poetry.toml
 ggml-metal-merged.metal

 # Test binaries
-tests/test-grammar-parser
 /tests/test-llama-grammar
 tests/test-double-float
 tests/test-grad0
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -471,8 +471,6 @@ add_library(common2
            common/common.h
            common/sampling.cpp
            common/sampling.h
-            common/grammar-parser.h
-            common/grammar-parser.cpp
            examples/llava/llava.cpp
            examples/llava/llava.h
            examples/llava/clip.cpp
--- a/8
+++ b/8
@ -87,9 +87,9 @@ endif
 CUBLASLD_FLAGS =
 CUBLAS_OBJS =

-OBJS_FULL += ggml-alloc.o ggml-aarch64.o ggml-quants.o unicode.o unicode-data.o sgemm.o common.o sampling.o grammar-parser.o
-OBJS_SIMPLE += ggml-alloc.o ggml-aarch64.o ggml-quants_noavx2.o unicode.o unicode-data.o sgemm_noavx2.o common.o sampling.o grammar-parser.o
-OBJS_FAILSAFE += ggml-alloc.o ggml-aarch64.o ggml-quants_failsafe.o unicode.o unicode-data.o sgemm_failsafe.o common.o sampling.o grammar-parser.o
+OBJS_FULL += ggml-alloc.o ggml-aarch64.o ggml-quants.o unicode.o unicode-data.o sgemm.o common.o sampling.o
+OBJS_SIMPLE += ggml-alloc.o ggml-aarch64.o ggml-quants_noavx2.o unicode.o unicode-data.o sgemm_noavx2.o common.o sampling.o
+OBJS_FAILSAFE += ggml-alloc.o ggml-aarch64.o ggml-quants_failsafe.o unicode.o unicode-data.o sgemm_failsafe.o common.o sampling.o

 # OS specific
 # TODO: support Windows
@ -545,8 +545,6 @@ sampling.o: common/sampling.cpp common/common.h common/sampling.h common/log.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 console.o: common/console.cpp common/console.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
-grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
 expose.o: expose.cpp expose.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

--- a/common/common.cpp
+++ b/common/common.cpp
@ -354,16 +354,15 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)
 }

 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
-    bool invalid_param = false;
-    std::string arg;
-    const std::string arg_prefix = "--";
-    llama_sampling_params & sparams = params.sparams;
-
    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
+        const std::string arg_prefix = "--";
+
+        std::string arg = argv[i];
        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
            std::replace(arg.begin(), arg.end(), '_', '-');
        }
+
+        bool invalid_param = false;
        if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
            throw std::invalid_argument("error: unknown argument: " + arg);
        }
@ -387,11 +386,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        get_env("HF_TOKEN", params.hf_token);
    }

+    auto & sparams = params.sparams;
+
    if (params.escape) {
        string_process_escapes(params.prompt);
        string_process_escapes(params.input_prefix);
        string_process_escapes(params.input_suffix);
-        string_process_escapes(sparams.cfg_negative_prompt);
        for (auto & antiprompt : params.antiprompt) {
            string_process_escapes(antiprompt);
        }
@ -402,6 +402,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        params.kv_overrides.back().key[0] = 0;
    }

+    if (sparams.seed == LLAMA_DEFAULT_SEED) {
+        sparams.seed = time(NULL);
+    }
+
    return true;
 }

@ -527,12 +531,10 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
 bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
    const char split_delim = ',';

-    llama_sampling_params & sparams = params.sparams;
+    auto & sparams = params.sparams;

    if (arg == "-s" || arg == "--seed") {
        CHECK_ARG
-        // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context.
-        params.seed = std::stoul(argv[i]);
        sparams.seed = std::stoul(argv[i]);
        return true;
    }
@ -843,12 +845,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
    if (arg == "--samplers") {
        CHECK_ARG
        const auto sampler_names = string_split(argv[i], ';');
-        sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true);
+        sparams.samplers = gpt_sampler_types_from_names(sampler_names, true);
        return true;
    }
    if (arg == "--sampling-seq") {
        CHECK_ARG
-        sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]);
+        sparams.samplers = gpt_sampler_types_from_chars(argv[i]);
        return true;
    }
    if (arg == "--top-p") {
@ -874,7 +876,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
    }
    if (arg == "--typical") {
        CHECK_ARG
-        sparams.typical_p = std::stof(argv[i]);
+        sparams.typ_p = std::stof(argv[i]);
        return true;
    }
    if (arg == "--repeat-last-n") {
@ -923,30 +925,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        sparams.mirostat_tau = std::stof(argv[i]);
        return true;
    }
-    if (arg == "--cfg-negative-prompt") {
-        CHECK_ARG
-        sparams.cfg_negative_prompt = argv[i];
-        return true;
-    }
-    if (arg == "--cfg-negative-prompt-file") {
-        CHECK_ARG
-        std::ifstream file(argv[i]);
-        if (!file) {
-            fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-            invalid_param = true;
-            return true;
-        }
-        std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt));
-        if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
-            sparams.cfg_negative_prompt.pop_back();
-        }
-        return true;
-    }
-    if (arg == "--cfg-scale") {
-        CHECK_ARG
-        sparams.cfg_scale = std::stof(argv[i]);
-        return true;
-    }
    if (arg == "-b" || arg == "--batch-size") {
        CHECK_ARG
        params.n_batch = std::stoi(argv[i]);
@ -1356,7 +1334,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        return true;
    }
    if (arg == "--ignore-eos") {
-        params.ignore_eos = true;
+        sparams.ignore_eos = true;
        return true;
    }
    if (arg == "--penalize-nl") {
@ -1371,7 +1349,8 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        std::string value_str;
        try {
            if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
-                sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
+                const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
+                sparams.logit_bias.push_back({key, bias});
            }
            else {
                throw std::exception();
@ -1726,13 +1705,13 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
 #endif

 void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
-    const llama_sampling_params & sparams = params.sparams;
+    const auto & sparams = params.sparams;

    std::string sampler_type_chars;
    std::string sampler_type_names;
-    for (const auto sampler_type : sparams.samplers_sequence) {
-        sampler_type_chars += static_cast<char>(sampler_type);
-        sampler_type_names += llama_sampling_type_to_str(sampler_type) + ";";
+    for (const auto & sampler : sparams.samplers) {
+        sampler_type_chars += gpt_sampler_type_to_chr(sampler);
+        sampler_type_names += gpt_sampler_type_to_str(sampler) + ";";
    }
    sampler_type_names.pop_back();

@ -1767,7 +1746,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "       --verbose-prompt",       "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
    options.push_back({ "*",           "       --no-display-prompt",    "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
    options.push_back({ "*",           "-co,   --color",                "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
-    options.push_back({ "*",           "-s,    --seed SEED",            "RNG seed (default: %d, use random seed for < 0)", params.seed });
    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during generation (default: %d)", params.cpuparams.n_threads });
    options.push_back({ "*",           "-tb,   --threads-batch N",      "number of threads to use during batch and prompt processing (default: same as --threads)" });
    options.push_back({ "speculative", "-td,   --threads-draft N",      "number of threads to use during generation (default: same as --threads)" });
@ -1847,18 +1825,19 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                       "       --spm-infill",           "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });

    options.push_back({ "sampling" });
+    options.push_back({ "*",           "-s,    --seed SEED",            "RNG seed (default: %d, use random seed for < 0)", sparams.seed });
    options.push_back({ "*",           "       --samplers SAMPLERS",    "samplers that will be used for generation in the order, separated by \';\'\n"
                                                                        "(default: %s)", sampler_type_names.c_str() });
    options.push_back({ "*",           "       --sampling-seq SEQUENCE",
                                                                        "simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() });
    options.push_back({ "*",           "       --ignore-eos",           "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" });
    options.push_back({ "*",           "       --penalize-nl",          "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" });
-    options.push_back({ "*",           "       --temp N",               "temperature (default: %.1f)", (double)sparams.temp });
+    options.push_back({ "*",           "       --temp T",               "temperature (default: %.1f)", (double)sparams.temp });
    options.push_back({ "*",           "       --top-k N",              "top-k sampling (default: %d, 0 = disabled)", sparams.top_k });
-    options.push_back({ "*",           "       --top-p N",              "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p });
-    options.push_back({ "*",           "       --min-p N",              "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p });
-    options.push_back({ "*",           "       --tfs N",                "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z });
-    options.push_back({ "*",           "       --typical N",            "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p });
+    options.push_back({ "*",           "       --top-p P",              "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p });
+    options.push_back({ "*",           "       --min-p P",              "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p });
+    options.push_back({ "*",           "       --tfs P",                "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z });
+    options.push_back({ "*",           "       --typical P",            "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typ_p });
    options.push_back({ "*",           "       --repeat-last-n N",      "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n });
    options.push_back({ "*",           "       --repeat-penalty N",     "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat });
    options.push_back({ "*",           "       --presence-penalty N",   "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present });
@ -1873,11 +1852,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "       -l TOKEN_ID(+/-)BIAS",   "modifies the likelihood of token appearing in the completion,\n"
                                                                        "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
                                                                        "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" });
-    options.push_back({ "main",        "       --cfg-negative-prompt PROMPT",
-                                                                        "negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str() });
-    options.push_back({ "main",        "       --cfg-negative-prompt-file FNAME",
-                                                                        "negative prompt file to use for guidance" });
-    options.push_back({ "main",        "       --cfg-scale N",          "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
    options.push_back({ "main",        "       --chat-template JINJA_TEMPLATE",
                                                                        "set custom jinja chat template (default: template taken from model's metadata)\n"
                                                                        "if suffix/prefix are specified, template will be disabled\n"
@ -2529,8 +2503,9 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
        llama_lora_adapters_apply(lctx, iparams.lora_adapters);
    }

-    if (params.ignore_eos) {
-        params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
+    if (params.sparams.ignore_eos && llama_token_eos(model) == -1) {
+        fprintf(stderr, "%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
+        params.sparams.ignore_eos = false;
    }

    if (params.warmup) {
@ -2559,7 +2534,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
        }
        llama_kv_cache_clear(lctx);
        llama_synchronize(lctx);
-        llama_reset_timings(lctx);
+        llama_perf_reset(lctx, LLAMA_PERF_TYPE_CONTEXT);
    }

    iparams.model   = model;
@ -2638,7 +2613,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.n_threads         = params.cpuparams.n_threads;
    cparams.n_threads_batch   = params.cpuparams_batch.n_threads == -1 ?
                                    params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
-    cparams.seed              = params.seed;
    cparams.logits_all        = params.logits_all;
    cparams.embeddings        = params.embedding;
    cparams.rope_scaling_type = params.rope_scaling_type;
@ -3524,7 +3498,7 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha

 void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx,
                               const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
-    const llama_sampling_params & sparams = params.sparams;
+    const auto & sparams = params.sparams;

    fprintf(stream, "build_commit: %s\n",        LLAMA_COMMIT);
    fprintf(stream, "build_number: %d\n",        LLAMA_BUILD_NUMBER);
@ -3575,8 +3549,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l

    fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
    fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
-    yaml_dump_string_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
-    fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
    fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
    fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
    fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
@ -3587,10 +3559,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
    fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
    fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
-
-    const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
-    const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
-    fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
+    fprintf(stream, "ignore_eos: %s # default: false\n", sparams.ignore_eos ? "true" : "false");

    yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
    fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
@ -3601,11 +3570,8 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());

    fprintf(stream, "logit_bias:\n");
-    for (std::pair<llama_token, float> lb : sparams.logit_bias) {
-        if (ignore_eos && lb.first == logit_bias_eos->first) {
-            continue;
-        }
-        fprintf(stream, "  %d: %f", lb.first, lb.second);
+    for (const auto & logit_bias : sparams.logit_bias) {
+        fprintf(stream, "  %d: %f", logit_bias.token, logit_bias.bias);
    }

    fprintf(stream, "lora:\n");
@ -3658,7 +3624,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l

    fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
    fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
-    fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
    fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
    fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
    fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
@ -3672,7 +3637,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
    fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
    fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
-    fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
+    fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
    fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
    fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
 }
--- a/common/common.h
+++ b/common/common.h
@ -144,8 +144,8 @@ struct gpt_params {
    float   dynatemp_range     = 0.0f;  // enables DynaTemp if greater than 0. dynatemp_min = temperature - dt_range, dynatemp_max = temperature + dt_range
    float   dynatemp_exponent  = 1.0f;

-    // // sampling parameters
-    struct llama_sampling_params sparams;
+
+    struct gpt_sampler_params sparams;

    std::string model                = ""; // model path
    std::string model_draft          = ""; // draft model for speculative decoding
@ -209,7 +209,6 @@ struct gpt_params {
    bool flash_attn        = false; // flash attention

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
-    bool ignore_eos        = false; // ignore generated EOS tokens
    bool logits_all        = false; // return logits for all tokens in the batch
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@ -1,539 +0,0 @@
-#include "grammar-parser.h"
-#include <cstdint>
-#include <cwchar>
-#include <string>
-#include <utility>
-#include <stdexcept>
-#include <exception>
-
-namespace grammar_parser {
-    // NOTE: assumes valid utf8 (but checks for overrun)
-    // copied from llama.cpp
-    static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
-        static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
-        uint8_t  first_byte = static_cast<uint8_t>(*src);
-        uint8_t  highbits   = first_byte >> 4;
-        int      len        = lookup[highbits];
-        uint8_t  mask       = (1 << (8 - len)) - 1;
-        uint32_t value      = first_byte & mask;
-        const char * end    = src + len; // may overrun!
-        const char * pos    = src + 1;
-        for ( ; pos < end && *pos; pos++) {
-            value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
-        }
-        return std::make_pair(value, pos);
-    }
-
-    static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
-        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
-        auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
-        return result.first->second;
-    }
-
-    static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
-        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
-        state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
-        return next_id;
-    }
-
-    static void add_rule(
-            parse_state & state,
-            uint32_t      rule_id,
-            const std::vector<llama_grammar_element> & rule) {
-        if (state.rules.size() <= rule_id) {
-            state.rules.resize(rule_id + 1);
-        }
-        state.rules[rule_id] = rule;
-    }
-
-    static bool is_digit_char(char c) {
-        return '0' <= c && c <= '9';
-    }
-
-    static bool is_word_char(char c) {
-        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
-    }
-
-    static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
-        const char * pos   = src;
-        const char * end   = src + size;
-        uint32_t     value = 0;
-        for ( ; pos < end && *pos; pos++) {
-            value <<= 4;
-            char c = *pos;
-            if ('a' <= c && c <= 'f') {
-                value += c - 'a' + 10;
-            } else if ('A' <= c && c <= 'F') {
-                value += c - 'A' + 10;
-            } else if ('0' <= c && c <= '9') {
-                value += c - '0';
-            } else {
-                break;
-            }
-        }
-        if (pos != end) {
-            throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
-        }
-        return std::make_pair(value, pos);
-    }
-
-    static const char * parse_space(const char * src, bool newline_ok) {
-        const char * pos = src;
-        while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
-                (newline_ok && (*pos == '\r' || *pos == '\n'))) {
-            if (*pos == '#') {
-                while (*pos && *pos != '\r' && *pos != '\n') {
-                    pos++;
-                }
-            } else {
-                pos++;
-            }
-        }
-        return pos;
-    }
-
-    static const char * parse_name(const char * src) {
-        const char * pos = src;
-        while (is_word_char(*pos)) {
-            pos++;
-        }
-        if (pos == src) {
-            throw std::runtime_error(std::string("expecting name at ") + src);
-        }
-        return pos;
-    }
-
-    static const char * parse_int(const char * src) {
-        const char * pos = src;
-        while (is_digit_char(*pos)) {
-            pos++;
-        }
-        if (pos == src) {
-            throw std::runtime_error(std::string("expecting integer at ") + src);
-        }
-        return pos;
-    }
-
-    static std::pair<uint32_t, const char *> parse_char(const char * src) {
-        if (*src == '\\') {
-            switch (src[1]) {
-                case 'x': return parse_hex(src + 2, 2);
-                case 'u': return parse_hex(src + 2, 4);
-                case 'U': return parse_hex(src + 2, 8);
-                case 't': return std::make_pair('\t', src + 2);
-                case 'r': return std::make_pair('\r', src + 2);
-                case 'n': return std::make_pair('\n', src + 2);
-                case '\\':
-                case '"':
-                case '[':
-                case ']':
-                    return std::make_pair(src[1], src + 2);
-                default:
-                    throw std::runtime_error(std::string("unknown escape at ") + src);
-            }
-        } else if (*src) {
-            return decode_utf8(src);
-        }
-        throw std::runtime_error("unexpected end of input");
-    }
-
-    const char * parse_alternates(
-            parse_state       & state,
-            const char        * src,
-            const std::string & rule_name,
-            uint32_t            rule_id,
-            bool                is_nested);
-
-    static const char * parse_sequence(
-            parse_state                        & state,
-            const char                         * src,
-            const std::string                  & rule_name,
-            std::vector<llama_grammar_element> & out_elements,
-            bool                                 is_nested) {
-        size_t last_sym_start = out_elements.size();
-        const char * pos = src;
-
-        auto handle_repetitions = [&](int min_times, int max_times) {
-
-            if (last_sym_start == out_elements.size()) {
-                throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
-            }
-
-            // apply transformation to previous symbol (last_sym_start to end) according to
-            // the following rewrite rules:
-            // S{m,n} --> S S S (m times) S'(n-m)
-            //            S'(x)   ::= S S'(x-1) |
-            //            (... n-m definitions of these S' rules ...)
-            //            S'(1)   ::= S |
-            // S{m,} -->  S S S (m times) S'
-            //            S'     ::= S S' |
-            // S*     --> S{0,}
-            //        --> S'     ::= S S' |
-            // S+     --> S{1,}
-            //        --> S S'
-            //            S'     ::= S S' |
-            // S?     --> S{0,1}
-            //        --> S'
-            //            S'     ::= S |
-
-            std::vector<llama_grammar_element> previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
-            if (min_times == 0) {
-                out_elements.resize(last_sym_start);
-            } else {
-                // Repeat the previous elements (min_times - 1) times
-                for (int i = 1; i < min_times; i++) {
-                    out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
-                }
-            }
-
-            uint32_t last_rec_rule_id = 0;
-            auto n_opt = max_times < 0 ? 1 : max_times - min_times;
-
-            std::vector<llama_grammar_element> rec_rule(previous_elements);
-            for (int i = 0; i < n_opt; i++) {
-                rec_rule.resize(previous_elements.size());
-                uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
-                if (i > 0 || max_times < 0) {
-                    rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
-                }
-                rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
-                rec_rule.push_back({LLAMA_GRETYPE_END, 0});
-                add_rule(state, rec_rule_id, rec_rule);
-                last_rec_rule_id = rec_rule_id;
-            }
-            if (n_opt > 0) {
-                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
-            }
-        };
-
-        while (*pos) {
-            if (*pos == '"') { // literal string
-                pos++;
-                last_sym_start = out_elements.size();
-                while (*pos != '"') {
-                    if (!*pos) {
-                        throw std::runtime_error("unexpected end of input");
-                    }
-                    auto char_pair = parse_char(pos);
-                         pos       = char_pair.second;
-                    out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
-                }
-                pos = parse_space(pos + 1, is_nested);
-            } else if (*pos == '[') { // char range(s)
-                pos++;
-                enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
-                if (*pos == '^') {
-                    pos++;
-                    start_type = LLAMA_GRETYPE_CHAR_NOT;
-                }
-                last_sym_start = out_elements.size();
-                while (*pos != ']') {
-                    if (!*pos) {
-                        throw std::runtime_error("unexpected end of input");
-                    }
-                    auto char_pair = parse_char(pos);
-                         pos       = char_pair.second;
-                    enum llama_gretype type = last_sym_start < out_elements.size()
-                        ? LLAMA_GRETYPE_CHAR_ALT
-                        : start_type;
-
-                    out_elements.push_back({type, char_pair.first});
-                    if (pos[0] == '-' && pos[1] != ']') {
-                        if (!pos[1]) {
-                            throw std::runtime_error("unexpected end of input");
-                        }
-                        auto endchar_pair = parse_char(pos + 1);
-                             pos          = endchar_pair.second;
-                        out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
-                    }
-                }
-                pos = parse_space(pos + 1, is_nested);
-            } else if (is_word_char(*pos)) { // rule reference
-                const char * name_end    = parse_name(pos);
-                uint32_t     ref_rule_id = get_symbol_id(state, pos, name_end - pos);
-                pos = parse_space(name_end, is_nested);
-                last_sym_start = out_elements.size();
-                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
-            } else if (*pos == '(') { // grouping
-                // parse nested alternates into synthesized rule
-                pos = parse_space(pos + 1, true);
-                uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
-                pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
-                last_sym_start = out_elements.size();
-                // output reference to synthesized rule
-                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
-                if (*pos != ')') {
-                    throw std::runtime_error(std::string("expecting ')' at ") + pos);
-                }
-                pos = parse_space(pos + 1, is_nested);
-            } else if (*pos == '.') { // any char
-                last_sym_start = out_elements.size();
-                out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
-                pos = parse_space(pos + 1, is_nested);
-            } else if (*pos == '*') {
-                pos = parse_space(pos + 1, is_nested);
-                handle_repetitions(0, -1);
-            } else if (*pos == '+') {
-                pos = parse_space(pos + 1, is_nested);
-                handle_repetitions(1, -1);
-            } else if (*pos == '?') {
-                pos = parse_space(pos + 1, is_nested);
-                handle_repetitions(0, 1);
-            } else if (*pos == '{') {
-                pos = parse_space(pos + 1, is_nested);
-
-                if (!is_digit_char(*pos)) {
-                    throw std::runtime_error(std::string("expecting an int at ") + pos);
-                }
-                const char * int_end = parse_int(pos);
-                int min_times = std::stoul(std::string(pos, int_end - pos));
-                pos = parse_space(int_end, is_nested);
-
-                int max_times = -1;
-
-                if (*pos == '}') {
-                    max_times = min_times;
-                    pos = parse_space(pos + 1, is_nested);
-                } else if (*pos == ',') {
-                    pos = parse_space(pos + 1, is_nested);
-
-                    if (is_digit_char(*pos)) {
-                        const char * int_end = parse_int(pos);
-                        max_times = std::stoul(std::string(pos, int_end - pos));
-                        pos = parse_space(int_end, is_nested);
-                    }
-
-                    if (*pos != '}') {
-                        throw std::runtime_error(std::string("expecting '}' at ") + pos);
-                    }
-                    pos = parse_space(pos + 1, is_nested);
-                } else {
-                    throw std::runtime_error(std::string("expecting ',' at ") + pos);
-                }
-                handle_repetitions(min_times, max_times);
-            } else {
-                break;
-            }
-        }
-        return pos;
-    }
-
-    const char * parse_alternates(
-            parse_state       & state,
-            const char        * src,
-            const std::string & rule_name,
-            uint32_t            rule_id,
-            bool                is_nested) {
-        std::vector<llama_grammar_element> rule;
-        const char * pos = parse_sequence(state, src, rule_name, rule, is_nested);
-        while (*pos == '|') {
-            rule.push_back({LLAMA_GRETYPE_ALT, 0});
-            pos = parse_space(pos + 1, true);
-            pos = parse_sequence(state, pos, rule_name, rule, is_nested);
-        }
-        rule.push_back({LLAMA_GRETYPE_END, 0});
-        add_rule(state, rule_id, rule);
-        return pos;
-    }
-
-    static const char * parse_rule(parse_state & state, const char * src) {
-        const char * name_end = parse_name(src);
-        const char * pos      = parse_space(name_end, false);
-        size_t       name_len = name_end - src;
-        uint32_t     rule_id  = get_symbol_id(state, src, name_len);
-        const std::string name(src, name_len);
-
-        if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
-            throw std::runtime_error(std::string("expecting ::= at ") + pos);
-        }
-        pos = parse_space(pos + 3, true);
-
-        pos = parse_alternates(state, pos, name, rule_id, false);
-
-        if (*pos == '\r') {
-            pos += pos[1] == '\n' ? 2 : 1;
-        } else if (*pos == '\n') {
-            pos++;
-        } else if (*pos) {
-            throw std::runtime_error(std::string("expecting newline or end at ") + pos);
-        }
-        return parse_space(pos, true);
-    }
-
-    parse_state parse(const char * src) {
-        try {
-            parse_state state;
-            const char * pos = parse_space(src, true);
-            while (*pos) {
-                pos = parse_rule(state, pos);
-            }
-            // Validate the state to ensure that all rules are defined
-            for (const auto & rule : state.rules) {
-                if (rule.empty()) {
-                    throw std::runtime_error("Undefined rule");
-                }
-                for (const auto & elem : rule) {
-                    if (elem.type == LLAMA_GRETYPE_RULE_REF) {
-                        // Ensure that the rule at that location exists
-                        if (elem.value >= state.rules.size() || state.rules[elem.value].empty()) {
-                            // Get the name of the rule that is missing
-                            for (const auto & kv : state.symbol_ids) {
-                                if (kv.second == elem.value) {
-                                    throw std::runtime_error("Undefined rule identifier '" + kv.first + "'");
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-            return state;
-        } catch (const std::exception & err) {
-            fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
-            return parse_state();
-        }
-    }
-
-    static void print_grammar_char(FILE * file, uint32_t c) {
-        if (0x20 <= c && c <= 0x7f) {
-            fprintf(file, "%c", static_cast<char>(c));
-        } else {
-            // cop out of encoding UTF-8
-            fprintf(file, "<U+%04X>", c);
-        }
-    }
-
-    static bool is_char_element(llama_grammar_element elem) {
-        switch (elem.type) {
-            case LLAMA_GRETYPE_CHAR:           return true;
-            case LLAMA_GRETYPE_CHAR_NOT:       return true;
-            case LLAMA_GRETYPE_CHAR_ALT:       return true;
-            case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
-            case LLAMA_GRETYPE_CHAR_ANY:       return true;
-            default:                           return false;
-        }
-    }
-
-    static void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
-        for (auto elem : rule) {
-            switch (elem.type) {
-                case LLAMA_GRETYPE_END:            fprintf(file, "END");            break;
-                case LLAMA_GRETYPE_ALT:            fprintf(file, "ALT");            break;
-                case LLAMA_GRETYPE_RULE_REF:       fprintf(file, "RULE_REF");       break;
-                case LLAMA_GRETYPE_CHAR:           fprintf(file, "CHAR");           break;
-                case LLAMA_GRETYPE_CHAR_NOT:       fprintf(file, "CHAR_NOT");       break;
-                case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
-                case LLAMA_GRETYPE_CHAR_ALT:       fprintf(file, "CHAR_ALT");       break;
-                case LLAMA_GRETYPE_CHAR_ANY:       fprintf(file, "CHAR_ANY");       break;
-            }
-            switch (elem.type) {
-                case LLAMA_GRETYPE_END:
-                case LLAMA_GRETYPE_ALT:
-                case LLAMA_GRETYPE_RULE_REF:
-                    fprintf(file, "(%u) ", elem.value);
-                    break;
-                case LLAMA_GRETYPE_CHAR:
-                case LLAMA_GRETYPE_CHAR_NOT:
-                case LLAMA_GRETYPE_CHAR_RNG_UPPER:
-                case LLAMA_GRETYPE_CHAR_ALT:
-                case LLAMA_GRETYPE_CHAR_ANY:
-                    fprintf(file, "(\"");
-                    print_grammar_char(file, elem.value);
-                    fprintf(file, "\") ");
-                    break;
-            }
-        }
-        fprintf(file, "\n");
-    }
-
-    static void print_rule(
-            FILE     * file,
-            uint32_t   rule_id,
-            const std::vector<llama_grammar_element> & rule,
-            const std::map<uint32_t, std::string>    & symbol_id_names) {
-        if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
-            throw std::runtime_error(
-                "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
-        }
-        fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
-        for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
-            llama_grammar_element elem = rule[i];
-            switch (elem.type) {
-                case LLAMA_GRETYPE_END:
-                    throw std::runtime_error(
-                        "unexpected end of rule: " + std::to_string(rule_id) + "," +
-                        std::to_string(i));
-                case LLAMA_GRETYPE_ALT:
-                    fprintf(file, "| ");
-                    break;
-                case LLAMA_GRETYPE_RULE_REF:
-                    fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
-                    break;
-                case LLAMA_GRETYPE_CHAR:
-                    fprintf(file, "[");
-                    print_grammar_char(file, elem.value);
-                    break;
-                case LLAMA_GRETYPE_CHAR_NOT:
-                    fprintf(file, "[^");
-                    print_grammar_char(file, elem.value);
-                    break;
-                case LLAMA_GRETYPE_CHAR_RNG_UPPER:
-                    if (i == 0 || !is_char_element(rule[i - 1])) {
-                        throw std::runtime_error(
-                            "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
-                            std::to_string(rule_id) + "," + std::to_string(i));
-                    }
-                    fprintf(file, "-");
-                    print_grammar_char(file, elem.value);
-                    break;
-                case LLAMA_GRETYPE_CHAR_ALT:
-                    if (i == 0 || !is_char_element(rule[i - 1])) {
-                        throw std::runtime_error(
-                            "LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
-                            std::to_string(rule_id) + "," + std::to_string(i));
-                    }
-                    print_grammar_char(file, elem.value);
-                    break;
-                case LLAMA_GRETYPE_CHAR_ANY:
-                    fprintf(file, ".");
-                    break;
-            }
-            if (is_char_element(elem)) {
-                switch (rule[i + 1].type) {
-                    case LLAMA_GRETYPE_CHAR_ALT:
-                    case LLAMA_GRETYPE_CHAR_RNG_UPPER:
-                    case LLAMA_GRETYPE_CHAR_ANY:
-                        break;
-                    default:
-                        fprintf(file, "] ");
-                }
-            }
-        }
-        fprintf(file, "\n");
-    }
-
-    void print_grammar(FILE * file, const parse_state & state) {
-        try {
-            std::map<uint32_t, std::string> symbol_id_names;
-            for (const auto & kv : state.symbol_ids) {
-                symbol_id_names[kv.second] = kv.first;
-            }
-            for (size_t i = 0, end = state.rules.size(); i < end; i++) {
-                // fprintf(file, "%zu: ", i);
-                // print_rule_binary(file, state.rules[i]);
-                print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
-                // fprintf(file, "\n");
-            }
-        } catch (const std::exception & err) {
-            fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
-        }
-    }
-
-    std::vector<const llama_grammar_element *> parse_state::c_rules() {
-        std::vector<const llama_grammar_element *> ret;
-        ret.reserve(rules.size());
-        for (const auto & rule : rules) {
-            ret.push_back(rule.data());
-        }
-        return ret;
-    }
-}
--- a/common/grammar-parser.h
+++ b/common/grammar-parser.h
@ -1,29 +0,0 @@
-// Implements a parser for an extended Backus-Naur form (BNF), producing the
-// binary context-free grammar format specified by llama.h. Supports character
-// ranges, grouping, and repetition operators. As an example, a grammar for
-// arithmetic might look like:
-//
-// root  ::= expr
-// expr  ::= term ([-+*/] term)*
-// term  ::= num | "(" space expr ")" space
-// num   ::= [0-9]+ space
-// space ::= [ \t\n]*
-
-#pragma once
-#include "llama.h"
-#include <vector>
-#include <map>
-#include <cstdint>
-#include <string>
-
-namespace grammar_parser {
-    struct parse_state {
-        std::map<std::string, uint32_t>                 symbol_ids;
-        std::vector<std::vector<llama_grammar_element>> rules;
-
-        std::vector<const llama_grammar_element *> c_rules();
-    };
-
-    parse_state parse(const char * src);
-    void print_grammar(FILE * file, const parse_state & state);
-}
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -1,395 +1,116 @@
-#define LLAMA_API_INTERNAL
 #include "sampling.h"
-#include <random>

-struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
-    struct llama_sampling_context * result = new llama_sampling_context();
+#include "common.h"

-    result->params  = params;
-    result->grammar = nullptr;
+// the ring buffer works similarly to std::deque, but with a fixed capacity
+// TODO: deduplicate with llama-impl.h
+template<typename T>
+struct ring_buffer {
+    ring_buffer(size_t cap) : capacity(cap), data(cap) {}

-    // if there is a grammar, parse it
-    if (!params.grammar.empty()) {
-        result->parsed_grammar = grammar_parser::parse(params.grammar.c_str());
-
-        // will be empty (default) if there are parse errors
-        if (result->parsed_grammar.rules.empty()) {
-            fprintf(stderr, "%s: failed to parse grammar\n", __func__);
-            delete result;
-            return nullptr;
+    T & front() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[first];
    }

-        // Ensure that there is a "root" node.
-        if (result->parsed_grammar.symbol_ids.find("root") == result->parsed_grammar.symbol_ids.end()) {
-            fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
-            delete result;
-            return nullptr;
+    const T & front() const {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[first];
    }

-        std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
-
-        struct llama_grammar * grammar = llama_grammar_init(
-                grammar_rules.data(),
-                grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
-        if (grammar == nullptr) {
-            throw std::runtime_error("Failed to initialize llama_grammar");
+    T & back() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
        }
-        result->grammar = grammar;
+        return data[pos];
    }

-    result->prev.resize(params.n_prev);
+    const T & back() const {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[pos];
+    }

-    result->n_valid = 0;
+    void push_back(const T & value) {
+        if (sz == capacity) {
+            // advance the start when buffer is full
+            first = (first + 1) % capacity;
+        } else {
+            sz++;
+        }
+        data[pos] = value;
+        pos = (pos + 1) % capacity;
+    }

-    llama_sampling_set_rng_seed(result, params.seed);
+    T pop_front() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        T value = data[first];
+        first = (first + 1) % capacity;
+        sz--;
+        return value;
+    }

+    const T & rat(size_t i) const {
+        if (i >= sz) {
+            throw std::runtime_error("ring buffer: index out of bounds");
+        }
+        return data[(first + sz - i - 1) % capacity];
+    }
+
+    std::vector<T> to_vector() const {
+        std::vector<T> result;
+        result.reserve(sz);
+        for (size_t i = 0; i < sz; i++) {
+            result.push_back(data[(first + i) % capacity]);
+        }
        return result;
    }

-void llama_sampling_free(struct llama_sampling_context * ctx) {
-    if (ctx->grammar != NULL) {
-        llama_grammar_free(ctx->grammar);
+    void clear() {
+        // here only reset the status of the buffer
+        sz = 0;
+        first = 0;
+        pos = 0;
    }

-    delete ctx;
+    bool empty() const {
+        return sz == 0;
    }

-void llama_sampling_reset(llama_sampling_context * ctx) {
-    if (ctx->grammar != NULL) {
-        llama_grammar_free(ctx->grammar);
-        ctx->grammar = NULL;
+    size_t size() const {
+        return sz;
    }

-    if (!ctx->parsed_grammar.rules.empty()) {
-        std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
-
-        struct llama_grammar * grammar = llama_grammar_init(
-                grammar_rules.data(),
-                grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
-        if (grammar == nullptr) {
-            throw std::runtime_error("Failed to initialize llama_grammar");
-        }
-        ctx->grammar = grammar;
-    }
-
-    std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
-    ctx->cur.clear();
-    ctx->n_valid = 0;
-}
-
-void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
-    if (seed == LLAMA_DEFAULT_SEED) {
-        seed = std::random_device{}();
-    }
-    ctx->rng.seed(seed);
-}
-
-void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
-    if (dst->grammar) {
-        llama_grammar_free(dst->grammar);
-        dst->grammar = nullptr;
-    }
-
-    if (src->grammar) {
-        dst->grammar = llama_grammar_copy(src->grammar);
-    }
-
-    dst->prev = src->prev;
-}
-
-llama_token llama_sampling_last(llama_sampling_context * ctx) {
-    return ctx->prev.back();
-}
-
-std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
-    const int size = ctx_sampling->prev.size();
-
-    n = std::min(n, size);
-
-    std::string result;
-
-    for (int i = size - n; i < size; i++) {
-        result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]);
-    }
-
-    return result;
-}
-
-std::string llama_sampling_print(const llama_sampling_params & params) {
-    char result[1024];
-
-    snprintf(result, sizeof(result),
-            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
-            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
-            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
-            params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
-            params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
-            params.mirostat, params.mirostat_eta, params.mirostat_tau);
-
-    return std::string(result);
-}
-
-std::string llama_sampling_order_print(const llama_sampling_params & params) {
-    std::string result = "CFG -> Penalties ";
-    if (params.mirostat == 0) {
-        for (auto sampler_type : params.samplers_sequence) {
-            const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
-            if (!sampler_type_name.empty()) {
-                result += "-> " + sampler_type_name + " ";
-            }
-        }
-    } else {
-        result += "-> mirostat ";
-    }
-
-    return result;
-}
-
-std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
-    switch (sampler_type) {
-        case llama_sampler_type::TOP_K:       return "top_k";
-        case llama_sampler_type::TFS_Z:       return "tfs_z";
-        case llama_sampler_type::TYPICAL_P:   return "typical_p";
-        case llama_sampler_type::TOP_P:       return "top_p";
-        case llama_sampler_type::MIN_P:       return "min_p";
-        case llama_sampler_type::TEMPERATURE: return "temperature";
-        default : return "";
-    }
-}
-
-std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
-    std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
-        {"top_k",       llama_sampler_type::TOP_K},
-        {"top_p",       llama_sampler_type::TOP_P},
-        {"typical_p",   llama_sampler_type::TYPICAL_P},
-        {"min_p",       llama_sampler_type::MIN_P},
-        {"tfs_z",       llama_sampler_type::TFS_Z},
-        {"temperature", llama_sampler_type::TEMPERATURE}
+    size_t capacity = 0;
+    size_t sz = 0;
+    size_t first = 0;
+    size_t pos = 0;
+    std::vector<T> data;
 };

-    // since samplers names are written multiple ways
-    // make it ready for both system names and input names
-    std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
-        {"top-k",       llama_sampler_type::TOP_K},
-        {"top-p",       llama_sampler_type::TOP_P},
-        {"nucleus",     llama_sampler_type::TOP_P},
-        {"typical-p",   llama_sampler_type::TYPICAL_P},
-        {"typical",     llama_sampler_type::TYPICAL_P},
-        {"min-p",       llama_sampler_type::MIN_P},
-        {"tfs-z",       llama_sampler_type::TFS_Z},
-        {"tfs",         llama_sampler_type::TFS_Z},
-        {"temp",        llama_sampler_type::TEMPERATURE}
-    };
+struct gpt_sampler {
+    gpt_sampler_params params;

-    std::vector<llama_sampler_type> sampler_types;
-    sampler_types.reserve(names.size());
-    for (const auto & name : names)
-    {
-        auto sampler_item = sampler_canonical_name_map.find(name);
-        if (sampler_item != sampler_canonical_name_map.end())
-        {
-            sampler_types.push_back(sampler_item->second);
-        }
-        else
-        {
-            if (allow_alt_names)
-            {
-                sampler_item = sampler_alt_name_map.find(name);
-                if (sampler_item != sampler_alt_name_map.end())
-                {
-                    sampler_types.push_back(sampler_item->second);
-                }
-            }
-        }
-    }
-    return sampler_types;
-}
+    struct llama_sampler * grmr;
+    struct llama_sampler * chain;

-std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
-    std::unordered_map<char, llama_sampler_type> sampler_name_map {
-        {'k', llama_sampler_type::TOP_K},
-        {'p', llama_sampler_type::TOP_P},
-        {'y', llama_sampler_type::TYPICAL_P},
-        {'m', llama_sampler_type::MIN_P},
-        {'f', llama_sampler_type::TFS_Z},
-        {'t', llama_sampler_type::TEMPERATURE}
-    };
+    ring_buffer<llama_token> prev;

-    std::vector<llama_sampler_type> sampler_types;
-    sampler_types.reserve(names_string.size());
-    for (const auto & c : names_string) {
-        const auto sampler_item = sampler_name_map.find(c);
-        if (sampler_item != sampler_name_map.end()) {
-            sampler_types.push_back(sampler_item->second);
-        }
-    }
-    return sampler_types;
-}
+    std::vector<llama_token_data> cur;

-// no reasons to expose this function in header
-static void sampler_queue(
-                   struct llama_context * ctx_main,
-            const llama_sampling_params & params,
-                 llama_token_data_array & cur_p,
-                                 size_t   min_keep) {
-    const float         temp              = params.temp;
-    const float         dynatemp_range    = params.dynatemp_range;
-    const float         dynatemp_exponent = params.dynatemp_exponent;
-    const int32_t       top_k             = params.top_k;
-    const float         top_p             = params.top_p;
-    const float         min_p             = params.min_p;
-    const float         tfs_z             = params.tfs_z;
-    const float         typical_p         = params.typical_p;
-    const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
+    llama_token_data_array cur_p;

-    for (auto sampler_type : samplers_sequence) {
-        switch (sampler_type) {
-            case llama_sampler_type::TOP_K    : llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep); break;
-            case llama_sampler_type::TFS_Z    : llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep); break;
-            case llama_sampler_type::TYPICAL_P: llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
-            case llama_sampler_type::TOP_P    : llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
-            case llama_sampler_type::MIN_P    : llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
-            case llama_sampler_type::TEMPERATURE:
-                if (dynatemp_range > 0) {
-                    float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
-                    float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
-                    llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent, 0);
-                } else {
-                    llama_sample_temp(ctx_main, &cur_p, temp, 0);
-                }
-                break;
-            default : break;
-        }
-    }
-}
+    void set_logits(struct llama_context * ctx, int idx) {
+        const auto * logits = llama_get_logits_ith(ctx, idx);

-static llama_token llama_sampling_sample_impl(
-                  struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_main,
-                  struct llama_context * ctx_cfg,
-                  const int idx,
-                  bool is_resampling) {
-    const llama_sampling_params & params = ctx_sampling->params;
-
-    const float   temp            = params.temp;
-    const int     mirostat        = params.mirostat;
-    const float   mirostat_tau    = params.mirostat_tau;
-    const float   mirostat_eta    = params.mirostat_eta;
-
-    std::vector<float> original_logits;
-    auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits);
-    if (ctx_sampling->grammar != NULL && !is_resampling) {
-        GGML_ASSERT(!original_logits.empty());
-    }
-    llama_token id = 0;
-
-    if (temp < 0.0) {
-        // greedy sampling, with probs
-        llama_sample_softmax(ctx_main, &cur_p);
-        id = cur_p.data[0].id;
-    } else if (temp == 0.0) {
-        // greedy sampling, no probs
-        id = llama_sample_token_greedy(ctx_main, &cur_p);
-    } else {
-        if (mirostat == 1) {
-            const int mirostat_m = 100;
-            llama_sample_temp(ctx_main, &cur_p, temp, 0);
-            id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
-        } else if (mirostat == 2) {
-            llama_sample_temp(ctx_main, &cur_p, temp, 0);
-            id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
-        } else {
-            // temperature sampling
-            size_t min_keep = std::max(1, params.min_keep);
-
-            sampler_queue(ctx_main, params, cur_p, min_keep);
-
-            id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);
-
-            //{
-            //    const int n_top = 10;
-            //    LOG("top %d candidates:\n", n_top);
-
-            //    for (int i = 0; i < n_top; i++) {
-            //        const llama_token id = cur_p.data[i].id;
-            //        (void)id; // To avoid a warning that id is unused when logging is disabled.
-            //        LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
-            //    }
-            //}
-
-            //LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
-        }
-    }
-
-    if (ctx_sampling->grammar != NULL && !is_resampling) {
-        // Get a pointer to the logits
-        float * logits = llama_get_logits_ith(ctx_main, idx);
-
-        // Create an array with a single token data element for the sampled id
-        llama_token_data single_token_data = {id, logits[id], 0.0f};
-        llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
-
-        // Apply grammar constraints to the single token
-        llama_grammar_sample(ctx_sampling->grammar, ctx_main, &single_token_data_array);
-
-        // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
-        bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
-
-        // If the token is not valid according to the grammar, perform resampling
-        if (!is_valid) {
-            LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str());
-
-            // Restore logits from the copy
-            std::copy(original_logits.begin(), original_logits.end(), logits);
-
-            return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true);
-        }
-    }
-
-    ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size;
-
-    return id;
-}
-
-static llama_token_data_array llama_sampling_prepare_impl(
-                  struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_main,
-                  struct llama_context * ctx_cfg,
-                  const int idx,
-                  bool apply_grammar,
-                  std::vector<float> * original_logits) {
-    const llama_sampling_params & params = ctx_sampling->params;
-
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
-
-    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
-    const float   penalty_repeat  = params.penalty_repeat;
-    const float   penalty_freq    = params.penalty_freq;
-    const float   penalty_present = params.penalty_present;
-
-    const bool    penalize_nl     = params.penalize_nl;
-
-    auto & prev = ctx_sampling->prev;
-    auto & cur  = ctx_sampling->cur;
-
-    // Get a pointer to the logits
-    float * logits = llama_get_logits_ith(ctx_main, idx);
-
-    if (ctx_sampling->grammar != NULL && !apply_grammar) {
-        GGML_ASSERT(original_logits != NULL);
-        // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
-        *original_logits = {logits, logits + n_vocab};
-    }
-
-    // apply params.logit_bias map
-    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
-        logits[it->first] += it->second;
-    }
-
-    if (ctx_cfg) {
-        float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
-        llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
-    }
+        const int n_vocab = llama_n_vocab(llama_get_model(ctx));

        cur.resize(n_vocab);

@ -397,64 +118,326 @@ static llama_token_data_array llama_sampling_prepare_impl(
            cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
        }

-    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
+        cur_p = { cur.data(), cur.size(), -1, false };
+    }
+};

-    // apply penalties
-    const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
-    const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
-    if (penalty_tokens_used_size) {
-        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
+std::string gpt_sampler_params::print() const {
+    char result[1024];

-        llama_sample_repetition_penalties(ctx_main, &cur_p,
-                penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
-                penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
+    snprintf(result, sizeof(result),
+            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
+            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
+            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
+            penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
+            top_k, tfs_z, top_p, min_p, typ_p, temp,
+            mirostat, mirostat_eta, mirostat_tau);

-        if (!penalize_nl) {
-            for (size_t idx = 0; idx < cur_p.size; idx++) {
-                if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
-                    cur_p.data[idx].logit = nl_logit;
+    return std::string(result);
+}
+
+struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
+    llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
+
+    lparams.no_perf = false; // TODO: control via params
+
+    auto * result = new gpt_sampler {
+        /* .params = */ params,
+        /* .grmr   = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
+        /* .chain  = */ llama_sampler_chain_init(lparams),
+        /* .prev   = */ ring_buffer<llama_token>(params.n_prev),
+        /* .cur    = */ {},
+        /* .cur_p  = */ {},
+    };
+
+    llama_sampler_chain_add(result->chain,
+            llama_sampler_init_logit_bias(
+                llama_n_vocab(model),
+                params.logit_bias.size(),
+                params.logit_bias.data()));
+
+    llama_sampler_chain_add(result->chain,
+            llama_sampler_init_penalties(
+                llama_n_vocab  (model),
+                llama_token_eos(model),
+                llama_token_nl (model),
+                params.penalty_last_n,
+                params.penalty_repeat,
+                params.penalty_freq,
+                params.penalty_present,
+                params.penalize_nl,
+                params.ignore_eos));
+
+    if (params.temp > 0.0f) {
+        if (params.mirostat == 0) {
+            for (const auto & cnstr : params.samplers) {
+                switch (cnstr) {
+                    case GPT_SAMPLER_TYPE_TOP_K:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
                        break;
+                    case GPT_SAMPLER_TYPE_TOP_P:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
+                        break;
+                    case GPT_SAMPLER_TYPE_MIN_P:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
+                        break;
+                    case GPT_SAMPLER_TYPE_TFS_Z:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
+                        break;
+                    case GPT_SAMPLER_TYPE_TYPICAL_P:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
+                        break;
+                    case GPT_SAMPLER_TYPE_TEMPERATURE:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
+                        break;
+                    default:
+                        GGML_ASSERT(false && "unknown sampler type");
+                }
+            }
+            llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
+            llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
+        } else if (params.mirostat == 1) {
+            llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
+            llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
+        } else if (params.mirostat == 2) {
+            llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
+            llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
+        } else {
+            GGML_ASSERT(false && "unknown mirostat version");
+        }
+    } else {
+        llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
+        llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
+    }
+
+    return result;
+}
+
+void gpt_sampler_free(struct gpt_sampler * gsmpl) {
+    if (gsmpl) {
+        llama_sampler_free(gsmpl->grmr);
+
+        llama_sampler_free(gsmpl->chain);
+
+        delete gsmpl;
+    }
+}
+
+void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar) {
+    if (accept_grammar) {
+        llama_sampler_accept(gsmpl->grmr, token);
+    }
+
+    llama_sampler_accept(gsmpl->chain, token);
+
+    gsmpl->prev.push_back(token);
+}
+
+void gpt_sampler_reset(struct gpt_sampler * gsmpl) {
+    llama_sampler_reset(gsmpl->grmr);
+
+    llama_sampler_reset(gsmpl->chain);
+}
+
+struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
+    return new gpt_sampler {
+        /* .params = */ gsmpl->params,
+        /* .grmr   = */ llama_sampler_clone(gsmpl->grmr),
+        /* .chain  = */ llama_sampler_clone(gsmpl->chain),
+        /* .prev   = */ gsmpl->prev,
+        /* .cur    = */ gsmpl->cur,
+        /* .cur_p  = */ gsmpl->cur_p,
+    };
+}
+
+void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) {
+    // TODO: measure grammar performance
+
+    if (gsmpl) {
+        llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
+    }
+    if (ctx) {
+        llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+    }
+}
+
+llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
+    gsmpl->set_logits(ctx, idx);
+
+    auto & grmr  = gsmpl->grmr;
+    auto & chain = gsmpl->chain;
+    auto & cur_p = gsmpl->cur_p; // initialized by set_logits
+
+    if (grammar_first) {
+        llama_sampler_apply(grmr, &cur_p);
+    }
+
+    llama_sampler_apply(chain, &cur_p);
+
+    GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
+
+    const llama_token id = cur_p.data[cur_p.selected].id;
+
+    if (grammar_first) {
+        return id;
+    }
+
+    // check if it the sampled token fits the grammar
+    {
+        llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
+        llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
+
+        llama_sampler_apply(grmr, &single_token_data_array);
+
+        const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
+        if (is_valid) {
+            return id;
+        }
+    }
+
+    // resampling:
+    // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
+    gsmpl->set_logits(ctx, idx);
+
+    llama_sampler_apply(grmr,  &cur_p);
+    llama_sampler_apply(chain, &cur_p);
+
+    GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
+
+    return cur_p.data[cur_p.selected].id;
+}
+
+// helpers
+
+llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
+    return &gsmpl->cur_p;
+}
+
+llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
+    return gsmpl->prev.rat(0);
+}
+
+std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
+    std::string result = "\tlogits ";
+
+    for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
+        const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
+        result += std::string("-> ") + llama_sampler_name(smpl) + " ";
+    }
+
+    return result;
+}
+
+std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) {
+    n = std::min(n, (int) gsmpl->prev.size());
+
+    if (n <= 0) {
+        return "";
+    }
+
+    std::string result;
+    result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
+
+    for (int i = n - 1; i >= 0; i--) {
+        const llama_token id = gsmpl->prev.rat(i);
+
+        GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
+
+        result += llama_token_to_piece(ctx_main, id);
+    }
+
+    return result;
+}
+
+char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
+    switch (cnstr) {
+        case GPT_SAMPLER_TYPE_TOP_K:       return 'k';
+        case GPT_SAMPLER_TYPE_TFS_Z:       return 'f';
+        case GPT_SAMPLER_TYPE_TYPICAL_P:   return 'y';
+        case GPT_SAMPLER_TYPE_TOP_P:       return 'p';
+        case GPT_SAMPLER_TYPE_MIN_P:       return 'm';
+        case GPT_SAMPLER_TYPE_TEMPERATURE: return 't';
+        default : return '?';
+    }
+}
+
+std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) {
+    switch (cnstr) {
+        case GPT_SAMPLER_TYPE_TOP_K:       return "top_k";
+        case GPT_SAMPLER_TYPE_TFS_Z:       return "tfs_z";
+        case GPT_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
+        case GPT_SAMPLER_TYPE_TOP_P:       return "top_p";
+        case GPT_SAMPLER_TYPE_MIN_P:       return "min_p";
+        case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature";
+        default : return "";
+    }
+}
+
+std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
+    std::unordered_map<std::string, gpt_sampler_type> sampler_canonical_name_map {
+        { "top_k",       GPT_SAMPLER_TYPE_TOP_K },
+        { "top_p",       GPT_SAMPLER_TYPE_TOP_P },
+        { "typ_p",       GPT_SAMPLER_TYPE_TYPICAL_P },
+        { "min_p",       GPT_SAMPLER_TYPE_MIN_P },
+        { "tfs_z",       GPT_SAMPLER_TYPE_TFS_Z },
+        { "temperature", GPT_SAMPLER_TYPE_TEMPERATURE },
+    };
+
+    // since samplers names are written multiple ways
+    // make it ready for both system names and input names
+    std::unordered_map<std::string, gpt_sampler_type> sampler_alt_name_map {
+        { "top-k",       GPT_SAMPLER_TYPE_TOP_K },
+        { "top-p",       GPT_SAMPLER_TYPE_TOP_P },
+        { "nucleus",     GPT_SAMPLER_TYPE_TOP_P },
+        { "typical-p",   GPT_SAMPLER_TYPE_TYPICAL_P },
+        { "typical",     GPT_SAMPLER_TYPE_TYPICAL_P },
+        { "typ-p",       GPT_SAMPLER_TYPE_TYPICAL_P },
+        { "typ",         GPT_SAMPLER_TYPE_TYPICAL_P },
+        { "min-p",       GPT_SAMPLER_TYPE_MIN_P },
+        { "tfs-z",       GPT_SAMPLER_TYPE_TFS_Z },
+        { "tfs",         GPT_SAMPLER_TYPE_TFS_Z },
+        { "temp",        GPT_SAMPLER_TYPE_TEMPERATURE },
+    };
+
+    std::vector<gpt_sampler_type> samplers;
+    samplers.reserve(names.size());
+
+    for (const auto & name : names) {
+        auto sampler = sampler_canonical_name_map.find(name);
+        if (sampler != sampler_canonical_name_map.end()) {
+            samplers.push_back(sampler->second);
+        } else {
+            if (allow_alt_names) {
+                sampler = sampler_alt_name_map.find(name);
+                if (sampler != sampler_alt_name_map.end()) {
+                    samplers.push_back(sampler->second);
                }
            }
        }
    }

-    // apply grammar checks before sampling logic
-    if (apply_grammar && ctx_sampling->grammar != NULL) {
-        llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p);
+    return samplers;
 }

-    return cur_p;
+std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
+    std::unordered_map<char, gpt_sampler_type> sampler_name_map {
+        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K),       GPT_SAMPLER_TYPE_TOP_K },
+        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z),       GPT_SAMPLER_TYPE_TFS_Z },
+        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P),   GPT_SAMPLER_TYPE_TYPICAL_P },
+        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P),       GPT_SAMPLER_TYPE_TOP_P },
+        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P),       GPT_SAMPLER_TYPE_MIN_P },
+        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE }
+    };
+
+    std::vector<gpt_sampler_type> samplers;
+    samplers.reserve(chars.size());
+
+    for (const auto & c : chars) {
+        const auto sampler = sampler_name_map.find(c);
+        if (sampler != sampler_name_map.end()) {
+            samplers.push_back(sampler->second);
+        }
    }

-llama_token llama_sampling_sample(
-                  struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_main,
-                  struct llama_context * ctx_cfg,
-                  const int idx) {
-    // Call the implementation function with is_resampling set to false by default
-    return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false);
-}
-
-llama_token_data_array llama_sampling_prepare(
-                  struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_main,
-                  struct llama_context * ctx_cfg,
-                  const int idx,
-                  bool apply_grammar,
-                  std::vector<float> * original_logits) {
-    return llama_sampling_prepare_impl(ctx_sampling,ctx_main, ctx_cfg, idx, apply_grammar, original_logits);
-}
-
-void llama_sampling_accept(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        llama_token id,
-        bool apply_grammar) {
-    ctx_sampling->prev.erase(ctx_sampling->prev.begin());
-    ctx_sampling->prev.push_back(id);
-
-    if (ctx_sampling->grammar != NULL && apply_grammar) {
-        llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id);
-    }
+    return samplers;
 }
--- a/common/sampling.h
+++ b/common/sampling.h
@ -2,25 +2,23 @@

 #include "llama.h"

-#include "grammar-parser.h"
-
-#include <random>
 #include <string>
-#include <unordered_map>
 #include <vector>

-// sampler types
-enum class llama_sampler_type : char {
-    TOP_K       = 'k',
-    TOP_P       = 'p',
-    MIN_P       = 'm',
-    TFS_Z       = 'f',
-    TYPICAL_P   = 'y',
-    TEMPERATURE = 't'
+enum gpt_sampler_type {
+    GPT_SAMPLER_TYPE_NONE        = 0,
+    GPT_SAMPLER_TYPE_TOP_K       = 1,
+    GPT_SAMPLER_TYPE_TOP_P       = 2,
+    GPT_SAMPLER_TYPE_MIN_P       = 3,
+    GPT_SAMPLER_TYPE_TFS_Z       = 4,
+    GPT_SAMPLER_TYPE_TYPICAL_P   = 5,
+    GPT_SAMPLER_TYPE_TEMPERATURE = 6,
 };

 // sampling parameters
-typedef struct llama_sampling_params {
+struct gpt_sampler_params {
+    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
+
    int32_t n_prev            = 64;    // number of previous tokens to remember
    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
@ -28,9 +26,8 @@ typedef struct llama_sampling_params {
    float   top_p             = 0.95f; // 1.0 = disabled
    float   min_p             = 0.05f; // 0.0 = disabled
    float   tfs_z             = 1.00f; // 1.0 = disabled
-    float       typical_p             = 1.00f;              // 1.0 = disabled
+    float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
-    float       smoothing_factor      = 0.00f;              // 0.00 = disabled
    float   dynatemp_range    = 0.00f; // 0.0 = disabled
    float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
@ -41,121 +38,94 @@ typedef struct llama_sampling_params {
    float   mirostat_tau      = 5.00f; // target entropy
    float   mirostat_eta      = 0.10f; // learning rate
    bool    penalize_nl       = false; // consider newlines as a repeatable token
-    uint32_t    seed                  = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
+    bool    ignore_eos        = false;

-    std::vector<llama_sampler_type> samplers_sequence = {
-        llama_sampler_type::TOP_K,
-        llama_sampler_type::TFS_Z,
-        llama_sampler_type::TYPICAL_P,
-        llama_sampler_type::TOP_P,
-        llama_sampler_type::MIN_P,
-        llama_sampler_type::TEMPERATURE
+    std::vector<enum gpt_sampler_type> samplers = {
+        GPT_SAMPLER_TYPE_TOP_K,
+        GPT_SAMPLER_TYPE_TFS_Z,
+        GPT_SAMPLER_TYPE_TYPICAL_P,
+        GPT_SAMPLER_TYPE_TOP_P,
+        GPT_SAMPLER_TYPE_MIN_P,
+        GPT_SAMPLER_TYPE_TEMPERATURE
    };

    std::string grammar; // optional BNF-like grammar to constrain sampling

-    // Classifier-Free Guidance
-    // https://arxiv.org/abs/2306.17806
-    std::string cfg_negative_prompt; // string to help guidance
-    float       cfg_scale     = 1.f; // how strong is guidance
+    std::vector<llama_logit_bias> logit_bias; // logit biases to apply

-    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
-
-    std::vector<llama_token> penalty_prompt_tokens;
-    bool                     use_penalty_prompt_tokens = false;
-} llama_sampling_params;
-
-// general sampler context
-// TODO: move to llama.h
-struct llama_sampling_context {
-    // parameters that will be used for sampling
-    llama_sampling_params params;
-
-    // mirostat sampler state
-    float mirostat_mu;
-
-    llama_grammar * grammar;
-
-    // internal
-    grammar_parser::parse_state parsed_grammar;
-
-    // TODO: replace with ring-buffer
-    std::vector<llama_token>      prev;
-    std::vector<llama_token_data> cur;
-    size_t n_valid; // Number of correct top tokens with correct probabilities.
-
-    std::mt19937 rng;
+    // print the parameters into a string
+    std::string print() const;
 };

-#include "common.h"
-
-// Create a new sampling context instance.
-struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
-
-void llama_sampling_free(struct llama_sampling_context * ctx);
-
-// Reset the sampler context
-// - clear prev tokens
-// - reset grammar
-void llama_sampling_reset(llama_sampling_context * ctx);
-
-// Set the sampler seed
-void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
-
-// Copy the sampler context
-void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
-
-// Get the last sampled token
-llama_token llama_sampling_last(llama_sampling_context * ctx);
-
-// Get a string representation of the last sampled tokens
-std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
-
-// Print sampling parameters into a string
-std::string llama_sampling_print(const llama_sampling_params & params);
-
-// Print sampling order into a string
-std::string llama_sampling_order_print(const llama_sampling_params & params);
-
-std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
-
-std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
-std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
-
-// this is a common sampling function used across the examples for convenience
-// it can serve as a starting point for implementing your own sampling function
-// Note: When using multiple sequences, it is the caller's responsibility to call
-//       llama_sampling_reset when a sequence ends
+// gpt_sampler extends llama_sampler with additional functionality:
 //
-// required:
-//  - ctx_main:     context to use for sampling
-//  - ctx_sampling: sampling-specific context
+//  - grammar support
+//  - custom sampler logic based on the parameters
+//  - history of the last accepted tokens
+//  - performance metrics
 //
-// optional:
-//  - ctx_cfg:      context to use for classifier-free guidance
-//  - idx:          sample from llama_get_logits_ith(ctx, idx)
+// This goal is to have a common implementation of the sampling logic shared across the examples.
+// For example, depending on the temperature, the sampling chain can be very simple (greedy) or more
+// complex (top-k, top-p, etc).
 //
-// returns:
-//  - token:      sampled token
-//  - candidates: vector of candidate tokens
+// Another example is related to the grammar. In general, the grammar constraints applied on the full
+// vocabulary can be very taxing. To improve performance, the grammar can be applied only to the sampled
+// token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
+// grammar constraints are applied to the full vocabulary and the token is resampled.
+//
+// The gpt_sampler also maintains a container with the last accepted tokens. In the future, this can
+// be moved into the core llama library.
+//
+// For convenience, the gpt_sampler also maintains a container with the current candidate tokens.
+// This can be used to access the probabilities of the rest of the non-sampled tokens.
+//
+// TODO: measure grammar performance
 //
-llama_token llama_sampling_sample(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        struct llama_context * ctx_cfg,
-        int idx = -1);

-// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
-llama_token_data_array llama_sampling_prepare(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        struct llama_context * ctx_cfg,
-        int idx = 0,
-        bool apply_grammar = true,
-        std::vector<float> * original_logits = nullptr);
+struct gpt_sampler;

-void llama_sampling_accept(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        llama_token id,
-        bool apply_grammar);
+// llama_sampler API overloads
+
+struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params);
+
+void gpt_sampler_free(struct gpt_sampler * gsmpl);
+
+// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
+void                 gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar);
+void                 gpt_sampler_reset (struct gpt_sampler * gsmpl);
+struct gpt_sampler * gpt_sampler_clone (struct gpt_sampler * gsmpl);
+
+// arguments can be nullptr to skip printing
+void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);
+
+// extended sampling implementation:
+//
+// - set logits
+// - apply the configured sampler chain
+// - check if the token fits the grammar (if any)
+// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
+//
+// if grammar_first is true, the grammar is applied before the samplers (slower)
+// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
+//
+llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
+
+// helpers
+
+// access the internal list of current candidate tokens
+llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl);
+
+// get the last accepted token
+llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl);
+
+// print the sampler chain into a string
+std::string gpt_sampler_print(const struct gpt_sampler * gsmpl);
+
+// get a string representation of the last accepted tokens
+std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx, int n);
+
+char        gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr);
+std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr);
+
+std::vector<enum gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
+std::vector<enum gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars);
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@ -1,60 +0,0 @@
-# llama.cpp/example/batched-bench
-
-Benchmark the batched decoding performance of `llama.cpp`
-
-## Usage
-
-There are 2 modes of operation:
-
- `prompt not shared` - each batch has a separate prompt of size `PP` (i.e. `N_KV = B*(PP + TG)`)
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
-
-```bash
-./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
-
-# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
-./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
-
-# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
-./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
-
-# custom set of batches
-./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
-```
-
-## Sample results
-
- `PP` - prompt tokens per batch
- `TG` - generated tokens per batch
- `B` - number of batches
- `N_KV` - required KV cache size
- `T_PP` - prompt processing time (i.e. time to first token)
- `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`)
- `T_TG` - time to generate all batches
- `S_TG` - text generation speed (`(B*TG)/T_TG`)
- `T` - total time
- `S` - total speed (i.e. all tokens / total time)
-
-|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
-|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
-|   128 |    128 |    1 |    256 |    0.108 |  1186.64 |    3.079 |    41.57 |    3.187 |    80.32 |
-|   128 |    128 |    2 |    512 |    0.198 |  1295.19 |    5.029 |    50.90 |    5.227 |    97.95 |
-|   128 |    128 |    4 |   1024 |    0.373 |  1373.96 |    6.878 |    74.44 |    7.251 |   141.23 |
-|   128 |    128 |    8 |   2048 |    0.751 |  1363.27 |    7.344 |   139.43 |    8.095 |   252.99 |
-|   128 |    128 |   16 |   4096 |    1.570 |  1304.68 |    8.455 |   242.23 |   10.024 |   408.60 |
-|   128 |    128 |   32 |   8192 |    3.408 |  1201.73 |    8.801 |   465.40 |   12.209 |   670.96 |
-|   128 |    256 |    1 |    384 |    0.107 |  1196.70 |    6.329 |    40.45 |    6.436 |    59.67 |
-|   128 |    256 |    2 |    768 |    0.194 |  1317.45 |   10.239 |    50.00 |   10.433 |    73.61 |
-|   128 |    256 |    4 |   1536 |    0.366 |  1399.03 |   13.960 |    73.35 |   14.326 |   107.22 |
-|   128 |    256 |    8 |   3072 |    0.751 |  1363.92 |   15.110 |   135.54 |   15.861 |   193.69 |
-|   128 |    256 |   16 |   6144 |    1.569 |  1304.93 |   18.073 |   226.64 |   19.642 |   312.80 |
-|   128 |    256 |   32 |  12288 |    3.409 |  1201.35 |   19.223 |   426.15 |   22.633 |   542.93 |
-
-### JSONL output
-
-Pass `--output-format jsonl` to output JSONL instead of Markdown, á la
-
-```json lines
-{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 1, "n_kv": 256, "t_pp": 0.233810, "speed_pp": 547.453064, "t_tg": 3.503684, "speed_tg": 36.532974, "t": 3.737494, "speed": 68.495094}
-{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 2, "n_kv": 512, "t_pp": 0.422602, "speed_pp": 605.770935, "t_tg": 11.106112, "speed_tg": 23.050371, "t": 11.528713, "speed": 44.410854}
-```
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -210,7 +210,8 @@ int main(int argc, char ** argv) {
        }
    }

-    llama_print_timings(ctx);
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);

    llama_batch_free(batch);

--- a/examples/batched.swift/README.md
+++ b/examples/batched.swift/README.md
@ -1,4 +0,0 @@
-This is a swift clone of `examples/batched`.
-
-$ `make`
-$ `./llama-batched-swift MODEL_PATH [PROMPT] [PARALLEL]`
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@ -27,7 +27,6 @@ guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), mo
    print("Failed to load model")
    exit(1)
 }
-
 defer {
    llama_free_model(model)
 }
@ -37,7 +36,6 @@ var tokens = tokenize(text: prompt, add_bos: true)
 let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)

 var context_params = llama_context_default_params()
-context_params.seed = 1234
 context_params.n_ctx = n_kv_req
 context_params.n_batch = UInt32(max(n_len, n_parallel))
 context_params.n_threads = 8
@ -48,11 +46,26 @@ guard context != nil else {
    print("Failed to initialize context")
    exit(1)
 }
-
 defer {
    llama_free(context)
 }

+var sparams = llama_sampler_chain_default_params()
+
+let smpl = llama_sampler_chain_init(sparams)
+guard smpl != nil else {
+    print("Failed to initialize sampling")
+    exit(1)
+}
+defer {
+    llama_sampler_free(smpl)
+}
+
+llama_sampler_chain_add(smpl, llama_sampler_init_top_k(40));
+llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1));
+llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.4));
+llama_sampler_chain_add(smpl, llama_sampler_init_dist (1234));
+
 let n_ctx = llama_n_ctx(context)

 print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
@ -125,32 +138,9 @@ while n_cur <= n_len {
            continue
        }

-        var n_vocab = llama_n_vocab(model)
-        var logits = llama_get_logits_ith(context, i_batch[i])
+        let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])

-        var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))
-
-        for token_id in 0 ..< n_vocab {
-            candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
-        }
-
-        var candidates_p: llama_token_data_array = .init(
-            data: &candidates,
-            size: candidates.count,
-            sorted: false
-        )
-
-        let top_k: Int32 = 40
-        let top_p: Float = 0.9
-        let temp: Float = 0.4
-
-        llama_sample_top_k(context, &candidates_p, top_k, 1)
-        llama_sample_top_p(context, &candidates_p, top_p, 1)
-        llama_sample_temp(context, &candidates_p, temp)
-
-        let new_token_id = llama_sample_token(context, &candidates_p)
-
-        // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+        llama_sampler_accept(smpl, new_token_id)

        // is it an end of stream? -> mark the stream as finished
        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
@ -210,9 +200,10 @@ if n_parallel > 1 {

 let t_main_end = ggml_time_us()

-print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n")
+print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")

-llama_print_timings(context)
+llama_perf_print(UnsafeRawPointer(context), LLAMA_PERF_TYPE_CONTEXT)
+llama_perf_print(UnsafeRawPointer(smpl),    LLAMA_PERF_TYPE_SAMPLER_CHAIN)

 private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
    let utf8Count = text.utf8.count
--- a/examples/batched/README.md
+++ b/examples/batched/README.md
@ -1,44 +0,0 @@
-# llama.cpp/example/batched
-
-The example demonstrates batched generation from a given prompt
-
-```bash
-./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
-
-...
-
-main: n_len = 32, n_ctx = 2048, n_parallel = 4, n_kv_req = 113
-
- Hello my name is
-
-main: generating 4 sequences ...
-
-main: stream 0 finished
-main: stream 1 finished
-main: stream 2 finished
-main: stream 3 finished
-
-sequence 0:
-
-Hello my name is Shirley. I am a 25-year-old female who has been working for over 5 years as a b
-
-sequence 1:
-
-Hello my name is Renee and I'm a 32 year old female from the United States. I'm looking for a man between
-
-sequence 2:
-
-Hello my name is Diana. I am looking for a housekeeping job. I have experience with children and have my own transportation. I am
-
-sequence 3:
-
-Hello my name is Cody. I am a 3 year old neutered male. I am a very friendly cat. I am very playful and
-
-main: decoded 108 tokens in 3.57 s, speed: 30.26 t/s
-
-llama_print_timings:        load time =   587.00 ms
-llama_print_timings:      sample time =     2.56 ms /   112 runs   (    0.02 ms per token, 43664.72 tokens per second)
-llama_print_timings: prompt eval time =  4089.11 ms /   118 tokens (   34.65 ms per token,    28.86 tokens per second)
-llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_print_timings:       total time =  4156.04 ms
-```
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -2,7 +2,6 @@
 #include "llama.h"

 #include <algorithm>
-#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
@ -65,6 +64,15 @@ int main(int argc, char ** argv) {

    llama_context * ctx = llama_new_context_with_model(model, ctx_params);

+    auto sparams = llama_sampler_chain_default_params();
+
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+
+    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k));
+    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep));
+    llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp));
+    llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
+
    if (ctx == NULL) {
        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
        return 1;
@ -164,29 +172,9 @@ int main(int argc, char ** argv) {
                continue;
            }

-            auto   n_vocab = llama_n_vocab(model);
-            auto * logits  = llama_get_logits_ith(ctx, i_batch[i]);
+            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);

-            std::vector<llama_token_data> candidates;
-            candidates.reserve(n_vocab);
-
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
-            }
-
-            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-            const int   top_k = 40;
-            const float top_p = 0.9f;
-            const float temp  = 0.4f;
-
-            llama_sample_top_k(ctx, &candidates_p, top_k, 1);
-            llama_sample_top_p(ctx, &candidates_p, top_p, 1);
-            llama_sample_temp (ctx, &candidates_p, temp);
-
-            const llama_token new_token_id = llama_sample_token(ctx, &candidates_p);
-
-            //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+            llama_sampler_accept(smpl, new_token_id);

            // is it an end of generation? -> mark the stream as finished
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
@ -244,12 +232,15 @@ int main(int argc, char ** argv) {
    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

-    llama_print_timings(ctx);
+    LOG_TEE("\n");
+    llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
+    llama_perf_print(ctx,  LLAMA_PERF_TYPE_CONTEXT);

    fprintf(stderr, "\n");

    llama_batch_free(batch);

+    llama_sampler_free(smpl);
    llama_free(ctx);
    llama_free_model(model);

--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@ -1,28 +0,0 @@
-## Convert llama2.c model to ggml
-
-This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
-
-To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository:
-
-`$ make -j`
-
-After successful compilation, following usage options are available:
-```
-usage: ./llama-convert-llama2c-to-ggml [options]
-
-options:
-  -h, --help                       show this help message and exit
-  --copy-vocab-from-model FNAME    path of gguf llama model or llama2.c vocabulary from which to copy vocab (default 'models/7B/ggml-model-f16.gguf')
-  --llama2c-model FNAME            [REQUIRED] model path from which to load Karpathy's llama2.c model
-  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default ak_llama_model.bin')
-```
-
-An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
-
-`$ ./llama-convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
-
-Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
-
-Now you can use the model with a command like:
-
-`$ ./llama-cli -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
--- a/examples/cvector-generator/README.md
+++ b/examples/cvector-generator/README.md
@ -1,45 +0,0 @@
-# cvector-generator
-
-This example demonstrates how to generate a control vector using gguf models.
-
-Related PRs:
- [Add support for control vectors](https://github.com/ggerganov/llama.cpp/pull/5970)
- (Issue) [Generate control vector using llama.cpp](https://github.com/ggerganov/llama.cpp/issues/6880)
- [Add cvector-generator example](https://github.com/ggerganov/llama.cpp/pull/7514)
-
-## Examples
-
-```sh
-# CPU only
-./cvector-generator -m ./llama-3.Q4_K_M.gguf
-
-# With GPU
-./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99
-
-# With advanced options
-./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100
-
-# Using mean value instead of PCA
-./cvector-generator -m ./llama-3.Q4_K_M.gguf --method mean
-
-# To see help message
-./cvector-generator -h
-# Then, have a look at "cvector" section
-```
-
-## Tips and tricks
-
-If you have multiple lines per prompt, you can escape the newline character (change it to `\n`). For example:
-
-```
-<|im_start|>system\nAct like a person who is extremely happy.<|im_end|>
-<|im_start|>system\nYou are in a very good mood today<|im_end|>
-```
-
-Example to use output file with `llama-cli`:
-
-(Tips: The control vector works better when apply to layers higher than 10)
-
-```sh
-./llama-cli -m ./llama-3.Q4_K_M.gguf -p "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSing a song<|im_end|><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" --special --control-vector-scaled ./control_vector.gguf 0.8 --control-vector-layer-range 10 31
-```
--- a/examples/deprecation-warning/README.md
+++ b/examples/deprecation-warning/README.md
@ -1,49 +0,0 @@
-# Migration notice for binary filenames
-
-> [!IMPORTANT]
-[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
-
-This migration was important, but it is a breaking change that may not always be immediately obvious to users.
-
-Please update all scripts and workflows to use the new binary names.
-
-| Old Filename | New Filename |
-| ---- | ---- |
-| main | llama-cli |
-| server | llama-server |
-| llama-bench | llama-bench |
-| embedding | llama-embedding |
-| quantize | llama-quantize |
-| tokenize | llama-tokenize |
-| export-lora | llama-export-lora |
-| libllava.a | libllava.a |
-| baby-llama | llama-baby-llama |
-| batched | llama-batched |
-| batched-bench | llama-batched-bench |
-| benchmark-matmult | llama-benchmark-matmult |
-| convert-llama2c-to-ggml | llama-convert-llama2c-to-ggml |
-| eval-callback | llama-eval-callback |
-| gbnf-validator | llama-gbnf-validator |
-| gguf | llama-gguf |
-| gguf-split | llama-gguf-split |
-| gritlm | llama-gritlm |
-| imatrix | llama-imatrix |
-| infill | llama-infill |
-| llava-cli | llama-llava-cli |
-| lookahead | llama-lookahead |
-| lookup | llama-lookup |
-| lookup-create | llama-lookup-create |
-| lookup-merge | llama-lookup-merge |
-| lookup-stats | llama-lookup-stats |
-| parallel | llama-parallel |
-| passkey | llama-passkey |
-| perplexity | llama-perplexity |
-| q8dot | llama-q8dot |
-| quantize-stats | llama-quantize-stats |
-| retrieval | llama-retrieval |
-| save-load-state | llama-save-load-state |
-| simple | llama-simple |
-| speculative | llama-speculative |
-| vdot | llama-vdot |
-| tests/test-c.o | tests/test-c.o |
-
--- a/examples/embedding/README.md
+++ b/examples/embedding/README.md
@ -1,60 +0,0 @@
-# llama.cpp/example/embedding
-
-This example demonstrates generate high-dimensional embedding vector of a given text with llama.cpp.
-
-## Quick Start
-
-To get started right away, run the following command, making sure to use the correct path for the model you have:
-
-### Unix-based systems (Linux, macOS, etc.):
-
-```bash
-./llama-embedding -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>/dev/null
-```
-
-### Windows:
-
-```powershell
-llama-embedding.exe -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>$null
-```
-
-The above command will output space-separated float values.
-
-## extra parameters
-### --embd-normalize $integer$
-| $integer$ | description         | formula |
-|-----------|---------------------|---------|
-| $-1$      | none                |
-| $0$       | max absolute int16  | $\Large{{32760 * x_i} \over\max \lvert x_i\rvert}$
-| $1$       | taxicab             | $\Large{x_i \over\sum \lvert x_i\rvert}$
-| $2$       | euclidean (default) | $\Large{x_i \over\sqrt{\sum x_i^2}}$
-| $>2$      | p-norm              | $\Large{x_i \over\sqrt[p]{\sum \lvert x_i\rvert^p}}$
-
-### --embd-output-format $'string'$
-| $'string'$ | description                  |  |
-|------------|------------------------------|--|
-| ''         | same as before               | (default)
-| 'array'    | single embeddings            | $[[x_1,...,x_n]]$
-|            | multiple embeddings          | $[[x_1,...,x_n],[x_1,...,x_n],...,[x_1,...,x_n]]$
-| 'json'     | openai style                 |
-| 'json+'    | add cosine similarity matrix |
-
-### --embd-separator $"string"$
-| $"string"$   | |
-|--------------|-|
-| "\n"         | (default)
-| "<#embSep#>" | for exemple
-| "<#sep#>"    | other exemple
-
-## examples
-### Unix-based systems (Linux, macOS, etc.):
-
-```bash
-./llama-embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2  --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
-```
-
-### Windows:
-
-```powershell
-llama-embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2  --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
-```
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -91,13 +91,7 @@ int main(int argc, char ** argv) {

    print_build_info();

-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
+    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);

    llama_backend_init();
    llama_numa_init(params.numa);
@ -314,8 +308,10 @@ int main(int argc, char ** argv) {
        if (notArray) fprintf(stdout, "\n}\n");
    }

+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+
    // clean up
-    llama_print_timings(ctx);
    llama_batch_free(batch);
    llama_free(ctx);
    llama_free_model(model);
--- a/examples/eval-callback/README.md
+++ b/examples/eval-callback/README.md
@ -1,95 +0,0 @@
-# llama.cpp/examples/eval-callback
-
-A simple example which demonstrates how to use callback during the inference.
-It simply prints to the console all operations and tensor data.
-
-Usage:
-
-```shell
-llama-eval-callback \
-  --hf-repo ggml-org/models \
-  --hf-file phi-2/ggml-model-q4_0.gguf \
-  --model phi-2-q4_0.gguf \
-  --prompt hello \
-  --seed 42 \
-  -ngl 33
-```
-
-Will print:
-
-```shell
-llm_load_tensors: offloaded 33/33 layers to GPU
-...
-llama_new_context_with_model: n_ctx      = 512
-...
-llama_new_context_with_model:      CUDA0 compute buffer size =   105.00 MiB
-llama_new_context_with_model:  CUDA_Host compute buffer size =     6.01 MiB
-llama_new_context_with_model: graph nodes  = 1225
-llama_new_context_with_model: graph splits = 2
-ggml_debug:                 inp_embd = (f32)   GET_ROWS(token_embd.weight{2560, 51200, 1, 1}, inp_tokens{1, 1, 1, 1}}) = {2560, 1, 1, 1}
-                                     [
-                                      [
-                                       [ -0.0181,   0.0272,   0.0272, ...],
-                                      ],
-                                     ]
-ggml_debug:                   norm-0 = (f32)       NORM(CUDA0#inp_embd#0{2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
-                                     [
-                                      [
-                                       [ -0.6989,   1.0636,   1.0636, ...],
-                                      ],
-                                     ]
-ggml_debug:                 norm_w-0 = (f32)        MUL(norm-0{2560, 1, 1, 1}, blk.0.attn_norm.weight{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
-                                     [
-                                      [
-                                       [ -0.1800,   0.2817,   0.2632, ...],
-                                      ],
-                                     ]
-ggml_debug:              attn_norm-0 = (f32)        ADD(norm_w-0{2560, 1, 1, 1}, blk.0.attn_norm.bias{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
-                                     [
-                                      [
-                                       [ -0.1863,   0.2970,   0.2604, ...],
-                                      ],
-                                     ]
-ggml_debug:                   wqkv-0 = (f32)    MUL_MAT(blk.0.attn_qkv.weight{2560, 7680, 1, 1}, attn_norm-0{2560, 1, 1, 1}}) = {7680, 1, 1, 1}
-                                     [
-                                      [
-                                       [ -1.1238,   1.2876,  -1.8086, ...],
-                                      ],
-                                     ]
-ggml_debug:                   bqkv-0 = (f32)        ADD(wqkv-0{7680, 1, 1, 1}, blk.0.attn_qkv.bias{7680, 1, 1, 1}}) = {7680, 1, 1, 1}
-                                     [
-                                      [
-                                       [ -1.1135,   1.4604,  -1.9226, ...],
-                                      ],
-                                     ]
-ggml_debug:            bqkv-0 (view) = (f32)       VIEW(bqkv-0{7680, 1, 1, 1}, }) = {2560, 1, 1, 1}
-                                     [
-                                      [
-                                       [ -1.1135,   1.4604,  -1.9226, ...],
-                                      ],
-                                     ]
-ggml_debug:                   Qcur-0 = (f32)       CONT(bqkv-0 (view){2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
-                                     [
-                                      [
-                                       [ -1.1135,   1.4604,  -1.9226, ...],
-                                      ],
-                                     ]
-ggml_debug:        Qcur-0 (reshaped) = (f32)    RESHAPE(Qcur-0{2560, 1, 1, 1}, }) = {80, 32, 1, 1}
-                                     [
-                                      [
-                                       [ -1.1135,   1.4604,  -1.9226, ...],
-                                       [ -0.3608,   0.5076,  -1.8866, ...],
-                                       [  1.7643,   0.0273,  -2.1065, ...],
-                                       ...
-                                      ],
-                                     ]
-ggml_debug:                   Qcur-0 = (f32)       ROPE(Qcur-0 (reshaped){80, 32, 1, 1}, CUDA0#inp_pos#0{1, 1, 1, 1}}) = {80, 32, 1, 1}
-                                     [
-                                      [
-                                       [ -1.1135,   1.4604,  -1.9226, ...],
-                                       [ -0.3608,   0.5076,  -1.8866, ...],
-                                       [  1.7643,   0.0273,  -2.1065, ...],
-                                       ...
-                                      ],
-                                     ]
-```
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@ -151,8 +151,6 @@ int main(int argc, char ** argv) {

    print_build_info();

-    std::mt19937 rng(params.seed);
-
    llama_backend_init();
    llama_numa_init(params.numa);

@ -183,7 +181,8 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    llama_print_timings(ctx);
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);

    llama_free(ctx);
    llama_free_model(model);
--- a/examples/export-lora/README.md
+++ b/examples/export-lora/README.md
@ -1,33 +0,0 @@
-# export-lora
-
-Apply LORA adapters to base model and export the resulting model.
-
-```
-usage: llama-export-lora [options]
-
-options:
-  -m,    --model                  model path from which to load base model (default '')
-         --lora FNAME             path to LoRA adapter  (can be repeated to use multiple adapters)
-         --lora-scaled FNAME S    path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)
-  -t,    --threads N              number of threads to use during computation (default: 4)
-  -o,    --output FNAME           output file (default: 'ggml-lora-merged-f16.gguf')
-```
-
-For example:
-
-```bash
-./bin/llama-export-lora \
-    -m open-llama-3b-v2.gguf \
-    -o open-llama-3b-v2-english2tokipona-chat.gguf \
-    --lora lora-open-llama-3b-v2-english2tokipona-chat-LATEST.gguf
-```
-
-Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters:
-
-```bash
-./bin/llama-export-lora \
-    -m your_base_model.gguf \
-    -o your_merged_model.gguf \
-    --lora-scaled lora_task_A.gguf 0.5 \
-    --lora-scaled lora_task_B.gguf 0.5
-```
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@ -1,9 +1,5 @@
-#define LLAMA_API_INTERNAL
-
-#include "grammar-parser.h"
-#include "ggml.h"
-#include "llama.h"
 #include "unicode.h"
+#include "llama-grammar.h"

 #include <cstdio>
 #include <cstdlib>
@ -12,29 +8,28 @@
 #include <string>
 #include <vector>

-static bool llama_sample_grammar_string(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
-    auto decoded = decode_utf8(input_str, {});
-    const auto & code_points = decoded.first;
+static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
+    const auto cpts = unicode_cpts_from_utf8(input_str);

    const llama_grammar_rules  & rules      = llama_grammar_get_rules (grammar);
-          llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
+          llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);

    size_t pos = 0;
-    for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
-        const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy
+    for (const auto & cpt : cpts) {
+        const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy

-        llama_grammar_accept(rules, prev_stacks, *it, cur_stacks);
+        llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur);

-        if (cur_stacks.empty()) {
+        if (stacks_cur.empty()) {
            error_pos = pos;
-            error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
-            cur_stacks = prev_stacks;
+            error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'";
+            stacks_cur = stacks_prev;
            return false;
        }
        ++pos;
    }

-    for (const auto & stack : cur_stacks) {
+    for (const auto & stack : stacks_cur) {
        if (stack.empty()) {
            return true;
        }
@ -85,27 +80,7 @@ int main(int argc, char** argv) {
        grammar_str = buffer.str();
    }

-    // Parse the GBNF grammar
-    auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
-
-    // will be empty (default) if there are parse errors
-    if (parsed_grammar.rules.empty()) {
-        fprintf(stdout, "%s: failed to parse grammar\n", __func__);
-        return 1;
-    }
-
-    // Ensure that there is a "root" node.
-    if (parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()) {
-        fprintf(stdout, "%s: grammar does not contain a 'root' symbol\n", __func__);
-        return 1;
-    }
-
-    std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-
-    // Create the LLAMA grammar
-    auto grammar = llama_grammar_init(
-            grammar_rules.data(),
-            grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+    llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root");
    if (grammar == nullptr) {
        throw std::runtime_error("Failed to initialize llama_grammar");
    }
@ -122,7 +97,7 @@ int main(int argc, char** argv) {
    // Validate the input string against the grammar
    size_t error_pos;
    std::string error_msg;
-    bool is_valid = llama_sample_grammar_string(grammar, input_str, error_pos, error_msg);
+    bool is_valid = llama_grammar_validate(grammar, input_str, error_pos, error_msg);

    if (is_valid) {
        fprintf(stdout, "Input string is valid according to the grammar.\n");
@ -131,7 +106,7 @@ int main(int argc, char** argv) {
    }

    // Clean up
-    llama_grammar_free(grammar);
+    llama_grammar_free_impl(grammar);

    return 0;
 }
--- a/examples/gguf-hash/README.md
+++ b/examples/gguf-hash/README.md
@ -1,206 +0,0 @@
-
-# llama-gguf-hash
-
-CLI to hash GGUF files to detect difference on a per model and per tensor level.
-
-**Command line options:**
-
- `--help`: display help message
- `--xxh64`: use xhash 64bit hash mode (default)
- `--sha1`: use sha1
- `--uuid`: use uuid
- `--sha256`: use sha256
- `--all`: use all hash
- `--no-layer`: exclude per layer hash
- `--uuid`: generate UUIDv5 ID
- `-c`, `--check <manifest>`:  verify against a manifest
-
-## About
-
-While most POSIX systems already have hash checking programs like sha256sum, it
-is designed to check entire files. This is not ideal for our purpose if we want
-to check for consistency of the tensor data even if the metadata content of the
-gguf KV store has been updated.
-
-This program is designed to hash a gguf tensor payload on a 'per tensor layer'
-in addition to a 'entire tensor model' hash. The intent is that the entire
-tensor layer can be checked first but if there is any detected inconsistencies,
-then the per tensor hash can be used to narrow down the specific tensor layer
-that has inconsistencies.
-
-For Maintainers:
- Detection of tensor inconsistency during development and automated tests
-    - This is served by xxh64 which is fast
-    - This is also served by having per tensor layer to assist in narrowing down
-      the location of the faulty tensor layer
-    - This is also served by sha1 which is much slower but more widely supported
-
-For Model Creators:
- Optional consistent UUID generation based on model tensor content
-    - This is served by UUIDv5 which is useful for databases keys
-        - llama.cpp UUIDv5 Namespace: `ef001206-dadc-5f6d-a15f-3359e577d4e5`
-            - Made via UUIDv5 URL namespace of `en.wikipedia.org/wiki/Llama.cpp`
-
-For Model Users:
- Assurance of tensor layer integrity even if metadata was updated
-    - This is served by sha256 which is still considered very secure as of 2024
-
-### Design Note
-
- The default behavior of this program if no arguments is provided is to hash
-  using xxhash's xxh32 mode because it is very fast and is primarily targeted
-  towards maintainers who may want to use this in automated tests.
- xxhash support xxh32 and xxh128 for 32bit hash and 128bit hash respectively
-  however we picked 64bit xxhash as most computers are 64bit as of 2024 and thus
-  would have a better affinity to calculating hash that is 64bit in size.
-
-## Compile Example
-
-```bash
-cmake -B build -DCMAKE_BUILD_TYPE=Debug -DLLAMA_FATAL_WARNINGS=ON
-make -C build clean
-make -C build llama-gguf-hash VERBOSE=1
-./build/bin/llama-gguf-hash test.gguf
-./build/bin/llama-gguf-hash --xxh64 test.gguf
-./build/bin/llama-gguf-hash --sha1 test.gguf
-./build/bin/llama-gguf-hash --uuid test.gguf
-./build/bin/llama-gguf-hash --sha256 test.gguf
-```
-
-## Generation and Verification Example
-
-To generate we may use this command
-
-```bash
-./llama-gguf-hash --all test.gguf > test.gguf.manifest
-```
-
-Which would generate a manifest that looks like below, which contains multiple hash type and per tensor layer hashes as well
-(This excludes UUID as that is an ID not a hash)
-
-```bash
-xxh64     f66e9cd66a4396a0  test.gguf:tensor_0
-sha1      59f79ecefd8125a996fdf419239051a7e99e5f20  test.gguf:tensor_0
-sha256    c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd  test.gguf:tensor_0
-xxh64     7d3a1f9ac04d0537  test.gguf:tensor_1
-sha1      4765f592eacf096df4628ba59476af94d767080a  test.gguf:tensor_1
-sha256    8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7  test.gguf:tensor_1
-xxh64     a0af5d700049693b  test.gguf:tensor_2
-sha1      25cbfbad4513cc348e2c95ebdee69d6ff2fd8753  test.gguf:tensor_2
-sha256    947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180  test.gguf:tensor_2
-xxh64     e83fddf559d7b6a6  test.gguf:tensor_3
-sha1      a9cba73e2d90f2ee3dae2548caa42bef3fe6a96c  test.gguf:tensor_3
-sha256    423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1  test.gguf:tensor_3
-xxh64     1257733306b7992d  test.gguf:tensor_4
-sha1      d7bc61db93bb685ce9d598da89717c66729b7543  test.gguf:tensor_4
-sha256    79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf  test.gguf:tensor_4
-xxh64     d238d16ba4711e58  test.gguf:tensor_5
-sha1      0706566c198fe1072f37e0a5135b4b5f23654c52  test.gguf:tensor_5
-sha256    60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b  test.gguf:tensor_5
-xxh64     3fbc3b65ab8c7f39  test.gguf:tensor_6
-sha1      73922a0727226a409049f6fc3172a52219ca6f00  test.gguf:tensor_6
-sha256    574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0  test.gguf:tensor_6
-xxh64     c22021c29854f093  test.gguf:tensor_7
-sha1      efc39cece6a951188fc41e354c73bbfe6813d447  test.gguf:tensor_7
-sha256    4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75  test.gguf:tensor_7
-xxh64     936df61f5d64261f  test.gguf:tensor_8
-sha1      c2490296d789a4f34398a337fed8377d943d9f06  test.gguf:tensor_8
-sha256    c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01  test.gguf:tensor_8
-xxh64     93fd20c64421c081  test.gguf:tensor_9
-sha1      7047ce1e78437a6884337a3751c7ee0421918a65  test.gguf:tensor_9
-sha256    23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514  test.gguf:tensor_9
-xxh64     5a54d3aad816f302  test.gguf
-sha1      d15be52c4ff213e823cb6dd13af7ee2f978e7042  test.gguf
-sha256    7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4  test.gguf
-```
-
-We can then use the normal check command which will by default check for the highest security strength hash and verify against that:
-
-```bash
-$ ./llama-gguf-hash --check test.gguf.manifest test.gguf
-manifest  test.gguf.manifest  sha256  sha1  xxh64
-sha256    c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd  test.gguf:tensor_0  -  Ok
-sha256    8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7  test.gguf:tensor_1  -  Ok
-sha256    947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180  test.gguf:tensor_2  -  Ok
-sha256    423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1  test.gguf:tensor_3  -  Ok
-sha256    79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf  test.gguf:tensor_4  -  Ok
-sha256    60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b  test.gguf:tensor_5  -  Ok
-sha256    574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0  test.gguf:tensor_6  -  Ok
-sha256    4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75  test.gguf:tensor_7  -  Ok
-sha256    c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01  test.gguf:tensor_8  -  Ok
-sha256    23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514  test.gguf:tensor_9  -  Ok
-sha256    7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4  test.gguf  -  Ok
-
-Verification results for test.gguf.manifest - Success
-```
-
-Or we may explicitly ask for a faster hash like:
-
-```bash
-$ ./llama-gguf-hash --check test.gguf.manifest --xxh64 test.gguf
-manifest  test.gguf.manifest  sha256  sha1  xxh64
-xxh64     f66e9cd66a4396a0  test.gguf:tensor_0  -  Ok
-xxh64     7d3a1f9ac04d0537  test.gguf:tensor_1  -  Ok
-xxh64     a0af5d700049693b  test.gguf:tensor_2  -  Ok
-xxh64     e83fddf559d7b6a6  test.gguf:tensor_3  -  Ok
-xxh64     1257733306b7992d  test.gguf:tensor_4  -  Ok
-xxh64     d238d16ba4711e58  test.gguf:tensor_5  -  Ok
-xxh64     3fbc3b65ab8c7f39  test.gguf:tensor_6  -  Ok
-xxh64     c22021c29854f093  test.gguf:tensor_7  -  Ok
-xxh64     936df61f5d64261f  test.gguf:tensor_8  -  Ok
-xxh64     93fd20c64421c081  test.gguf:tensor_9  -  Ok
-xxh64     5a54d3aad816f302  test.gguf  -  Ok
-
-Verification results for test.gguf.manifest - Success
-```
-
-Or maybe we want to just check that all the hash is valid:
-
-```bash
-$./llama-gguf-hash --check test.gguf.manifest --all test.gguf.manifest
-manifest  test.gguf.manifest  sha256  sha1  xxh64
-xxh64     f66e9cd66a4396a0  test.gguf:tensor_0  -  Ok
-sha1      59f79ecefd8125a996fdf419239051a7e99e5f20  test.gguf:tensor_0  -  Ok
-sha256    c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd  test.gguf:tensor_0  -  Ok
-xxh64     7d3a1f9ac04d0537  test.gguf:tensor_1  -  Ok
-sha1      4765f592eacf096df4628ba59476af94d767080a  test.gguf:tensor_1  -  Ok
-sha256    8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7  test.gguf:tensor_1  -  Ok
-xxh64     a0af5d700049693b  test.gguf:tensor_2  -  Ok
-sha1      25cbfbad4513cc348e2c95ebdee69d6ff2fd8753  test.gguf:tensor_2  -  Ok
-sha256    947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180  test.gguf:tensor_2  -  Ok
-xxh64     e83fddf559d7b6a6  test.gguf:tensor_3  -  Ok
-sha1      a9cba73e2d90f2ee3dae2548caa42bef3fe6a96c  test.gguf:tensor_3  -  Ok
-sha256    423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1  test.gguf:tensor_3  -  Ok
-xxh64     1257733306b7992d  test.gguf:tensor_4  -  Ok
-sha1      d7bc61db93bb685ce9d598da89717c66729b7543  test.gguf:tensor_4  -  Ok
-sha256    79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf  test.gguf:tensor_4  -  Ok
-xxh64     d238d16ba4711e58  test.gguf:tensor_5  -  Ok
-sha1      0706566c198fe1072f37e0a5135b4b5f23654c52  test.gguf:tensor_5  -  Ok
-sha256    60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b  test.gguf:tensor_5  -  Ok
-xxh64     3fbc3b65ab8c7f39  test.gguf:tensor_6  -  Ok
-sha1      73922a0727226a409049f6fc3172a52219ca6f00  test.gguf:tensor_6  -  Ok
-sha256    574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0  test.gguf:tensor_6  -  Ok
-xxh64     c22021c29854f093  test.gguf:tensor_7  -  Ok
-sha1      efc39cece6a951188fc41e354c73bbfe6813d447  test.gguf:tensor_7  -  Ok
-sha256    4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75  test.gguf:tensor_7  -  Ok
-xxh64     936df61f5d64261f  test.gguf:tensor_8  -  Ok
-sha1      c2490296d789a4f34398a337fed8377d943d9f06  test.gguf:tensor_8  -  Ok
-sha256    c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01  test.gguf:tensor_8  -  Ok
-xxh64     93fd20c64421c081  test.gguf:tensor_9  -  Ok
-sha1      7047ce1e78437a6884337a3751c7ee0421918a65  test.gguf:tensor_9  -  Ok
-sha256    23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514  test.gguf:tensor_9  -  Ok
-xxh64     5a54d3aad816f302  test.gguf  -  Ok
-sha1      d15be52c4ff213e823cb6dd13af7ee2f978e7042  test.gguf  -  Ok
-sha256    7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4  test.gguf  -  Ok
-
-Verification results for test.gguf.manifest - Success
-```
-
-
-## Crypto/Hash Libraries Used
-
-These micro c libraries dependencies was installed via the [clib c package manager](https://github.com/clibs)
-
- https://github.com/Cyan4973/xxHash
- https://github.com/clibs/sha1/
- https://github.com/jb55/sha256.c
--- a/examples/gguf-split/README.md
+++ b/examples/gguf-split/README.md
@ -1,10 +0,0 @@
-## GGUF split Example
-
-CLI to split / merge GGUF files.
-
-**Command line options:**
-
- `--split`: split GGUF to multiple GGUF, default operation.
- `--split-max-size`: max size per split in `M` or `G`, f.ex. `500M` or `2G`.
- `--split-max-tensors`: maximum tensors in each split: default(128)
- `--merge`: merge multiple GGUF to a single GGUF.
--- a/examples/gritlm/README.md
+++ b/examples/gritlm/README.md
@ -1,62 +0,0 @@
-## Generative Representational Instruction Tuning (GRIT) Example
-[gritlm] a model which can generate embeddings as well as "normal" text
-generation depending on the instructions in the prompt.
-
-* Paper: https://arxiv.org/pdf/2402.09906.pdf
-
-### Retrieval-Augmented Generation (RAG) use case
-One use case for `gritlm` is to use it with RAG. If we recall how RAG works is
-that we take documents that we want to use as context, to ground the large
-language model (LLM), and we create token embeddings for them. We then store
-these token embeddings in a vector database.
-
-When we perform a query, prompt the LLM, we will first create token embeddings
-for the query and then search the vector database to retrieve the most
-similar vectors, and return those documents so they can be passed to the LLM as
-context. Then the query and the context will be passed to the LLM which will
-have to _again_ create token embeddings for the query. But because gritlm is used
-the first query can be cached and the second query tokenization generation does
-not have to be performed at all.
-
-### Running the example
-Download a Grit model:
-```console
-$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --outdir models
-```
-
-Run the example using the downloaded model:
-```console
-$ ./llama-gritlm -m models/gritlm-7b_q4_1.gguf
-
-Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
-Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
-Cosine similarity between "Generative Representational Instruction Tuning" and "A purely peer-to-peer version of electronic cash w" is: 0.112
-Cosine similarity between "Generative Representational Instruction Tuning" and "All text-based language problems can be reduced to" is: 0.547
-
-Oh, brave adventurer, who dared to climb
-The lofty peak of Mt. Fuji in the night,
-When shadows lurk and ghosts do roam,
-And darkness reigns, a fearsome sight.
-
-Thou didst set out, with heart aglow,
-To conquer this mountain, so high,
-And reach the summit, where the stars do glow,
-And the moon shines bright, up in the sky.
-
-Through the mist and fog, thou didst press on,
-With steadfast courage, and a steadfast will,
-Through the darkness, thou didst not be gone,
-But didst climb on, with a steadfast skill.
-
-At last, thou didst reach the summit's crest,
-And gazed upon the world below,
-And saw the beauty of the night's best,
-And felt the peace, that only nature knows.
-
-Oh, brave adventurer, who dared to climb
-The lofty peak of Mt. Fuji in the night,
-Thou art a hero, in the eyes of all,
-For thou didst conquer this mountain, so bright.
-```
-
-[gritlm]: https://github.com/ContextualAI/gritlm
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@ -9,7 +9,7 @@
 static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
    std::vector<std::vector<float>> result;

-    const llama_model * mdl = llama_get_model(ctx);
+    const llama_model * model = llama_get_model(ctx);

    llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);

@ -18,16 +18,16 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve

        const std::string input_string = instruction + sentences[i];

-        std::vector<llama_token> inputs = llama_tokenize(mdl, input_string, true, false);
+        std::vector<llama_token> inputs = llama_tokenize(model, input_string, true, false);

        const int32_t n_toks = inputs.size();

        // GritLM seems to have EOS = ""
        // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
-        // inputs.push_back(llama_token_eos(mdl));
+        // inputs.push_back(llama_token_eos(model));

        // we want to ignore instruction tokens for mean pooling
-        const int32_t n_inst = llama_tokenize(mdl, instruction, true, false).size();
+        const int32_t n_inst = llama_tokenize(model, instruction, true, false).size();

 #ifdef GRIT_DEBUG
        // debug tokens - should be matching as referenced in the GritLM sample
@ -51,7 +51,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
        llama_decode(ctx, batch);

        // get embedding dimensions
-        uint64_t n_embd = llama_n_embd(mdl);
+        uint64_t n_embd = llama_n_embd(model);

        // allocate embedding output
        std::vector<float> emb_unorm(n_embd, 0.0f);
@ -92,11 +92,11 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
    return result;
 }

-static std::string generate(llama_context * ctx, const std::string & prompt, bool stream) {
+static std::string generate(llama_context * ctx, llama_sampler * smpl, const std::string & prompt, bool stream) {
    std::string result;

-    const llama_model * mdl = llama_get_model(ctx);
-    llama_token eos_token = llama_token_eos(mdl);
+    const llama_model * model = llama_get_model(ctx);
+    llama_token eos_token = llama_token_eos(model);

    llama_kv_cache_clear(ctx);
    llama_set_embeddings(ctx, false);
@ -104,28 +104,25 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo

    llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);

-    std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
+    std::vector<llama_token> inputs = llama_tokenize(model, prompt, false, true);
    int32_t i_current_token = 0;

    while (true) {
        llama_batch_clear(bat);
-        auto n_inputs = (int32_t)inputs.size();
+        {
+            const int32_t n_inputs = inputs.size();
+
            for (int32_t i = 0; i < n_inputs; i++) {
                llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
            }
+        }
        inputs.clear();

        llama_decode(ctx, bat);
-        auto logits = llama_get_logits_ith(ctx, bat.n_tokens - 1);

-        auto candidates = std::vector<llama_token_data>(llama_n_vocab(mdl));
-        auto n_candidates = (int32_t)candidates.size();
-        for (int32_t token = 0; token < n_candidates; token++) {
-            candidates[token] = llama_token_data{ token, logits[token], 0.0f };
-        }
-        auto candidates_p = llama_token_data_array{ candidates.data(), candidates.size(), false };
+        llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);
+        llama_sampler_accept(smpl, token);

-        llama_token token = llama_sample_token_greedy(ctx, &candidates_p);
        if (token == eos_token) {
            break;
        }
@ -167,10 +164,18 @@ int main(int argc, char * argv[]) {

    llama_backend_init();

-    llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);

    // create generation context
-    llama_context * ctx = llama_new_context_with_model(mdl, cparams);
+    llama_context * ctx = llama_new_context_with_model(model, cparams);
+
+    auto sparams = llama_sampler_chain_default_params();
+
+    sparams.no_perf = false;
+
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+
+    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());

    // ### Embedding/Representation ###
    // samples taken from: https://github.com/ContextualAI/gritlm#basic
@ -191,7 +196,7 @@ int main(int argc, char * argv[]) {
        const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
        const std::vector<std::vector<float>> q_rep = encode(ctx, queries,   gritlm_instruction(instruction));

-        const int n_embd = llama_n_embd(mdl);
+        const int n_embd = llama_n_embd(model);

        const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
        const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
@ -208,11 +213,12 @@ int main(int argc, char * argv[]) {
    // GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
    {
        const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n";
-        std::string response = generate(ctx, prompt, true);
+        std::string response = generate(ctx, smpl, prompt, true);
    }

+    llama_sampler_free(smpl);
    llama_free(ctx);
-    llama_free_model(mdl);
+    llama_free_model(model);
    llama_backend_free();

    return 0;
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@ -1,35 +0,0 @@
-# llama.cpp/examples/imatrix
-
-Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models.
-More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861
-
-## Usage
-
-```
-./llama-imatrix \
-    -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
-    [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
-    [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
-```
-
-Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory.
-The parameters in square brackets are optional and have the following meaning:
-* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
-* `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
-* `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
-* `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
-* `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
-
-For faster computation, make sure to use GPU offloading via the `-ngl` argument
-
-## Example
-
-```bash
-GGML_CUDA=1 make -j
-
-# generate importance matrix (imatrix.dat)
-./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
-
-# use the imatrix to perform a Q4_K_M quantization
-./llama-quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
-```
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -639,7 +639,8 @@ int main(int argc, char ** argv) {

    g_collector.save_imatrix();

-    llama_print_timings(ctx);
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);

    llama_free(ctx);
    llama_free_model(model);
--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@ -1,47 +0,0 @@
-# llama.cpp/example/infill
-
-This example shows how to use the infill mode with Code Llama models supporting infill mode.
-Currently the 7B and 13B models support infill mode.
-
-Infill supports most of the options available in the main example.
-
-For further information have a look at the main README.md in llama.cpp/example/main/README.md
-
-## Common Options
-
-In this section, we cover the most commonly used options for running the `infill` program with the LLaMA models:
-
-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
-   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
-   `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
-
-## Input Prompts
-
-The `infill` program provides several ways to interact with the LLaMA models using input prompts:
-
-   `--in-prefix PROMPT_BEFORE_CURSOR`: Provide the prefix directly as a command-line option.
-   `--in-suffix PROMPT_AFTER_CURSOR`: Provide the suffix directly as a command-line option.
-   `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
-
-## Interaction
-
-The `infill` program offers a seamless way to interact with LLaMA models, allowing users to receive real-time infill suggestions. The interactive mode can be triggered using `--interactive`, and `--interactive-first`
-
-### Interaction Options
-
-   `-i, --interactive`: Run the program in interactive mode, allowing users to get real time code suggestions from model.
-   `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
-   `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
-
-### Example
-
-Download a model that supports infill, for example CodeLlama:
-```console
-scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.gguf --outdir models
-```
-
-```bash
-./llama-infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n    print(\"hell" --in-suffix "\n   print(\"goodbye world\")\n    "
-```
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -1,656 +0,0 @@
-#include "common.h"
-
-#include "console.h"
-#include "llama.h"
-#include "build-info.h"
-#include "grammar-parser.h"
-
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-#include <signal.h>
-#include <unistd.h>
-#elif defined (_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include <windows.h>
-#include <signal.h>
-#endif
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-static llama_context           ** g_ctx;
-static llama_model             ** g_model;
-static gpt_params               * g_params;
-static std::vector<llama_token> * g_input_tokens;
-static std::ostringstream       * g_output_ss;
-static std::vector<llama_token> * g_output_tokens;
-
-static bool is_interacting = false;
-
-static void write_logfile(
-    const llama_context * ctx, const gpt_params & params, const llama_model * model,
-    const std::vector<llama_token> & input_tokens, const std::string & output,
-    const std::vector<llama_token> & output_tokens
-) {
-    if (params.logdir.empty()) {
-        return;
-    }
-
-    const std::string timestamp = string_get_sortable_timestamp();
-
-    const bool success = fs_create_directory_with_parents(params.logdir);
-    if (!success) {
-        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
-                __func__, params.logdir.c_str());
-        return;
-    }
-
-    const std::string logfile_path = params.logdir + timestamp + ".yml";
-    FILE * logfile = fopen(logfile_path.c_str(), "w");
-
-    if (logfile == NULL) {
-        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
-        return;
-    }
-
-    fprintf(logfile, "binary: infill\n");
-    char model_desc[128];
-    llama_model_desc(model, model_desc, sizeof(model_desc));
-    yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
-
-    fprintf(logfile, "\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "# Generation Results #\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "\n");
-
-    yaml_dump_string_multiline(logfile, "output", output.c_str());
-    yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
-
-    llama_dump_timing_info_yaml(logfile, ctx);
-    fclose(logfile);
-}
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-static void sigint_handler(int signo) {
-    if (signo == SIGINT) {
-        if (!is_interacting) {
-            is_interacting = true;
-        } else {
-            console::cleanup();
-            printf("\n");
-            llama_print_timings(*g_ctx);
-            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
-            _exit(130);
-        }
-    }
-}
-#endif
-
-int main(int argc, char ** argv) {
-    gpt_params params;
-    llama_sampling_params & sparams = params.sparams;
-    g_params = &params;
-
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
-        return 1;
-    }
-
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("infill", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
-    console::init(params.simple_io, params.use_color);
-    atexit([]() { console::cleanup(); });
-
-    if (params.logits_all) {
-        printf("\n************\n");
-        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
-
-    if (params.embedding) {
-        printf("\n************\n");
-        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
-
-    if (params.n_ctx != 0 && params.n_ctx < 8) {
-        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
-        params.n_ctx = 8;
-    }
-    if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
-        printf("\n************\n");
-        printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
-
-    if (params.rope_freq_base != 0.0) {
-        LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
-    }
-
-    if (params.rope_freq_scale != 0.0) {
-        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
-    }
-
-    LOG_TEE("%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
-    LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
-
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-
-    LOG("%s: llama backend init\n", __func__);
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    llama_model * model;
-    llama_context * ctx;
-
-    g_model = &model;
-    g_ctx = &ctx;
-
-    // load the model and apply lora adapter, if any
-    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
-
-    model = llama_init.model;
-    ctx = llama_init.context;
-
-    if (model == NULL) {
-        LOG_TEE("%s: error: unable to load model\n", __func__);
-        return 1;
-    }
-
-    const int n_ctx_train = llama_n_ctx_train(model);
-    const int n_ctx = llama_n_ctx(ctx);
-    LOG("n_ctx: %d\n", n_ctx);
-
-    if (n_ctx > n_ctx_train) {
-        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, n_ctx);
-    }
-
-    // print system information
-    {
-        LOG_TEE("\n");
-        LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
-    }
-    const bool add_bos = llama_add_bos_token(model);
-    GGML_ASSERT(!llama_add_eos_token(model));
-    LOG("add_bos: %d\n", add_bos);
-
-    std::vector<llama_token> embd_inp;
-    std::vector<llama_token> embd_end;
-    std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
-    std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
-
-    GGML_ASSERT(llama_token_prefix(model) >= 0);
-    GGML_ASSERT(llama_token_suffix(model) >= 0);
-
-    inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
-    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
-
-    embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
-    embd_end = params.spm_infill ? inp_pfx : inp_sfx;
-    if (add_bos) {
-        embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
-    }
-    embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
-
-    const llama_token middle_token = llama_token_middle(model);
-    if (middle_token >= 0) {
-        embd_inp.push_back(middle_token);
-    }
-
-    LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
-    LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
-    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
-
-    // Should not run without any tokens
-    if (embd_inp.empty()) {
-        embd_inp.push_back(llama_token_bos(model));
-        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
-    }
-
-    if ((int) embd_inp.size() > n_ctx - 4) {
-        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
-        return 1;
-    }
-
-    // number of tokens to keep when resetting context
-    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
-        params.n_keep = (int)embd_inp.size();
-    }
-
-    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
-    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
-
-
-    // enable interactive mode if interactive start is specified
-    if (params.interactive_first) {
-        params.interactive = true;
-    }
-
-    if (params.verbose_prompt) {
-        LOG_TEE("\n");
-        LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
-        }
-
-        if (params.n_keep > 0) {
-        LOG_TEE("%s: static prompt based on n_keep: '", __func__);
-            for (int i = 0; i < params.n_keep; i++) {
-                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
-            }
-            LOG_TEE("'\n");
-        }
-        LOG_TEE("\n");
-    }
-
-    if (params.interactive) {
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-        struct sigaction sigint_action;
-        sigint_action.sa_handler = sigint_handler;
-        sigemptyset (&sigint_action.sa_mask);
-        sigint_action.sa_flags = 0;
-        sigaction(SIGINT, &sigint_action, NULL);
-#elif defined (_WIN32)
-        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
-            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
-        };
-        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
-#endif
-
-        LOG_TEE("%s: interactive mode on.\n", __func__);
-
-        if (params.input_prefix_bos) {
-            LOG_TEE("Input prefix with BOS\n");
-        }
-
-        if (!params.input_prefix.empty()) {
-            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
-        }
-
-        if (!params.input_suffix.empty()) {
-            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
-        }
-    }
-    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
-    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
-    LOG_TEE("\n\n");
-
-    LOG_TEE("\n#####  Infill mode  #####\n\n");
-    if (params.infill) {
-        printf("\n************\n");
-        printf("no need to specify '--infill', always running infill\n");
-        printf("************\n\n");
-    }
-    if (params.interactive) {
-        const char *control_message;
-        if (params.multiline_input) {
-            control_message = " - To return control to LLaMA, end your input with '\\'.\n"
-                              " - To return control without starting a new line, end your input with '/'.\n";
-        } else {
-            control_message = " - Press Return to return control to LLaMA.\n"
-                              " - To return control without starting a new line, end your input with '/'.\n"
-                              " - If you want to submit another line, end your input with '\\'.\n";
-        }
-        LOG_TEE("== Running in interactive mode. ==\n");
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-        LOG_TEE(       " - Press Ctrl+C to interject at any time.\n");
-#endif
-        LOG_TEE(       "%s\n", control_message);
-
-        is_interacting = params.interactive_first;
-    }
-
-    bool input_echo = true;
-
-    int n_past     = 0;
-    int n_remain   = params.n_predict;
-    int n_consumed = 0;
-
-    std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
-    std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
-    std::ostringstream output_ss;     g_output_ss     = &output_ss;
-
-    // the first thing we will do is to output the prompt, so set color accordingly
-    console::set_display(console::prompt);
-
-    std::vector<llama_token> embd;
-
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
-
-    while (n_remain != 0 || params.interactive) {
-        // predict
-        if (!embd.empty()) {
-            // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
-            // --prompt or --file which uses the same value.
-            int max_embd_size = n_ctx - 4;
-
-            // Ensure the input doesn't exceed the context size by truncating embd if necessary.
-            if ((int) embd.size() > max_embd_size) {
-                const int skipped_tokens = (int) embd.size() - max_embd_size;
-                embd.resize(max_embd_size);
-
-                console::set_display(console::error);
-                printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
-                console::set_display(console::reset);
-                fflush(stdout);
-            }
-
-            // infinite text generation via context swapping
-            // if we run out of context:
-            // - take the n_keep first tokens from the original prompt (via n_past)
-            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-            if (n_past + (int) embd.size() > n_ctx) {
-                if (params.n_predict == -2) {
-                    LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
-                    break;
-                }
-
-                const int n_left    = n_past - params.n_keep - 1;
-                const int n_discard = n_left/2;
-
-                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
-                    n_past, n_left, n_ctx, params.n_keep, n_discard);
-
-                llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
-                llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
-
-                n_past -= n_discard;
-
-                LOG("after swap: n_past = %d\n", n_past);
-
-                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
-
-            }
-
-            // evaluate tokens in batches
-            // embd is typically prepared beforehand to fit within a batch, but not always
-            for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
-                int n_eval = (int) embd.size() - i;
-                if (n_eval > params.n_batch) {
-                    n_eval = params.n_batch;
-                }
-
-                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
-
-                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
-                    LOG_TEE("%s : failed to eval\n", __func__);
-                    return 1;
-                }
-
-                n_past += n_eval;
-
-                LOG("n_past = %d\n", n_past);
-            }
-
-        }
-
-        embd.clear();
-
-        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
-
-            llama_sampling_accept(ctx_sampling, ctx, id, true);
-
-            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
-
-            embd.push_back(id);
-
-            // echo this to console
-            input_echo = true;
-
-            // decrement remaining sampling budget
-            --n_remain;
-
-            LOG("n_remain: %d\n", n_remain);
-        } else {
-            // some user input remains from prompt or interaction, forward it to processing
-            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
-            while ((int) embd_inp.size() > n_consumed) {
-                embd.push_back(embd_inp[n_consumed]);
-
-                // push the prompt in the sampling context in order to apply repetition penalties later
-                // for the prompt, we don't apply grammar rules
-                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
-
-                ++n_consumed;
-                if ((int) embd.size() >= params.n_batch) {
-                    break;
-                }
-            }
-        }
-
-        // display text
-        if (input_echo) {
-            for (auto id : embd) {
-                const std::string token_str = llama_token_to_piece(ctx, id);
-                printf("%s", token_str.c_str());
-
-                if (embd.size() > 1) {
-                    input_tokens.push_back(id);
-                } else {
-                    output_tokens.push_back(id);
-                    output_ss << token_str;
-                }
-            }
-            fflush(stdout);
-        }
-        // reset color to default if we there is no pending user input
-        if (input_echo && (int) embd_inp.size() == n_consumed) {
-            console::set_display(console::reset);
-        }
-
-        // if not currently processing queued inputs;
-        if ((int) embd_inp.size() <= n_consumed) {
-            // deal with eot token in infill mode
-            if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
-                if (is_interacting && !params.interactive_first) {
-                    // print an eot token
-                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
-                }
-                fflush(stdout);
-                printf("\n");
-                console::set_display(console::user_input);
-                std::string buffer;
-                std::string line;
-                bool another_line=true;
-                // set a new prefix via stdin
-                do {
-                    another_line = console::readline(line, params.multiline_input);
-                    buffer += line;
-                } while (another_line);
-                // check if we got an empty line, if so we use the old input
-                if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
-                    params.input_prefix = buffer;
-                }
-                buffer.clear();
-                // set a new suffix via stdin
-                do {
-                    another_line = console::readline(line, params.multiline_input);
-                    buffer += line;
-                } while (another_line);
-                // check if we got an empty line
-                if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
-                    params.input_suffix = buffer;
-                }
-                buffer.clear();
-                // done taking input, reset color
-                console::set_display(console::reset);
-
-                if (params.escape) {
-                    //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
-                    string_process_escapes(params.input_prefix);
-                    string_process_escapes(params.input_suffix);
-                }
-
-                // tokenize new prefix and suffix
-                std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
-                std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
-
-                inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
-                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
-
-                embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
-                embd_end = params.spm_infill ? inp_pfx : inp_sfx;
-                if (add_bos) {
-                    embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
-                }
-                embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
-
-                if (middle_token >= 0) {
-                    embd_inp.push_back(middle_token);
-                }
-
-                embd.clear();
-                n_remain = params.n_predict;
-                n_past = 0;
-                n_consumed = 0;
-                // LOG_TEE("took new input\n");
-                is_interacting = false;
-            }
-            // deal with end of generation tokens in interactive mode
-            else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
-                LOG("found EOS token\n");
-
-                if (params.interactive) {
-
-                    is_interacting = true;
-                    printf("\n");
-                    console::set_display(console::user_input);
-                    fflush(stdout);
-               }
-            }
-
-            if (n_past > 0 && is_interacting && !params.interactive) {
-                LOG("waiting for user input\n");
-
-                if (params.input_prefix_bos) {
-                    LOG("adding input prefix BOS token\n");
-                    embd_inp.push_back(llama_token_bos(model));
-                }
-
-                std::string buffer;
-                if (!params.input_prefix.empty()) {
-                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
-                    buffer += params.input_prefix;
-                    printf("%s", buffer.c_str());
-                }
-
-                std::string line;
-                bool another_line = true;
-                do {
-                    another_line = console::readline(line, params.multiline_input);
-                    buffer += line;
-                } while (another_line);
-
-                // done taking input, reset color
-                console::set_display(console::reset);
-
-                // Add tokens to embd only if the input buffer is non-empty
-                // Entering a empty line lets the user pass control back
-                if (buffer.length() > 1) {
-                    // append input suffix if any
-                    if (!params.input_suffix.empty()) {
-                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
-                        buffer += params.input_suffix;
-                        printf("%s", params.input_suffix.c_str());
-                    }
-
-                    LOG("buffer: '%s'\n", buffer.c_str());
-
-                    const size_t original_size = embd_inp.size();
-
-                    const auto line_inp = ::llama_tokenize(ctx, buffer, false);
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
-
-                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
-
-                    for (size_t i = original_size; i < embd_inp.size(); ++i) {
-                        const llama_token token = embd_inp[i];
-                        output_tokens.push_back(token);
-                        output_ss << llama_token_to_piece(ctx, token);
-                    }
-
-                    n_remain -= line_inp.size();
-                    LOG("n_remain: %d\n", n_remain);
-                } else {
-                    LOG("empty line, passing control back\n");
-                }
-
-                input_echo = false; // do not echo this again
-            }
-
-            if (n_past > 0) {
-                if (is_interacting) {
-                    llama_sampling_reset(ctx_sampling);
-                }
-                is_interacting = false;
-            }
-        }
-
-        // end of generation
-        if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) {
-            break;
-        }
-
-        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
-        // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
-        if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
-            n_remain = params.n_predict;
-            is_interacting = true;
-        }
-    }
-    if (!params.interactive && n_remain <= 0) {
-        printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
-        fflush(stdout);
-    }
-
-    llama_print_timings(ctx);
-    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
-
-    llama_free(ctx);
-    llama_free_model(model);
-
-    llama_sampling_free(ctx_sampling);
-    llama_backend_free();
-
-#ifndef LOG_DISABLE_LOGS
-    LOG_TEE("Log end\n");
-#endif // LOG_DISABLE_LOGS
-
-    return 0;
-}
--- a/examples/jeopardy/README.md
+++ b/examples/jeopardy/README.md
@ -1,21 +0,0 @@
-# llama.cpp/example/jeopardy
-
-This is pretty much just a straight port of aigoopy/llm-jeopardy/ with an added graph viewer.
-
-The jeopardy test can be used to compare the fact knowledge of different models and compare them to each other. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc.
-
-
-Step 1: Open jeopardy.sh and modify the following:
-```
-MODEL=(path to your model)
-MODEL_NAME=(name of your model)
-prefix=(basically, if you use vicuna it's Human: , if you use something else it might be User: , etc)
-opts=(add -instruct here if needed for your model, or anything else you want to test out)
-```
-Step 2: Run `jeopardy.sh` from the llama.cpp folder
-
-Step 3: Repeat steps 1 and 2 until you have all the results you need.
-
-Step 4: Run `graph.py`, and follow the instructions. At the end, it will generate your final graph.
-
-Note: The Human bar is based off of the full, original 100 sample questions. If you modify the question count or questions, it will not be valid.
--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@ -1,305 +0,0 @@
-# llama.cpp/examples/llama-bench
-
-Performance testing tool for llama.cpp.
-
-## Table of contents
-
-1. [Syntax](#syntax)
-2. [Examples](#examples)
-    1. [Text generation with different models](#text-generation-with-different-models)
-    2. [Prompt processing with different batch sizes](#prompt-processing-with-different-batch-sizes)
-    3. [Different numbers of threads](#different-numbers-of-threads)
-    4. [Different numbers of layers offloaded to the GPU](#different-numbers-of-layers-offloaded-to-the-gpu)
-3. [Output formats](#output-formats)
-    1. [Markdown](#markdown)
-    2. [CSV](#csv)
-    3. [JSON](#json)
-    4. [JSONL](#jsonl)
-    5. [SQL](#sql)
-
-## Syntax
-
-```
-usage: ./llama-bench [options]
-
-options:
-  -h, --help
-  -m, --model <filename>                    (default: models/7B/ggml-model-q4_0.gguf)
-  -p, --n-prompt <n>                        (default: 512)
-  -n, --n-gen <n>                           (default: 128)
-  -pg <pp,tg>                               (default: )
-  -b, --batch-size <n>                      (default: 2048)
-  -ub, --ubatch-size <n>                    (default: 512)
-  -ctk, --cache-type-k <t>                  (default: f16)
-  -ctv, --cache-type-v <t>                  (default: f16)
-  -t, --threads <n>                         (default: 8)
-  -C, --cpu-mask <hex,hex>                  (default: 0x0)
-  --cpu-strict <0|1>                        (default: 0)
-  --poll <0...100>                          (default: 50)
-  -ngl, --n-gpu-layers <n>                  (default: 99)
-  -rpc, --rpc <rpc_servers>                 (default: )
-  -sm, --split-mode <none|layer|row>        (default: layer)
-  -mg, --main-gpu <i>                       (default: 0)
-  -nkvo, --no-kv-offload <0|1>              (default: 0)
-  -fa, --flash-attn <0|1>                   (default: 0)
-  -mmp, --mmap <0|1>                        (default: 1)
-  --numa <distribute|isolate|numactl>       (default: disabled)
-  -embd, --embeddings <0|1>                 (default: 0)
-  -ts, --tensor-split <ts0/ts1/..>          (default: 0)
-  -r, --repetitions <n>                     (default: 5)
-  --prio <0|1|2|3>                          (default: 0)
-  --delay <0...N> (seconds)                 (default: 0)
-  -o, --output <csv|json|jsonl|md|sql>      (default: md)
-  -oe, --output-err <csv|json|jsonl|md|sql> (default: none)
-  -v, --verbose                             (default: 0)
-
-Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
-```
-
-llama-bench can perform three types of tests:
-
- Prompt processing (pp): processing a prompt in batches (`-p`)
- Text generation (tg): generating a sequence of tokens (`-n`)
- Prompt processing + text generation (pg): processing a prompt followed by generating a sequence of tokens (`-pg`)
-
-With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`).
-
-Each test is repeated the number of times given by `-r`, and the results are averaged. The results are given in average tokens per second (t/s) and standard deviation. Some output formats (e.g. json) also include the individual results of each repetition.
-
-For a description of the other options, see the [main example](../main/README.md).
-
-Note:
-
- When using SYCL backend, there would be hang issue in some cases. Please set `--mmp 0`.
-
-## Examples
-
-### Text generation with different models
-
-```sh
-$ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0.gguf -p 0 -n 128,256,512
-```
-
-| model                          |       size |     params | backend    | ngl | test       |              t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 128     |    132.19 ± 0.55 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 256     |    129.37 ± 0.54 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 512     |    123.83 ± 0.25 |
-| llama 13B mostly Q4_0          |   6.86 GiB |    13.02 B | CUDA       |  99 | tg 128     |     82.17 ± 0.31 |
-| llama 13B mostly Q4_0          |   6.86 GiB |    13.02 B | CUDA       |  99 | tg 256     |     80.74 ± 0.23 |
-| llama 13B mostly Q4_0          |   6.86 GiB |    13.02 B | CUDA       |  99 | tg 512     |     78.08 ± 0.07 |
-
-### Prompt processing with different batch sizes
-
-```sh
-$ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024
-```
-
-| model                          |       size |     params | backend    | ngl |    n_batch | test       |              t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------: | ---------- | ---------------: |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |        128 | pp 1024    |   1436.51 ± 3.66 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |        256 | pp 1024    |  1932.43 ± 23.48 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |        512 | pp 1024    |  2254.45 ± 15.59 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |       1024 | pp 1024    |  2498.61 ± 13.58 |
-
-### Different numbers of threads
-
-```sh
-$ ./llama-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32
-```
-
-| model                          |       size |     params | backend    |    threads | test       |              t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | ---------: | ---------- | ---------------: |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          1 | pp 64      |      6.17 ± 0.07 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          1 | tg 16      |      4.05 ± 0.02 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          2 | pp 64      |     12.31 ± 0.13 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          2 | tg 16      |      7.80 ± 0.07 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          4 | pp 64      |     23.18 ± 0.06 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          4 | tg 16      |     12.22 ± 0.07 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          8 | pp 64      |     32.29 ± 1.21 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          8 | tg 16      |     16.71 ± 0.66 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         16 | pp 64      |     33.52 ± 0.03 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         16 | tg 16      |     15.32 ± 0.05 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         32 | pp 64      |     59.00 ± 1.11 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         32 | tg 16      |     16.41 ± 0.79 ||
-
-### Different numbers of layers offloaded to the GPU
-
-```sh
-$ ./llama-bench -ngl 10,20,30,31,32,33,34,35
-```
-
-| model                          |       size |     params | backend    | ngl | test       |              t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  10 | pp 512     |    373.36 ± 2.25 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  10 | tg 128     |     13.45 ± 0.93 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  20 | pp 512     |    472.65 ± 1.25 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  20 | tg 128     |     21.36 ± 1.94 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  30 | pp 512     |   631.87 ± 11.25 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  30 | tg 128     |     40.04 ± 1.82 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  31 | pp 512     |    657.89 ± 5.08 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  31 | tg 128     |     48.19 ± 0.81 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  32 | pp 512     |    688.26 ± 3.29 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  32 | tg 128     |     54.78 ± 0.65 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  33 | pp 512     |    704.27 ± 2.24 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  33 | tg 128     |     60.62 ± 1.76 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  34 | pp 512     |    881.34 ± 5.40 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  34 | tg 128     |     71.76 ± 0.23 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  35 | pp 512     |   2400.01 ± 7.72 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  35 | tg 128     |    131.66 ± 0.49 |
-
-## Output formats
-
-By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option.
-
-### Markdown
-
-```sh
-$ ./llama-bench -o md
-```
-
-| model                          |       size |     params | backend    | ngl | test       |              t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | pp 512     |  2368.80 ± 93.24 |
-| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 128     |    131.42 ± 0.59 |
-
-### CSV
-
-```sh
-$ ./llama-bench -o csv
-```
-
-```csv
-build_commit,build_number,cuda,opencl,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
-"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
-"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
-```
-
-### JSON
-
-```sh
-$ ./llama-bench -o json
-```
-
-```json
-[
-  {
-    "build_commit": "3469684",
-    "build_number": 1275,
-    "cuda": true,
-    "opencl": false,
-    "metal": false,
-    "gpu_blas": true,
-    "blas": true,
-    "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
-    "gpu_info": "NVIDIA GeForce RTX 3090 Ti",
-    "model_filename": "models/7B/ggml-model-q4_0.gguf",
-    "model_type": "llama 7B mostly Q4_0",
-    "model_size": 3825065984,
-    "model_n_params": 6738415616,
-    "n_batch": 512,
-    "n_threads": 16,
-    "f16_kv": true,
-    "n_gpu_layers": 99,
-    "main_gpu": 0,
-    "mul_mat_q": true,
-    "tensor_split": "0.00",
-    "n_prompt": 512,
-    "n_gen": 0,
-    "test_time": "2023-09-23T12:09:57Z",
-    "avg_ns": 212365953,
-    "stddev_ns": 985423,
-    "avg_ts": 2410.974041,
-    "stddev_ts": 11.163766,
-    "samples_ns": [ 213837238, 211635853, 212328053, 211329715, 212698907 ],
-    "samples_ts": [ 2394.34, 2419.25, 2411.36, 2422.75, 2407.16 ]
-  },
-  {
-    "build_commit": "3469684",
-    "build_number": 1275,
-    "cuda": true,
-    "opencl": false,
-    "metal": false,
-    "gpu_blas": true,
-    "blas": true,
-    "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
-    "gpu_info": "NVIDIA GeForce RTX 3090 Ti",
-    "model_filename": "models/7B/ggml-model-q4_0.gguf",
-    "model_type": "llama 7B mostly Q4_0",
-    "model_size": 3825065984,
-    "model_n_params": 6738415616,
-    "n_batch": 512,
-    "n_threads": 16,
-    "f16_kv": true,
-    "n_gpu_layers": 99,
-    "main_gpu": 0,
-    "mul_mat_q": true,
-    "tensor_split": "0.00",
-    "n_prompt": 0,
-    "n_gen": 128,
-    "test_time": "2023-09-23T12:09:59Z",
-    "avg_ns": 977425219,
-    "stddev_ns": 9268593,
-    "avg_ts": 130.965708,
-    "stddev_ts": 1.238924,
-    "samples_ns": [ 984472709, 974901233, 989474741, 970729355, 967548060 ],
-    "samples_ts": [ 130.019, 131.295, 129.362, 131.86, 132.293 ]
-  }
-]
-```
-
-
-### JSONL
-
-```sh
-$ ./llama-bench -o jsonl
-```
-
-```json lines
-{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]}
-{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]}
-```
-
-
-### SQL
-
-SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database.
-
-```sh
-$ ./llama-bench -o sql
-```
-
-```sql
-CREATE TABLE IF NOT EXISTS test (
-  build_commit TEXT,
-  build_number INTEGER,
-  cuda INTEGER,
-  opencl INTEGER,
-  metal INTEGER,
-  gpu_blas INTEGER,
-  blas INTEGER,
-  cpu_info TEXT,
-  gpu_info TEXT,
-  model_filename TEXT,
-  model_type TEXT,
-  model_size INTEGER,
-  model_n_params INTEGER,
-  n_batch INTEGER,
-  n_threads INTEGER,
-  f16_kv INTEGER,
-  n_gpu_layers INTEGER,
-  main_gpu INTEGER,
-  mul_mat_q INTEGER,
-  tensor_split TEXT,
-  n_prompt INTEGER,
-  n_gen INTEGER,
-  test_time TEXT,
-  avg_ns INTEGER,
-  stddev_ns INTEGER,
-  avg_ts REAL,
-  stddev_ts REAL
-);
-
-INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
-INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
-```
--- a/examples/llama.android/README.md
+++ b/examples/llama.android/README.md
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@ -120,7 +120,7 @@ Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmo
    LOGi("Using %d threads", n_threads);

    llama_context_params ctx_params = llama_context_default_params();
-    ctx_params.seed  = 1234;
+
    ctx_params.n_ctx           = 2048;
    ctx_params.n_threads       = n_threads;
    ctx_params.n_threads_batch = n_threads;
@ -380,11 +380,13 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
        JNIEnv * env,
        jobject,
        jlong context_pointer,
+        jlong sampling_pointer,
        jlong batch_pointer,
        jint n_len,
        jobject intvar_ncur
 ) {
    const auto context = reinterpret_cast<llama_context *>(context_pointer);
+    const auto sampling = reinterpret_cast<llama_sampler *>(sampling_pointer);
    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
    const auto model = llama_get_model(context);

@ -392,20 +394,10 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
    if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
    if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");

-    auto n_vocab = llama_n_vocab(model);
-    auto logits = llama_get_logits_ith(context, batch->n_tokens - 1);
-
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-
-    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
-    }
-
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
    // sample the most likely token
-    const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);
+    const auto new_token_id = llama_sampler_sample(sampling, context, batch->n_tokens - 1);
+
+    llama_sampler_accept(sampling, new_token_id);

    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
    if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
--- a/examples/llama.swiftui/README.md
+++ b/examples/llama.swiftui/README.md
@ -1,12 +0,0 @@
-# llama.cpp/examples/llama.swiftui
-
-Local inference of llama.cpp on an iPhone. This is a sample app that can be used as a starting
-point for more advanced projects.
-
-For usage instructions and performance stats, check the following discussion: https://github.com/ggerganov/llama.cpp/discussions/4508
-
-![image](https://github.com/ggerganov/llama.cpp/assets/1991296/2b40284f-8421-47a2-b634-74eece09a299)
-
-Video demonstration:
-
-https://github.com/bachittle/llama.cpp/assets/39804642/e290827a-4edb-4093-9642-2a5e399ec545
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@ -24,6 +24,7 @@ func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama
 actor LlamaContext {
    private var model: OpaquePointer
    private var context: OpaquePointer
+    private var sampling: UnsafeMutablePointer<llama_sampler>
    private var batch: llama_batch
    private var tokens_list: [llama_token]
    var is_done: Bool = false
@ -42,9 +43,15 @@ actor LlamaContext {
        self.tokens_list = []
        self.batch = llama_batch_init(512, 0, 1)
        self.temporary_invalid_cchars = []
+        let sparams = llama_sampler_chain_default_params()
+        self.sampling = llama_sampler_chain_init(sparams)
+        llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.4))
+        llama_sampler_chain_add(self.sampling, llama_sampler_init_softmax())
+        llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234))
    }

    deinit {
+        llama_sampler_free(sampling)
        llama_batch_free(batch)
        llama_free(context)
        llama_free_model(model)
@ -69,7 +76,6 @@ actor LlamaContext {
        print("Using \(n_threads) threads")

        var ctx_params = llama_context_default_params()
-        ctx_params.seed  = 1234
        ctx_params.n_ctx = 2048
        ctx_params.n_threads       = Int32(n_threads)
        ctx_params.n_threads_batch = Int32(n_threads)
@ -144,20 +150,9 @@ actor LlamaContext {
    func completion_loop() -> String {
        var new_token_id: llama_token = 0

-        let n_vocab = llama_n_vocab(model)
-        let logits = llama_get_logits_ith(context, batch.n_tokens - 1)
+        new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)

-        var candidates = Array<llama_token_data>()
-        candidates.reserveCapacity(Int(n_vocab))
-
-        for token_id in 0..<n_vocab {
-            candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
-        }
-        candidates.withUnsafeMutableBufferPointer() { buffer in
-            var candidates_p = llama_token_data_array(data: buffer.baseAddress, size: buffer.count, sorted: false)
-
-            new_token_id = llama_sample_token_greedy(context, &candidates_p)
-        }
+        llama_sampler_accept(sampling, new_token_id)

        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            print("\n")
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@ -1,377 +0,0 @@
-# MobileVLM
-
-Currently this implementation supports [MobileVLM-1.7B](https://huggingface.co/mtgv/MobileVLM-1.7B) / [MobileVLM_V2-1.7B](https://huggingface.co/mtgv/MobileVLM_V2-1.7B) variants.
-
-for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM)
-
-The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
-
-Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.
-
-## Usage
-Build with cmake or run `make llama-llava-cli` to build it.
-
-After building, run: `./llama-llava-cli` to see the usage. For example:
-
-```sh
-./llama-llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
-    --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
-    --image path/to/an/image.jpg \
-    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
-```
-
-## Model conversion
-
-1. Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
-
-```sh
-git clone https://huggingface.co/mtgv/MobileVLM-1.7B
-
-git clone https://huggingface.co/openai/clip-vit-large-patch14-336
-```
-
-2. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
-
-```sh
-python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
-```
-
-3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
-
-```sh
-python ./examples/llava/convert_image_encoder_to_gguf \
-    -m path/to/clip-vit-large-patch14-336 \
-    --llava-projector path/to/MobileVLM-1.7B/llava.projector \
-    --output-dir path/to/MobileVLM-1.7B \
-    --projector-type ldp
-```
-
-```sh
-python ./examples/llava/convert_image_encoder_to_gguf \
-    -m path/to/clip-vit-large-patch14-336 \
-    --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
-    --output-dir path/to/MobileVLM-1.7B_V2 \
-    --projector-type ldpv2
-```
-
-4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
-
-```sh
-python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B
-```
-
-5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
-```sh
-./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
-```
-
-Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
-
-## Android compile and run
-### compile
-refer to `examples/llava/android/build_64.sh`
-```sh
-mkdir examples/llava/android/build_64
-cd examples/llava/android/build_64
-../build_64.sh
-```
-### run on Android
-refer to `android/adb_run.sh`, modify resources' `name` and `path`
-
-## Some result on Android with `Snapdragon 888` chip
-### case 1
-**input**
-```sh
-/data/local/tmp/llama-llava-cli \
-    -m /data/local/tmp/ggml-model-q4_k.gguf \
-    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
-    -t 4 \
-    --image /data/local/tmp/demo.jpg \
-    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
-```
-**output**
-```sh
-encode_image_with_clip: image encoded in 21148.71 ms by CLIP (  146.87 ms per image patch)
- Susan Wise Bauer
-llama_print_timings:        load time =   23574.72 ms
-llama_print_timings:      sample time =       1.24 ms /     6 runs   (    0.21 ms per token,  4850.44 tokens per second)
-llama_print_timings: prompt eval time =   12460.15 ms /   246 tokens (   50.65 ms per token,    19.74 tokens per second)
-llama_print_timings:        eval time =     424.86 ms /     6 runs   (   70.81 ms per token,    14.12 tokens per second)
-llama_print_timings:       total time =   34731.93 ms
-```
-### case 2
-**input**
-```sh
-/data/local/tmp/llama-llava-cli \
-    -m /data/local/tmp/ggml-model-q4_k.gguf \
-    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
-    -t 4 \
-    --image /data/local/tmp/cat.jpeg \
-    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
-```
-**output**
-```sh
-encode_image_with_clip: image encoded in 21149.51 ms by CLIP (  146.87 ms per image patch)
- The image depicts a cat sitting in the grass near some tall green plants.
-llama_print_timings:        load time =   23257.32 ms
-llama_print_timings:      sample time =       5.25 ms /    18 runs   (    0.29 ms per token,  3430.53 tokens per second)
-llama_print_timings: prompt eval time =   11900.73 ms /   232 tokens (   51.30 ms per token,    19.49 tokens per second)
-llama_print_timings:        eval time =    1279.03 ms /    18 runs   (   71.06 ms per token,    14.07 tokens per second)
-llama_print_timings:       total time =   34570.79 ms
-```
-
-
-## Some result on Android with `Snapdragon 778G` chip
-### MobileVLM-1.7B case
-#### llava-cli release-b2005
-**input**
-```sh
-/data/local/tmp/llama-llava-cli \
-    -m /data/local/tmp/ggml-model-q4_k.gguf \
-    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
-    -t 4 \
-    --image /data/local/tmp/many_llamas.jpeg \
-    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:"
-```
-**output**
-```sh
-encode_image_with_clip: image encoded in 18728.52 ms by CLIP (  130.06 ms per image patch)
-system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
-user_prompt: \nWhat's that? ASSISTANT:
-
- A group of llamas are standing in a green pasture.
-
-llama_print_timings:        load time =   20357.33 ms
-llama_print_timings:      sample time =       2.96 ms /    14 runs   (    0.21 ms per token,  4734.53 tokens per second)
-llama_print_timings: prompt eval time =    8119.49 ms /   191 tokens (   42.51 ms per token,    23.52 tokens per second)
-llama_print_timings:        eval time =    1005.75 ms /    14 runs   (   71.84 ms per token,    13.92 tokens per second)
-llama_print_timings:       total time =   28038.34 ms /   205 tokens
-```
-#### llava-cli latest-version
-**input**
-
-Just the same as above.
-
-**output**(seems to be much slower)
-```sh
-encode_image_with_clip: image embedding created: 144 tokens
-
-encode_image_with_clip: image encoded in 288268.88 ms by CLIP ( 2001.87 ms per image patch)
-system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
-user_prompt: \nWhat's that? ASSISTANT:
-
- It is a group of sheep standing together in a grass field.
-
-llama_print_timings:        load time =  818120.91 ms
-llama_print_timings:      sample time =       3.44 ms /    14 runs   (    0.25 ms per token,  4067.40 tokens per second)
-llama_print_timings: prompt eval time =  529274.69 ms /   191 tokens ( 2771.07 ms per token,     0.36 tokens per second)
-llama_print_timings:        eval time =   43894.02 ms /    13 runs   ( 3376.46 ms per token,     0.30 tokens per second)
-llama_print_timings:       total time =  865441.76 ms /   204 tokens
-```
-### MobileVLM_V2-1.7B case
-#### llava-cli release-2005b
-**input**
-
-Just the same as above.
-
-**output**
-```sh
-encode_image_with_clip: image encoded in 20609.61 ms by CLIP (  143.12 ms per image patch)
-system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
-user_prompt: \nWhat's that? ASSISTANT:
-
- This image captures a lively scene of 20 llamas in motion on an expansive, grassy field. The llama is scattered across the landscape with some standing and others sitting down as if taking rest or observing their surroundings from different vantage points within this verdant setting.
-
-The background offers glimpses into a picturesque town nestled amidst hills under an overcast sky, adding depth to the scene while also emphasizing that distance between these llama and human-made structures like houses or roads in which they roam freely without any barriers around them. The image is framed by text at both right angles on white backgrounds against a contrasting blue backdrop with green foliage, further drawing attention to the llamas amidst their natural habitat while also inviting viewers into this picturesque landscape within town limits of Alta Llama
-
-llama_print_timings:        load time =   22406.77 ms
-llama_print_timings:      sample time =      49.26 ms /   186 runs   (    0.26 ms per token,  3776.27 tokens per second)
-llama_print_timings: prompt eval time =    9044.54 ms /   191 tokens (   47.35 ms per token,    21.12 tokens per second)
-llama_print_timings:        eval time =   14497.49 ms /   186 runs   (   77.94 ms per token,    12.83 tokens per second)
-llama_print_timings:       total time =   44411.01 ms /   377 tokens
-```
-
-## Orin compile and run
-### compile
-```sh
-make GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_87 GGML_CUDA_F16=1 -j 32
-```
-### run on Orin
-### case 1
-**input**
-```sh
-./llama-llava-cli \
-    -m /data/local/tmp/ggml-model-q4_k.gguf \
-    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
-    --image /data/local/tmp/demo.jpeg \
-    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:" \
-    --n-gpu-layers 999
-```
-**output**
-```sh
-
-encode_image_with_clip: image encoded in   296.62 ms by CLIP (    2.06 ms per image patch)
-
- Susan Wise Bauer
-
-llama_print_timings:        load time =    1067.64 ms
-llama_print_timings:      sample time =       1.53 ms /     6 runs   (    0.25 ms per token,  3934.43 tokens per second)
-llama_print_timings: prompt eval time =     306.84 ms /   246 tokens (    1.25 ms per token,   801.72 tokens per second)
-llama_print_timings:        eval time =      91.50 ms /     6 runs   (   15.25 ms per token,    65.58 tokens per second)
-llama_print_timings:       total time =    1352.63 ms /   252 tokens
-```
-
-### case 2
-**input**
-```sh
-./llama-llava-cli \
-    -m /data/local/tmp/ggml-model-q4_k.gguf \
-    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
-    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \
-    --n-gpu-layers 999
-
-```
-**output**
-```sh
-encode_image_with_clip: image encoded in   302.15 ms by CLIP (    2.10 ms per image patch)
-
- The image features a cat lying in the grass.
-
-llama_print_timings:        load time =    1057.07 ms
-llama_print_timings:      sample time =       3.27 ms /    11 runs   (    0.30 ms per token,  3360.83 tokens per second)
-llama_print_timings: prompt eval time =     213.60 ms /   232 tokens (    0.92 ms per token,  1086.14 tokens per second)
-llama_print_timings:        eval time =     166.65 ms /    11 runs   (   15.15 ms per token,    66.01 tokens per second)
-llama_print_timings:       total time =    1365.47 ms /   243 tokens
-```
-
-## Running on Intel(R) Core(TM) i7-10750H
-### Operating system
-Ubuntu22.04
-### compile
-```sh
-make -j32
-```
-### MobileVLM-1.7B case
-**input**
-```sh
-m /path/to/ggml-model-q4_k.gguf \
-    --mmproj /path/to/mmproj-model-f16.gguf \
-    --image /path/to/many_llamas.jpeg
-    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:" \
-```
-**output**
-```sh
-encode_image_with_clip: image embedding created: 144 tokens
-
-encode_image_with_clip: image encoded in  2730.94 ms by CLIP (   18.96 ms per image patch)
-system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
-user_prompt: \nWhat's that?ASSISTANT:
-
- A group of llamas are walking together in a field.
-
-llama_print_timings:        load time =    5506.60 ms
-llama_print_timings:      sample time =       0.44 ms /    13 runs   (    0.03 ms per token, 29545.45 tokens per second)
-llama_print_timings: prompt eval time =    2031.58 ms /   190 tokens (   10.69 ms per token,    93.52 tokens per second)
-llama_print_timings:        eval time =     438.92 ms /    12 runs   (   36.58 ms per token,    27.34 tokens per second)
-llama_print_timings:       total time =    5990.25 ms /   202 tokens
-```
-
-### MobileVLM_V2-1.7B case
-**input**
-
-Just the same as above.
-
-**ouput**
-```sh
-encode_image_with_clip: image embedding created: 144 tokens
-
-encode_image_with_clip: image encoded in  3223.89 ms by CLIP (   22.39 ms per image patch)
-system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
-user_prompt: \nWhat's that?ASSISTANT:
-
- The image captures a tranquil scene in a park, where a group of approximately 20 llamas are gathered. The llamas, a mix of white and black, are standing in a line, their black and white patterns contrasting with the lush green grass of the park. The lamas are arranged in a line, suggesting a social order.
-
-The park itself is lush and green, with trees dotting the landscape in the background. A sign reading "Llamas Tico  Ana" is also visible in the image, possibly indicating the location or the breed of the llamas. The image seems to be taken from a distance, providing a wide view of the scene and the surrounding environment.
-
-The llamas' positions relative to each other, the sign, and the trees create a harmonious composition. The image does not contain any discernible text. The overall scene is one of peace and natural beauty, with the llamas in their natural habitat, surrounded by the vibrant colors and lush greenery of the park.
-
-llama_print_timings:        load time =    6642.61 ms
-llama_print_timings:      sample time =       8.15 ms /   223 runs   (    0.04 ms per token, 27358.61 tokens per second)
-llama_print_timings: prompt eval time =    2475.07 ms /   190 tokens (   13.03 ms per token,    76.77 tokens per second)
-llama_print_timings:        eval time =    8760.60 ms /   222 runs   (   39.46 ms per token,    25.34 tokens per second)
-llama_print_timings:       total time =   15513.95 ms /   412 tokens
-```
-
-## Run on Intel(R) Core(TM) Ultra7 115H
-### operation system
-Windows11
-### comiple
-```sh
-make -j32
-```
-### MobileVLM-1.7B case
-**input**
-```sh
-m /path/to/ggml-model-q4_k.gguf \
-    --mmproj /path/to/tmp/mmproj-model-f16.gguf \
-    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:" \
-```
-**output**
-```sh
-encode_image_with_clip: image encoded in  4902.81 ms by CLIP (   34.05 ms per image patch)
-system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
-user_prompt: \nWhat's that? ASSISTANT:
-
- The image features a group of brown and white llamas standing in a grassy field.
-
-llama_print_timings:        load time =    7441.06 ms
-llama_print_timings:      sample time =       0.72 ms /    19 runs   (    0.04 ms per token, 26279.39 tokens per second)
-llama_print_timings: prompt eval time =    2090.71 ms /   191 tokens (   10.95 ms per token,    91.36 tokens per second)
-llama_print_timings:        eval time =     512.35 ms /    18 runs   (   28.46 ms per token,    35.13 tokens per second)
-llama_print_timings:       total time =    7987.23 ms /   209 tokens
-```
-
-### MobileVLM_V2-1.7B case
-**input**
-
-Just the same as above.
-
-**output**
-```sh
-encode_image_with_clip: image encoded in  4682.44 ms by CLIP (   32.52 ms per image patch)
-system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
-user_prompt: \nWhat's that? ASSISTANT:
-
- This image captures a lively scene of a group of 14 llamas in a grassy field. The llamas, with their distinctive black and white coats, are standing and walking in a line, seemingly engaged in a social activity. One
- of them, possibly the first in the line, has its back turned, perhaps observing something in the distance.
-
-The llama in the front of the line stands out due to its black and white coloring, which is quite unusual for llama patterns. The llama in the front also seems to be more aware of its surroundings, as it faces the camera, giving a sense of engagement with the viewer.
-
-The image is taken from the side of the llama, providing a clear view of the llama in the front and its companions. The lameness in the llama in
- front is not visible, indicating that it might not be the main focus of the photo.
-
-The background of the image features a grassy field, with a fence and a tree visible in the distance. The tree appears to be bare, suggesting that it might be during a time of year when most trees are dormant or have shed their leaves.
-
-
-llama_print_timings:        load time =    7015.35 ms
-llama_print_timings:      sample time =      10.61 ms /   256 runs   (    0.04 ms per token, 24119.09 tokens per second)
-llama_print_timings: prompt eval time =    2052.45 ms /   191 tokens (   10.75 ms per token,    93.06 tokens per second)
-llama_print_timings:        eval time =    7259.43 ms /   255 runs   (   28.47 ms per token,    35.13 tokens per second)
-llama_print_timings:       total time =   14371.19 ms /   446 tokens
-```
-
-## TODO
-
- [x] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid`
- [ ] Optimize LDP projector performance
-
-      - Optimize the structure definition to avoid unnecessary memory rearrangements, to reduce the use of `ggml_permute_cpy`;
-      - Optimize operator implementation (ARM CPU/NVIDIA GPU): such as depthwise conv, hardswish, hardsigmoid, etc.
- [x] run MobileVLM on `Jetson Orin`
- [ ] Support more model variants, such as `MobileVLM-3B`.
-
-
-## contributor
-```sh
-zhangjidong05, yangyang260, huyiming03, chenxiaotao03, ZiangWu-77
-```
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@ -1,139 +0,0 @@
-# LLaVA
-
-Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants,
-as well as llava-1.6 [llava-v1.6](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2) variants.
-
-The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
-and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
-models are available.
-For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](https://huggingface.co/cmp-nct/llava-1.6-gguf)
-
-After API is confirmed, more models will be supported / uploaded.
-
-## Usage
-Build with cmake or run `make llama-llava-cli` to build it.
-
-After building, run: `./llama-llava-cli` to see the usage. For example:
-
-```sh
-./llama-llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
-```
-
-**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
-**note**: For GPU offloading ensure to use the `-ngl` flag just like usual
-
-## LLaVA 1.5
-
-1. Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
-
-```sh
-git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
-
-git clone https://huggingface.co/openai/clip-vit-large-patch14-336
-```
-
-2. Install the required Python packages:
-
-```sh
-pip install -r examples/llava/requirements.txt
-```
-
-3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
-
-```sh
-python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b
-```
-
-4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:
-
-```sh
-python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
-```
-
-5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
-
-```sh
-python ./examples/convert_legacy_llama.py ../llava-v1.5-7b --skip-unknown
-```
-
-Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
-
-## LLaVA 1.6 gguf conversion
-1) First clone a LLaVA 1.6 model:
-```console
-git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
-```
-
-2) Install the required Python packages:
-
-```sh
-pip install -r examples/llava/requirements.txt
-```
-
-3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
-```console
-python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
-```
- you will find a llava.projector and a llava.clip file in your model directory
-
-4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
-```console
-mkdir vit
-cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
-cp ../llava-v1.6-vicuna-7b/llava.projector vit/
-curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
-```
-
-5) Create the visual gguf model:
-```console
-python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
-```
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
-
-6) Then convert the model to gguf format:
-```console
-python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
-```
-
-7) And finally we can run the llava cli using the 1.6 model version:
-```console
-./llama-llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
-```
-
-**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
-**note** llava-1.6 greatly benefits from batched prompt processing (defaults work)
-
-## llava-cli templating and llava-1.6 prompting
-
-llava-1.5 models all use the same vicuna prompt, here you can just add your image question like `-p "Provide a full description."`
-For llava-1.5 models which are not vicuna (mistral and Yi) you need to adapt system prompt as well as user prompt, for this purpose llava-cli has a basic templating system:
-
-**For Mistral and using llava-cli binary:**
-Add this: `-p "<image>\nUSER:\nProvide a full description.\nASSISTANT:\n"`
-The mistral template for llava-1.6 seems to be no system print and a USER/ASSISTANT role
-
-**For the 34B this should work:**
-Add this: `-e -p <|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nProvide a full description.<|im_end|><|im_start|>assistant\n`
-
-
-## How to know if you are running in llava-1.5 or llava-1.6 mode
-
-When running llava-cli you will see a visual information right before the prompt is being processed:
-
-**Llava-1.5:**
-`encode_image_with_clip: image embedding created: 576 tokens`
-
-**Llava-1.6 (anything above 576):**
-`encode_image_with_clip: image embedding created: 2880 tokens`
-
-
-Alternatively just pay notice to how many "tokens" have been used for your prompt, it will also show 1000+ tokens for llava-1.6
-
-
-
-
-## TODO
-
- [x] Support non-CPU backend for the image encoding part.
- [ ] Support different sampling methods.
- [ ] Support more model variants.
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@ -40,11 +40,11 @@ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n
    return true;
 }

-static const char * sample(struct llama_sampling_context * ctx_sampling,
+static const char * sample(struct gpt_sampler * smpl,
                           struct llama_context * ctx_llama,
                           int * n_past) {
-    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
-    llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
+    const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1);
+    gpt_sampler_accept(smpl, id, true);
    static std::string ret;
    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
        ret = "</s>";
@ -191,15 +191,15 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_

    LOG_TEE("\n");

-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
-    if (!ctx_sampling) {
+    struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
+    if (!smpl) {
        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
        exit(1);
    }

    std::string response = "";
    for (int i = 0; i < max_tgt_len; i++) {
-        const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
+        const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
        response += tmp;
        if (strcmp(tmp, "</s>") == 0) break;
        if (strstr(tmp, "###")) break; // Yi-VL behavior
@ -211,7 +211,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
        fflush(stdout);
    }

-    llama_sampling_free(ctx_sampling);
+    gpt_sampler_free(smpl);
    printf("\n");
 }

@ -310,7 +310,7 @@ int main(int argc, char ** argv) {
        // process the prompt
        process_prompt(ctx_llava, image_embed, &params, params.prompt);

-        llama_print_timings(ctx_llava->ctx_llama);
+        llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
        llava_image_embed_free(image_embed);
        ctx_llava->model = NULL;
        llava_free(ctx_llava);
@ -327,7 +327,7 @@ int main(int argc, char ** argv) {
            // process the prompt
            process_prompt(ctx_llava, image_embed, &params, params.prompt);

-            llama_print_timings(ctx_llava->ctx_llama);
+            llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
            llava_image_embed_free(image_embed);
            ctx_llava->model = NULL;
            llava_free(ctx_llava);
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@ -163,11 +163,11 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
    LOG_TEE("%s: image token past: %d\n", __func__, n_past);
 }

-static const char * sample(struct llama_sampling_context * ctx_sampling,
+static const char * sample(struct gpt_sampler * smpl,
                           struct llama_context * ctx_llama,
                           int * n_past) {
-    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
-    llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
+    const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1);
+    gpt_sampler_accept(smpl, id, true);
    static std::string ret;
    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
        ret = "</s>";
@ -214,7 +214,7 @@ static struct llava_context * minicpmv_init(gpt_params * params, const std::stri
    return ctx_llava;
 }

-static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
+static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
    std::string user_prompt = prompt;
    int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
    if (!is_first) {
@ -238,13 +238,13 @@ static struct llama_sampling_context * llama_init(struct llava_context * ctx_lla

    LOG_TEE("\n");

-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
-    return ctx_sampling;
+    struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
+    return smpl;
 }

-static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){
+static const char * llama_loop(struct llava_context * ctx_llava,struct gpt_sampler * smpl, int &n_past){

-    const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
+    const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
    return tmp;
 }

@ -278,12 +278,12 @@ int main(int argc, char ** argv) {
        if (!params.prompt.empty()) {
            LOG_TEE("<user>%s\n", params.prompt.c_str());
            LOG_TEE("<assistant>");
-            auto ctx_sampling = llama_init(ctx_llava, &params, params.prompt.c_str(), n_past, true);
+            auto smpl = llama_init(ctx_llava, &params, params.prompt.c_str(), n_past, true);
            const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
            std::string response = "";
            bool have_tmp = false;
            for (int i = 0; i < max_tgt_len; i++) {
-                auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
+                auto tmp = llama_loop(ctx_llava, smpl, n_past);
                response += tmp;
                if (strcmp(tmp, "</s>") == 0){
                    if(!have_tmp)continue;
@ -296,18 +296,18 @@ int main(int argc, char ** argv) {

                fflush(stdout);
            }
-            llama_sampling_free(ctx_sampling);
+            gpt_sampler_free(smpl);
        }else {
            while (true) {
                LOG_TEE("<user>");
                std::string prompt;
                std::getline(std::cin, prompt);
                LOG_TEE("<assistant>");
-                auto ctx_sampling = llama_init(ctx_llava, &params, prompt, n_past, true);
+                auto smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
                const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
                std::string response = "";
                for (int i = 0; i < max_tgt_len; i++) {
-                    auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
+                    auto tmp = llama_loop(ctx_llava, smpl, n_past);
                    response += tmp;
                    if (strcmp(tmp, "</s>") == 0) break;
                    if (strstr(tmp, "###")) break; // Yi-VL behavior
@ -315,11 +315,11 @@ int main(int argc, char ** argv) {
                    if (strstr(response.c_str(), "<user>")) break; // minicpm-v
                    fflush(stdout);
                }
-                llama_sampling_free(ctx_sampling);
+                gpt_sampler_free(smpl);
            }
        }
        printf("\n");
-        llama_print_timings(ctx_llava->ctx_llama);
+        llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);

        ctx_llava->model = NULL;
        llava_free(ctx_llava);
--- a/examples/lookahead/README.md
+++ b/examples/lookahead/README.md
@ -1,7 +0,0 @@
-# llama.cpp/examples/lookahead
-
-Demonstration of lookahead decoding technique:
-
-https://lmsys.org/blog/2023-11-21-lookahead-decoding/
-
-More info: https://github.com/ggerganov/llama.cpp/pull/4207
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@ -1,7 +1,6 @@
 #include "common.h"
 #include "llama.h"

-#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
@ -118,7 +117,7 @@ int main(int argc, char ** argv) {
    llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);

    // target model sampling context
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
+    struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);

    // verification n-grams
    std::vector<ngram_data> ngrams_cur(G);
@ -159,9 +158,9 @@ int main(int argc, char ** argv) {

    // sample first token
    {
-        id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0);
+        id = gpt_sampler_sample(smpl, ctx, 0);

-        llama_sampling_accept(ctx_sampling, ctx, id, true);
+        gpt_sampler_accept(smpl, id, true);

        {
            const std::string token_str = llama_token_to_piece(ctx, id);
@ -284,9 +283,9 @@ int main(int argc, char ** argv) {
            }

            // sample the next token
-            id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_batch);
+            id = gpt_sampler_sample(smpl, ctx, i_batch);

-            llama_sampling_accept(ctx_sampling, ctx, id, true);
+            gpt_sampler_accept(smpl, id, true);

            // print
            {
@ -361,7 +360,7 @@ int main(int argc, char ** argv) {
                if (v == 0) {
                    // sample from the last level
                    for (int i = 0; i < W; i++) {
-                        tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
+                        tokens_j[N - 2][i] = gpt_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
                    }
                } else {
                    for (int i = 0; i < W; i++) {
@ -468,10 +467,12 @@ int main(int argc, char ** argv) {
    LOG_TEE("n_predict = %d\n", n_predict);
    LOG_TEE("n_accept  = %d\n", n_accept);

-    llama_print_timings(ctx);
+    LOG_TEE("\n");
+    gpt_perf_print(ctx, smpl);
+
+    gpt_sampler_free(smpl);

    llama_kv_cache_view_free(&kvc_view);
-    llama_sampling_free(ctx_sampling);

    llama_batch_free(batch);

--- a/examples/lookup/README.md
+++ b/examples/lookup/README.md
@ -1,12 +0,0 @@
-# llama.cpp/examples/lookup
-
-Demonstration of Prompt Lookup Decoding
-
-https://github.com/apoorvumang/prompt-lookup-decoding
-
-The key parameters for lookup decoding are `ngram_min`, `ngram_max` and `n_draft`. The first two determine the size of the ngrams to search for in the prompt for a match. The latter specifies how many subsequent tokens to draft if a match is found.
-
-More info:
-
-https://github.com/ggerganov/llama.cpp/pull/4484
-https://github.com/ggerganov/llama.cpp/issues/4226
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@ -3,13 +3,11 @@
 #include "common.h"
 #include "ngram-cache.h"

-#include <cmath>
 #include <cstdint>
 #include <cstdio>
 #include <fstream>
 #include <string>
 #include <vector>
-#include <unordered_map>

 int main(int argc, char ** argv){
    gpt_params params;
@ -106,7 +104,7 @@ int main(int argc, char ** argv){

    bool has_eos = false;

-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
+    struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);

    std::vector<llama_token> draft;

@ -130,9 +128,9 @@ int main(int argc, char ** argv){
        int i_dft = 0;
        while (true) {
            // sample from the target model
-            llama_token id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_dft);
+            llama_token id = gpt_sampler_sample(smpl, ctx, i_dft);

-            llama_sampling_accept(ctx_sampling, ctx, id, true);
+            gpt_sampler_accept(smpl, id, true);

            const std::string token_str = llama_token_to_piece(ctx, id);

@ -240,10 +238,12 @@ int main(int argc, char ** argv){
    LOG_TEE("n_accept     = %d\n", n_accept);
    LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);

-    LOG_TEE("\ntarget:\n");
-    llama_print_timings(ctx);
+    LOG_TEE("\ntarget:\n\n");
+    llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
+    llama_perf_print(ctx,  LLAMA_PERF_TYPE_CONTEXT);
+
+    gpt_sampler_free(smpl);

-    llama_sampling_free(ctx_sampling);
    llama_batch_free(batch_tgt);

    llama_free(ctx);
--- a/examples/main-cmake-pkg/README.md
+++ b/examples/main-cmake-pkg/README.md
@ -1,33 +0,0 @@
-# llama.cpp/example/main-cmake-pkg
-
-This program builds [llama-cli](../main) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
-
-## Building
-
-Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions.
-
-### Considerations
-
-When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
-
-### Build llama.cpp and install to C:\LlamaCPP directory
-
-In this case, CLBlast was already installed so the CMake package is referenced in `CMAKE_PREFIX_PATH`.
-
-```cmd
-git clone https://github.com/ggerganov/llama.cpp
-cd llama.cpp
-cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
-cmake --build build --config Release
-cmake --install build --prefix C:/LlamaCPP
-```
-
-### Build llama-cli-cmake-pkg
-
-
-```cmd
-cd ..\examples\main-cmake-pkg
-cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
-cmake --build build --config Release
-cmake --install build --prefix C:/MyLlamaApp
-```
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -1,319 +0,0 @@
-# llama.cpp/examples/main
-
-This example program allows you to use various LLaMA language models easily and efficiently. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
-
-## Table of Contents
-
-1. [Quick Start](#quick-start)
-2. [Common Options](#common-options)
-3. [Input Prompts](#input-prompts)
-4. [Interaction](#interaction)
-5. [Context Management](#context-management)
-6. [Generation Flags](#generation-flags)
-7. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options)
-8. [Additional Options](#additional-options)
-
-## Quick Start
-
-To get started right away, run the following command, making sure to use the correct path for the model you have:
-
-First, we will need to download a model. In these examples, we will use the Gemma model from the ggml-org repo on Hugging Face.
-[https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true)
-
-Once downloaded, place your model in the models folder in llama.cpp.
-
-### Unix-based systems (Linux, macOS, etc.):
-
-##### Input prompt (One-and-done)
-
-```bash
-./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time"
-```
-##### Conversation mode (Allow for continuous interaction with the model)
-
-```bash
-./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma
-```
-
-##### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
-```bash
-./llama-cli -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
-```
-
-### Windows:
-
-##### Input prompt (One-and-done)
-```powershell
-./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time"
-```
-##### Conversation mode (Allow for continuous interaction with the model)
-
-```powershell
-./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma
-```
-
-#### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
-
-```powershell
-llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
-```
-
-## Common Options
-
-In this section, we cover the most commonly used options for running the `llama-cli` program with the LLaMA models:
-
-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/gemma-1.1-7b-it.Q4_K_M.gguf`; inferred from `--model-url` if set).
-   `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g [https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true)).
-   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
-   `-mli, --multiline-input`: Allows you to write or paste multiple lines without ending each in '\'
-   `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has.
-   -   `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
-
-## Input Prompts
-
-The `llama-cli` program provides several ways to interact with the LLaMA models using input prompts:
-
-   `--prompt PROMPT`: Provide a prompt directly as a command-line option.
-   `--file FNAME`: Provide a file containing a prompt or multiple prompts.
-   `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
-
-## Interaction
-
-The `llama-cli` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive` and `--interactive-first`.
-
-In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
-
-### Interaction Options
-
-   `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model.
-   `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
-   `-cnv,  --conversation`:  Run the program in conversation mode (does not print special tokens and suffix/prefix, use default chat template) (default: false)
-   `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
-
-By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
-
-### Reverse Prompts
-
-Reverse prompts are a powerful way to create a chat-like experience with a LLaMA model by pausing the text generation when specific text strings are encountered:
-
-   `-r PROMPT, --reverse-prompt PROMPT`: Specify one or multiple reverse prompts to pause text generation and switch to interactive mode. For example, `-r "User:"` can be used to jump back into the conversation whenever it's the user's turn to speak. This helps create a more interactive and conversational experience. However, the reverse prompt doesn't work when it ends with a space.
-
-To overcome this limitation, you can use the `--in-prefix` flag to add a space or any other characters after the reverse prompt.
-
-### In-Prefix
-
-The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag:
-
-```sh
-./llama-cli -r "User:" --in-prefix " "
-```
-
-### In-Suffix
-
-The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag:
-
-```sh
-./llama-cli -r "User:" --in-prefix " " --in-suffix "Assistant:"
-```
-When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled
-
-### Chat templates
-
- `--chat-template JINJA_TEMPLATE`: This option sets a custom jinja chat template. It accepts a string, not a file name.  Default: template taken from model's metadata. Llama.cpp only supports [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template). These include llama2, llama3, gemma, monarch, chatml, orion, vicuna, vicuna-orca, deepseek, command-r, zephyr. When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled.
-
- Example usage: `--chat-template gemma`
-
-## Context Management
-
-During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations.
-
-### Context Size
-
- `-c N, --ctx-size N`: Set the size of the prompt context (default: 0, 0 = loaded from model). The LLaMA models were built with a context of 2048-8192, which will yield the best results on longer input/inference.
-
-### Extended Context Size
-
-Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model has a context length (max sequence length) of 4096 (4k) and the fine-tuned model has 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
-
-   `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
-
-### Keep Prompt
-
-The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained.
-
-   `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
-
-By utilizing context management options like `--ctx-size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
-
-## Generation Flags
-
-The following options allow you to control the text generation process and fine-tune the diversity, creativity, and quality of the generated text according to your needs. By adjusting these options and experimenting with different combinations of values, you can find the best settings for your specific use case.
-
-### Number of Tokens to Predict
-
-   `-n N, --predict N`: Set the number of tokens to predict when generating text (default: -1, -1 = infinity, -2 = until context filled)
-
-The `--predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text.
-
-A value of -1 will enable infinite text generation, even though we have a finite context window. When the context window is full, some of the earlier tokens (half of the tokens after `--keep`) will be discarded. The context must then be re-evaluated before generation can resume. On large models and/or large context windows, this will result in a significant pause in output.
-
-If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled.
-
-It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode, text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `--predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
-
-### Temperature
-
-   `--temp N`: Adjust the randomness of the generated text (default: 0.8).
-
-Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
-
-Example usage: `--temp 0`
-
-### Repeat Penalty
-
-   `--repeat-penalty N`: Control the repetition of token sequences in the generated text default: 1.0, 1.0 = disabled).
-   `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
-   `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty.
-
-The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.
-
-The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`).
-
-Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases.
-
-Example usage: `--repeat-penalty 1.15 --repeat-last-n 128 --no-penalize-nl`
-
-### Top-K Sampling
-
-   `--top-k N`: Limit the next token selection to the K most probable tokens (default: 40).
-
-Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top-k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40.
-
-Example usage: `--top-k 30`
-
-### Top-P Sampling
-
-   `--top-p N`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
-
-Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top-p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9.
-
-Example usage: `--top-p 0.95`
-
-### Min-P Sampling
-
-   `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.1).
-
-The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.
-
-Example usage: `--min-p 0.05`
-
-### Tail-Free Sampling (TFS)
-
-   `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
-
-Tail-free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. Similar to Top-P it tries to determine the bulk of the most likely tokens dynamically. But TFS filters out logits based on the second derivative of their probabilities. Adding tokens is stopped after the sum of the second derivatives reaches the parameter z. In short: TFS looks at how quickly the probabilities of the tokens decrease and cuts off the tail of unlikely tokens using the parameter z. Typical values for z are in the range of 0.9 to 0.95. A value of 1.0 would include all tokens and thus disables the effect of TFS.
-
-Example usage: `--tfs 0.95`
-
-### Locally Typical Sampling
-
-   `--typical N`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled).
-
-Locally typical sampling promotes the generation of contextually coherent and diverse text by sampling tokens that are typical or expected based on the surrounding context. By setting the parameter p between 0 and 1, you can control the balance between producing text that is locally coherent and diverse. A value closer to 1 will promote more contextually coherent tokens, while a value closer to 0 will promote more diverse tokens. A value equal to 1 disables locally typical sampling.
-
-Example usage: `--typical 0.9`
-
-### Mirostat Sampling
-
-   `--mirostat N`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
-   `--mirostat-lr N`: Set the Mirostat learning rate, parameter eta (default: 0.1).
-   `--mirostat-ent N`: Set the Mirostat target entropy, parameter tau (default: 5.0).
-
-Mirostat is an algorithm that actively maintains the quality of generated text within a desired range during text generation. It aims to strike a balance between coherence and diversity, avoiding low-quality output caused by excessive repetition (boredom traps) or incoherence (confusion traps).
-
-The `--mirostat-lr` option sets the Mirostat learning rate (eta). The learning rate influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. The default value is `0.1`.
-
-The `--mirostat-ent` option sets the Mirostat target entropy (tau), which represents the desired perplexity value for the generated text. Adjusting the target entropy allows you to control the balance between coherence and diversity in the generated text. A lower value will result in more focused and coherent text, while a higher value will lead to more diverse and potentially less coherent text. The default value is `5.0`.
-
-Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0`
-
-### Logit Bias
-
-   `-l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS`: Modify the likelihood of a token appearing in the generated text completion.
-
-The logit bias option allows you to manually adjust the likelihood of specific tokens appearing in the generated text. By providing a token ID and a positive or negative bias value, you can increase or decrease the probability of that token being generated.
-
-For example, use `--logit-bias 15043+1` to increase the likelihood of the token 'Hello', or `--logit-bias 15043-1` to decrease its likelihood. Using a value of negative infinity, `--logit-bias 15043-inf` ensures that the token `Hello` is never produced.
-
-A more practical use case might be to prevent the generation of `\code{begin}` and `\code{end}` by setting the `\` token (29905) to negative infinity with `-l 29905-inf`. (This is due to the prevalence of LaTeX codes that show up in LLaMA model inference.)
-
-Example usage: `--logit-bias 29905-inf`
-
-### RNG Seed
-
-   `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).
-
-The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run.
-
-## Performance Tuning and Memory Options
-
-These options help improve the performance and memory usage of the LLaMA models. By adjusting these settings, you can fine-tune the model's behavior to better suit your system's capabilities and achieve optimal performance for your specific use case.
-
-### Number of Threads
-
-   `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). Using the correct number of threads can greatly improve performance.
-   `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. In some systems, it is beneficial to use a higher number of threads during batch processing than during generation. If not specified, the number of threads used for batch processing will be the same as the number of threads used for generation.
-
-### Mlock
-
-   `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. This can improve performance but trades away some of the advantages of memory-mapping by requiring more RAM to run and potentially slowing down load times as the model loads into RAM.
-
-### No Memory Mapping
-
-   `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. Disabling mmap results in slower load times but may reduce pageouts if you're not using `--mlock`. Note that if the model is larger than the total amount of RAM, turning off mmap would prevent the model from loading at all.
-
-### NUMA support
-
-   `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.
-   `--numa isolate`: Pin all threads to the NUMA node that the program starts on. This limits the number of cores and amount of memory that can be used, but guarantees all memory access remains local to the NUMA node.
-   `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitrary core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus.
-
- These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
-
-### Memory Float 32
-
-   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.
-
-### Batch Size
-
-   `-b N, --batch-size N`: Set the batch size for prompt processing (default: `2048`). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
-
- `-ub N`, `--ubatch-size N`: physical maximum batch size. This is for pipeline parallelization. Default: `512`.
-
-### Prompt Caching
-
-   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.
-
-### Grammars & JSON schemas
-
-   `--grammar GRAMMAR`, `--grammar-file FILE`: Specify a grammar (defined inline or in a file) to constrain model output to a specific format. For example, you could force the model to output JSON or to speak only in emojis. See the [GBNF guide](../../grammars/README.md) for details on the syntax.
-
-   `--json-schema SCHEMA`: Specify a [JSON schema](https://json-schema.org/) to constrain model output to (e.g. `{}` for any JSON object, or `{"items": {"type": "string", "minLength": 10, "maxLength": 100}, "minItems": 10}` for a JSON array of strings with size constraints). If a schema uses external `$ref`s, you should use `--grammar "$( python examples/json_schema_to_grammar.py myschema.json )"` instead.
-
-### Quantization
-
-For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-and-quantize).
-
-## Additional Options
-
-These options provide extra functionality and customization when running the LLaMA models:
-
-   `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
-   `--verbose-prompt`: Print the prompt before generating text.
-   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
-   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
-   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
-   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
-   `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable  or in an OS-specific local cache.
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -34,6 +34,7 @@

 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
+static gpt_sampler             ** g_smpl;
 static gpt_params               * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
@ -93,7 +94,7 @@ static void write_logfile(
    yaml_dump_string_multiline(logfile, "output", output.c_str());
    yaml_dump_vector_int(logfile, "output_tokens", output_tokens);

-    llama_dump_timing_info_yaml(logfile, ctx);
+    llama_perf_dump_yaml(logfile, ctx);
    fclose(logfile);
 }

@ -106,7 +107,7 @@ static void sigint_handler(int signo) {
        } else {
            console::cleanup();
            printf("\n");
-            llama_print_timings(*g_ctx);
+            gpt_perf_print(*g_ctx, *g_smpl);
            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
            _exit(130);
        }
@ -122,8 +123,7 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v

 static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
    llama_chat_msg new_msg{role, content};
-    auto formatted = llama_chat_format_single(
-        model, g_params->chat_template, chat_msgs, new_msg, role == "user");
+    auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
    chat_msgs.push_back({role, content});
    LOG("formatted: %s\n", formatted.c_str());
    return formatted;
@ -138,7 +138,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    llama_sampling_params & sparams = params.sparams;
+    auto & sparams = params.sparams;

 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("main", "log"));
@ -184,27 +184,23 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
    }

-    LOG_TEE("%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
-    LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+    print_build_info();

-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
+    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);

    LOG("%s: llama backend init\n", __func__);
    llama_backend_init();
    llama_numa_init(params.numa);

-    llama_model * model;
-    llama_context * ctx;
-    llama_context * ctx_guidance = NULL;
+    llama_model * model = nullptr;
+    llama_context * ctx = nullptr;
+    gpt_sampler * smpl = nullptr;
+
    std::vector<llama_chat_msg> chat_msgs;
+
    g_model = &model;
    g_ctx = &ctx;
+    g_smpl = &smpl;

    // load the model and apply lora adapter, if any
    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
@ -212,10 +208,6 @@ int main(int argc, char ** argv) {

    model = llama_init.model;
    ctx = llama_init.context;
-    if (sparams.cfg_scale > 1.f) {
-        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
-        ctx_guidance = llama_new_context_with_model(model, lparams);
-    }

    if (model == NULL) {
        LOG_TEE("%s: error: unable to load model\n", __func__);
@ -252,9 +244,6 @@ int main(int argc, char ** argv) {
    }

    llama_attach_threadpool(ctx, threadpool, threadpool_batch);
-    if (ctx_guidance) {
-        llama_attach_threadpool(ctx_guidance, threadpool, threadpool_batch);
-    }

    const int n_ctx_train = llama_n_ctx_train(model);
    const int n_ctx = llama_n_ctx(ctx);
@ -338,24 +327,6 @@ int main(int argc, char ** argv) {
    }

    // Tokenize negative prompt
-    std::vector<llama_token> guidance_inp;
-    int guidance_offset = 0;
-    int original_prompt_len = 0;
-    if (ctx_guidance) {
-        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
-
-        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
-        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
-
-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
-        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
-
-        original_prompt_len = original_inp.size();
-        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
-        LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
-        LOG("guidance_offset:     %s", log_tostr(guidance_offset));
-    }
-
    if ((int) embd_inp.size() > n_ctx - 4) {
        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
        return 1;
@ -422,15 +393,6 @@ int main(int argc, char ** argv) {
            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
        }

-        if (ctx_guidance) {
-            LOG_TEE("\n");
-            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
-            LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
-            for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
-            }
-        }
-
        if (params.n_keep > add_bos) {
            LOG_TEE("%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
@ -496,8 +458,15 @@ int main(int argc, char ** argv) {
            }
        }
    }
-    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
-    LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
+
+    smpl = gpt_sampler_init(model, sparams);
+    if (!smpl) {
+        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
+        exit(1);
+    }
+
+    LOG_TEE("sampling params: \n%s\n", sparams.print().c_str());
+    LOG_TEE(" sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str());
    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);

    // group-attention state
@ -544,7 +513,6 @@ int main(int argc, char ** argv) {
    int n_remain           = params.n_predict;
    int n_consumed         = 0;
    int n_session_consumed = 0;
-    int n_past_guidance    = 0;

    std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
    std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
@ -556,7 +524,6 @@ int main(int argc, char ** argv) {
    display = params.display_prompt;

    std::vector<llama_token> embd;
-    std::vector<llama_token> embd_guidance;

    // tokenized antiprompts
    std::vector<std::vector<llama_token>> antiprompt_ids;
@ -566,12 +533,6 @@ int main(int argc, char ** argv) {
        antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
    }

-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
-    if (!ctx_sampling) {
-        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
-        exit(1);
-    }
-
    if (llama_model_has_encoder(model)) {
        int enc_input_size = embd_inp.size();
        llama_token * enc_input_buf = embd_inp.data();
@ -613,7 +574,7 @@ int main(int argc, char ** argv) {
                // if we run out of context:
                // - take the n_keep first tokens from the original prompt (via n_past)
                // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-                if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) >= n_ctx) {
+                if (n_past + (int) embd.size() >= n_ctx) {
                    if (params.n_predict == -2) {
                        LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                        break;
@ -630,11 +591,7 @@ int main(int argc, char ** argv) {

                    n_past -= n_discard;

-                    if (ctx_guidance) {
-                        n_past_guidance -= n_discard;
-                    }
-
-                    LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
+                    LOG("after swap: n_past = %d\n", n_past);

                    LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());

@ -687,46 +644,6 @@ int main(int argc, char ** argv) {
                }
            }

-            // evaluate tokens in batches
-            // embd is typically prepared beforehand to fit within a batch, but not always
-            if (ctx_guidance) {
-                int input_size = 0;
-                llama_token * input_buf = NULL;
-
-                if (n_past_guidance < (int) guidance_inp.size()) {
-                    // Guidance context should have the same data with these modifications:
-                    //
-                    // * Replace the initial prompt
-                    // * Shift everything by guidance_offset
-                    embd_guidance = guidance_inp;
-                    if (embd.begin() + original_prompt_len < embd.end()) {
-                        embd_guidance.insert(
-                            embd_guidance.end(),
-                            embd.begin() + original_prompt_len,
-                            embd.end()
-                        );
-                    }
-
-                    input_buf  = embd_guidance.data();
-                    input_size = embd_guidance.size();
-
-                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
-                } else {
-                    input_buf  = embd.data();
-                    input_size = embd.size();
-                }
-
-                for (int i = 0; i < input_size; i += params.n_batch) {
-                    int n_eval = std::min(input_size - i, params.n_batch);
-                    if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
-                        LOG_TEE("%s : failed to eval\n", __func__);
-                        return 1;
-                    }
-
-                    n_past_guidance += n_eval;
-                }
-            }
-
            for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
                int n_eval = (int) embd.size() - i;
                if (n_eval > params.n_batch) {
@ -756,7 +673,6 @@ int main(int argc, char ** argv) {
        }

        embd.clear();
-        embd_guidance.clear();

        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
            // optionally save the session on first sample (for faster prompt loading next time)
@ -767,11 +683,11 @@ int main(int argc, char ** argv) {
                LOG("saved session to %s\n", path_session.c_str());
            }

-            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
+            const llama_token id = gpt_sampler_sample(smpl, ctx, -1);

-            llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);
+            gpt_sampler_accept(smpl, id, /* apply_grammar= */ true);

-            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
+            // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str());

            embd.push_back(id);

@ -790,7 +706,7 @@ int main(int argc, char ** argv) {

                // push the prompt in the sampling context in order to apply repetition penalties later
                // for the prompt, we don't apply grammar rules
-                llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);
+                gpt_sampler_accept(smpl, embd_inp[n_consumed], /* apply_grammar= */ false);

                ++n_consumed;
                if ((int) embd.size() >= params.n_batch) {
@ -833,7 +749,7 @@ int main(int argc, char ** argv) {
            // check for reverse prompt in the last n_prev tokens
            if (!params.antiprompt.empty()) {
                const int n_prev = 32;
-                const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);
+                const std::string last_output = gpt_sampler_prev_str(smpl, ctx, n_prev);

                is_antiprompt = false;
                // Check if each of the reverse prompts appears at the end of the output.
@ -855,7 +771,7 @@ int main(int argc, char ** argv) {
                }

                // check for reverse prompt using special tokens
-                llama_token last_token = llama_sampling_last(ctx_sampling);
+                llama_token last_token = gpt_sampler_last(smpl);
                for (std::vector<llama_token> ids : antiprompt_ids) {
                    if (ids.size() == 1 && last_token == ids[0]) {
                        if (params.interactive) {
@ -872,7 +788,7 @@ int main(int argc, char ** argv) {
            }

            // deal with end of generation tokens in interactive mode
-            if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
+            if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
                LOG("found an EOG token\n");

                if (params.interactive) {
@ -893,7 +809,7 @@ int main(int argc, char ** argv) {

            // if current token is not EOG, we add it to current assistant message
            if (params.conversation) {
-                auto id = llama_sampling_last(ctx_sampling);
+                const auto id = gpt_sampler_last(smpl);
                assistant_ss << llama_token_to_piece(ctx, id, false);
            }

@ -989,7 +905,7 @@ int main(int argc, char ** argv) {

            if (n_past > 0) {
                if (is_interacting) {
-                    llama_sampling_reset(ctx_sampling);
+                    gpt_sampler_reset(smpl);
                }
                is_interacting = false;
            }
@ -1014,14 +930,15 @@ int main(int argc, char ** argv) {
        llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
    }

-    llama_print_timings(ctx);
+    LOG_TEE("\n");
+    gpt_perf_print(ctx, smpl);
    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);

-    if (ctx_guidance) { llama_free(ctx_guidance); }
+    gpt_sampler_free(smpl);
+
    llama_free(ctx);
    llama_free_model(model);

-    llama_sampling_free(ctx_sampling);
    llama_backend_free();

    ggml_threadpool_free(threadpool);
--- a/examples/parallel/README.md
+++ b/examples/parallel/README.md
@ -1,3 +0,0 @@
-# llama.cpp/example/parallel
-
-Simplified simulation of serving incoming requests in parallel
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -52,8 +52,8 @@ static std::vector<std::string> k_prompts = {

 struct client {
    ~client() {
-        if (ctx_sampling) {
-            llama_sampling_free(ctx_sampling);
+        if (smpl) {
+            gpt_sampler_free(smpl);
        }
    }

@ -74,7 +74,7 @@ struct client {
    std::string prompt;
    std::string response;

-    struct llama_sampling_context * ctx_sampling = nullptr;
+    struct gpt_sampler * smpl = nullptr;
 };

 static void print_date_time() {
@ -163,7 +163,7 @@ int main(int argc, char ** argv) {
    for (size_t i = 0; i < clients.size(); ++i) {
        auto & client = clients[i];
        client.id = i;
-        client.ctx_sampling = llama_sampling_init(params.sparams);
+        client.smpl = gpt_sampler_init(model, params.sparams);
    }

    std::vector<llama_token> tokens_system;
@ -255,7 +255,7 @@ int main(int argc, char ** argv) {
                    client.prompt   = client.input + "\nAssistant:";
                    client.response = "";

-                    llama_sampling_reset(client.ctx_sampling);
+                    gpt_sampler_reset(client.smpl);

                    // do not prepend BOS because we have a system prompt!
                    std::vector<llama_token> tokens_prompt;
@ -343,9 +343,9 @@ int main(int argc, char ** argv) {
                //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
                //        client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);

-                const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i);
+                const llama_token id = gpt_sampler_sample(client.smpl, ctx, client.i_batch - i);

-                llama_sampling_accept(client.ctx_sampling, ctx, id, true);
+                gpt_sampler_accept(client.smpl, id, true);

                if (client.n_decoded == 1) {
                    // start measuring generation time after the first token to make sure all concurrent clients
@ -415,7 +415,8 @@ int main(int argc, char ** argv) {

    LOG_TEE("\n");

-    llama_print_timings(ctx);
+    // TODO: print sampling/grammar timings for all clients
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);

    llama_batch_free(batch);

--- a/examples/passkey/README.md
+++ b/examples/passkey/README.md
@ -1,15 +0,0 @@
-# llama.cpp/example/passkey
-
-A passkey retrieval task is an evaluation method used to measure a language
-models ability to recall information from long contexts.
-
-See the following PRs for more info:
-
- https://github.com/ggerganov/llama.cpp/pull/3856
- https://github.com/ggerganov/llama.cpp/pull/4810
-
-### Usage
-
-```bash
-make -j && ./llama-passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250
-```
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@ -26,8 +26,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed);
-
    int n_junk = params.n_junk;
    int n_keep = params.n_keep;
    int n_grp  = params.grp_attn_n;
@ -80,12 +78,17 @@ int main(int argc, char ** argv) {
    GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");

    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
-
    if (ctx == NULL) {
        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
        return 1;
    }

+    auto sparams = llama_sampler_chain_default_params();
+
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+
+    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+
    // tokenize the prompt
    std::vector<llama_token> tokens_list;
    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
@ -217,20 +220,9 @@ int main(int argc, char ** argv) {
    while (n_cur <= n_len) {
        // sample the next token
        {
-            auto   n_vocab = llama_n_vocab(model);
-            auto * logits  = llama_get_logits_ith(ctx, batch.n_tokens - 1);
+            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);

-            std::vector<llama_token_data> candidates;
-            candidates.reserve(n_vocab);
-
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
-            }
-
-            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-            // sample the most likely token
-            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+            llama_sampler_accept(smpl, new_token_id);

            // is it an end of generation?
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
@ -267,10 +259,13 @@ int main(int argc, char ** argv) {
    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

-    llama_print_timings(ctx);
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);

    fprintf(stderr, "\n");

+    llama_sampler_free(smpl);
+
    llama_batch_free(batch);

    llama_free(ctx);
--- a/examples/perplexity/README.md
+++ b/examples/perplexity/README.md
@ -1,193 +0,0 @@
-# Perplexity
-
-The `perplexity` example can be used to calculate the so-called perplexity value of a language model over a given text corpus.
-Perplexity measures how well the model can predict the next token with lower values being better.
-Note that perplexity is **not** directly comparable between models, especially if they use different tokenizers.
-Also note that finetunes typically result in a higher perplexity value even though the human-rated quality of outputs increases.
-
-Within llama.cpp the perplexity of base models is used primarily to judge the quality loss from e.g. quantized models vs. FP16.
-The convention among contributors is to use the Wikitext-2 test set for testing unless noted otherwise (can be obtained with `scripts/get-wikitext-2.sh`).
-When numbers are listed all command line arguments and compilation options are left at their defaults unless noted otherwise.
-llama.cpp numbers are **not** directly comparable to those of other projects because the exact values depend strongly on the implementation details.
-
-By default only the mean perplexity value and the corresponding uncertainty is calculated.
-The uncertainty is determined empirically by assuming a Gaussian distribution of the "correct" logits per and then applying error propagation.
-
-More statistics can be obtained by recording the logits from the FP16 version of a model.
-To do this, supply `perplexity` with `--kl-divergence-base path/to/logit/binary/file.kld`.
-The program will then record all logits and save them to the provided path in binary format.
-**The logit file will be very large, 11 GiB for LLaMA 2 or 37 GiB for LLaMA 3 when using the Wikitext-2 test set.**
-Once you have the file, supply `perplexity` with the quantized model, the logits file via `--kl-divergence-base`,
-and finally the `--kl-divergence` argument to indicate that the program should calculate the so-called Kullback-Leibler divergence.
-This is a measure of how similar the FP16 and the quantized logit distributions are with a value of 0 indicating that the distribution are the same.
-The uncertainty on the mean KL divergence is calculated by assuming the KL divergence per token follows a Gaussian distribution.
-
-In addition to the KL divergence the following statistics are calculated with `--kl-divergence`:
-
-* Ratio of mean FP16 PPL and quantized PPL. Uncertainty is estimated on logits, then propagated. The logarithm of this metric is also calculated and printed, it is 0 if the logit distributions are the same.
-* Difference of mean FP16 PPL and quantized PPL. Uncertainty is estimated on logits, then propagated.
-* Mean change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse.
-* Pearson correlation coefficient of the "correct" token probabilites between models.
-* Percentiles of change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse. Can be used to judge noise vs. quality loss from quantization. If the percentiles are symmetric then the quantization is essentially just adding noise. If the negative values are significantly larger than the positive values then this indicates that the model is actually becoming worse from the quantization.
-* The root mean square of the change in token probabilities. If you were to assume that the quantization simply causes Gaussian noise on the token probabilities then this would be the standard deviation of said noise. The uncertainty on the value is calculated that the change in token probabilities follows a Gaussian distribution. Related discussion: https://github.com/ggerganov/llama.cpp/discussions/2875 .
-* Same top p: Percentage of how often the token was assigned the highest probabilites by both models. The uncertainty is calculated from the Gaussian approximation of the binomial distribution.
-
-## LLaMA 3 8b Scoreboard
-
-| Revision | f364eb6f           |
-|:---------|:-------------------|
-| Backend  | CUDA               |
-| CPU      | AMD Epyc 7742      |
-| GPU      | 1x NVIDIA RTX 4090 |
-
-Results were generated using the CUDA backend and are sorted by Kullback-Leibler divergence relative to FP16.
-The "WT" importance matrices were created using varying numbers of Wikitext tokens and can be found [here](https://huggingface.co/JohannesGaessler/llama.cpp_importance_matrices/blob/main/imatrix-llama_3-8b-f16-2.7m_tokens.dat).
-Note: the FP16 logits used for the calculation of all metrics other than perplexity are stored in a binary file between runs.
-In order to save space this file does **not** contain the exact same FP32 logits but instead casts them to 16 bit unsigned integers (with some scaling).
-So the "f16" results are to be understood as the difference resulting only from this downcast.
-
-| Quantization | imatrix | Model size [GiB] | PPL                    | ΔPPL                   | KLD                   | Mean Δp           | RMS Δp           |
-|--------------|---------|------------------|------------------------|------------------------|-----------------------|-------------------|------------------|
-| f16          | None    |            14.97 | 6.233160 ±   0.037828  | 0.001524 ±   0.000755  | 0.000551 ±   0.000002 |  0.001 ± 0.002 %  | 0.787 ± 0.004 %  |
-| q8_0         | None    |             7.96 | 6.234284 ±   0.037878  | 0.002650 ±   0.001006  | 0.001355 ±   0.000006 | -0.019 ± 0.003 %  | 1.198 ± 0.007 %  |
-| q6_K         | None    |             6.14 | 6.253382 ±   0.038078  | 0.021748 ±   0.001852  | 0.005452 ±   0.000035 | -0.007 ± 0.006 %  | 2.295 ± 0.019 %  |
-| q5_K_M       | None    |             5.33 | 6.288607 ±   0.038338  | 0.056974 ±   0.002598  | 0.010762 ±   0.000079 | -0.114 ± 0.008 %  | 3.160 ± 0.031 %  |
-| q5_K_S       | None    |             5.21 | 6.336598 ±   0.038755  | 0.104964 ±   0.003331  | 0.016595 ±   0.000122 | -0.223 ± 0.010 %  | 3.918 ± 0.036 %  |
-| q5_1         | None    |             5.65 | 6.337857 ±   0.038677  | 0.106223 ±   0.003476  | 0.018045 ±   0.000139 | -0.287 ± 0.011 %  | 4.123 ± 0.039 %  |
-| q5_0         | None    |             5.21 | 6.363224 ±   0.038861  | 0.131591 ±   0.003894  | 0.022239 ±   0.000166 | -0.416 ± 0.012 %  | 4.634 ± 0.043 %  |
-| q4_K_M       | WT 10m  |             4.58 | 6.382937 ±   0.039055  | 0.151303 ±   0.004429  | 0.028152 ±   0.000240 | -0.389 ± 0.014 %  | 5.251 ± 0.049 %  |
-| q4_K_M       | None    |             4.58 | 6.407115 ±   0.039119  | 0.175482 ±   0.004620  | 0.031273 ±   0.000238 | -0.596 ± 0.014 %  | 5.519 ± 0.050 %  |
-| q4_K_S       | WT 10m  |             4.37 | 6.409697 ±   0.039189  | 0.178064 ±   0.004744  | 0.031951 ±   0.000259 | -0.531 ± 0.015 %  | 5.645 ± 0.051 %  |
-| iq4_NL       | WT 10m  |             4.35 | 6.455593 ±   0.039630  | 0.223959 ±   0.005201  | 0.035742 ±   0.000288 | -0.590 ± 0.016 %  | 5.998 ± 0.054 %  |
-| iq4_XS       | WT 10m  |             4.14 | 6.459705 ±   0.039595  | 0.228071 ±   0.005207  | 0.036334 ±   0.000284 | -0.668 ± 0.016 %  | 6.044 ± 0.054 %  |
-| q4_K_S       | None    |             4.37 | 6.500529 ±   0.039778  | 0.268895 ±   0.005638  | 0.043136 ±   0.000314 | -0.927 ± 0.017 %  | 6.562 ± 0.055 %  |
-| q4_1         | None    |             4.78 | 6.682737 ±   0.041285  | 0.451103 ±   0.008030  | 0.071683 ±   0.000505 | -0.927 ± 0.017 %  | 8.512 ± 0.063 %  |
-| q4_0         | None    |             4.34 | 6.700147 ±   0.041226  | 0.468514 ±   0.007951  | 0.071940 ±   0.000491 | -1.588 ± 0.022 %  | 8.434 ± 0.061 %  |
-| q3_K_L       | WT 10m  |             4.03 | 6.671223 ±   0.041427  | 0.439590 ±   0.008154  | 0.073077 ±   0.000529 | -0.940 ± 0.023 %  | 8.662 ± 0.064 %  |
-| q3_K_M       | WT 10m  |             3.74 | 6.734255 ±   0.041838  | 0.502622 ±   0.008901  | 0.084358 ±   0.000588 | -1.198 ± 0.024 %  | 9.292 ± 0.065 %  |
-| q3_K_L       | None    |             4.03 | 6.787876 ±   0.042104  | 0.556242 ±   0.009171  | 0.087176 ±   0.000614 | -1.532 ± 0.025 %  | 9.432 ± 0.067 %  |
-| q3_K_M       | None    |             3.74 | 6.888498 ±   0.042669  | 0.656864 ±   0.010071  | 0.101913 ±   0.000677 | -1.990 ± 0.026 %  | 10.203 ± 0.068 % |
-| iq3_M        | WT 10m  |             3.53 | 6.898327 ±   0.041643  | 0.666694 ±   0.009449  | 0.102534 ±   0.000663 | -3.178 ± 0.026 %  | 10.513 ± 0.066 % |
-| iq3_S        | WT 10m  |             3.42 | 6.965501 ±   0.042406  | 0.733867 ±   0.010245  | 0.111278 ±   0.000710 | -3.066 ± 0.027 %  | 10.845 ± 0.068 % |
-| iq3_XS       | WT 10m  |             3.28 | 7.163043 ±   0.043772  | 0.931409 ±   0.012084  | 0.138693 ±   0.000857 | -3.667 ± 0.031 %  | 12.148 ± 0.070 % |
-| iq3_XXS      | WT 10m  |             3.05 | 7.458436 ±   0.046404  | 1.226803 ±   0.015234  | 0.183625 ±   0.001042 | -3.918 ± 0.035 %  | 13.836 ± 0.074 % |
-| q3_K_S       | WT 10m  |             3.41 | 7.602878 ±   0.046848  | 1.371244 ±   0.015688  | 0.199821 ±   0.001008 | -5.046 ± 0.037 %  | 14.980 ± 0.070 % |
-| q3_K_S       | None    |             3.41 | 7.863786 ±   0.048885  | 1.632152 ±   0.017733  | 0.228217 ±   0.001079 | -5.604 ± 0.038 %  | 15.541 ± 0.070 % |
-| iq2_M        | WT 10m  |             2.74 | 8.600799 ±   0.055124  | 2.369166 ±   0.025244  | 0.325989 ±   0.00160  | -6.463 ± 0.046 %  | 18.519 ± 0.080 % |
-| q2_K         | WT 10k  |             2.96 | 8.652290 ±   0.055572  | 2.420657 ±   0.025587  | 0.331393 ±   0.001562 | -6.606 ± 0.046 %  | 18.790 ± 0.078 % |
-| q2_K         | WT 100k |             2.96 | 8.641993 ±   0.055406  | 2.410359 ±   0.025495  | 0.331672 ±   0.001569 | -6.628 ± 0.047 %  | 18.856 ± 0.078 % |
-| q2_K         | WT 10m  |             2.96 | 8.647825 ±   0.055610  | 2.416191 ±   0.025683  | 0.332223 ±   0.001572 | -6.500 ± 0.047 %  | 18.881 ± 0.078 % |
-| q2_K         | WT 1m   |             2.96 | 8.674365 ±   0.055743  | 2.442732 ±   0.025843  | 0.335308 ±   0.001576 | -6.634 ± 0.047 %  | 19.009 ± 0.079 % |
-| q2_K         | WT 1k   |             2.96 | 8.682605 ±   0.055916  | 2.450972 ±   0.026069  | 0.337093 ±   0.001596 | -6.596 ± 0.047 %  | 18.977 ± 0.079 % |
-| q2_K_S       | WT 10m  |             2.96 | 9.323778 ±   0.061551  | 3.092145 ±   0.031914  | 0.403360 ±   0.001787 | -7.131 ± 0.049 %  | 20.050 ± 0.081 % |
-| q2_K_S       | WT 1m   |             2.96 | 9.329321 ±   0.061378  | 3.097688 ±   0.031816  | 0.403590 ±   0.001797 | -7.289 ± 0.049 %  | 20.123 ± 0.081 % |
-| q2_K_S       | WT 100k |             2.96 | 9.362973 ±   0.061740  | 3.131339 ±   0.032169  | 0.408367 ±   0.001802 | -7.198 ± 0.050 %  | 20.132 ± 0.081 % |
-| q2_K_S       | WT 10k  |             2.96 | 9.376479 ±   0.062045  | 3.144846 ±   0.032464  | 0.408662 ±   0.001819 | -7.141 ± 0.050 %  | 20.120 ± 0.081 % |
-| q2_K_S       | WT 1k   |             2.96 | 9.415200 ±   0.062475  | 3.183567 ±   0.032993  | 0.415865 ±   0.001846 | -7.153 ± 0.050 %  | 20.311 ± 0.082 % |
-| iq2_S        | WT 10m  |             2.56 | 9.650781 ±   0.063209  | 3.419148 ±   0.034017  | 0.439197 ±   0.001976 | -8.319 ± 0.052 %  | 21.491 ± 0.083 % |
-| q2_K         | None    |             2.96 | 9.751568 ±   0.063312  | 3.519934 ±   0.033863  | 0.445132 ±   0.001835 | -9.123 ± 0.051 %  | 21.421 ± 0.079 % |
-| iq2_XS       | WT 10m  |             2.43 | 10.761424 ±   0.071056 | 4.529791 ±   0.042229  | 0.546290 ±   0.002133 | -10.576 ± 0.056 % | 23.872 ± 0.082 % |
-| iq2_XXS      | WT 10m  |             2.24 | 14.091782 ±   0.098396 | 7.860148 ±   0.070752  | 0.812022 ±   0.002741 | -14.363 ± 0.065 % | 28.576 ± 0.084 % |
-| iq1_M        | WT 10m  |             2.01 | 25.493722 ±   0.177903 | 19.262089 ±   0.152396 | 1.393084 ±   0.003529 | -24.672 ± 0.077 % | 38.287 ± 0.084 % |
-| iq1_S        | WT 1m   |             1.88 | 58.097760 ±   0.438604 | 51.866126 ±   0.416604 | 2.211278 ±   0.004688 | -32.471 ± 0.087 % | 46.418 ± 0.085 % |
-| iq1_S        | WT 1k   |             1.88 | 58.267851 ±   0.446208 | 52.036218 ±   0.424373 | 2.214858 ±   0.004778 | -31.880 ± 0.089 % | 46.330 ± 0.086 % |
-| iq1_S        | WT 100k |             1.88 | 58.581498 ±   0.453145 | 52.349864 ±   0.431360 | 2.220834 ±   0.004818 | -32.261 ± 0.089 % | 46.002 ± 0.086 % |
-| iq1_S        | WT 10m  |             1.88 | 60.694593 ±   0.471290 | 54.462959 ±   0.449644 | 2.254554 ±   0.004868 | -31.973 ± 0.088 % | 46.271 ± 0.086 % |
-| iq1_S        | WT 10k  |             1.88 | 63.221324 ±   0.493077 | 56.989691 ±   0.471423 | 2.293527 ±   0.004885 | -32.261 ± 0.089 % | 46.562 ± 0.086 % |
-
-There seems to be no consistent improvement from using more Wikitext tokens for the importance matrix.
-K-quants score better on mean Δp than the legacy quants than e.g. KL divergence would suggest.
-
-## LLaMA 2 vs. LLaMA 3 Quantization comparison
-
-| Revision | f364eb6f           |
-|:---------|:-------------------|
-| Backend  | CUDA               |
-| CPU      | AMD Epyc 7742      |
-| GPU      | 1x NVIDIA RTX 4090 |
-
-| Metric          |          L2 7b q2_K |          L3 8b q2_K |        L2 7b q4_K_M |        L3 8b q4_K_M |          L2 7b q6_K |          L3 8b q6_K |          L2 7b q8_0 |          L3 8b q8_0 |
-|-----------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|
-| Mean PPL        | 5.794552 ± 0.032298 | 9.751568 ± 0.063312 | 5.877078 ± 0.032781 | 6.407115 ± 0.039119 | 5.808494 ± 0.032425 | 6.253382 ± 0.038078 | 5.798542 ± 0.032366 | 6.234284 ± 0.037878 |
-| Mean PPL ratio  | 1.107955 ± 0.001427 | 1.564849 ± 0.004525 | 1.014242 ± 0.000432 | 1.028160 ± 0.000723 | 1.002406 ± 0.000191 | 1.003490 ± 0.000296 | 1.000689 ± 0.000107 | 1.000425 ± 0.000161 |
-| Mean ΔPPL       | 0.625552 ± 0.008725 | 3.519934 ± 0.033863 | 0.082526 ± 0.002530 | 0.175482 ± 0.004620 | 0.013941 ± 0.001110 | 0.021748 ± 0.001852 | 0.003990 ± 0.000624 | 0.002650 ± 0.001006 |
-| PPL correlation |              97.36% |              89.62% |              99.71% |              99.34% |              99.94% |              99.88% |              99.98% |              99.96% |
-| Mean KLD        | 0.108903 ± 0.000645 | 0.445132 ± 0.001835 | 0.012686 ± 0.000079 | 0.031273 ± 0.000238 | 0.002098 ± 0.000014 | 0.005452 ± 0.000035 | 0.000369 ± 0.000007 | 0.001355 ± 0.000006 |
-| Mean Δp         |    -2.710 ± 0.023 % |    -9.123 ± 0.051 % |    -0.416 ± 0.008 % |    -0.596 ± 0.014 % |    -0.035 ± 0.003 % |    -0.007 ± 0.006 % |    -0.005 ± 0.002 % |    -0.019 ± 0.003 % |
-| Maximum Δp      |             85.136% |             94.268% |             45.209% |             95.054% |             23.593% |             53.601% |             43.925% |             28.734% |
-| 99.9% Δp        |             37.184% |             50.003% |             17.461% |             27.084% |              7.798% |             13.613% |              3.387% |              6.402% |
-| 99.0% Δp        |             18.131% |             25.875% |              7.798% |             12.084% |              3.838% |              6.407% |              1.867% |              3.544% |
-| Median Δp       |             -0.391% |             -2.476% |             -0.026% |             -0.024% |             -0.001% |              0.000% |             -0.000% |             -0.000% |
-| 1.0% Δp         |            -39.762% |            -87.173% |            -11.433% |            -19.567% |             -4.222% |             -6.767% |             -1.862% |             -3.698% |
-| 0.1% Δp         |            -79.002% |            -98.897% |            -26.433% |            -56.054% |             -9.091% |            -16.584% |             -3.252% |             -6.579% |
-| Minimum Δp      |            -99.915% |            -99.965% |            -83.383% |            -98.699% |            -43.142% |            -68.487% |             -9.343% |            -24.301% |
-| RMS Δp          |     9.762 ± 0.053 % |    21.421 ± 0.079 % |     3.252 ± 0.024 % |     5.519 ± 0.050 % |     1.339 ± 0.010 % |     2.295 ± 0.019 % |     0.618 ± 0.011 % |     1.198 ± 0.007 % |
-| Same top p      |    85.584 ± 0.086 % |    71.138 ± 0.119 % |    94.665 ± 0.055 % |    91.901 ± 0.072 % |    97.520 ± 0.038 % |    96.031 ± 0.051 % |    98.846 ± 0.026 % |    97.674 ± 0.040 % |
-
-## LLaMA 3 BF16 vs. FP16 comparison
-
-| Revision | 83330d8c      |
-|:---------|:--------------|
-| Backend  | CPU           |
-| CPU      | AMD Epyc 7742 |
-| GPU      | N/A           |
-
-Results were calculated with LLaMA 3 8b BF16 as `--kl-divergence-base` and LLaMA 3 8b FP16 as the `--model` for comparison.
-
-| Metric                         |                    Value |
-|--------------------------------|--------------------------|
-| Mean PPL(Q)                    |      6.227711 ± 0.037833 |
-| Mean PPL(base)                 |      6.225194 ± 0.037771 |
-| Cor(ln(PPL(Q)), ln(PPL(base))) |                  99.990% |
-| Mean ln(PPL(Q)/PPL(base))      |      0.000404 ± 0.000086 |
-| Mean PPL(Q)/PPL(base)          |      1.000404 ± 0.000086 |
-| Mean PPL(Q)-PPL(base)          |      0.002517 ± 0.000536 |
-| Mean    KLD                    |  0.00002515 ± 0.00000020 |
-| Maximum KLD                    |                 0.012206 |
-| 99.9%   KLD                    |                 0.000799 |
-| 99.0%   KLD                    |                 0.000222 |
-| 99.0%   KLD                    |                 0.000222 |
-| Median  KLD                    |                 0.000013 |
-| 10.0%   KLD                    |                -0.000002 |
-| 5.0%   KLD                     |                -0.000008 |
-| 1.0%   KLD                     |                -0.000023 |
-| Minimum KLD                    |                -0.000059 |
-| Mean    Δp                     | -0.0000745 ± 0.0003952 % |
-| Maximum Δp                     |                   4.186% |
-| 99.9%   Δp                     |                   1.049% |
-| 99.0%   Δp                     |                   0.439% |
-| 95.0%   Δp                     |                   0.207% |
-| 90.0%   Δp                     |                   0.125% |
-| 75.0%   Δp                     |                   0.029% |
-| Median  Δp                     |                   0.000% |
-| 25.0%   Δp                     |                  -0.030% |
-| 10.0%   Δp                     |                  -0.126% |
-| 5.0%   Δp                      |                  -0.207% |
-| 1.0%   Δp                      |                  -0.434% |
-| 0.1%   Δp                      |                  -1.016% |
-| Minimum Δp                     |                  -4.672% |
-| RMS Δp                         |          0.150 ± 0.001 % |
-| Same top p                     |         99.739 ± 0.013 % |
-
-## Old Numbers
-
-<details>
-<summary>Llama 2 70B Scoreboard</summary>
-
-| Quantization | Model size (GiB) | Perplexity | Delta to fp16 |
-|--------------|------------------|------------|---------------|
-| Q4_0         | 36.20            | 3.5550     | 3.61%         |
-| Q4_1         | 40.20            | 3.5125     | 2.37%         |
-| Q5_0         | 44.20            | 3.4744     | 1.26%         |
-| Q2_K         | 27.27            | 3.7339     | 8.82%         |
-| Q3_K_S       | 27.86            | 3.7019     | 7.89%         |
-| Q3_K_M       | 30.83            | 3.5932     | 4.72%         |
-| Q3_K_L       | 33.67            | 3.5617     | 3.80%         |
-| Q4_K_S       | 36.39            | 3.4852     | 1.57%         |
-| Q4_K_M       | 38.54            | 3.4725     | 1.20%         |
-| Q5_K_S       | 44.20            | 3.4483     | 0.50%         |
-| Q5_K_M       | 45.41            | 3.4451     | 0.40%         |
-| Q6_K         | 52.70            | 3.4367     | 0.16%         |
-| fp16         | 128.5            | 3.4313     | -             |
-
-</details>
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -77,7 +77,7 @@ static void write_logfile(
    fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
    yaml_dump_vector_float(logfile, "probs", results.probs);

-    llama_dump_timing_info_yaml(logfile, ctx);
+    llama_perf_dump_yaml(logfile, ctx);
    fclose(logfile);
 }

@ -2008,13 +2008,7 @@ int main(int argc, char ** argv) {

    print_build_info();

-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
+    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);

    llama_backend_init();
    llama_numa_init(params.numa);
@ -2055,7 +2049,8 @@ int main(int argc, char ** argv) {
        results = perplexity(ctx, params, n_ctx);
    }

-    llama_print_timings(ctx);
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
    write_logfile(ctx, params, model, results);

    llama_free(ctx);
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@ -1,425 +0,0 @@
-#define LLAMA_API_INTERNAL
-#include "build-info.h"
-#include "common.h"
-#include "ggml.h"
-#include "llama.h"
-
-#include <algorithm>
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <map>
-#include <numeric>
-#include <regex>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include <thread>
-#include <mutex>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-struct quantize_stats_params {
-    std::string model = DEFAULT_MODEL_PATH;
-    bool verbose = false;
-    bool per_layer_stats = false;
-    bool print_histogram = false;
-    bool reference = false;
-    std::vector<std::string> include_layers;
-    std::vector<std::string> exclude_layers;
-    std::vector<enum ggml_type> include_types;
-};
-
-constexpr size_t HISTOGRAM_BUCKETS = 150;
-constexpr double HISTOGRAM_RANGE = 0.03;
-
-struct error_stats {
-    size_t num_samples;
-    double total_error;
-    double max_error;
-    uint64_t error_histogram[HISTOGRAM_BUCKETS];
-};
-
-static void quantize_stats_print_usage(int /*argc*/, char ** argv) {
-    quantize_stats_params params;
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -m FNAME, --model FNAME\n");
-    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "  -r, --reference\n");
-    fprintf(stderr, "                        use reference implementation (default: false)\n");
-    fprintf(stderr, "  -v, --verbose\n");
-    fprintf(stderr, "                        verbose output (default: false)\n");
-    fprintf(stderr, "  -p, --per-layer-stats\n");
-    fprintf(stderr, "                        print stats per layer (default: false)\n");
-    fprintf(stderr, "  --histogram\n");
-    fprintf(stderr, "                        print error histogram (default: false)\n");
-    fprintf(stderr, "  -l LAYER, --include-layer LAYER\n");
-    fprintf(stderr, "                        only test layers matching pattern\n");
-    fprintf(stderr, "  -L LAYER, --exclude-layer LAYER\n");
-    fprintf(stderr, "                        exclude layers matching pattern\n");
-    fprintf(stderr, "  -t TYPE, --type TYPE\n");
-    fprintf(stderr, "                        only test given type (q4_0, q4_1)\n");
-    fprintf(stderr, "\n");
-}
-
-// Check if a layer is included/excluded by command line
-static bool layer_included(const quantize_stats_params & params, const std::string & layer) {
-    for (const auto& excluded : params.exclude_layers) {
-        if (std::regex_search(layer, std::regex(excluded))) {
-            return false;
-        }
-    }
-    for (const auto& included : params.include_layers) {
-        if (std::regex_search(layer, std::regex(included))) {
-            return true;
-        }
-    }
-    return params.include_layers.empty();
-}
-
-// Update error statistics given vectors with the before/after result of quantization
-static void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
-    for (int64_t i = 0; i < nelements; i++) {
-        double diff = input[i] - output[i];
-        stats.total_error += diff * diff;
-        stats.max_error = fmax(fabs(diff), stats.max_error);
-        stats.error_histogram[std::max(std::min((size_t) floor(fabs(diff) / HISTOGRAM_RANGE * HISTOGRAM_BUCKETS), HISTOGRAM_BUCKETS-1), (size_t) 0)]++;
-    }
-    stats.num_samples += nelements;
-}
-
-static void combine_error_stats(error_stats & into, const error_stats & from) {
-    into.num_samples += from.num_samples;
-    into.total_error += from.total_error;
-    if (from.max_error > into.max_error) into.max_error = from.max_error;
-    for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i];
-}
-
-static double find_quantile(const error_stats & stats, double quantile) {
-    double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
-
-    double accum = 0;
-    for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
-        accum += stats.error_histogram[i];
-        if (accum >= sum*quantile) {
-            return (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
-        }
-    }
-    return INFINITY;
-}
-
-static void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
-    double rmse = sqrt(stats.total_error / (double) stats.num_samples);
-    double median = find_quantile(stats, .5);
-    double pct95 = find_quantile(stats, .95);
-    printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median);
-    if (print_histogram) {
-        printf("Error distribution:\n");
-        for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
-            double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
-            double upper = (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
-            if (i == HISTOGRAM_BUCKETS -1) upper = INFINITY;
-            printf("[%3.4f, %3.4f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]);
-        }
-    }
-}
-
-// copied from ggml.h - verify that we can access this as a flat array
-static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return
-        tensor->nb[0] == ggml_type_size(tensor->type) &&
-        tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
-        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
-        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
-}
-
-static void test_roundtrip_on_chunk(
-    const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference,
-    float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
-) {
-    if (layer->type == GGML_TYPE_F16) {
-        for (int i = 0; i < chunk_size; i++) {
-            input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
-        }
-    } else {
-        input_scratch = ggml_get_data_f32(layer) + offset;
-    }
-
-    if (use_reference) {
-        qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size);
-    } else {
-        qfns.from_float(input_scratch, quantized_scratch, chunk_size);
-    }
-    qfns.to_float(quantized_scratch, output_scratch, chunk_size);
-
-    update_error_stats(chunk_size, input_scratch, output_scratch, stats);
-}
-
-
-// Run quantization function for a single layer and update error stats
-static void test_roundtrip_on_layer(
-    std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference,
-    const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
-    std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
-) {
-    assert(tensor_is_contiguous(layer));
-    error_stats layer_error {};
-    uint64_t nelements = ggml_nelements(layer);
-
-    float* input_scratch_ptr = nullptr;
-    if (layer->type == GGML_TYPE_F16) {
-        if (input_scratch.size() < nelements) input_scratch.resize(nelements);
-        input_scratch_ptr = input_scratch.data();
-    }
-    if (quantized_scratch.size() < 4*nelements) quantized_scratch.resize(4*nelements);
-    if (output_scratch.size() < nelements) output_scratch.resize(nelements);
-
-    if (max_thread < 1) max_thread = std::thread::hardware_concurrency();
-    int chunk_size = 32*512;
-    int num_chunks = (nelements + chunk_size - 1)/chunk_size;
-
-    if (num_chunks < 2 || max_thread < 2) {
-        test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(),
-                output_scratch.data(), print_layer_stats ? layer_error : total_error);
-    } else {
-        auto & stats = print_layer_stats ? layer_error : total_error;
-        std::mutex mutex;
-        uint64_t counter = 0;
-        auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr,
-             &quantized_scratch, &output_scratch, chunk_size] () {
-            error_stats local_stats {};
-            while (true) {
-                std::unique_lock<std::mutex> lock(mutex);
-                uint64_t offset = counter; counter += chunk_size;
-                if (offset >= nelements) {
-                    combine_error_stats(stats, local_stats);
-                    break;
-                }
-                lock.unlock();
-                uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
-                test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset,
-                        quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
-            }
-        };
-        int nthread = std::min(num_chunks, max_thread);
-        std::vector<std::thread> workers(nthread-1);
-        for (auto& w : workers) w = std::thread(compute);
-        compute();
-        for (auto& w : workers) w.join();
-    }
-
-    if (print_layer_stats) {
-        print_error_stats(name, layer_error, false);
-        combine_error_stats(total_error, layer_error);
-    }
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    quantize_stats_params params;
-
-    // read command line
-
-    int max_thread = 0;
-    bool invalid_param = false;
-    std::string arg;
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-
-        if (arg == "-h" || arg == "--help") {
-            quantize_stats_print_usage(argc, argv);
-            exit(0);
-        } else if (arg == "-r" || arg == "--reference") {
-            params.reference = true;
-        } else if (arg == "-v") {
-            params.verbose = true;
-        } else if (arg == "-p" || arg == "--per-layer-stats") {
-            params.per_layer_stats = true;
-        } else if (arg == "--histogram") {
-            params.print_histogram = true;
-        } else if (arg == "-m" || arg == "--model") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.model = argv[i];
-        } else if (arg == "-l" || arg == "--include-layer") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.include_layers.emplace_back(argv[i]);
-        } else if (arg == "-L" || arg == "--exclude-layer") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.exclude_layers.emplace_back(argv[i]);
-        } else if (arg == "-t" || arg == "--type") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            int j;
-            for (j = 0; j < GGML_TYPE_COUNT; ++j) {
-               const auto * name = ggml_type_name((ggml_type) j);
-               if (name && strcmp(argv[i], name) == 0) break;
-            }
-            if (j < GGML_TYPE_COUNT) {
-                params.include_types.push_back((ggml_type) j);
-            } else {
-                fprintf(stderr, "error: %s not in list of types\n", argv[i]);
-                invalid_param = true;
-            }
-        } else if (arg == "-n" || arg == "--num-threads") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            max_thread = atoi(argv[i]);
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            quantize_stats_print_usage(argc, argv);
-            return 1;
-        }
-    }
-    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        quantize_stats_print_usage(argc, argv);
-        return 1;
-    }
-
-    print_build_info();
-
-    // load the model
-    fprintf(stderr, "Loading model\n");
-
-    const int64_t t_main_start_us = ggml_time_us();
-    llama_model * model;
-    llama_context * ctx;
-
-    {
-        auto mparams = llama_model_default_params();
-        mparams.use_mlock  = false;
-
-        model = llama_load_model_from_file(params.model.c_str(), mparams);
-
-        if (model == NULL) {
-            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        auto cparams = llama_context_default_params();
-        cparams.n_ctx      = 256;
-        cparams.seed       = 1;
-
-        ctx = llama_new_context_with_model(model, cparams);
-
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
-            llama_free_model(model);
-            return 1;
-        }
-    }
-
-    const auto &tensors = llama_internal_get_tensor_map(ctx);
-
-    // check layer tensors
-    int included_layers = 0;
-    int64_t max_nelements = 0;
-    bool is_f16 = false;
-    for (const auto& kv_tensor : tensors) {
-        if (!layer_included(params, kv_tensor.first)) {
-            continue;
-        }
-        if (params.verbose) {
-            printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), ggml_type_name(kv_tensor.second->type), ggml_nelements(kv_tensor.second));
-        }
-        if (kv_tensor.second->type == GGML_TYPE_F16) {
-            is_f16 = true;
-        } else if (kv_tensor.second->type != GGML_TYPE_F32) {
-            fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
-                "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
-            llama_free(ctx);
-            llama_free_model(model);
-            return 1;
-        }
-        included_layers++;
-        max_nelements = std::max(max_nelements, ggml_nelements(kv_tensor.second));
-    }
-
-    if (is_f16) {
-        printf("note: source model is f16\n");
-    }
-    printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements);
-    // allocate scratch space
-    std::vector<float> input_scratch;
-    std::vector<char> quantized_scratch;
-    std::vector<float> output_scratch;
-
-    // loop throught quantization types
-    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
-        const ggml_type type = (ggml_type) i;
-        if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
-            continue;
-        }
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
-        if (qfns.from_float && qfns.to_float) {
-            if (params.verbose) {
-                printf("testing %s ...\n",  ggml_type_name(type));
-            }
-
-            ggml_quantize_init(type);
-
-            error_stats global_stats {};
-
-            for (const auto& kv_tensor : tensors) {
-                if (!layer_included(params, kv_tensor.first)) {
-                    continue;
-                }
-                if (params.verbose) {
-                    printf("  %s ...\n",  kv_tensor.first.c_str());
-                }
-                std::string layer_name { ggml_type_name(type) };
-                layer_name += "::" + kv_tensor.first;
-                test_roundtrip_on_layer(
-                        layer_name,
-                        params.per_layer_stats,
-                        qfns,
-                        params.reference,
-                        kv_tensor.second,
-                        input_scratch,
-                        quantized_scratch,
-                        output_scratch,
-                        global_stats,
-                        max_thread
-                );
-            }
-
-            print_error_stats(ggml_type_name(type), global_stats, params.print_histogram);
-        }
-    }
-
-
-    llama_free(ctx);
-    llama_free_model(model);
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n");
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
-    }
-
-    return 0;
-}
--- a/examples/retrieval/README.md
+++ b/examples/retrieval/README.md
@ -1,69 +0,0 @@
-# llama.cpp/examples/retrieval
-
-Demonstration of simple retrieval technique based on cosine similarity
-
-More info:
-https://github.com/ggerganov/llama.cpp/pull/6193
-
-### How to use
-
-`retieval.cpp` has parameters of its own:
- `--context-file`: file to be embedded - state this option multiple times to embed multiple files
- `--chunk-size`: minimum size of each text chunk to be embedded
- `--chunk-separator`: STRING to divide chunks by. newline by default
-
-`retrieval` example can be tested as follows:
-
-```bash
-make -j && ./llama-retrieval --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .
-```
-
-This chunks and embeds all given files and starts a loop requesting query inputs:
-
-```
-Enter query:
-```
-
-On each query input, top k chunks are shown along with file name, chunk position within file and original text:
-
-```
-Enter query: describe the mit license
-batch_decode: n_tokens = 6, n_seq = 1
-Top 3 similar chunks:
-filename: README.md
-filepos: 119
-similarity: 0.762334
-textdata:
-png)
-
-[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
-
-[Roadmap](https://github.
--------------------
-filename: License
-filepos: 0
-similarity: 0.725146
-textdata:
-MIT License
-
-Copyright (c) 2023 Georgi Gerganov
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
--------------------
-filename: README.md
-filepos: 9178
-similarity: 0.621722
-textdata:
-com/cztomsik/ava) (MIT)
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
- [pythops/tenere](https://github.
--------------------
-```
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@ -293,9 +293,11 @@ int main(int argc, char ** argv) {
        }
    }

+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+
    // clean up
    llama_batch_free(query_batch);
-    llama_print_timings(ctx);
    llama_free(ctx);
    llama_free_model(model);
    llama_backend_free();
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@ -1,78 +0,0 @@
-## Overview
-
-> [!IMPORTANT]
-> This example and the RPC backend are currently in a proof-of-concept development stage. As such, the functionality is fragile and
-> insecure. **Never run the RPC server on an open network or in a sensitive environment!**
-
-The `rpc-server` allows  running `ggml` backend on a remote host.
-The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
-This can be used for distributed LLM inference with `llama.cpp` in the following way:
-
-```mermaid
-flowchart TD
-    rpcb---|TCP|srva
-    rpcb---|TCP|srvb
-    rpcb-.-|TCP|srvn
-    subgraph hostn[Host N]
-    srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"]
-    end
-    subgraph hostb[Host B]
-    srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"]
-    end
-    subgraph hosta[Host A]
-    srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"]
-    end
-    subgraph host[Main Host]
-    ggml[llama.cpp]---rpcb[RPC backend]
-    end
-    style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
-```
-
-Each host can run a different backend, e.g. one with CUDA and another with Metal.
-You can also run multiple `rpc-server` instances on the same host, each with a different backend.
-
-## Usage
-
-On each host, build the corresponding backend with `cmake` and add `-DGGML_RPC=ON` to the build options.
-For example, to build the CUDA backend with RPC support:
-
-```bash
-mkdir build-rpc-cuda
-cd build-rpc-cuda
-cmake .. -DGGML_CUDA=ON -DGGML_RPC=ON
-cmake --build . --config Release
-```
-
-Then, start the `rpc-server` with the backend:
-
-```bash
-$ bin/rpc-server -p 50052
-create_backend: using CUDA backend
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   no
-ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes
-ggml_cuda_init: found 1 CUDA devices:
-  Device 0: NVIDIA T1200 Laptop GPU, compute capability 7.5, VMM: yes
-Starting RPC server on 0.0.0.0:50052
-```
-
-When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.:
-```bash
-$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
-```
-This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
-
-
-On the main host build `llama.cpp` only with `-DGGML_RPC=ON`:
-
-```bash
-mkdir build-rpc
-cd build-rpc
-cmake .. -DGGML_RPC=ON
-cmake --build . --config Release
-```
-
-Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
-
-```bash
-$ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
-```
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -4,12 +4,12 @@

 #include <vector>
 #include <cstdio>
-#include <chrono>

 int main(int argc, char ** argv) {
    gpt_params params;

    params.prompt = "The quick brown fox";
+    params.sparams.seed = 1234;

    if (!gpt_params_parse(argc, argv, params)) {
        gpt_params_print_usage(argc, argv, params);
@ -39,6 +39,13 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    auto sparams = llama_sampler_chain_default_params();
+
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+
+    llama_sampler_chain_add(smpl, llama_sampler_init_softmax());
+    llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed));
+
    // tokenize prompt
    auto tokens = llama_tokenize(ctx, params.prompt, true);

@ -65,18 +72,11 @@ int main(int argc, char ** argv) {
    printf("\nfirst run: %s", params.prompt.c_str());

    for (auto i = 0; i < params.n_predict; i++) {
-        auto * logits = llama_get_logits(ctx);
-        auto n_vocab = llama_n_vocab(model);
-
-        std::vector<llama_token_data> candidates;
-        candidates.reserve(n_vocab);
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-        }
-        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-        auto next_token = llama_sample_token(ctx, &candidates_p);
+        auto next_token     = llama_sampler_sample(smpl, ctx, -1);
        auto next_token_str = llama_token_to_piece(ctx, next_token);

+        llama_sampler_accept(smpl, next_token);
+
        printf("%s", next_token_str.c_str());
        result0 += next_token_str;

@ -97,6 +97,11 @@ int main(int argc, char ** argv) {
    // make new context
    auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));

+    llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
+
+    llama_sampler_chain_add(smpl2, llama_sampler_init_softmax());
+    llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed));
+
    printf("\nsecond run: %s", params.prompt.c_str());

    // load state (rng, logits, embedding and kv_cache) from file
@ -125,17 +130,11 @@ int main(int argc, char ** argv) {

    // second run
    for (auto i = 0; i < params.n_predict; i++) {
-        auto * logits = llama_get_logits(ctx2);
-        auto n_vocab = llama_n_vocab(model);
-        std::vector<llama_token_data> candidates;
-        candidates.reserve(n_vocab);
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-        }
-        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-        auto next_token = llama_sample_token(ctx2, &candidates_p);
+        auto next_token     = llama_sampler_sample(smpl2, ctx2, -1);
        auto next_token_str = llama_token_to_piece(ctx2, next_token);

+        llama_sampler_accept(smpl2, next_token);
+
        printf("%s", next_token_str.c_str());
        result1 += next_token_str;

@ -160,6 +159,11 @@ int main(int argc, char ** argv) {
    // make new context
    auto * ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));

+    llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
+
+    llama_sampler_chain_add(smpl3, llama_sampler_init_softmax());
+    llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed));
+
    printf("\nsingle seq run: %s", params.prompt.c_str());

    // load state (rng, logits, embedding and kv_cache) from file
@ -216,17 +220,11 @@ int main(int argc, char ** argv) {

    // third run with seq 1 instead of 0
    for (auto i = 0; i < params.n_predict; i++) {
-        auto * logits = llama_get_logits(ctx3);
-        auto n_vocab = llama_n_vocab(model);
-        std::vector<llama_token_data> candidates;
-        candidates.reserve(n_vocab);
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-        }
-        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-        auto next_token = llama_sample_token(ctx3, &candidates_p);
+        auto next_token     = llama_sampler_sample(smpl3, ctx3, -1);
        auto next_token_str = llama_token_to_piece(ctx3, next_token);

+        llama_sampler_accept(smpl3, next_token);
+
        printf("%s", next_token_str.c_str());
        result2 += next_token_str;

@ -241,6 +239,10 @@ int main(int argc, char ** argv) {

    printf("\n");

+    llama_sampler_free(smpl);
+    llama_sampler_free(smpl2);
+    llama_sampler_free(smpl3);
+
    llama_free(ctx3);
    llama_free_model(model);

--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -1,981 +0,0 @@
-# LLaMA.cpp HTTP Server
-
-Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/yhirose/cpp-httplib), [nlohmann::json](https://github.com/nlohmann/json) and **llama.cpp**.
-
-Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
-
-**Features:**
- * LLM inference of F16 and quantized models on GPU and CPU
- * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
- * Parallel decoding with multi-user support
- * Continuous batching
- * Multimodal (wip)
- * Monitoring endpoints
- * Schema-constrained JSON response format
-
-The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).
-
-## Usage
-
-```
-usage: ./llama-server [options]
-
-general:
-
-  -h,    --help, --usage          print usage and exit
-         --version                show version and build info
-  -v,    --verbose                print verbose information
-         --verbosity N            set specific verbosity level (default: 0)
-         --verbose-prompt         print a verbose prompt before generation (default: false)
-         --no-display-prompt      don't print prompt at generation (default: false)
-  -co,   --color                  colorise output to distinguish prompt and user input from generations (default: false)
-  -s,    --seed SEED              RNG seed (default: -1, use random seed for < 0)
-  -t,    --threads N              number of threads to use during generation (default: 8)
-  -tb,   --threads-batch N        number of threads to use during batch and prompt processing (default: same as --threads)
-  -td,   --threads-draft N        number of threads to use during generation (default: same as --threads)
-  -tbd,  --threads-batch-draft N  number of threads to use during batch and prompt processing (default: same as --threads-draft)
-         --draft N                number of tokens to draft for speculative decoding (default: 5)
-  -ps,   --p-split N              speculative decoding split probability (default: 0.1)
-  -lcs,  --lookup-cache-static FNAME
-                                  path to static lookup cache to use for lookup decoding (not updated by generation)
-  -lcd,  --lookup-cache-dynamic FNAME
-                                  path to dynamic lookup cache to use for lookup decoding (updated by generation)
-  -c,    --ctx-size N             size of the prompt context (default: 0, 0 = loaded from model)
-  -n,    --predict N              number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)
-  -b,    --batch-size N           logical maximum batch size (default: 2048)
-  -ub,   --ubatch-size N          physical maximum batch size (default: 512)
-         --keep N                 number of tokens to keep from the initial prompt (default: 0, -1 = all)
-         --chunks N               max number of chunks to process (default: -1, -1 = all)
-  -fa,   --flash-attn             enable Flash Attention (default: disabled)
-  -p,    --prompt PROMPT          prompt to start generation with
-                                  in conversation mode, this will be used as system prompt
-                                  (default: '')
-  -f,    --file FNAME             a file containing the prompt (default: none)
-         --in-file FNAME          an input file (repeat to specify multiple files)
-  -bf,   --binary-file FNAME      binary file containing the prompt (default: none)
-  -e,    --escape                 process escapes sequences (\n, \r, \t, \', \", \\) (default: true)
-         --no-escape              do not process escape sequences
-  -ptc,  --print-token-count N    print token count every N tokens (default: -1)
-         --prompt-cache FNAME     file to cache prompt state for faster startup (default: none)
-         --prompt-cache-all       if specified, saves user input and generations to cache as well
-                                  not supported with --interactive or other interactive options
-         --prompt-cache-ro        if specified, uses the prompt cache but does not update it
-  -r,    --reverse-prompt PROMPT  halt generation at PROMPT, return control in interactive mode
-                                  can be specified more than once for multiple prompts
-  -sp,   --special                special tokens output enabled (default: false)
-  -cnv,  --conversation           run in conversation mode, does not print special tokens and suffix/prefix
-                                  if suffix/prefix are not specified, default chat template will be used
-                                  (default: false)
-  -i,    --interactive            run in interactive mode (default: false)
-  -if,   --interactive-first      run in interactive mode and wait for input right away (default: false)
-  -mli,  --multiline-input        allows you to write or paste multiple lines without ending each in '\'
-         --in-prefix-bos          prefix BOS to user inputs, preceding the `--in-prefix` string
-         --in-prefix STRING       string to prefix user inputs with (default: empty)
-         --in-suffix STRING       string to suffix after user inputs with (default: empty)
-         --spm-infill             use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled)
-
-sampling:
-
-         --samplers SAMPLERS      samplers that will be used for generation in the order, separated by ';'
-                                  (default: top_k;tfs_z;typical_p;top_p;min_p;temperature)
-         --sampling-seq SEQUENCE  simplified sequence for samplers that will be used (default: kfypmt)
-         --ignore-eos             ignore end of stream token and continue generating (implies --logit-bias EOS-inf)
-         --penalize-nl            penalize newline tokens (default: false)
-         --temp N                 temperature (default: 0.8)
-         --top-k N                top-k sampling (default: 40, 0 = disabled)
-         --top-p N                top-p sampling (default: 0.9, 1.0 = disabled)
-         --min-p N                min-p sampling (default: 0.1, 0.0 = disabled)
-         --tfs N                  tail free sampling, parameter z (default: 1.0, 1.0 = disabled)
-         --typical N              locally typical sampling, parameter p (default: 1.0, 1.0 = disabled)
-         --repeat-last-n N        last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size)
-         --repeat-penalty N       penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled)
-         --presence-penalty N     repeat alpha presence penalty (default: 0.0, 0.0 = disabled)
-         --frequency-penalty N    repeat alpha frequency penalty (default: 0.0, 0.0 = disabled)
-         --dynatemp-range N       dynamic temperature range (default: 0.0, 0.0 = disabled)
-         --dynatemp-exp N         dynamic temperature exponent (default: 1.0)
-         --mirostat N             use Mirostat sampling.
-                                  Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.
-                                  (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)
-         --mirostat-lr N          Mirostat learning rate, parameter eta (default: 0.1)
-         --mirostat-ent N         Mirostat target entropy, parameter tau (default: 5.0)
-         -l TOKEN_ID(+/-)BIAS     modifies the likelihood of token appearing in the completion,
-                                  i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
-                                  or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'
-         --cfg-negative-prompt PROMPT
-                                  negative prompt to use for guidance (default: '')
-         --cfg-negative-prompt-file FNAME
-                                  negative prompt file to use for guidance
-         --cfg-scale N            strength of guidance (default: 1.0, 1.0 = disable)
-         --chat-template JINJA_TEMPLATE
-                                  set custom jinja chat template (default: template taken from model's metadata)
-                                  if suffix/prefix are specified, template will be disabled
-                                  only commonly used templates are accepted:
-                                  https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
-
-grammar:
-
-         --grammar GRAMMAR        BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '')
-         --grammar-file FNAME     file to read grammar from
-  -j,    --json-schema SCHEMA     JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
-                                  For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead
-
-embedding:
-
-         --pooling {none,mean,cls,last}
-                                  pooling type for embeddings, use model default if unspecified
-         --attention {causal,non-causal}
-                                  attention type for embeddings, use model default if unspecified
-
-context hacking:
-
-         --rope-scaling {none,linear,yarn}
-                                  RoPE frequency scaling method, defaults to linear unless specified by the model
-         --rope-scale N           RoPE context scaling factor, expands context by a factor of N
-         --rope-freq-base N       RoPE base frequency, used by NTK-aware scaling (default: loaded from model)
-         --rope-freq-scale N      RoPE frequency scaling factor, expands context by a factor of 1/N
-         --yarn-orig-ctx N        YaRN: original context size of model (default: 0 = model training context size)
-         --yarn-ext-factor N      YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)
-         --yarn-attn-factor N     YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
-         --yarn-beta-slow N       YaRN: high correction dim or alpha (default: 1.0)
-         --yarn-beta-fast N       YaRN: low correction dim or beta (default: 32.0)
-  -gan,  --grp-attn-n N           group-attention factor (default: 1)
-  -gaw,  --grp-attn-w N           group-attention width (default: 512.0)
-  -dkvc, --dump-kv-cache          verbose print of the KV cache
-  -nkvo, --no-kv-offload          disable KV offload
-  -ctk,  --cache-type-k TYPE      KV cache data type for K (default: f16)
-  -ctv,  --cache-type-v TYPE      KV cache data type for V (default: f16)
-
-perplexity:
-
-         --all-logits             return logits for all tokens in the batch (default: false)
-         --hellaswag              compute HellaSwag score over random tasks from datafile supplied with -f
-         --hellaswag-tasks N      number of tasks to use when computing the HellaSwag score (default: 400)
-         --winogrande             compute Winogrande score over random tasks from datafile supplied with -f
-         --winogrande-tasks N     number of tasks to use when computing the Winogrande score (default: 0)
-         --multiple-choice        compute multiple choice score over random tasks from datafile supplied with -f
-         --multiple-choice-tasks N
-                                  number of tasks to use when computing the multiple choice score (default: 0)
-         --kl-divergence          computes KL-divergence to logits provided via --kl-divergence-base
-         --ppl-stride N           stride for perplexity calculation (default: 0)
-         --ppl-output-type {0,1}  output type for perplexity calculation (default: 0)
-
-parallel:
-
-  -dt,   --defrag-thold N         KV cache defragmentation threshold (default: -1.0, < 0 - disabled)
-  -np,   --parallel N             number of parallel sequences to decode (default: 1)
-  -ns,   --sequences N            number of sequences to decode (default: 1)
-  -cb,   --cont-batching          enable continuous batching (a.k.a dynamic batching) (default: enabled)
-
-multi-modality:
-
-         --mmproj FILE            path to a multimodal projector file for LLaVA. see examples/llava/README.md
-         --image FILE             path to an image file. use with multimodal models. Specify multiple times for batching
-
-backend:
-
-         --rpc SERVERS            comma separated list of RPC servers
-         --mlock                  force system to keep model in RAM rather than swapping or compressing
-         --no-mmap                do not memory-map model (slower load but may reduce pageouts if not using mlock)
-         --numa TYPE              attempt optimizations that help on some NUMA systems
-                                    - distribute: spread execution evenly over all nodes
-                                    - isolate: only spawn threads on CPUs on the node that execution started on
-                                    - numactl: use the CPU map provided by numactl
-                                  if run without this previously, it is recommended to drop the system page cache before using this
-                                  see https://github.com/ggerganov/llama.cpp/issues/1437
-
-model:
-
-         --check-tensors          check model tensor data for invalid values (default: false)
-         --override-kv KEY=TYPE:VALUE
-                                  advanced option to override model metadata by key. may be specified multiple times.
-                                  types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false
-         --lora FNAME             apply LoRA adapter (implies --no-mmap)
-         --lora-scaled FNAME S    apply LoRA adapter with user defined scaling S (implies --no-mmap)
-         --lora-base FNAME        optional model to use as a base for the layers modified by the LoRA adapter
-         --control-vector FNAME   add a control vector
-                                  note: this argument can be repeated to add multiple control vectors
-         --control-vector-scaled FNAME SCALE
-                                  add a control vector with user defined scaling SCALE
-                                  note: this argument can be repeated to add multiple scaled control vectors
-         --control-vector-layer-range START END
-                                  layer range to apply the control vector(s) to, start and end inclusive
-  -m,    --model FNAME            model path (default: models/$filename with filename from --hf-file
-                                  or --model-url if set, otherwise models/7B/ggml-model-f16.gguf)
-  -md,   --model-draft FNAME      draft model for speculative decoding (default: unused)
-  -mu,   --model-url MODEL_URL    model download url (default: unused)
-  -hfr,  --hf-repo REPO           Hugging Face model repository (default: unused)
-  -hff,  --hf-file FILE           Hugging Face model file (default: unused)
-  -hft,  --hf-token TOKEN         Hugging Face access token (default: value from HF_TOKEN environment variable)
-
-server:
-
-         --host HOST              ip address to listen (default: 127.0.0.1)
-         --port PORT              port to listen (default: 8080)
-         --path PATH              path to serve static files from (default: )
-         --embedding(s)           restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
-         --api-key KEY            API key to use for authentication (default: none)
-         --api-key-file FNAME     path to file containing API keys (default: none)
-         --ssl-key-file FNAME     path to file a PEM-encoded SSL private key
-         --ssl-cert-file FNAME    path to file a PEM-encoded SSL certificate
-         --timeout N              server read/write timeout in seconds (default: 600)
-         --threads-http N         number of threads used to process HTTP requests (default: -1)
-         --system-prompt-file FNAME
-                                  set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications
-         --log-format {text,json}
-                                  log output format: json or text (default: json)
-         --metrics                enable prometheus compatible metrics endpoint (default: disabled)
-         --no-slots               disables slots monitoring endpoint (default: enabled)
-         --slot-save-path PATH    path to save slot kv cache (default: disabled)
-         --chat-template JINJA_TEMPLATE
-                                  set custom jinja chat template (default: template taken from model's metadata)
-                                  only commonly used templates are accepted:
-                                  https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
-  -sps,  --slot-prompt-similarity SIMILARITY
-                                  how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
-         --lora-init-without-apply
-                                  load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled)
-
-logging:
-
-         --simple-io              use basic IO for better compatibility in subprocesses and limited consoles
-  -ld,   --logdir LOGDIR          path under which to save YAML logs (no logging if unset)
-         --log-test               Run simple logging test
-         --log-disable            Disable trace logs
-         --log-enable             Enable trace logs
-         --log-file FNAME         Specify a log filename (without extension)
-         --log-new                Create a separate new log file on start. Each log file will have unique name: "<name>.<ID>.log"
-         --log-append             Don't truncate the old log file.
-```
-
-Available environment variables (if specified, these variables will override parameters specified in arguments):
-
- `LLAMA_CACHE`: cache directory, used by `--hf-repo`
- `HF_TOKEN`: Hugging Face access token, used when accessing a gated model with `--hf-repo`
- `LLAMA_ARG_MODEL`: equivalent to `-m`
- `LLAMA_ARG_MODEL_URL`: equivalent to `-mu`
- `LLAMA_ARG_MODEL_ALIAS`: equivalent to `-a`
- `LLAMA_ARG_HF_REPO`: equivalent to `--hf-repo`
- `LLAMA_ARG_HF_FILE`: equivalent to `--hf-file`
- `LLAMA_ARG_THREADS`: equivalent to `-t`
- `LLAMA_ARG_CTX_SIZE`: equivalent to `-c`
- `LLAMA_ARG_N_PARALLEL`: equivalent to `-np`
- `LLAMA_ARG_BATCH`: equivalent to `-b`
- `LLAMA_ARG_UBATCH`: equivalent to `-ub`
- `LLAMA_ARG_N_GPU_LAYERS`: equivalent to `-ngl`
- `LLAMA_ARG_THREADS_HTTP`: equivalent to `--threads-http`
- `LLAMA_ARG_CHAT_TEMPLATE`: equivalent to `--chat-template`
- `LLAMA_ARG_N_PREDICT`: equivalent to `-n`
- `LLAMA_ARG_ENDPOINT_METRICS`: if set to `1`, it will enable metrics endpoint (equivalent to `--metrics`)
- `LLAMA_ARG_ENDPOINT_SLOTS`: if set to `0`, it will **disable** slots endpoint (equivalent to `--no-slots`). This feature is enabled by default.
- `LLAMA_ARG_EMBEDDINGS`: if set to `1`, it will enable embeddings endpoint (equivalent to `--embeddings`)
- `LLAMA_ARG_FLASH_ATTN`: if set to `1`, it will enable flash attention (equivalent to `-fa`)
- `LLAMA_ARG_CONT_BATCHING`: if set to `0`, it will **disable** continuous batching (equivalent to `--no-cont-batching`). This feature is enabled by default.
- `LLAMA_ARG_DEFRAG_THOLD`: equivalent to `-dt`
- `LLAMA_ARG_HOST`: equivalent to `--host`
- `LLAMA_ARG_PORT`: equivalent to `--port`
-
-Example usage of docker compose with environment variables:
-
-```yml
-services:
-  llamacpp-server:
-    image: ghcr.io/ggerganov/llama.cpp:server
-    ports:
-      - 8080:8080
-    volumes:
-      - ./models:/models
-    environment:
-      # alternatively, you can use "LLAMA_ARG_MODEL_URL" to download the model
-      LLAMA_ARG_MODEL: /models/my_model.gguf
-      LLAMA_ARG_CTX_SIZE: 4096
-      LLAMA_ARG_N_PARALLEL: 2
-      LLAMA_ARG_ENDPOINT_METRICS: 1  # to disable, either remove or set to 0
-      LLAMA_ARG_PORT: 8080
-```
-
-## Build
-
-`llama-server` is built alongside everything else from the root of the project
-
- Using `make`:
-
-  ```bash
-  make llama-server
-  ```
-
- Using `CMake`:
-
-  ```bash
-  cmake -B build
-  cmake --build build --config Release -t llama-server
-  ```
-
-  Binary is at `./build/bin/llama-server`
-
-## Build with SSL
-
-`llama-server` can also be built with SSL support using OpenSSL 3
-
- Using `make`:
-
-  ```bash
-  # NOTE: For non-system openssl, use the following:
-  #   CXXFLAGS="-I /path/to/openssl/include"
-  #   LDFLAGS="-L /path/to/openssl/lib"
-  make LLAMA_SERVER_SSL=true llama-server
-  ```
-
- Using `CMake`:
-
-  ```bash
-  cmake -B build -DLLAMA_SERVER_SSL=ON
-  cmake --build build --config Release -t llama-server
-  ```
-
-## Quick Start
-
-To get started right away, run the following command, making sure to use the correct path for the model you have:
-
-### Unix-based systems (Linux, macOS, etc.)
-
-```bash
-./llama-server -m models/7B/ggml-model.gguf -c 2048
-```
-
-### Windows
-
-```powershell
-llama-server.exe -m models\7B\ggml-model.gguf -c 2048
-```
-
-The above command will start a server that by default listens on `127.0.0.1:8080`.
-You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.
-
-### Docker
-
-```bash
-docker run -p 8080:8080 -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
-
-# or, with CUDA:
-docker run -p 8080:8080 -v /path/to/models:/models --gpus all ghcr.io/ggerganov/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99
-```
-
-## Testing with CURL
-
-Using [curl](https://curl.se/). On Windows, `curl.exe` should be available in the base OS.
-
-```sh
-curl --request POST \
-    --url http://localhost:8080/completion \
-    --header "Content-Type: application/json" \
-    --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
-```
-
-## Advanced testing
-
-We implemented a [server test framework](./tests/README.md) using human-readable scenario.
-
-*Before submitting an issue, please try to reproduce it with this format.*
-
-## Node JS Test
-
-You need to have [Node.js](https://nodejs.org/en) installed.
-
-```bash
-mkdir llama-client
-cd llama-client
-```
-
-Create a index.js file and put this inside:
-
-```javascript
-const prompt = `Building a website can be done in 10 simple steps:`;
-
-async function Test() {
-    let response = await fetch("http://127.0.0.1:8080/completion", {
-        method: 'POST',
-        body: JSON.stringify({
-            prompt,
-            n_predict: 512,
-        })
-    })
-    console.log((await response.json()).content)
-}
-
-Test()
-```
-
-And run it:
-
-```bash
-node index.js
-```
-
-## API Endpoints
-
-### GET `/health`: Returns heath check result
-
-**Response format**
-
- HTTP status code 503
-  - Body: `{"error": {"code": 503, "message": "Loading model", "type": "unavailable_error"}}`
-  - Explanation: the model is still being loaded.
- HTTP status code 200
-  - Body: `{"status": "ok" }`
-  - Explanation: the model is successfully loaded and the server is ready.
-
-### POST `/completion`: Given a `prompt`, it returns the predicted completion.
-
-    *Options:*
-
-    `prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, if `cache_prompt` is `true`, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. A `BOS` token is inserted at the start, if all of the following conditions are true:
-
-      - The prompt is a string or an array with the first element given as a string
-      - The model's `tokenizer.ggml.add_bos_token` metadata is `true`
-      - The system prompt is empty
-
-    `temperature`: Adjust the randomness of the generated text. Default: `0.8`
-
-    `dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` Default: `0.0`, which is disabled.
-
-    `dynatemp_exponent`: Dynamic temperature exponent. Default: `1.0`
-
-    `top_k`: Limit the next token selection to the K most probable tokens.  Default: `40`
-
-    `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Default: `0.95`
-
-    `min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token. Default: `0.05`
-
-    `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
-
-    `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
-    By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
-
-    `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
-
-    `stop`: Specify a JSON array of stopping strings.
-    These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`
-
-    `tfs_z`: Enable tail free sampling with parameter z. Default: `1.0`, which is disabled.
-
-    `typical_p`: Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled.
-
-    `repeat_penalty`: Control the repetition of token sequences in the generated text. Default: `1.1`
-
-    `repeat_last_n`: Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size.
-
-    `penalize_nl`: Penalize newline tokens when applying the repeat penalty. Default: `true`
-
-    `presence_penalty`: Repeat alpha presence penalty. Default: `0.0`, which is disabled.
-
-    `frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
-
-    `penalty_prompt`: This will replace the `prompt` for the purpose of the penalty evaluation. Can be either `null`, a string or an array of numbers representing tokens. Default: `null`, which is to use the original `prompt`.
-
-    `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
-
-    `mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`
-
-    `mirostat_eta`: Set the Mirostat learning rate, parameter eta.  Default: `0.1`
-
-    `grammar`: Set grammar for grammar-based sampling.  Default: no grammar
-
-    `json_schema`: Set a JSON schema for grammar-based sampling (e.g. `{"items": {"type": "string"}, "minItems": 10, "maxItems": 100}` of a list of strings, or `{}` for any JSON). See [tests](../../tests/test-json-schema-to-grammar.cpp) for supported features.  Default: no JSON schema.
-
-    `seed`: Set the random number generator (RNG) seed.  Default: `-1`, which is a random seed.
-
-    `ignore_eos`: Ignore end of stream token and continue generating.  Default: `false`
-
-    `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]`
-
-    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. Default: `0`
-
-    `min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`
-
-    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
-
-    `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot.  Default: `-1`
-
-    `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
-
-    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
-
-    `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.
-
-**Response format**
-
- Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion.
-
- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:
-
-```json
-{
-  "content": "<the token selected by the model>",
-  "probs": [
-    {
-      "prob": float,
-      "tok_str": "<most likely token>"
-    },
-    {
-      "prob": float,
-      "tok_str": "<second most likely token>"
-    },
-    ...
-  ]
-},
-```
-
-Notice that each `probs` is an array of length `n_probs`.
-
- `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
- `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.).
- `model`: The path to the model loaded with `-m`
- `prompt`: The provided `prompt`
- `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
- `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered
- `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided
- `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
- `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
- `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
- `tokens_evaluated`: Number of tokens evaluated in total from the prompt
- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
-
-### POST `/tokenize`: Tokenize a given text
-
-    *Options:*
-
-    `content`: Set the text to tokenize.
-
-    `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
-
-### POST `/detokenize`: Convert tokens to text
-
-    *Options:*
-
-    `tokens`: Set the tokens to detokenize.
-
-### POST `/embedding`: Generate embedding of a given text
-
-The same as [the embedding example](../embedding) does.
-
-    *Options:*
-
-    `content`: Set the text to process.
-
-    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
-
-### POST `/infill`: For code infilling.
-
-Takes a prefix and a suffix and returns the predicted completion as stream.
-
-    *Options:*
-
-    `input_prefix`: Set the prefix of the code to infill.
-
-    `input_suffix`: Set the suffix of the code to infill.
-
-    It also accepts all the options of `/completion` except `stream` and `prompt`.
-
- **GET** `/props`: Return current server settings.
-
-**Response format**
-
-```json
-{
-  "assistant_name": "",
-  "user_name": "",
-  "default_generation_settings": { ... },
-  "total_slots": 1,
-  "chat_template": ""
-}
-```
-
- `assistant_name` - the required assistant name to generate the prompt in case you have specified a system prompt for all slots.
- `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
- `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
- `chat_template` - the model's original Jinja2 prompt template
-
-### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
-
-Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
-
-    *Options:*
-
-    See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
-
-    The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}`), similar to other OpenAI-inspired API providers.
-
-    *Examples:*
-
-    You can use either Python `openai` library with appropriate checkpoints:
-
-    ```python
-    import openai
-
-    client = openai.OpenAI(
-        base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
-        api_key = "sk-no-key-required"
-    )
-
-    completion = client.chat.completions.create(
-    model="gpt-3.5-turbo",
-    messages=[
-        {"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
-        {"role": "user", "content": "Write a limerick about python exceptions"}
-    ]
-    )
-
-    print(completion.choices[0].message)
-    ```
-
-    ... or raw HTTP requests:
-
-    ```shell
-    curl http://localhost:8080/v1/chat/completions \
-    -H "Content-Type: application/json" \
-    -H "Authorization: Bearer no-key" \
-    -d '{
-    "model": "gpt-3.5-turbo",
-    "messages": [
-    {
-        "role": "system",
-        "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
-    },
-    {
-        "role": "user",
-        "content": "Write a limerick about python exceptions"
-    }
-    ]
-    }'
-    ```
-
-### POST `/v1/embeddings`: OpenAI-compatible embeddings API
-
-    *Options:*
-
-    See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings).
-
-    *Examples:*
-
-  - input as string
-
-    ```shell
-    curl http://localhost:8080/v1/embeddings \
-    -H "Content-Type: application/json" \
-    -H "Authorization: Bearer no-key" \
-    -d '{
-            "input": "hello",
-            "model":"GPT-4",
-            "encoding_format": "float"
-    }'
-    ```
-
-  - `input` as string array
-
-    ```shell
-    curl http://localhost:8080/v1/embeddings \
-    -H "Content-Type: application/json" \
-    -H "Authorization: Bearer no-key" \
-    -d '{
-            "input": ["hello", "world"],
-            "model":"GPT-4",
-            "encoding_format": "float"
-    }'
-    ```
-
-### GET `/slots`: Returns the current slots processing state
-
-This endpoint can be disabled with `--no-slots`
-
-If query param `?fail_on_no_slot=1` is set, this endpoint will respond with status code 503 if there is no available slots.
-
-**Response format**
-
-Example:
-
-```json
-[
-    {
-        "dynatemp_exponent": 1.0,
-        "dynatemp_range": 0.0,
-        "frequency_penalty": 0.0,
-        "grammar": "",
-        "id": 0,
-        "ignore_eos": false,
-        "logit_bias": [],
-        "min_p": 0.05000000074505806,
-        "mirostat": 0,
-        "mirostat_eta": 0.10000000149011612,
-        "mirostat_tau": 5.0,
-        "model": "llama-2-7b-32k-instruct.Q2_K.gguf",
-        "n_ctx": 2048,
-        "n_keep": 0,
-        "n_predict": 100000,
-        "n_probs": 0,
-        "next_token": {
-            "has_next_token": true,
-            "n_remain": -1,
-            "n_decoded": 0,
-            "stopped_eos": false,
-            "stopped_limit": false,
-            "stopped_word": false,
-            "stopping_word": ""
-        },
-        "penalize_nl": true,
-        "penalty_prompt_tokens": [],
-        "presence_penalty": 0.0,
-        "prompt": "Say hello to llama.cpp",
-        "repeat_last_n": 64,
-        "repeat_penalty": 1.100000023841858,
-        "samplers": [
-            "top_k",
-            "tfs_z",
-            "typical_p",
-            "top_p",
-            "min_p",
-            "temperature"
-        ],
-        "seed": 42,
-        "state": 1,
-        "stop": [
-            "\n"
-        ],
-        "stream": false,
-        "task_id": 0,
-        "temperature": 0.0,
-        "tfs_z": 1.0,
-        "top_k": 40,
-        "top_p": 0.949999988079071,
-        "typical_p": 1.0,
-        "use_penalty_prompt_tokens": false
-    }
-]
-```
-
-Possible values for `slot[i].state` are:
- `0`: SLOT_STATE_IDLE
- `1`: SLOT_STATE_PROCESSING
-
-### GET `/metrics`: Prometheus compatible metrics exporter
-
-This endpoint is only accessible if `--metrics` is set.
-
-Available metrics:
- `llamacpp:prompt_tokens_total`: Number of prompt tokens processed.
- `llamacpp:tokens_predicted_total`: Number of generation tokens processed.
- `llamacpp:prompt_tokens_seconds`: Average prompt throughput in tokens/s.
- `llamacpp:predicted_tokens_seconds`: Average generation throughput in tokens/s.
- `llamacpp:kv_cache_usage_ratio`: KV-cache usage. `1` means 100 percent usage.
- `llamacpp:kv_cache_tokens`: KV-cache tokens.
- `llamacpp:requests_processing`: Number of requests processing.
- `llamacpp:requests_deferred`: Number of requests deferred.
-
-### POST `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.
-
-    *Options:*
-
-    `filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter.
-
-**Response format**
-
-```json
-{
-    "id_slot": 0,
-    "filename": "slot_save_file.bin",
-    "n_saved": 1745,
-    "n_written": 14309796,
-    "timings": {
-        "save_ms": 49.865
-    }
-}
-```
-
-### POST `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file.
-
-    *Options:*
-
-    `filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter.
-
-**Response format**
-
-```json
-{
-    "id_slot": 0,
-    "filename": "slot_save_file.bin",
-    "n_restored": 1745,
-    "n_read": 14309796,
-    "timings": {
-        "restore_ms": 42.937
-    }
-}
-```
-
-### POST `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot.
-
-**Response format**
-
-```json
-{
-    "id_slot": 0,
-    "n_erased": 1745
-}
-```
-
-### GET `/lora-adapters`: Get list of all LoRA adapters
-
-This endpoint returns the loaded LoRA adapters. You can add adapters using `--lora` when starting the server, for example: `--lora my_adapter_1.gguf --lora my_adapter_2.gguf ...`
-
-By default, all adapters will be loaded with scale set to 1. To initialize all adapters scale to 0, add `--lora-init-without-apply`
-
-If an adapter is disabled, the scale will be set to 0.
-
-**Response format**
-
-```json
-[
-    {
-        "id": 0,
-        "path": "my_adapter_1.gguf",
-        "scale": 0.0
-    },
-    {
-        "id": 1,
-        "path": "my_adapter_2.gguf",
-        "scale": 0.0
-    }
-]
-```
-
-### POST `/lora-adapters`: Set list of LoRA adapters
-
-To disable an adapter, either remove it from the list below, or set scale to 0.
-
-**Request format**
-
-To know the `id` of the adapter, use GET `/lora-adapters`
-
-```json
-[
-  {"id": 0, "scale": 0.2},
-  {"id": 1, "scale": 0.8}
-]
-```
-
-## More examples
-
-### Change system prompt on runtime
-
-To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt`. This only needs to be used once.
-
-`prompt`: Specify a context that you want all connecting clients to respect.
-
-`anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint.
-
-`assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint.
-
-```json
-{
-    "system_prompt": {
-        "prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:",
-        "anti_prompt": "User:",
-        "assistant_name": "Assistant:"
-    }
-}
-```
-
-**NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`.
-
-### Interactive mode
-
-Check the sample in [chat.mjs](chat.mjs).
-Run with NodeJS version 16 or later:
-
-```sh
-node chat.mjs
-```
-
-Another sample in [chat.sh](chat.sh).
-Requires [bash](https://www.gnu.org/software/bash/), [curl](https://curl.se) and [jq](https://jqlang.github.io/jq/).
-Run with bash:
-
-```sh
-bash chat.sh
-```
-
-### OAI-like API
-
-The HTTP `llama-server` supports an OAI-like API: https://github.com/openai/openai-openapi
-
-### API errors
-
-`llama-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
-
-Example of an error:
-
-```json
-{
-    "error": {
-        "code": 401,
-        "message": "Invalid API Key",
-        "type": "authentication_error"
-    }
-}
-```
-
-Apart from error types supported by OAI, we also have custom types that are specific to functionalities of llama.cpp:
-
-**When /metrics or /slots endpoint is disabled**
-
-```json
-{
-    "error": {
-        "code": 501,
-        "message": "This server does not support metrics endpoint.",
-        "type": "not_supported_error"
-    }
-}
-```
-
-**When the server receives invalid grammar via */completions endpoint**
-
-```json
-{
-    "error": {
-        "code": 400,
-        "message": "Failed to parse grammar",
-        "type": "invalid_request_error"
-    }
-}
-```
-
-### Extending or building alternative Web Front End
-
-You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method.
-
-Read the documentation in `/completion.js` to see convenient ways to access llama.
-
-A simple example is below:
-
-```html
-<html>
-  <body>
-    <pre>
-      <script type="module">
-        import { llama } from '/completion.js'
-
-        const prompt = `### Instruction:
-Write dad jokes, each one paragraph.
-You can use html formatting if needed.
-
-### Response:`
-
-        for await (const chunk of llama(prompt)) {
-          document.write(chunk.data.content)
-        }
-      </script>
-    </pre>
-  </body>
-</html>
-```
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@ -1,120 +0,0 @@
-### Server benchmark tools
-
-Benchmark is using [k6](https://k6.io/).
-
-##### Install k6 and sse extension
-
-SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.
-
-Example:
-```shell
-go install go.k6.io/xk6/cmd/xk6@latest
-xk6 build master \
--with github.com/phymbert/xk6-sse
-```
-
-#### Download a dataset
-
-This dataset was originally proposed in [vLLM benchmarks](https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md).
-
-```shell
-wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-```
-
-#### Download a model
-Example for PHI-2
-
-```shell
-../../../scripts/hf.sh --repo ggml-org/models --file phi-2/ggml-model-q4_0.gguf
-```
-
-#### Start the server
-The server must answer OAI Chat completion requests on `http://localhost:8080/v1` or according to the environment variable `SERVER_BENCH_URL`.
-
-Example:
-```shell
-server --host localhost --port 8080 \
-  --model ggml-model-q4_0.gguf \
-  --cont-batching \
-  --metrics \
-  --parallel 8 \
-  --batch-size 512 \
-  --ctx-size 4096 \
-  --log-format text \
-  -ngl 33
-```
-
-#### Run the benchmark
-
-For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run:
-```shell
-./k6 run script.js --duration 10m --iterations 500 --vus 8
-```
-
-The benchmark values can be overridden with:
- `SERVER_BENCH_URL` server url prefix for chat completions, default `http://localhost:8080/v1`
- `SERVER_BENCH_N_PROMPTS` total prompts to randomly select in the benchmark, default `480`
- `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model`
- `SERVER_BENCH_MAX_TOKENS` max tokens to predict, default: `512`
- `SERVER_BENCH_DATASET` path to the benchmark dataset file
- `SERVER_BENCH_MAX_PROMPT_TOKENS` maximum prompt tokens to filter out in the dataset: default `1024`
- `SERVER_BENCH_MAX_CONTEXT` maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens, default `2048`
-
-Note: the local tokenizer is just a string space split, real number of tokens will differ.
-
-Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/):
-
-```shell
-SERVER_BENCH_N_PROMPTS=500 k6 run script.js --duration 10m --iterations 500 --vus 8
-```
-
-To [debug http request](https://k6.io/docs/using-k6/http-debugging/) use `--http-debug="full"`.
-
-#### Metrics
-
-Following metrics are available computed from the OAI chat completions response `usage`:
- `llamacpp_tokens_second` Trend of `usage.total_tokens / request duration`
- `llamacpp_prompt_tokens` Trend of `usage.prompt_tokens`
- `llamacpp_prompt_tokens_total_counter` Counter of `usage.prompt_tokens`
- `llamacpp_completion_tokens` Trend of `usage.completion_tokens`
- `llamacpp_completion_tokens_total_counter` Counter of `usage.completion_tokens`
- `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'`
- `llamacpp_completions_stop_rate` Rate of completions stopped by the model, i.e. if `finish_reason === 'stop'`
-
-The script will fail if too many completions are truncated, see `llamacpp_completions_truncated_rate`.
-
-K6 metrics might be compared against [server metrics](../README.md), with:
-
-```shell
-curl http://localhost:8080/metrics
-```
-
-### Using the CI python script
-The `bench.py` script does several steps:
- start the server
- define good variable for k6
- run k6 script
- extract metrics from prometheus
-
-It aims to be used in the CI, but you can run it manually:
-
-```shell
-LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/llama-server python bench.py \
-              --runner-label local \
-              --name local \
-              --branch `git rev-parse --abbrev-ref HEAD` \
-              --commit `git rev-parse HEAD` \
-              --scenario script.js \
-              --duration 5m \
-              --hf-repo ggml-org/models	 \
-              --hf-file phi-2/ggml-model-q4_0.gguf \
-              --model-path-prefix models \
-              --parallel 4 \
-              -ngl 33 \
-              --batch-size 2048 \
-              --ubatch-size	256 \
-              --ctx-size 4096 \
-              --n-prompts 200 \
-              --max-prompt-tokens 256 \
-              --max-tokens 256
-```
--- a/examples/server/public_simplechat/readme.md
+++ b/examples/server/public_simplechat/readme.md
@ -1,286 +0,0 @@
-
-# SimpleChat
-
-by Humans for All.
-
-## quickstart
-
-To run from the build dir
-
-bin/llama-server -m path/model.gguf --path ../examples/server/public_simplechat
-
-Continue reading for the details.
-
-## overview
-
-This simple web frontend, allows triggering/testing the server's /completions or /chat/completions endpoints
-in a simple way with minimal code from a common code base. Inturn additionally it tries to allow single or
-multiple independent back and forth chatting to an extent, with the ai llm model at a basic level, with their
-own system prompts.
-
-This allows seeing the generated text / ai-model response in oneshot at the end, after it is fully generated,
-or potentially as it is being generated, in a streamed manner from the server/ai-model.
-
-![Chat and Settings screens](./simplechat_screens.webp "Chat and Settings screens")
-
-Auto saves the chat session locally as and when the chat is progressing and inturn at a later time when you
-open SimpleChat, option is provided to restore the old chat session, if a matching one exists.
-
-The UI follows a responsive web design so that the layout can adapt to available display space in a usable
-enough manner, in general.
-
-Allows developer/end-user to control some of the behaviour by updating gMe members from browser's devel-tool
-console. Parallely some of the directly useful to end-user settings can also be changed using the provided
-settings ui.
-
-NOTE: Current web service api doesnt expose the model context length directly, so client logic doesnt provide
-any adaptive culling of old messages nor of replacing them with summary of their content etal. However there
-is a optional sliding window based chat logic, which provides a simple minded culling of old messages from
-the chat history before sending to the ai model.
-
-NOTE: Wrt options sent with the request, it mainly sets temperature, max_tokens and optionaly stream for now.
-However if someone wants they can update the js file or equivalent member in gMe as needed.
-
-NOTE: One may be able to use this to chat with openai api web-service /chat/completions endpoint, in a very
-limited / minimal way. One will need to set model, openai url and authorization bearer key in settings ui.
-
-
-## usage
-
-One could run this web frontend directly using server itself or if anyone is thinking of adding a built in web
-frontend to configure the server over http(s) or so, then run this web frontend using something like python's
-http module.
-
-### running using examples/server
-
-./llama-server -m path/model.gguf --path examples/server/public_simplechat [--port PORT]
-
-### running using python3's server module
-
-first run examples/server
-* ./llama-server -m path/model.gguf
-
-next run this web front end in examples/server/public_simplechat
-* cd ../examples/server/public_simplechat
-* python3 -m http.server PORT
-
-### using the front end
-
-Open this simple web front end from your local browser
-
-* http://127.0.0.1:PORT/index.html
-
-Once inside
-
-* If you want to, you can change many of the default global settings
-  * the base url (ie ip addr / domain name, port)
-  * chat (default) vs completion mode
-  * try trim garbage in response or not
-  * amount of chat history in the context sent to server/ai-model
-  * oneshot or streamed mode.
-
-* In completion mode
-  * one normally doesnt use a system prompt in completion mode.
-  * logic by default doesnt insert any role specific "ROLE: " prefix wrt each role's message.
-    If the model requires any prefix wrt user role messages, then the end user has to
-    explicitly add the needed prefix, when they enter their chat message.
-    Similarly if the model requires any prefix to trigger assistant/ai-model response,
-    then the end user needs to enter the same.
-    This keeps the logic simple, while still giving flexibility to the end user to
-    manage any templating/tagging requirement wrt their messages to the model.
-  * the logic doesnt insert newline at the begining and end wrt the prompt message generated.
-    However if the chat being sent to /completions end point has more than one role's message,
-    then insert newline when moving from one role's message to the next role's message, so
-    that it can be clearly identified/distinguished.
-  * given that /completions endpoint normally doesnt add additional chat-templating of its
-    own, the above ensures that end user can create a custom single/multi message combo with
-    any tags/special-tokens related chat templating to test out model handshake. Or enduser
-    can use it just for normal completion related/based query.
-
-* If you want to provide a system prompt, then ideally enter it first, before entering any user query.
-  Normally Completion mode doesnt need system prompt, while Chat mode can generate better/interesting
-  responses with a suitable system prompt.
-  * if chat.add_system_begin is used
-    * you cant change the system prompt, after it is has been submitted once along with user query.
-    * you cant set a system prompt, after you have submitted any user query
-  * if chat.add_system_anytime is used
-    * one can change the system prompt any time during chat, by changing the contents of system prompt.
-    * inturn the updated/changed system prompt will be inserted into the chat session.
-    * this allows for the subsequent user chatting to be driven by the new system prompt set above.
-
-* Enter your query and either press enter or click on the submit button.
-  If you want to insert enter (\n) as part of your chat/query to ai model, use shift+enter.
-
-* Wait for the logic to communicate with the server and get the response.
-  * the user is not allowed to enter any fresh query during this time.
-  * the user input box will be disabled and a working message will be shown in it.
-  * if trim garbage is enabled, the logic will try to trim repeating text kind of garbage to some extent.
-
-* just refresh the page, to reset wrt the chat history and or system prompt and start afresh.
-
-* Using NewChat one can start independent chat sessions.
-  * two independent chat sessions are setup by default.
-
-* When you want to print, switching ChatHistoryInCtxt to Full and clicking on the chat session button of
-  interest, will display the full chat history till then wrt same, if you want full history for printing.
-
-
-## Devel note
-
-### Reason behind this
-
-The idea is to be easy enough to use for basic purposes, while also being simple and easily discernable
-by developers who may not be from web frontend background (so inturn may not be familiar with template /
-end-use-specific-language-extensions driven flows) so that they can use it to explore/experiment things.
-
-And given that the idea is also to help explore/experiment for developers, some flexibility is provided
-to change behaviour easily using the devel-tools/console or provided minimal settings ui (wrt few aspects).
-Skeletal logic has been implemented to explore some of the end points and ideas/implications around them.
-
-
-### General
-
-Me/gMe consolidates the settings which control the behaviour into one object.
-One can see the current settings, as well as change/update them using browsers devel-tool/console.
-It is attached to the document object. Some of these can also be updated using the Settings UI.
-
-  baseURL - the domain-name/ip-address and inturn the port to send the request.
-
-  bStream - control between oneshot-at-end and live-stream-as-its-generated collating and showing
-  of the generated response.
-
-    the logic assumes that the text sent from the server follows utf-8 encoding.
-
-    in streaming mode - if there is any exception, the logic traps the same and tries to ensure
-    that text generated till then is not lost.
-
-      if a very long text is being generated, which leads to no user interaction for sometime and
-      inturn the machine goes into power saving mode or so, the platform may stop network connection,
-      leading to exception.
-
-  apiEP - select between /completions and /chat/completions endpoint provided by the server/ai-model.
-
-  bCompletionFreshChatAlways - whether Completion mode collates complete/sliding-window history when
-  communicating with the server or only sends the latest user query/message.
-
-  bCompletionInsertStandardRolePrefix - whether Completion mode inserts role related prefix wrt the
-  messages that get inserted into prompt field wrt /Completion endpoint.
-
-  bTrimGarbage - whether garbage repeatation at the end of the generated ai response, should be
-  trimmed or left as is. If enabled, it will be trimmed so that it wont be sent back as part of
-  subsequent chat history. At the same time the actual trimmed text is shown to the user, once
-  when it was generated, so user can check if any useful info/data was there in the response.
-
-    One may be able to request the ai-model to continue (wrt the last response) (if chat-history
-    is enabled as part of the chat-history-in-context setting), and chances are the ai-model will
-    continue starting from the trimmed part, thus allows long response to be recovered/continued
-    indirectly, in many cases.
-
-    The histogram/freq based trimming logic is currently tuned for english language wrt its
-    is-it-a-alpabetic|numeral-char regex match logic.
-
-  apiRequestOptions - maintains the list of options/fields to send along with api request,
-  irrespective of whether /chat/completions or /completions endpoint.
-
-    If you want to add additional options/fields to send to the server/ai-model, and or
-    modify the existing options value or remove them, for now you can update this global var
-    using browser's development-tools/console.
-
-    For string, numeric and boolean fields in apiRequestOptions, including even those added by a
-    user at runtime by directly modifying gMe.apiRequestOptions, setting ui entries will be auto
-    created.
-
-    cache_prompt option supported by example/server is allowed to be controlled by user, so that
-    any caching supported wrt system-prompt and chat history, if usable can get used. When chat
-    history sliding window is enabled, cache_prompt logic may or may not kick in at the backend
-    wrt same, based on aspects related to model, positional encoding, attention mechanism etal.
-    However system prompt should ideally get the benefit of caching.
-
-  headers - maintains the list of http headers sent when request is made to the server. By default
-  Content-Type is set to application/json. Additionally Authorization entry is provided, which can
-  be set if needed using the settings ui.
-
-  iRecentUserMsgCnt - a simple minded SlidingWindow to limit context window load at Ai Model end.
-  This is disabled by default. However if enabled, then in addition to latest system message, only
-  the last/latest iRecentUserMsgCnt user messages after the latest system prompt and its responses
-  from the ai model will be sent to the ai-model, when querying for a new response. IE if enabled,
-  only user messages after the latest system message/prompt will be considered.
-
-    This specified sliding window user message count also includes the latest user query.
-    <0 : Send entire chat history to server
-     0 : Send only the system message if any to the server
-    >0 : Send the latest chat history from the latest system prompt, limited to specified cnt.
-
-
-By using gMe's iRecentUserMsgCnt and apiRequestOptions.max_tokens/n_predict one can try to control
-the implications of loading of the ai-model's context window by chat history, wrt chat response to
-some extent in a simple crude way. You may also want to control the context size enabled when the
-server loads ai-model, on the server end.
-
-
-Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
-may not be visible. Also remember that just refreshing/reloading page in browser or for that
-matter clearing site data, dont directly override site caching in all cases. Worst case you may
-have to change port. Or in dev tools of browser, you may be able to disable caching fully.
-
-
-Currently the server to communicate with is maintained globally and not as part of a specific
-chat session. So if one changes the server ip/url in setting, then all chat sessions will auto
-switch to this new server, when you try using those sessions.
-
-
-By switching between chat.add_system_begin/anytime, one can control whether one can change
-the system prompt, anytime during the conversation or only at the beginning.
-
-
-### Default setup
-
-By default things are setup to try and make the user experience a bit better, if possible.
-However a developer when testing the server of ai-model may want to change these value.
-
-Using iRecentUserMsgCnt reduce chat history context sent to the server/ai-model to be
-just the system-prompt, prev-user-request-and-ai-response and cur-user-request, instead of
-full chat history. This way if there is any response with garbage/repeatation, it doesnt
-mess with things beyond the next question/request/query, in some ways. The trim garbage
-option also tries to help avoid issues with garbage in the context to an extent.
-
-Set max_tokens to 1024, so that a relatively large previous reponse doesnt eat up the space
-available wrt next query-response. However dont forget that the server when started should
-also be started with a model context size of 1k or more, to be on safe side.
-
-  The /completions endpoint of examples/server doesnt take max_tokens, instead it takes the
-  internal n_predict, for now add the same here on the client side, maybe later add max_tokens
-  to /completions endpoint handling code on server side.
-
-NOTE: One may want to experiment with frequency/presence penalty fields in apiRequestOptions
-wrt the set of fields sent to server along with the user query, to check how the model behaves
-wrt repeatations in general in the generated text response.
-
-A end-user can change these behaviour by editing gMe from browser's devel-tool/console or by
-using the provided settings ui (for settings exposed through the ui).
-
-
-### OpenAi / Equivalent API WebService
-
-One may be abe to handshake with OpenAI/Equivalent api web service's /chat/completions endpoint
-for a minimal chatting experimentation by setting the below.
-
-* the baseUrl in settings ui
-  * https://api.openai.com/v1 or similar
-
-* Wrt request body - gMe.apiRequestOptions
-  * model (settings ui)
-  * any additional fields if required in future
-
-* Wrt request headers - gMe.headers
-  * Authorization (available through settings ui)
-    * Bearer THE_OPENAI_API_KEY
-  * any additional optional header entries like "OpenAI-Organization", "OpenAI-Project" or so
-
-NOTE: Not tested, as there is no free tier api testing available. However logically this might
-work.
-
-
-## At the end
-
-Also a thank you to all open source and open model developers, who strive for the common good.
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -4,7 +4,6 @@
 #include "json-schema-to-grammar.h"
 #include "llama.h"
 #include "build-info.h"
-#include "grammar-parser.h"

 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
@ -170,11 +169,13 @@ struct server_slot {
    std::string stopping_word;

    // sampling
-    llama_token sampled;
-    struct llama_sampling_params sparams;
-    llama_sampling_context * ctx_sampling = nullptr;
    json json_schema;

+    struct gpt_sampler_params sparams;
+    struct gpt_sampler * smpl = nullptr;
+
+    llama_token sampled;
+
    int32_t ga_i = 0;   // group-attention state
    int32_t ga_n = 1;   // group-attention factor
    int32_t ga_w = 512; // group-attention width
@ -652,8 +653,8 @@ struct server_context {

        // Clear any sampling context
        for (server_slot & slot : slots) {
-            if (slot.ctx_sampling != nullptr) {
-                llama_sampling_free(slot.ctx_sampling);
+            if (slot.smpl != nullptr) {
+                gpt_sampler_free(slot.smpl);
            }
        }

@ -884,8 +885,8 @@ struct server_context {
    bool launch_slot_with_task(server_slot & slot, const server_task & task) {
        slot_params default_params;
        // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
-        llama_sampling_params default_sparams = params.sparams;
-        auto & data = task.data;
+        auto default_sparams = params.sparams;
+        const auto & data = task.data;

        if (data.count("__oaicompat") != 0) {
            slot.oaicompat = true;
@ -902,7 +903,7 @@ struct server_context {
        slot.sparams.top_p             = json_value(data, "top_p",             default_sparams.top_p);
        slot.sparams.min_p             = json_value(data, "min_p",             default_sparams.min_p);
        slot.sparams.tfs_z             = json_value(data, "tfs_z",             default_sparams.tfs_z);
-        slot.sparams.typical_p         = json_value(data, "typical_p",         default_sparams.typical_p);
+        slot.sparams.typ_p             = json_value(data, "typical_p",         default_sparams.typ_p);
        slot.sparams.temp              = json_value(data, "temperature",       default_sparams.temp);
        slot.sparams.dynatemp_range    = json_value(data, "dynatemp_range",    default_sparams.dynatemp_range);
        slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
@ -924,7 +925,8 @@ struct server_context {
        if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
            send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST);
            return false;
-        } else if (data.contains("json_schema") && !data.contains("grammar")) {
+        }
+        if (data.contains("json_schema") && !data.contains("grammar")) {
            try {
                auto schema                = json_value(data, "json_schema", json::object());
                slot.sparams.grammar       = json_schema_to_grammar(schema);
@ -974,56 +976,11 @@ struct server_context {
            }
        }

-        // penalize user-provided tokens
-        {
-            slot.sparams.penalty_prompt_tokens.clear();
-            slot.sparams.use_penalty_prompt_tokens = false;
-
-            const auto & penalty_prompt = data.find("penalty_prompt");
-
-            if (penalty_prompt != data.end()) {
-                if (penalty_prompt->is_string()) {
-                    const auto penalty_prompt_string = penalty_prompt->get<std::string>();
-                    slot.sparams.penalty_prompt_tokens = llama_tokenize(model, penalty_prompt_string, false);
-
-                    if (slot.params.n_predict > 0) {
-                        slot.sparams.penalty_prompt_tokens.reserve(slot.sparams.penalty_prompt_tokens.size() + slot.params.n_predict);
-                    }
-                    slot.sparams.use_penalty_prompt_tokens = true;
-
-                    LOG_VERBOSE("penalty_prompt_tokens", {
-                        {"id_slot", slot.id},
-                        {"tokens",  slot.sparams.penalty_prompt_tokens},
-                    });
-                }
-                else if (penalty_prompt->is_array()) {
-                    const auto n_tokens = penalty_prompt->size();
-                    slot.sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot.params.n_predict));
-
-                    const int n_vocab = llama_n_vocab(model);
-                    for (const auto & penalty_token : *penalty_prompt) {
-                        if (penalty_token.is_number_integer()) {
-                            const auto tok = penalty_token.get<llama_token>();
-                            if (tok >= 0 && tok < n_vocab) {
-                                slot.sparams.penalty_prompt_tokens.push_back(tok);
-                            }
-                        }
-                    }
-                    slot.sparams.use_penalty_prompt_tokens = true;
-
-                    LOG_VERBOSE("penalty_prompt_tokens", {
-                        {"id_slot", slot.id},
-                        {"tokens",  slot.sparams.penalty_prompt_tokens},
-                    });
-                }
-            }
-        }
-
        {
            slot.sparams.logit_bias.clear();

            if (json_value(data, "ignore_eos", false) && has_eos_token) {
-                slot.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
+                slot.sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
            }

            const auto & logit_bias = data.find("logit_bias");
@ -1044,12 +1001,12 @@ struct server_context {
                        if (el[0].is_number_integer()) {
                            llama_token tok = el[0].get<llama_token>();
                            if (tok >= 0 && tok < n_vocab) {
-                                slot.sparams.logit_bias[tok] = bias;
+                                slot.sparams.logit_bias.push_back({tok, bias});
                            }
                        } else if (el[0].is_string()) {
                            auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
                            for (auto tok : toks) {
-                                slot.sparams.logit_bias[tok] = bias;
+                                slot.sparams.logit_bias.push_back({tok, bias});
                            }
                        }
                    }
@ -1071,26 +1028,27 @@ struct server_context {
        }

        {
-            const auto & samplers_sequence = data.find("samplers");
-            if (samplers_sequence != data.end() && samplers_sequence->is_array()) {
+            const auto & samplers = data.find("samplers");
+            if (samplers != data.end() && samplers->is_array()) {
                std::vector<std::string> sampler_names;
-                for (const auto & sampler_name : *samplers_sequence) {
-                    if (sampler_name.is_string()) {
-                        sampler_names.emplace_back(sampler_name);
+                for (const auto & name : *samplers) {
+                    if (name.is_string()) {
+                        sampler_names.emplace_back(name);
                    }
                }
-                slot.sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
+                slot.sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
            } else {
-                slot.sparams.samplers_sequence = default_sparams.samplers_sequence;
+                slot.sparams.samplers = default_sparams.samplers;
            }
        }

        {
-            if (slot.ctx_sampling != nullptr) {
-                llama_sampling_free(slot.ctx_sampling);
+            if (slot.smpl != nullptr) {
+                gpt_sampler_free(slot.smpl);
            }
-            slot.ctx_sampling = llama_sampling_init(slot.sparams);
-            if (slot.ctx_sampling == nullptr) {
+
+            slot.smpl = gpt_sampler_init(model, slot.sparams);
+            if (slot.smpl == nullptr) {
                // for now, the only error that may happen here is invalid grammar
                send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
                return false;
@ -1179,11 +1137,6 @@ struct server_context {
        slot.generated_text += token_str;
        slot.has_next_token = true;

-        if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1) {
-            // we can change penalty_prompt_tokens because it is always created from scratch each request
-            slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
-        }
-
        // check if there is incomplete UTF-8 character at the end
        bool incomplete = false;
        for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i) {
@ -1301,13 +1254,10 @@ struct server_context {
    }

    json get_formated_generation(const server_slot & slot) const {
-        const auto eos_bias   =             slot.sparams.logit_bias.find(llama_token_eos(model));
-        const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second);
-
-        std::vector<std::string> samplers_sequence;
-        samplers_sequence.reserve(slot.sparams.samplers_sequence.size());
-        for (const auto & sampler_type : slot.sparams.samplers_sequence) {
-            samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
+        std::vector<std::string> samplers;
+        samplers.reserve(slot.sparams.samplers.size());
+        for (const auto & sampler : slot.sparams.samplers) {
+            samplers.emplace_back(gpt_sampler_type_to_str(sampler));
        }

        return json {
@ -1322,13 +1272,11 @@ struct server_context {
            {"top_p",                     slot.sparams.top_p},
            {"min_p",                     slot.sparams.min_p},
            {"tfs_z",                     slot.sparams.tfs_z},
-            {"typical_p",                 slot.sparams.typical_p},
+            {"typical_p",                 slot.sparams.typ_p},
            {"repeat_last_n",             slot.sparams.penalty_last_n},
            {"repeat_penalty",            slot.sparams.penalty_repeat},
            {"presence_penalty",          slot.sparams.penalty_present},
            {"frequency_penalty",         slot.sparams.penalty_freq},
-            {"penalty_prompt_tokens",     slot.sparams.penalty_prompt_tokens},
-            {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens},
            {"mirostat",                  slot.sparams.mirostat},
            {"mirostat_tau",              slot.sparams.mirostat_tau},
            {"mirostat_eta",              slot.sparams.mirostat_eta},
@ -1337,13 +1285,13 @@ struct server_context {
            {"max_tokens",                slot.params.n_predict}, // User configured n_predict
            {"n_keep",                    slot.params.n_keep},
            {"n_discard",                 slot.params.n_discard},
-            {"ignore_eos",                ignore_eos},
+            {"ignore_eos",                slot.sparams.ignore_eos},
            {"stream",                    slot.params.stream},
-            {"logit_bias",                slot.sparams.logit_bias},
+          //{"logit_bias",                slot.sparams.logit_bias},
            {"n_probs",                   slot.sparams.n_probs},
            {"min_keep",                  slot.sparams.min_keep},
            {"grammar",                   slot.sparams.grammar},
-            {"samplers",                  samplers_sequence}
+            {"samplers",                  samplers},
        };
    }

@ -2137,7 +2085,7 @@ struct server_context {
                                GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
                            }

-                            llama_sampling_reset(slot.ctx_sampling);
+                            gpt_sampler_reset(slot.smpl);

                            if (!slot.params.cache_prompt) {
                                slot.n_past_se = 0;
@ -2150,7 +2098,7 @@ struct server_context {

                                // push the prompt into the sampling context (do not apply grammar)
                                for (int i = 0; i < slot.n_past; ++i) {
-                                    llama_sampling_accept(slot.ctx_sampling, ctx, slot.cache_tokens[i], false);
+                                    gpt_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
                                }
                            }
                        }
@ -2203,7 +2151,7 @@ struct server_context {
                        slot.n_past_se = 0;
                        slot.ga_i = 0;
                        // TODO: is the system prompt ever in the sampling context?
-                        llama_sampling_reset(slot.ctx_sampling);
+                        gpt_sampler_reset(slot.smpl);
                    }

                    // remove the non-common part from the cache
@ -2376,18 +2324,18 @@ struct server_context {
                        slot.release();
                        slot.i_batch = -1;
                        continue; // continue loop of slots
-                    } else {
+                    }
+
                    // prompt evaluated for next-token prediction
                    slot.state = SLOT_STATE_GENERATING;
-                    }
                } else if (slot.state != SLOT_STATE_GENERATING) {
                    continue; // continue loop of slots
                }

                completion_token_output result;
-                const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i);
+                const llama_token id = gpt_sampler_sample(slot.smpl, ctx, slot.i_batch - i);

-                llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
+                gpt_sampler_accept(slot.smpl, id, true);

                slot.n_decoded += 1;
                if (slot.n_decoded == 1) {
@ -2396,35 +2344,16 @@ struct server_context {
                    metrics.on_prompt_eval(slot);
                }

-                llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
                result.tok = id;

-                const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs);
-                if (n_probs > 0) {
-                    const size_t n_valid = slot.ctx_sampling->n_valid;
+                const auto * cur_p = gpt_sampler_get_candidates(slot.smpl);

-                    // Make sure at least n_probs top tokens are at the front of the vector:
-                    if (slot.sparams.temp == 0.0f && n_probs > n_valid) {
-                        llama_sample_top_k(ctx, &cur_p, n_probs, 0);
-                    }
-
-                    if (slot.sparams.temp == 0.0f) {
-                        // With greedy sampling the probabilities have possibly not been calculated.
-                        for (size_t i = 0; i < n_probs; ++i) {
+                for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
                    result.probs.push_back({
-                                cur_p.data[i].id,
-                                i == 0 ? 1.0f : 0.0f
+                        cur_p->data[i].id,
+                        i >= cur_p->size ? 0.0f : cur_p->data[i].p,
                    });
                }
-                    } else {
-                        for (size_t i = 0; i < n_probs; ++i) {
-                            result.probs.push_back({
-                                cur_p.data[i].id,
-                                i >= n_valid ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability.
-                            });
-                        }
-                    }
-                }

                if (!process_token(result, slot)) {
                    // release slot because of stop condition
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@ -1,65 +0,0 @@
-# Server tests
-
-Python based server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development)
-and [behave](https://behave.readthedocs.io/en/latest/):
-
-* [issues.feature](./features/issues.feature) Pending issues scenario
-* [parallel.feature](./features/parallel.feature) Scenario involving multi slots and concurrent requests
-* [security.feature](./features/security.feature) Security, CORS and API Key
-* [server.feature](./features/server.feature) Server base scenario: completion, embedding, tokenization, etc...
-
-Tests target GitHub workflows job runners with 4 vCPU.
-
-Requests are
-using [aiohttp](https://docs.aiohttp.org/en/stable/client_reference.html), [asyncio](https://docs.python.org/fr/3/library/asyncio.html)
-based http client.
-
-Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail.
-To mitigate it, you can increase values in `n_predict`, `kv_size`.
-
-### Install dependencies
-
-`pip install -r requirements.txt`
-
-### Run tests
-
-1. Build the server
-
-```shell
-cd ../../..
-cmake -B build -DLLAMA_CURL=ON
-cmake --build build --target llama-server
-```
-
-2. Start the test: `./tests.sh`
-
-It's possible to override some scenario steps values with environment variables:
-
-| variable                 | description                                                                                    |
-|--------------------------|------------------------------------------------------------------------------------------------|
-| `PORT`                   | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
-| `LLAMA_SERVER_BIN_PATH`  | to change the server binary path, default: `../../../build/bin/llama-server`                         |
-| `DEBUG`                  | "ON" to enable steps and server verbose mode `--verbose`                                       |
-| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format                                                       |
-| `N_GPU_LAYERS`           | number of model layers to offload to VRAM `-ngl --n-gpu-layers`                                |
-
-### Run @bug, @wip or @wrong_usage annotated scenario
-
-Feature or Scenario must be annotated with `@llama.cpp` to be included in the default scope.
-
- `@bug` annotation aims to link a scenario with a GitHub issue.
- `@wrong_usage` are meant to show user issue that are actually an expected behavior
- `@wip` to focus on a scenario working in progress
- `@slow` heavy test, disabled by default
-
-To run a scenario annotated with `@bug`, start:
-
-```shell
-DEBUG=ON ./tests.sh --no-skipped --tags bug --stop
-```
-
-After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated.
-
-```shell
-./tests.sh --no-skipped --tags bug,wrong_usage || echo "should failed but compile"
-```
--- a/examples/server/themes/README.md
+++ b/examples/server/themes/README.md
@ -1,5 +0,0 @@
-# LLaMA.cpp Server Wild Theme
-
-Simple themes directory of sample "public" directories. To try any of these add --path to your run like `server --path=wild`.
-
-![image](wild/wild.png)
--- a/examples/server/themes/buttons-top/README.md
+++ b/examples/server/themes/buttons-top/README.md
@ -1,7 +0,0 @@
-# LLaMA.cpp Server Buttons Top Theme
-
-Simple tweaks to the UI. Chat buttons at the top of the page instead of bottom so you can hit Stop instead of chasing it down the page.
-
-To use simply run server with `--path=themes/buttons_top`
-
-![image](buttons_top.png)
--- a/examples/server/themes/wild/README.md
+++ b/examples/server/themes/wild/README.md
@ -1,5 +0,0 @@
-# LLaMA.cpp Server Wild Theme
-
-Simple tweaks to the UI. To use simply run server with `--path=themes/wild`
-
-![image](wild.png)
--- a/examples/simple/README.md
+++ b/examples/simple/README.md
@ -1,21 +0,0 @@
-# llama.cpp/example/simple
-
-The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
-
-```bash
-./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is"
-
-...
-
-main: n_len = 32, n_ctx = 2048, n_parallel = 1, n_kv_req = 32
-
- Hello my name is Shawn and I'm a 20 year old male from the United States. I'm a 20 year old
-
-main: decoded 27 tokens in 2.31 s, speed: 11.68 t/s
-
-llama_print_timings:        load time =   579.15 ms
-llama_print_timings:      sample time =     0.72 ms /    28 runs   (    0.03 ms per token, 38888.89 tokens per second)
-llama_print_timings: prompt eval time =   655.63 ms /    10 tokens (   65.56 ms per token,    15.25 tokens per second)
-llama_print_timings:        eval time =  2180.97 ms /    27 runs   (   80.78 ms per token,    12.38 tokens per second)
-llama_print_timings:       total time =  2891.13 ms
-```
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -55,6 +55,14 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    auto sparams = llama_sampler_chain_default_params();
+
+    sparams.no_perf = false;
+
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+
+    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+
    // tokenize the prompt

    std::vector<llama_token> tokens_list;
@ -110,20 +118,9 @@ int main(int argc, char ** argv) {
    while (n_cur <= n_predict) {
        // sample the next token
        {
-            auto   n_vocab = llama_n_vocab(model);
-            auto * logits  = llama_get_logits_ith(ctx, batch.n_tokens - 1);
+            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);

-            std::vector<llama_token_data> candidates;
-            candidates.reserve(n_vocab);
-
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
-            }
-
-            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-            // sample the most likely token
-            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+            llama_sampler_accept(smpl, new_token_id);

            // is it an end of generation?
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
@ -160,12 +157,14 @@ int main(int argc, char ** argv) {
    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

-    llama_print_timings(ctx);
+    LOG_TEE("\n");
+    llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
+    llama_perf_print(ctx,  LLAMA_PERF_TYPE_CONTEXT);

    fprintf(stderr, "\n");

    llama_batch_free(batch);
-
+    llama_sampler_free(smpl);
    llama_free(ctx);
    llama_free_model(model);

--- a/examples/speculative/README.md
+++ b/examples/speculative/README.md
@ -1,9 +0,0 @@
-# llama.cpp/examples/speculative
-
-Demonstration of speculative decoding and tree-based speculative decoding techniques
-
-More info:
-
- https://github.com/ggerganov/llama.cpp/pull/2926
- https://github.com/ggerganov/llama.cpp/pull/3624
- https://github.com/ggerganov/llama.cpp/pull/5625
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -23,7 +23,7 @@ struct seq_draft {
    std::vector<llama_token> tokens;
    std::vector<std::vector<llama_token_data>> dists;

-    struct llama_sampling_context * ctx_sampling;
+    struct gpt_sampler * smpl = nullptr;
 };

 int main(int argc, char ** argv) {
@ -45,10 +45,7 @@ int main(int argc, char ** argv) {
    // probability threshold for splitting a draft branch (only for n_seq_dft > 1)
    const float p_split  = params.p_split;

-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-    std::default_random_engine rng(params.seed);
+    std::default_random_engine rng(params.sparams.seed);
    std::uniform_real_distribution<> u_dist;

 #ifndef LOG_DISABLE_LOGS
@ -181,19 +178,17 @@ int main(int argc, char ** argv) {
    // used to determine end of generation
    bool has_eos = false;

-    // target model sampling context
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
+    // target model sampling context (reuse the llama_context's sampling instance)
+    struct gpt_sampler * smpl = gpt_sampler_init(model_tgt, params.sparams);
+
+    struct llama_sampler * softmax = llama_sampler_init_softmax();

    // draft sequence data
    std::vector<seq_draft> drafts(n_seq_dft);

-    params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
-    if (params.sparams.temp == 0) {
-        params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
-    }
-
    for (int s = 0; s < n_seq_dft; ++s) {
-        drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
+        // allocate gpt_sampler for each draft sequence
+        drafts[s].smpl = gpt_sampler_init(model_dft, params.sparams);
    }

    llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
@ -235,12 +230,12 @@ int main(int argc, char ** argv) {
                bool accept = false;
                if (params.sparams.temp > 0) {
                    // stochastic verification
+                    gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);

-                    llama_token_data_array dist_tgt = llama_sampling_prepare(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft], true, NULL);
-                    llama_sample_softmax(ctx_tgt, &dist_tgt);
-                    float p_tgt = 0, p_dft = 0;
+                    auto & dist_tgt = *gpt_sampler_get_candidates(smpl);

-                    // GGML_ASSERT(dist_tgt.size() == dist_dft.size());
+                    float p_tgt = 0.0f;
+                    float p_dft = 0.0f;

                    while (active_seqs.size() > 0) {
                        // randomly select a sequence to verify from active sequences
@ -259,9 +254,13 @@ int main(int argc, char ** argv) {
                            }
                            continue;
                        }
+
                        LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
                        float r = u_dist(rng);
-                        llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), true };
+                        llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true };
+
+                        //GGML_ASSERT(dist_tgt.size <= dist_dft.size);
+
                        // acquire the token probabilities assigned by the draft and target models
                        for (size_t i = 0; i < dist_tgt.size; i++) {
                            if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) {
@ -280,7 +279,7 @@ int main(int argc, char ** argv) {
                            accept = true;
                            token_id = drafts[s].tokens[i_dft];
                            token_str = llama_token_to_piece(ctx_tgt, token_id);
-                            llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
+                            gpt_sampler_accept(smpl, token_id, true);

                            LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
                            break;
@ -291,7 +290,6 @@ int main(int argc, char ** argv) {
                            // calculate residual probability
                            GGML_ASSERT(dist_tgt.sorted);
                            GGML_ASSERT(dist_dft.sorted);
-                            float sum_probs = 0.0f;

                            // sort dist by id
                            std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
@ -301,10 +299,18 @@ int main(int argc, char ** argv) {
                                return a.id < b.id;
                            });

+                            float sum_probs = 0.0f;
+
                            for (size_t i = 0; i < dist_tgt.size; i++) {
+                                if (i < dist_dft.size) {
                                    dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p - dist_dft.data[i].p);
+                                } else {
+                                    dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p);
+                                }
+
                                sum_probs += dist_tgt.data[i].p;
                            }
+
                            for (size_t i = 0; i < dist_tgt.size; i++) {
                                dist_tgt.data[i].p /= sum_probs;
                            }
@ -334,21 +340,29 @@ int main(int argc, char ** argv) {
                        // all drafted tokens were rejected
                        // sample from the target model
                        LOG("all drafted tokens were rejected, sampling from residual distribution\n");
-                        token_id = llama_sample_token(ctx_tgt, &dist_tgt);
-                        llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
-                        token_str = llama_token_to_piece(ctx_tgt, token_id);
+                        std::vector<float> probs(dist_tgt.size);
+                        for (size_t i = 0; i < dist_tgt.size; ++i) {
+                            probs[i] = dist_tgt.data[i].p;
                        }

+                        std::discrete_distribution<> dist(probs.begin(), probs.end());
+
+                        const int idx = dist(rng);
+
+                        token_id = dist_tgt.data[idx].id;
+                        gpt_sampler_accept(smpl, token_id, true);
+                        token_str = llama_token_to_piece(ctx_tgt, token_id);
+                    }
                } else {
                    // greedy verification

                    // sample from the target model
                    LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
-                    token_id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
+                    token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);

-                    llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
+                    gpt_sampler_accept(smpl, token_id, true);

-                    //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
+                    //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, smpl->prev).c_str());

                    token_str = llama_token_to_piece(ctx_tgt, token_id);

@ -436,7 +450,10 @@ int main(int argc, char ** argv) {
            break;
        }

-        llama_sampling_cp(ctx_sampling, drafts[0].ctx_sampling);
+        if (drafts[0].smpl) {
+            gpt_sampler_free(drafts[0].smpl);
+        }
+        drafts[0].smpl = gpt_sampler_clone(smpl);

        int n_seq_cur  = 1;
        int n_past_cur = n_past_dft;
@ -465,20 +482,20 @@ int main(int argc, char ** argv) {
                    continue;
                }

-                llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft);
+                gpt_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);

-                const auto & cur_p = drafts[s].ctx_sampling->cur;
+                const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl);

-                for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) {
+                for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
                    LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
-                            k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
+                            k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
                }

                std::vector<int> sa(1, s);

                // attempt to split the branch if the probability is high enough
                for (int f = 1; f < 8; ++f) {
-                    if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) {
+                    if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) {
                        LOG("splitting seq %3d into %3d\n", s, n_seq_cur);

                        llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
@ -505,7 +522,10 @@ int main(int argc, char ** argv) {
                        drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
                        drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;

-                        llama_sampling_cp(drafts[s].ctx_sampling, drafts[n_seq_cur].ctx_sampling);
+                        if (drafts[n_seq_cur].smpl) {
+                            gpt_sampler_free(drafts[n_seq_cur].smpl);
+                        }
+                        drafts[n_seq_cur].smpl = gpt_sampler_clone(drafts[s].smpl);

                        sa.push_back(n_seq_cur);

@ -517,15 +537,15 @@ int main(int argc, char ** argv) {

                // add drafted token for each sequence
                for (int is = 0; is < (int) sa.size(); ++is) {
-                    const llama_token id = cur_p[is].id;
+                    const llama_token id = cur_p->data[is].id;

                    const int s = sa[is];

-                    llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
+                    gpt_sampler_accept(drafts[s].smpl, id, true);

                    drafts[s].tokens.push_back(id);
                    // save cur_p.data into drafts[s].dists
-                    drafts[s].dists.push_back(cur_p);
+                    drafts[s].dists.push_back({cur_p->data, cur_p->data + cur_p->size});

                    // add unique drafted tokens to the target batch
                    drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
@ -595,17 +615,19 @@ int main(int argc, char ** argv) {
    LOG_TEE("n_accept  = %d\n", n_accept);
    LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);

-    LOG_TEE("\ndraft:\n");
-    llama_print_timings(ctx_dft);
+    LOG_TEE("\ndraft:\n\n");
+    // TODO: print sampling/grammar timings for all drafts
+    llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT);

-    LOG_TEE("\ntarget:\n");
-    llama_print_timings(ctx_tgt);
+    LOG_TEE("\ntarget:\n\n");
+    gpt_perf_print(ctx_tgt, smpl);

-    llama_sampling_free(ctx_sampling);
+    gpt_sampler_free(smpl);
    for (int s = 0; s < n_seq_dft; ++s) {
-        llama_sampling_free(drafts[s].ctx_sampling);
+        gpt_sampler_free(drafts[s].smpl);
    }

+    llama_sampler_free(softmax);
    llama_batch_free(batch_dft);

    llama_free(ctx_tgt);
--- a/examples/sycl/README.md
+++ b/examples/sycl/README.md
@ -1,41 +0,0 @@
-# llama.cpp/example/sycl
-
-This example program provides the tools for llama.cpp for SYCL on Intel GPU.
-
-## Tool
-
-|Tool Name| Function|Status|
-|-|-|-|
-|llama-ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
-
-### llama-ls-sycl-device
-
-List all SYCL devices with ID, compute capability, max work group size, ect.
-
-1. Build the llama.cpp for SYCL for the specified target *(using GGML_SYCL_TARGET)*.
-
-2. Enable oneAPI running environment *(if GGML_SYCL_TARGET is set to INTEL -default-)*
-
-```
-source /opt/intel/oneapi/setvars.sh
-```
-
-3. Execute
-
-```
-./build/bin/llama-ls-sycl-device
-```
-
-Check the ID in startup log, like:
-
-```
-found 2 SYCL devices:
-|  |                   |                                       |       |Max    |        |Max  |Global |                     |
-|  |                   |                                       |       |compute|Max work|sub  |mem    |                     |
-|ID|        Device Type|                                   Name|Version|units  |group   |group|size   |       Driver version|
-|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|
-| 0| [level_zero:gpu:0]|                Intel Arc A770 Graphics|    1.3|    512|    1024|   32| 16225M|            1.3.29138|
-| 1| [level_zero:gpu:1]|                 Intel UHD Graphics 750|    1.3|     32|     512|   32| 62631M|            1.3.29138|
-
-```
-
--- a/expose.cpp
+++ b/expose.cpp
@ -191,13 +191,18 @@ extern "C"
            }
            else if(file_format==FileFormat::GGML || file_format==FileFormat::GGHF || file_format==FileFormat::GGJT || file_format==FileFormat::GGJT_2 || file_format==FileFormat::GGJT_3)
            {
-                printf("\n---\nIdentified as Legacy GGML model: (ver %d)\nYou are STRONGLY ENCOURAGED to obtain a newer GGUF model!\nAttempting to Load...\n---\n", file_format);
+                printf("\n---\nIdentified as Legacy GGML model: (ver %d)\n======\nGGML Models are Outdated: You are STRONGLY ENCOURAGED to obtain a newer GGUF model!\n======\nAttempting to Load...\n---\n", file_format);
            }
            else
            {
                printf("\n---\nUnidentified Model Encountered: (ver %d)\n---\n", file_format);
            }
            ModelLoadResult lr = gpttype_load_model(inputs, file_format, file_format_meta);
+            if(file_format==FileFormat::GGML || file_format==FileFormat::GGHF || file_format==FileFormat::GGJT || file_format==FileFormat::GGJT_2 || file_format==FileFormat::GGJT_3)
+            {
+                //warn a second time
+                printf("\n======\nGGML Models are Outdated: You are STRONGLY ENCOURAGED to obtain a newer GGUF model!\n======\n");
+            }
            if (lr == ModelLoadResult::FAIL || lr == ModelLoadResult::RETRY_LOAD)
            {
                return false;
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -13,7 +13,11 @@
 #include <unordered_map>
 #include "model_adapter.h"
 #include "otherarch.h"
-#include "grammar-parser.h"
+#include "llama.h"
+#include <vector>
+#include <map>
+#include <cstdint>
+#include <string>

 //for easier compilation
 //concat source files into one file for compilation purposes
@ -55,7 +59,7 @@ stop_reason last_stop_reason = stop_reason::INVALID;
 std::vector<std::string> generated_tokens;

 llama_grammar *  grammar = nullptr; //currently used grammar
-grammar_parser::parse_state parsed_grammar;
+llama_grammar_parser parsed_grammar;
 static std::string current_grammar = "";

 //return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt)
@ -399,7 +403,7 @@ static void GetOverlappingTokenSequences(const std::string& str, std::unordered_

 llama_token sample_token(llama_token_data_array * candidates, std::mt19937 & rng)
 {
-    llama_sample_softmax(nullptr, candidates);
+    llama_sampler_softmax_impl(candidates);
    std::vector<float> probs;
    probs.reserve(candidates->size);
    top_picks.clear();
@ -429,7 +433,7 @@ llama_token sample_token(llama_token_data_array * candidates, std::mt19937 & rng
 llama_token sample_token_mirostat(int n_vocab, llama_token_data_array * candidates, std::mt19937 & rng, float tau, float eta, int m, float * mu)
 {
    float N = float(n_vocab);
-    llama_sample_softmax(nullptr, candidates);
+    llama_sampler_softmax_impl(candidates);
    // Estimate s_hat using the most probable m tokens
    float s_hat = 0.0;
    float sum_ti_bi = 0.0;
@ -445,7 +449,7 @@ llama_token sample_token_mirostat(int n_vocab, llama_token_data_array * candidat
    float epsilon_hat = s_hat - 1;
    float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
    // Sample the next word X using top-k sampling
-    llama_sample_top_k(nullptr, candidates, int(k),1);
+    llama_sampler_top_k_impl(candidates, int(k));
    llama_token X = sample_token(candidates, rng);    // Compute error as the difference between observed surprise and target surprise value
    size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
        return candidate.id == X;
@ -459,7 +463,7 @@ llama_token sample_token_mirostat(int n_vocab, llama_token_data_array * candidat

 llama_token sample_token_mirostat_v2(llama_token_data_array * candidates, std::mt19937 & rng, float tau, float eta, float * mu)
 {
-    llama_sample_softmax(nullptr, candidates);
+    llama_sampler_softmax_impl(candidates);
    // Truncate the words with surprise values greater than mu
    candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
        return -log2f(candidate.p) > *mu;
@ -470,7 +474,7 @@ llama_token sample_token_mirostat_v2(llama_token_data_array * candidates, std::m
    }

    // Normalize the probabilities of the remaining words
-    llama_sample_softmax(nullptr, candidates);
+    llama_sampler_softmax_impl(candidates);
    // Sample the next word X from the remaining words
    llama_token X = sample_token(candidates,rng);

@ -492,7 +496,7 @@ void sample_top_a(llama_token_data_array * candidates, float a, size_t min_keep)
        return;
    }

-    llama_sample_softmax(nullptr, candidates);
+    llama_sampler_softmax_impl(candidates);

    // Compute the cumulative probabilities
    float maxprob = candidates->data[0].p;
@ -528,7 +532,7 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float
        return;
    }

-    llama_sample_softmax(nullptr, candidates);
+    llama_sampler_softmax_impl(candidates);

    //calculate how many tokens cross the xtc threshold
    size_t last_idx = candidates->size;
@ -827,12 +831,12 @@ void sample_temperature(llama_token_data_array * candidates_p, float temp, float
    {
        // Imitate greedy sampling
        temp = 0.00390625f; //cannot be zero else div0, this is 1/256
-        llama_sample_temp(nullptr, candidates_p, temp, 0);
-        llama_sample_top_k(nullptr, candidates_p, 1, 1); //only want first candidate
+        llama_sampler_temp_impl(candidates_p, temp, 0);
+        llama_sampler_top_k_impl(candidates_p, 1); //only want first candidate
    }
    else
    {
-        llama_sample_temp(nullptr, candidates_p, temp, smoothing_factor);
+        llama_sampler_temp_impl(candidates_p, temp, smoothing_factor);
    }
 }

@ -903,7 +907,7 @@ const std::vector<samplers> & sampler_order, llama_grammar * grammar, float dyna
    sample_dry(n_ctx, dry_penalty_last_n, dry_multiplier, dry_base, dry_allowed_length, dry_sequence_breakers, &candidates_p);

    //prefilter to top 5k tokens for improved speed
-    llama_sample_top_k(nullptr, &candidates_p, 5000, 1);
+    llama_sampler_top_k_impl(&candidates_p, 5000);

    if (mirostat == 1 || mirostat == 2)
    {
@ -927,20 +931,20 @@ const std::vector<samplers> & sampler_order, llama_grammar * grammar, float dyna
            switch (sampler_order[i])
            {
                case KCPP_SAMPLER_TOP_K:
-                    llama_sample_top_k(nullptr, &candidates_p, top_k,1);
+                    llama_sampler_top_k_impl(&candidates_p, top_k);
                    break;
                case KCPP_SAMPLER_TOP_A:
                    sample_top_a(&candidates_p, top_a, 1);
                    break;
                case KCPP_SAMPLER_TOP_P:
-                    llama_sample_top_p(nullptr, &candidates_p, top_p,1);
-                    llama_sample_min_p(nullptr, &candidates_p, min_p,1);
+                    llama_sampler_top_p_impl(&candidates_p, top_p, 1);
+                    llama_sampler_min_p_impl(&candidates_p, min_p, 1);
                    break;
                case KCPP_SAMPLER_TFS:
-                    llama_sample_tail_free(nullptr, &candidates_p, tfs,1);
+                    llama_sampler_tail_free_impl(&candidates_p, tfs, 1);
                    break;
                case KCPP_SAMPLER_TYP:
-                    llama_sample_typical(nullptr, &candidates_p, typical_p,1);
+                    llama_sampler_typical_impl(&candidates_p, typical_p, 1);
                    break;
                case KCPP_SAMPLER_TEMP:
                    if (dynatemp_range>0)
@ -951,7 +955,7 @@ const std::vector<samplers> & sampler_order, llama_grammar * grammar, float dyna
                        dynatemp_min = dynatemp_min<0?0:dynatemp_min;
                        dynatemp_max = dynatemp_max<0?0:dynatemp_max;
                        dynatemp_exponent = dynatemp_exponent<0?0:dynatemp_exponent;
-                        llama_sample_entropy(nullptr, &candidates_p, dynatemp_min, dynatemp_max, dynatemp_exponent, smoothing_factor);
+                        llama_sampler_entropy_impl(&candidates_p, dynatemp_min, dynatemp_max, dynatemp_exponent, smoothing_factor);
                    }
                    else
                    {
@ -1002,12 +1006,12 @@ static void load_grammar(const std::string & gammarstr)
 {
    if(grammar!=nullptr) //on demand free when next grammar is loaded
    {
-        llama_grammar_free(grammar);
+        llama_grammar_free_impl(grammar);
        grammar = nullptr;
    }

    if (!gammarstr.empty()) {
-        parsed_grammar = grammar_parser::parse(gammarstr.c_str());
+        parsed_grammar.parse(gammarstr.c_str());
        // will be empty (default) if there are parse errors
        if (parsed_grammar.rules.empty()) {
            printf("\nIgnored invalid grammar sampler.");
@ -1015,10 +1019,10 @@ static void load_grammar(const std::string & gammarstr)
        }
        if(debugmode==1)
        {
-            grammar_parser::print_grammar(stderr, parsed_grammar);
+            parsed_grammar.print(stderr);
        }
        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-        grammar = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+        grammar = llama_grammar_init_impl(nullptr,grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
    }
 }

@ -1395,7 +1399,6 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
           llama_ctx_params.n_ctx += extra_context_handle_fragmentation;
        }

-        llama_ctx_params.seed = -1;
        llama_ctx_params.offload_kqv = !inputs.low_vram;
        llama_ctx_params.logits_all = false;
        model_params.use_mmap = inputs.use_mmap;
@ -2002,7 +2005,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)

    if(debugmode==1 && file_format == FileFormat::GGUF_GENERIC)
    {
-        llama_reset_timings(llama_ctx_v4);
+        llama_perf_reset(llama_ctx_v4, LLAMA_PERF_TYPE_CONTEXT);
    }

    generation_finished = false; // Set current generation status
@ -2926,7 +2929,8 @@ generation_outputs gpttype_generate(const generation_inputs inputs)

    if(debugmode==1 && file_format == FileFormat::GGUF_GENERIC)
    {
-        llama_print_timings(llama_ctx_v4);
+        printf("\n");
+        llama_perf_print(llama_ctx_v4, LLAMA_PERF_TYPE_CONTEXT);
    }

    time2 = timer_check();
--- a/include/llama.h
+++ b/include/llama.h
@ -33,12 +33,15 @@

 #define LLAMA_DEFAULT_SEED 0xFFFFFFFF

+// TODO: use everywhere in the implementation
+#define LLAMA_TOKEN_NULL -1
+
 #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
 #define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'

 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 8
+#define LLAMA_SESSION_VERSION 9

 #define LLAMA_STATE_SEQ_MAGIC   LLAMA_FILE_MAGIC_GGSQ
 #define LLAMA_STATE_SEQ_VERSION 2
@ -53,8 +56,10 @@ extern "C" {
    // TODO: show sample usage
    //

+    // struct llama_vocab; // TODO: add in the future
    struct llama_model;
    struct llama_context;
+    struct llama_sampler;

    typedef int32_t llama_pos;
    typedef int32_t llama_token;
@ -201,6 +206,7 @@ extern "C" {
        LLAMA_SPLIT_MODE_ROW     = 2, // split rows across GPUs
    };

+    // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
    typedef struct llama_token_data {
        llama_token id; // token id
        float logit;    // log-odds of the token
@ -208,8 +214,10 @@ extern "C" {
    } llama_token_data;

    typedef struct llama_token_data_array {
+        // TODO: consider SoA
        llama_token_data * data;
        size_t size;
+        int64_t selected; // this is the index in the data array (i.e. not the token id)
        bool sorted;
    } llama_token_data_array;

@ -302,7 +310,6 @@ extern "C" {
    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
    //       https://github.com/ggerganov/llama.cpp/pull/7544
    struct llama_context_params {
-        uint32_t seed;              // RNG seed, -1 for random
        uint32_t n_ctx;             // text context, 0 = from model
        uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
        uint32_t n_ubatch;          // physical maximum batch size
@ -330,11 +337,13 @@ extern "C" {
        enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
        enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]

-        // Keep the booleans together to avoid misalignment during copy-by-value.
+        // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
+        // TODO: move at the end of the struct
        bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
        bool embeddings;  // if true, extract embeddings (together with logits)
        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
+      //bool no_perf;     // whether to measure performance timings, TODO: implement

        // Abort callback
        // if it returns true, execution of llama_decode() will be aborted
@ -358,56 +367,14 @@ extern "C" {
        void * kv_overrides;                 // pointer to vector containing overrides
    } llama_model_quantize_params;

-    // grammar types
-    struct llama_grammar;
+    typedef struct llama_logit_bias {
+        llama_token token;
+        float bias;
+    } llama_logit_bias;

-    // grammar element type
-    enum llama_gretype {
-        // end of rule definition
-        LLAMA_GRETYPE_END            = 0,
-
-        // start of alternate definition for rule
-        LLAMA_GRETYPE_ALT            = 1,
-
-        // non-terminal element: reference to rule
-        LLAMA_GRETYPE_RULE_REF       = 2,
-
-        // terminal element: character (code point)
-        LLAMA_GRETYPE_CHAR           = 3,
-
-        // inverse char(s) ([^a], [^a-b] [^abc])
-        LLAMA_GRETYPE_CHAR_NOT       = 4,
-
-        // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
-        // be an inclusive range ([a-z])
-        LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
-
-        // modifies a preceding LLAMA_GRETYPE_CHAR or
-        // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
-        LLAMA_GRETYPE_CHAR_ALT       = 6,
-
-        // any character (.)
-        LLAMA_GRETYPE_CHAR_ANY       = 7,
-    };
-
-    typedef struct llama_grammar_element {
-        enum llama_gretype type;
-        uint32_t           value; // Unicode code point or rule ID
-    } llama_grammar_element;
-
-    // performance timing information
-    struct llama_timings {
-        double t_start_ms;
-        double t_end_ms;
-        double t_load_ms;
-        double t_sample_ms;
-        double t_p_eval_ms;
-        double t_eval_ms;
-
-        int32_t n_sample;
-        int32_t n_p_eval;
-        int32_t n_eval;
-    };
+    typedef struct llama_sampler_chain_params {
+        bool no_perf; // whether to measure performance timings
+    } llama_sampler_chain_params;

    // used in chat template
    typedef struct llama_chat_message {
@ -419,8 +386,10 @@ extern "C" {
    struct llama_lora_adapter;

    // Helpers for getting default parameters
+    // TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
    LLAMA_API struct llama_model_params          llama_model_default_params(void);
    LLAMA_API struct llama_context_params        llama_context_default_params(void);
+    LLAMA_API struct llama_sampler_chain_params  llama_sampler_chain_default_params(void);
    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);

    // Initialize the llama + ggml backend
@ -447,6 +416,7 @@ extern "C" {

    LLAMA_API void llama_free_model(struct llama_model * model);

+    // TODO: rename to llama_init_from_model
    LLAMA_API struct llama_context * llama_new_context_with_model(
                     struct llama_model * model,
            struct llama_context_params   params);
@ -462,23 +432,22 @@ extern "C" {
    LLAMA_API bool llama_supports_mlock      (void);
    LLAMA_API bool llama_supports_gpu_offload(void);

-    LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
-
    LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
    LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
    LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
    LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);

-    LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
-
-    LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model * model);
-    LLAMA_API enum llama_rope_type    llama_rope_type   (const struct llama_model * model);
-
    LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
    LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
    LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);
    LLAMA_API int32_t llama_n_layer    (const struct llama_model * model);

+    LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
+
+    LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
+    LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model * model);
+    LLAMA_API enum llama_rope_type    llama_rope_type   (const struct llama_model * model);
+
    // Get the model's RoPE frequency scaling factor
    LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);

@ -708,7 +677,7 @@ extern "C" {
    //

    // Returns the *actual* size in bytes of the state
-    // (rng, logits, embedding and kv_cache)
+    // (logits, embedding and kv_cache)
    // Only use when saving the state, not when restoring it, otherwise the size may be too small.
    LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
    LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
@ -1011,123 +980,110 @@ extern "C" {
                               int32_t   length);

    //
-    // Grammar
+    // Sampling API
+    //
+    // Sample usage:
+    //
+    //    // prepare the sampling chain at the start
+    //    auto sparams = llama_sampler_chain_default_params();
+    //
+    //    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+    //
+    //    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(50));
+    //    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1));
+    //    llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.8));
+    //
+    //    // typically, the chain should end with a sampler such as "greedy", "dist" or "mirostat"
+    //    // this sampler will be responsible to select the actual token
+    //    llama_sampler_chain_add(smpl, llama_sampler_init_dist(seed));
+    //
+    //    ...
+    //
+    //    // decoding loop:
+    //    while (...) {
+    //        ...
+    //
+    //        llama_decode(ctx, batch);
+    //
+    //        // sample from the logits of the last token in the batch
+    //        const llama_token id = llama_sampler_sample(smpl, ctx, -1);
+    //
+    //        // accepting the token updates the internal state of certain samplers (e.g. grammar, repetition, etc.)
+    //        llama_sampler_accept(smpl, id);
+    //        ...
+    //    }
+    //
+    //    llama_sampler_free(smpl);
+    //
+    // TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
+    // TODO: in the future, the entire sampling API that uses llama_model should start using llama_vocab
    //

-    /// Initialize a llama_grammar.
-    ///
-    /// @param rules The rule elements of the grammar to initialize.
-    /// @param n_rules The number of rules.
-    /// @param start_rule_index The index of the root rule (the starting point of the grammar).
-    /// @return The initialized llama_grammar or nullptr if initialization failed.
-    LLAMA_API struct llama_grammar * llama_grammar_init(
-            const llama_grammar_element ** rules,
-                                 size_t    n_rules,
-                                 size_t    start_rule_index);
+    typedef void * llama_sampler_context_t;

-    LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
+    // user code can implement the interface below in order to create custom llama_sampler
+    struct llama_sampler_i {
+        const char *           (*name)  (const struct llama_sampler * smpl);                                 // can be NULL
+        void                   (*accept)(      struct llama_sampler * smpl, llama_token token);              // can be NULL
+        void                   (*apply) (      struct llama_sampler * smpl, llama_token_data_array * cur_p); // required
+        void                   (*reset) (      struct llama_sampler * smpl);                                 // can be NULL
+        struct llama_sampler * (*clone) (const struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
+        void                   (*free)  (      struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL

-    LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
+        // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
+        //void (*apply_ggml) (struct llama_sampler * smpl, ...);
+    };

-    /// @details Apply constraints from grammar
-    LLAMA_API void llama_grammar_sample(
-            const struct llama_grammar * grammar,
-            const struct llama_context * ctx,
-                llama_token_data_array * candidates);
-    LLAMA_API DEPRECATED(void llama_sample_grammar(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-      const struct llama_grammar * grammar),
-        "use llama_grammar_sample instead");
+    struct llama_sampler {
+        struct llama_sampler_i  * iface;
+        llama_sampler_context_t   ctx;
+    };

-    /// @details Accepts the sampled token into the grammar
-    LLAMA_API void llama_grammar_accept_token(
-            struct llama_grammar * grammar,
-            struct llama_context * ctx,
-                     llama_token   token);
+    // mirror of llama_sampler_i:
+    LLAMA_API const char *           llama_sampler_name  (const struct llama_sampler * smpl);
+    LLAMA_API void                   llama_sampler_accept(      struct llama_sampler * smpl, llama_token token);
+    LLAMA_API void                   llama_sampler_apply (      struct llama_sampler * smpl, llama_token_data_array * cur_p);
+    LLAMA_API void                   llama_sampler_reset (      struct llama_sampler * smpl);
+    LLAMA_API struct llama_sampler * llama_sampler_clone (const struct llama_sampler * smpl);
+    // important: do not free if the sampler has been added to a llama_sampler_chain (via llama_sampler_chain_add)
+    LLAMA_API void                   llama_sampler_free  (      struct llama_sampler * smpl);

-    //
-    // Sampling functions
-    //
+    // llama_sampler_chain
+    // a type of llama_sampler that can chain multiple samplers one after another

-    // Sets the current rng seed.
-    LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
+    LLAMA_API struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params);

-    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
-    /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
-    LLAMA_API void llama_sample_repetition_penalties(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-               const llama_token * last_tokens,
-                          size_t   penalty_last_n,
-                           float   penalty_repeat,
-                           float   penalty_freq,
-                           float   penalty_present);
+    // important: takes ownership of the sampler object and will free it when llama_sampler_free is called
+    LLAMA_API void                   llama_sampler_chain_add(      struct llama_sampler * chain, struct llama_sampler * smpl);
+    LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
+    LLAMA_API int                    llama_sampler_chain_n  (const struct llama_sampler * chain);

-    /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
-    /// @param logits Logits extracted from the original generation context.
-    /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
-    /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
-    LLAMA_API void llama_sample_apply_guidance(
-              struct llama_context * ctx,
-                             float * logits,
-                             float * logits_guidance,
-                             float   scale);
+    // available samplers:
+
+    LLAMA_API struct llama_sampler * llama_sampler_init_greedy     (void);
+    LLAMA_API struct llama_sampler * llama_sampler_init_dist       (uint32_t seed);

    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-    LLAMA_API void llama_sample_softmax(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates);
+    LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void);

    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API void llama_sample_top_k(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-                         int32_t   k,
-                          size_t   min_keep);
+    LLAMA_API struct llama_sampler * llama_sampler_init_top_k      (int32_t k);

    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API void llama_sample_top_p(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-                           float   p,
-                          size_t   min_keep);
+    LLAMA_API struct llama_sampler * llama_sampler_init_top_p      (float   p, size_t min_keep);

    /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
-    LLAMA_API void llama_sample_min_p(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-                           float   p,
-                          size_t   min_keep);
+    LLAMA_API struct llama_sampler * llama_sampler_init_min_p      (float   p, size_t min_keep);

    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
-    LLAMA_API void llama_sample_tail_free(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-                           float   z,
-                          size_t   min_keep);
+    LLAMA_API struct llama_sampler * llama_sampler_init_tail_free  (float   z, size_t min_keep);

    /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
-    LLAMA_API void llama_sample_typical(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-                           float   p,
-                          size_t   min_keep);
+    LLAMA_API struct llama_sampler * llama_sampler_init_typical    (float   p, size_t min_keep);
+    LLAMA_API struct llama_sampler * llama_sampler_init_temp       (float   t);

-    /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
-    LLAMA_API void llama_sample_entropy(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates_p,
-                           float   min_temp,
-                           float   max_temp,
-                           float   exponent_val,
-                           float   smoothing_factor);
-
-    LLAMA_API void llama_sample_temp(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-                           float   temp,
-                           float   smoothing_factor);
+    /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
+    LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext   (float   t, float   delta, float exponent);

    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
@ -1135,36 +1091,57 @@ extern "C" {
    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
    /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    LLAMA_API llama_token llama_sample_token_mirostat(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
+    LLAMA_API struct llama_sampler * llama_sampler_init_mirostat(
+                             int32_t   n_vocab,
+                            uint32_t   seed,
                               float   tau,
                               float   eta,
-                         int32_t   m,
-                           float * mu);
+                             int32_t   m);

    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    LLAMA_API llama_token llama_sample_token_mirostat_v2(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
+    LLAMA_API struct llama_sampler * llama_sampler_init_mirostat_v2(
+                            uint32_t   seed,
                               float   tau,
-                           float   eta,
-                           float * mu);
+                               float   eta);

-    /// @details Selects the token with the highest probability.
-    ///          Does not compute the token probabilities. Use llama_sample_softmax() instead.
-    LLAMA_API llama_token llama_sample_token_greedy(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates);
+    LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
+            const struct llama_model * model,
+                          const char * grammar_str,
+                          const char * grammar_root);

-    /// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
-    LLAMA_API llama_token llama_sample_token(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates);
+    LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
+                             int32_t   n_vocab,         // llama_n_vocab()
+                         llama_token   special_eos_id,  // llama_token_eos()
+                         llama_token   linefeed_id,     // llama_token_nl()
+                             int32_t   penalty_last_n,  // last n tokens to penalize (0 = disable penalty, -1 = context size)
+                               float   penalty_repeat,  // 1.0 = disabled
+                               float   penalty_freq,    // 0.0 = disabled
+                               float   penalty_present, // 0.0 = disabled
+                                bool   penalize_nl,     // consider newlines as a repeatable token
+                                bool   ignore_eos);     // ignore the end-of-sequence token
+
+    LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
+                             int32_t   n_vocab,
+                             int32_t   n_logit_bias,
+              const llama_logit_bias * logit_bias);
+
+    // Shorthand for:
+    //
+    //    const auto * logits = llama_get_logits_ith(ctx, idx);
+    //    llama_token_data_array cur_p = { ... init from logits ... };
+    //    llama_sampler_apply(smpl, &cur_p);
+    //    return cur_p.data[cur_p.selected].id;
+    //
+    // At this point, this is mostly a convenience function.
+    //
+    LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
+
+    // TODO: extend in the future
+    //LLAMA_API void llama_decode_with_sampler(struct llama_context * ctx, struct llama_sampler * smpl, struct llama_batch batch, ...);

    //
    // Model split
@ -1180,12 +1157,6 @@ extern "C" {
    //  Returns the split_prefix length.
    LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);

-    // Performance information
-    LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
-
-    LLAMA_API void llama_print_timings(struct llama_context * ctx);
-    LLAMA_API void llama_reset_timings(struct llama_context * ctx);
-
    // Print system information
    LLAMA_API const char * llama_print_system_info(void);

@ -1193,65 +1164,24 @@ extern "C" {
    // If this is not called, or NULL is supplied, everything is output on stderr.
    LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);

-    LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
+    //
+    // Performance utils
+    //
+    // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
+    //
+
+    enum llama_perf_type {
+        LLAMA_PERF_TYPE_CONTEXT       = 0,
+        LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1,
+    };
+
+    LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type);
+    LLAMA_API void llama_perf_reset(      void * ctx, enum llama_perf_type type);
+
+    LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);

 #ifdef __cplusplus
 }
 #endif

-// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
-//#ifdef LLAMA_API_INTERNAL
-
-#include <random>
-#include <string>
-#include <vector>
-
-struct ggml_tensor;
-
-const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
-    struct llama_context * ctx
-);
-
-struct llama_partial_utf8 {
-    uint32_t value;    // bit value so far (unshifted)
-    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
-};
-
-struct llama_grammar_candidate {
-    size_t               index;
-    const uint32_t     * code_points;
-    llama_partial_utf8   partial_utf8;
-};
-
-using llama_grammar_rule  = std::vector<      llama_grammar_element>;
-using llama_grammar_stack = std::vector<const llama_grammar_element *>;
-
-using llama_grammar_rules      = std::vector<llama_grammar_rule>;
-using llama_grammar_stacks     = std::vector<llama_grammar_stack>;
-using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
-
-const llama_grammar_rules  & llama_grammar_get_rules (const struct llama_grammar * grammar);
-      llama_grammar_stacks & llama_grammar_get_stacks(      struct llama_grammar * grammar);
-
-void llama_grammar_accept(
-        const llama_grammar_rules  & rules,
-        const llama_grammar_stacks & stacks,
-        const uint32_t chr,
-              llama_grammar_stacks & new_stacks);
-
-std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
-        const llama_grammar_rules & rules,
-        const llama_grammar_stack & stack,
-        const llama_grammar_candidates & candidates);
-
-std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
-        const std::string & src,
-        llama_partial_utf8 partial_start);
-
-// Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
-// This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
-llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng);
-
-//#endif // LLAMA_API_INTERNAL
-
 #endif // LLAMA_H
--- a/klite.embd
+++ b/klite.embd
@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
 -->

 <script>
-	const LITEVER = 174;
+	const LITEVER = 175;
 	const urlParams = new URLSearchParams(window.location.search);
 	var localflag = true;
 	const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
@ -9837,6 +9837,7 @@ Current version indicated by LITEVER below.
 			document.getElementById("miro_type").value = 0;
 			document.getElementById("dry_multiplier").value = 0;
 			document.getElementById("xtc_probability").value = 0;
+			document.getElementById("sampler_seed").value = -1;
 			document.getElementById("rep_pen").value = document.getElementById("rep_pen_slide").value = found.rep_pen;
 			document.getElementById("rep_pen_range").value = found.rep_pen_range;
 			document.getElementById("rep_pen_slope").value = found.rep_pen_slope;
@ -9925,6 +9926,7 @@ Current version indicated by LITEVER below.
 			document.getElementById("miro_type").value != 0 ||
 			document.getElementById("dry_multiplier").value != 0 ||
 			document.getElementById("xtc_probability").value != 0 ||
+			document.getElementById("sampler_seed").value != -1 ||
 			document.getElementById("rep_pen").value != found.rep_pen ||
 			document.getElementById("rep_pen_range").value != found.rep_pen_range ||
 			document.getElementById("rep_pen_slope").value != found.rep_pen_slope ||
--- a/otherarch/sdcpp/thirdparty/README.md
+++ b/otherarch/sdcpp/thirdparty/README.md
@ -1,2 +0,0 @@
- json.hpp library from: https://github.com/nlohmann/json
- ZIP Library from: https://github.com/kuba--/zip
--- a/otherarch/whispercpp/main.cpp
+++ b/otherarch/whispercpp/main.cpp
@ -1,7 +1,6 @@
 #include "common.h"

 #include "whisper.h"
-#include "grammar-parser.h"
 #define DR_WAV_IMPLEMENTATION
 #include "dr_wav.h"

@ -219,7 +218,7 @@ struct whisper_params {
    std::vector<std::string> fname_inp = {};
    std::vector<std::string> fname_out = {};

-    grammar_parser::parse_state grammar_parsed;
+    //grammar_parser::parse_state grammar_parsed;
 };

 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@ -1143,28 +1142,28 @@ int main(int argc, char ** argv) {
    // initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
    whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);

-    if (!params.grammar.empty()) {
-        auto & grammar = params.grammar_parsed;
-        if (is_file_exist(params.grammar.c_str())) {
-            // read grammar from file
-            std::ifstream ifs(params.grammar.c_str());
-            const std::string txt = std::string((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>());
-            grammar = grammar_parser::parse(txt.c_str());
-        } else {
-            // read grammar from string
-            grammar = grammar_parser::parse(params.grammar.c_str());
-        }
+    // if (!params.grammar.empty()) {
+    //     auto & grammar = params.grammar_parsed;
+    //     if (is_file_exist(params.grammar.c_str())) {
+    //         // read grammar from file
+    //         std::ifstream ifs(params.grammar.c_str());
+    //         const std::string txt = std::string((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>());
+    //         grammar = grammar_parser::parse(txt.c_str());
+    //     } else {
+    //         // read grammar from string
+    //         grammar = grammar_parser::parse(params.grammar.c_str());
+    //     }

-        // will be empty (default) if there are parse errors
-        if (grammar.rules.empty()) {
-            fprintf(stderr, "error: failed to parse grammar \"%s\"\n", params.grammar.c_str());
-            return 4;
-        } else {
-            fprintf(stderr, "%s: grammar:\n", __func__);
-            grammar_parser::print_grammar(stderr, grammar);
-            fprintf(stderr, "\n");
-        }
-    }
+    //     // will be empty (default) if there are parse errors
+    //     if (grammar.rules.empty()) {
+    //         fprintf(stderr, "error: failed to parse grammar \"%s\"\n", params.grammar.c_str());
+    //         return 4;
+    //     } else {
+    //         fprintf(stderr, "%s: grammar:\n", __func__);
+    //         grammar_parser::print_grammar(stderr, grammar);
+    //         fprintf(stderr, "\n");
+    //     }
+    // }

    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
        const auto fname_inp = params.fname_inp[f];
@ -1212,7 +1211,7 @@ int main(int argc, char ** argv) {
        {
            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);

-            const bool use_grammar = (!params.grammar_parsed.rules.empty() && !params.grammar_rule.empty());
+            const bool use_grammar = false;// (!params.grammar_parsed.rules.empty() && !params.grammar_rule.empty());
            wparams.strategy = (params.beam_size > 1 || use_grammar) ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;

            wparams.print_realtime   = false;
@ -1255,7 +1254,7 @@ int main(int argc, char ** argv) {

            whisper_print_user_data user_data = { &params, &pcmf32s, 0 };

-            const auto & grammar_parsed = params.grammar_parsed;
+            //const auto & grammar_parsed = params.grammar_parsed;
            // auto grammar_rules = grammar_parsed.c_rules();

            // if (use_grammar) {
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@ -3,11 +3,31 @@
 #include "llama-vocab.h"
 #include "llama-sampling.h"

+#include <cmath>
 #include <algorithm>
+#include <stdexcept>

-// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
-// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
-std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
+//
+// helpers
+//
+
+// NOTE: assumes valid utf8 (but checks for overrun)
+static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
+    static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
+    uint8_t  first_byte = static_cast<uint8_t>(*src);
+    uint8_t  highbits   = first_byte >> 4;
+    int      len        = lookup[highbits];
+    uint8_t  mask       = (1 << (8 - len)) - 1;
+    uint32_t value      = first_byte & mask;
+    const char * end    = src + len; // may overrun!
+    const char * pos    = src + 1;
+    for ( ; pos < end && *pos; pos++) {
+        value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
+    }
+    return std::make_pair(value, pos);
+}
+
+static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
        const std::string & src,
        llama_partial_utf8 partial_start) {
    static const int      lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
@ -67,12 +87,510 @@ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
    return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
 }

-const llama_grammar_rules & llama_grammar_get_rules(const struct llama_grammar * grammar) {
-    return grammar->rules;
+static bool is_digit_char(char c) {
+    return '0' <= c && c <= '9';
 }

-llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar) {
-    return grammar->stacks;
+static bool is_word_char(char c) {
+    return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
+}
+
+static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
+    const char * pos   = src;
+    const char * end   = src + size;
+    uint32_t     value = 0;
+    for ( ; pos < end && *pos; pos++) {
+        value <<= 4;
+        char c = *pos;
+        if ('a' <= c && c <= 'f') {
+            value += c - 'a' + 10;
+        } else if ('A' <= c && c <= 'F') {
+            value += c - 'A' + 10;
+        } else if ('0' <= c && c <= '9') {
+            value += c - '0';
+        } else {
+            break;
+        }
+    }
+    if (pos != end) {
+        throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
+    }
+    return std::make_pair(value, pos);
+}
+
+static const char * parse_space(const char * src, bool newline_ok) {
+    const char * pos = src;
+    while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
+            (newline_ok && (*pos == '\r' || *pos == '\n'))) {
+        if (*pos == '#') {
+            while (*pos && *pos != '\r' && *pos != '\n') {
+                pos++;
+            }
+        } else {
+            pos++;
+        }
+    }
+    return pos;
+}
+
+static const char * parse_name(const char * src) {
+    const char * pos = src;
+    while (is_word_char(*pos)) {
+        pos++;
+    }
+    if (pos == src) {
+        throw std::runtime_error(std::string("expecting name at ") + src);
+    }
+    return pos;
+}
+
+static const char * parse_int(const char * src) {
+    const char * pos = src;
+    while (is_digit_char(*pos)) {
+        pos++;
+    }
+    if (pos == src) {
+        throw std::runtime_error(std::string("expecting integer at ") + src);
+    }
+    return pos;
+}
+
+static std::pair<uint32_t, const char *> parse_char(const char * src) {
+    if (*src == '\\') {
+        switch (src[1]) {
+            case 'x': return parse_hex(src + 2, 2);
+            case 'u': return parse_hex(src + 2, 4);
+            case 'U': return parse_hex(src + 2, 8);
+            case 't': return std::make_pair('\t', src + 2);
+            case 'r': return std::make_pair('\r', src + 2);
+            case 'n': return std::make_pair('\n', src + 2);
+            case '\\':
+            case '"':
+            case '[':
+            case ']':
+                      return std::make_pair(src[1], src + 2);
+            default:
+                      throw std::runtime_error(std::string("unknown escape at ") + src);
+        }
+    } else if (*src) {
+        return decode_utf8(src);
+    }
+    throw std::runtime_error("unexpected end of input");
+}
+
+static void print_grammar_char(FILE * file, uint32_t c) {
+    if (0x20 <= c && c <= 0x7f) {
+        fprintf(file, "%c", static_cast<char>(c));
+    } else {
+        // cop out of encoding UTF-8
+        fprintf(file, "<U+%04X>", c);
+    }
+}
+
+static bool is_char_element(llama_grammar_element elem) {
+    switch (elem.type) {
+        case LLAMA_GRETYPE_CHAR:           return true;
+        case LLAMA_GRETYPE_CHAR_NOT:       return true;
+        case LLAMA_GRETYPE_CHAR_ALT:       return true;
+        case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
+        case LLAMA_GRETYPE_CHAR_ANY:       return true;
+        default:                           return false;
+    }
+}
+
+static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
+    for (auto elem : rule) {
+        switch (elem.type) {
+            case LLAMA_GRETYPE_END:            fprintf(file, "END");            break;
+            case LLAMA_GRETYPE_ALT:            fprintf(file, "ALT");            break;
+            case LLAMA_GRETYPE_RULE_REF:       fprintf(file, "RULE_REF");       break;
+            case LLAMA_GRETYPE_CHAR:           fprintf(file, "CHAR");           break;
+            case LLAMA_GRETYPE_CHAR_NOT:       fprintf(file, "CHAR_NOT");       break;
+            case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
+            case LLAMA_GRETYPE_CHAR_ALT:       fprintf(file, "CHAR_ALT");       break;
+            case LLAMA_GRETYPE_CHAR_ANY:       fprintf(file, "CHAR_ANY");       break;
+        }
+        switch (elem.type) {
+            case LLAMA_GRETYPE_END:
+            case LLAMA_GRETYPE_ALT:
+            case LLAMA_GRETYPE_RULE_REF:
+                fprintf(file, "(%u) ", elem.value);
+                break;
+            case LLAMA_GRETYPE_CHAR:
+            case LLAMA_GRETYPE_CHAR_NOT:
+            case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+            case LLAMA_GRETYPE_CHAR_ALT:
+            case LLAMA_GRETYPE_CHAR_ANY:
+                fprintf(file, "(\"");
+                print_grammar_char(file, elem.value);
+                fprintf(file, "\") ");
+                break;
+        }
+    }
+    fprintf(file, "\n");
+}
+
+static void print_rule(
+        FILE     * file,
+        uint32_t   rule_id,
+        const llama_grammar_rule & rule,
+        const std::map<uint32_t, std::string> & symbol_id_names) {
+    if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
+        throw std::runtime_error(
+            "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
+    }
+    fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
+    for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
+        llama_grammar_element elem = rule[i];
+        switch (elem.type) {
+            case LLAMA_GRETYPE_END:
+                throw std::runtime_error(
+                    "unexpected end of rule: " + std::to_string(rule_id) + "," +
+                    std::to_string(i));
+            case LLAMA_GRETYPE_ALT:
+                fprintf(file, "| ");
+                break;
+            case LLAMA_GRETYPE_RULE_REF:
+                fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
+                break;
+            case LLAMA_GRETYPE_CHAR:
+                fprintf(file, "[");
+                print_grammar_char(file, elem.value);
+                break;
+            case LLAMA_GRETYPE_CHAR_NOT:
+                fprintf(file, "[^");
+                print_grammar_char(file, elem.value);
+                break;
+            case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+                if (i == 0 || !is_char_element(rule[i - 1])) {
+                    throw std::runtime_error(
+                        "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
+                        std::to_string(rule_id) + "," + std::to_string(i));
+                }
+                fprintf(file, "-");
+                print_grammar_char(file, elem.value);
+                break;
+            case LLAMA_GRETYPE_CHAR_ALT:
+                if (i == 0 || !is_char_element(rule[i - 1])) {
+                    throw std::runtime_error(
+                        "LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
+                        std::to_string(rule_id) + "," + std::to_string(i));
+                }
+                print_grammar_char(file, elem.value);
+                break;
+            case LLAMA_GRETYPE_CHAR_ANY:
+                fprintf(file, ".");
+                break;
+        }
+        if (is_char_element(elem)) {
+            switch (rule[i + 1].type) {
+                case LLAMA_GRETYPE_CHAR_ALT:
+                case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+                case LLAMA_GRETYPE_CHAR_ANY:
+                    break;
+                default:
+                    fprintf(file, "] ");
+            }
+        }
+    }
+    fprintf(file, "\n");
+}
+
+//
+// implementation
+//
+
+uint32_t llama_grammar_parser::get_symbol_id(const char * src, size_t len) {
+    uint32_t next_id = static_cast<uint32_t>(symbol_ids.size());
+    auto result = symbol_ids.emplace(std::string(src, len), next_id);
+    return result.first->second;
+}
+
+uint32_t llama_grammar_parser::generate_symbol_id(const std::string & base_name) {
+    uint32_t next_id = static_cast<uint32_t>(symbol_ids.size());
+    symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
+    return next_id;
+}
+
+void llama_grammar_parser::add_rule(uint32_t rule_id, const llama_grammar_rule & rule) {
+    if (rules.size() <= rule_id) {
+        rules.resize(rule_id + 1);
+    }
+    rules[rule_id] = rule;
+}
+
+const char * llama_grammar_parser::parse_alternates(
+        const char        * src,
+        const std::string & rule_name,
+        uint32_t            rule_id,
+        bool                is_nested) {
+    llama_grammar_rule rule;
+    const char * pos = parse_sequence(src, rule_name, rule, is_nested);
+    while (*pos == '|') {
+        rule.push_back({LLAMA_GRETYPE_ALT, 0});
+        pos = parse_space(pos + 1, true);
+        pos = parse_sequence(pos, rule_name, rule, is_nested);
+    }
+    rule.push_back({LLAMA_GRETYPE_END, 0});
+    add_rule(rule_id, rule);
+    return pos;
+}
+
+const char * llama_grammar_parser::parse_sequence(
+        const char         * src,
+        const std::string  & rule_name,
+        llama_grammar_rule & rule,
+        bool               is_nested) {
+    size_t last_sym_start = rule.size();
+    const char * pos = src;
+
+        auto handle_repetitions = [&](int min_times, int max_times) {
+
+            if (last_sym_start == rule.size()) {
+                throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
+            }
+
+            // apply transformation to previous symbol (last_sym_start to end) according to
+            // the following rewrite rules:
+            // S{m,n} --> S S S (m times) S'(n-m)
+            //            S'(x)   ::= S S'(x-1) |
+            //            (... n-m definitions of these S' rules ...)
+            //            S'(1)   ::= S |
+            // S{m,} -->  S S S (m times) S'
+            //            S'     ::= S S' |
+            // S*     --> S{0,}
+            //        --> S'     ::= S S' |
+            // S+     --> S{1,}
+            //        --> S S'
+            //            S'     ::= S S' |
+            // S?     --> S{0,1}
+            //        --> S'
+            //            S'     ::= S |
+
+            llama_grammar_rule prev_rule(rule.begin() + last_sym_start, rule.end());
+            if (min_times == 0) {
+                rule.resize(last_sym_start);
+            } else {
+                // Repeat the previous elements (min_times - 1) times
+                for (int i = 1; i < min_times; i++) {
+                    rule.insert(rule.end(), prev_rule.begin(), prev_rule.end());
+                }
+            }
+
+            uint32_t last_rec_rule_id = 0;
+            auto n_opt = max_times < 0 ? 1 : max_times - min_times;
+
+            llama_grammar_rule rec_rule(prev_rule);
+            for (int i = 0; i < n_opt; i++) {
+                rec_rule.resize(prev_rule.size());
+                uint32_t rec_rule_id = generate_symbol_id( rule_name);
+                if (i > 0 || max_times < 0) {
+                    rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
+                }
+                rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
+                rec_rule.push_back({LLAMA_GRETYPE_END, 0});
+                add_rule( rec_rule_id, rec_rule);
+                last_rec_rule_id = rec_rule_id;
+            }
+            if (n_opt > 0) {
+                rule.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
+            }
+        };
+
+        while (*pos) {
+            if (*pos == '"') { // literal string
+                pos++;
+                last_sym_start = rule.size();
+                while (*pos != '"') {
+                    if (!*pos) {
+                        throw std::runtime_error("unexpected end of input");
+                    }
+                    auto char_pair = parse_char(pos);
+                         pos       = char_pair.second;
+                    rule.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
+                }
+                pos = parse_space(pos + 1, is_nested);
+            } else if (*pos == '[') { // char range(s)
+                pos++;
+                enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
+                if (*pos == '^') {
+                    pos++;
+                    start_type = LLAMA_GRETYPE_CHAR_NOT;
+                }
+                last_sym_start = rule.size();
+                while (*pos != ']') {
+                    if (!*pos) {
+                        throw std::runtime_error("unexpected end of input");
+                    }
+                    auto char_pair = parse_char(pos);
+                         pos       = char_pair.second;
+                    enum llama_gretype type = last_sym_start < rule.size()
+                        ? LLAMA_GRETYPE_CHAR_ALT
+                        : start_type;
+
+                    rule.push_back({type, char_pair.first});
+                    if (pos[0] == '-' && pos[1] != ']') {
+                        if (!pos[1]) {
+                            throw std::runtime_error("unexpected end of input");
+                        }
+                        auto endchar_pair = parse_char(pos + 1);
+                             pos          = endchar_pair.second;
+                        rule.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
+                    }
+                }
+                pos = parse_space(pos + 1, is_nested);
+            } else if (is_word_char(*pos)) { // rule reference
+                const char * name_end    = parse_name(pos);
+                uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
+                pos = parse_space(name_end, is_nested);
+                last_sym_start = rule.size();
+                rule.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
+            } else if (*pos == '(') { // grouping
+                // parse nested alternates into synthesized rule
+                pos = parse_space(pos + 1, true);
+                uint32_t sub_rule_id = generate_symbol_id(rule_name);
+                pos = parse_alternates(pos, rule_name, sub_rule_id, true);
+                last_sym_start = rule.size();
+                // output reference to synthesized rule
+                rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
+                if (*pos != ')') {
+                    throw std::runtime_error(std::string("expecting ')' at ") + pos);
+                }
+                pos = parse_space(pos + 1, is_nested);
+            } else if (*pos == '.') { // any char
+                last_sym_start = rule.size();
+                rule.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
+                pos = parse_space(pos + 1, is_nested);
+            } else if (*pos == '*') {
+                pos = parse_space(pos + 1, is_nested);
+                handle_repetitions(0, -1);
+            } else if (*pos == '+') {
+                pos = parse_space(pos + 1, is_nested);
+                handle_repetitions(1, -1);
+            } else if (*pos == '?') {
+                pos = parse_space(pos + 1, is_nested);
+                handle_repetitions(0, 1);
+            } else if (*pos == '{') {
+                pos = parse_space(pos + 1, is_nested);
+
+                if (!is_digit_char(*pos)) {
+                    throw std::runtime_error(std::string("expecting an int at ") + pos);
+                }
+                const char * int_end = parse_int(pos);
+                int min_times = std::stoul(std::string(pos, int_end - pos));
+                pos = parse_space(int_end, is_nested);
+
+                int max_times = -1;
+
+                if (*pos == '}') {
+                    max_times = min_times;
+                    pos = parse_space(pos + 1, is_nested);
+                } else if (*pos == ',') {
+                    pos = parse_space(pos + 1, is_nested);
+
+                    if (is_digit_char(*pos)) {
+                        const char * int_end = parse_int(pos);
+                        max_times = std::stoul(std::string(pos, int_end - pos));
+                        pos = parse_space(int_end, is_nested);
+                    }
+
+                    if (*pos != '}') {
+                        throw std::runtime_error(std::string("expecting '}' at ") + pos);
+                    }
+                    pos = parse_space(pos + 1, is_nested);
+                } else {
+                    throw std::runtime_error(std::string("expecting ',' at ") + pos);
+                }
+                handle_repetitions(min_times, max_times);
+            } else {
+                break;
+            }
+        }
+        return pos;
+    }
+
+const char * llama_grammar_parser::parse_rule(const char * src) {
+        const char * name_end = parse_name(src);
+        const char * pos      = parse_space(name_end, false);
+        size_t       name_len = name_end - src;
+        uint32_t     rule_id  = get_symbol_id(src, name_len);
+        const std::string name(src, name_len);
+
+        if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
+            throw std::runtime_error(std::string("expecting ::= at ") + pos);
+        }
+        pos = parse_space(pos + 3, true);
+
+        pos = parse_alternates(pos, name, rule_id, false);
+
+        if (*pos == '\r') {
+            pos += pos[1] == '\n' ? 2 : 1;
+        } else if (*pos == '\n') {
+            pos++;
+        } else if (*pos) {
+            throw std::runtime_error(std::string("expecting newline or end at ") + pos);
+        }
+        return parse_space(pos, true);
+    }
+
+bool llama_grammar_parser::parse(const char * src) {
+    try {
+        const char * pos = parse_space(src, true);
+        while (*pos) {
+            pos = parse_rule(pos);
+        }
+        // Validate the state to ensure that all rules are defined
+        for (const auto & rule : rules) {
+            if (rule.empty()) {
+                throw std::runtime_error("Undefined rule");
+            }
+            for (const auto & elem : rule) {
+                if (elem.type == LLAMA_GRETYPE_RULE_REF) {
+                    // Ensure that the rule at that location exists
+                    if (elem.value >= rules.size() || rules[elem.value].empty()) {
+                        // Get the name of the rule that is missing
+                        for (const auto & kv : symbol_ids) {
+                            if (kv.second == elem.value) {
+                                throw std::runtime_error("Undefined rule identifier '" + kv.first + "'");
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } catch (const std::exception & err) {
+        fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
+        rules.clear();
+        return false;
+    }
+
+    return true;
+}
+
+void llama_grammar_parser::print(FILE * file) {
+    try {
+        std::map<uint32_t, std::string> symbol_id_names;
+        for (const auto & kv : symbol_ids) {
+            symbol_id_names[kv.second] = kv.first;
+        }
+        for (size_t i = 0, end = rules.size(); i < end; i++) {
+            // fprintf(file, "%zu: ", i);
+            // print_rule_binary(file, rules[i]);
+            print_rule(file, uint32_t(i), rules[i], symbol_id_names);
+            // fprintf(file, "\n");
+        }
+    } catch (const std::exception & err) {
+        fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
+    }
+}
+
+llama_grammar_stack llama_grammar_parser::c_rules() const {
+    llama_grammar_stack ret;
+    ret.reserve(rules.size());
+    for (const auto & rule : rules) {
+        ret.push_back(rule.data());
+    }
+    return ret;
 }

 // returns true iff pos points to the end of one of the definitions of a rule
@ -89,7 +607,6 @@ static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos)
 static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
        const llama_grammar_element * pos,
        const uint32_t                chr) {
-
    bool found            = false;
    bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;

@ -225,36 +742,6 @@ static void llama_grammar_advance_stack(
    }
 }

-// takes a set of possible pushdown stacks on a grammar, which are required to
-// be positioned at a character range (see `llama_grammar_advance_stack`), and
-// produces the N possible stacks if the given char is accepted at those
-// positions
-void llama_grammar_accept(
-        const llama_grammar_rules  & rules,
-        const llama_grammar_stacks & stacks,
-        const uint32_t               chr,
-              llama_grammar_stacks & new_stacks) {
-    new_stacks.clear();
-
-    for (const auto & stack : stacks) {
-        if (stack.empty()) {
-            continue;
-        }
-
-        auto match = llama_grammar_match_char(stack.back(), chr);
-        if (match.first) {
-            const llama_grammar_element * pos = match.second;
-
-            // update top of stack to next element, if any
-            llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
-            if (!llama_grammar_is_end_of_sequence(pos)) {
-                new_stack.push_back(pos);
-            }
-            llama_grammar_advance_stack(rules, new_stack, new_stacks);
-        }
-    }
-}
-
 static llama_grammar_candidates llama_grammar_reject_candidates(
        const llama_grammar_rules      & rules,
        const llama_grammar_stacks     & stacks,
@ -270,9 +757,98 @@ static llama_grammar_candidates llama_grammar_reject_candidates(
    for (size_t i = 1, size = stacks.size(); i < size; ++i) {
        rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
    }
+
    return rejects;
 }

+static bool llama_grammar_detect_left_recursion(
+        const llama_grammar_rules & rules,
+        size_t rule_index,
+        std::vector<bool> * rules_visited,
+        std::vector<bool> * rules_in_progress,
+        std::vector<bool> * rules_may_be_empty) {
+    if ((*rules_in_progress)[rule_index]) {
+        return true;
+    }
+
+    (*rules_in_progress)[rule_index] = true;
+
+    const llama_grammar_rule & rule = rules[rule_index];
+
+    // First check if the rule might produce the empty string. This could be done combined with the second
+    // step but it's more readable as two steps.
+    bool at_rule_start = true;
+    for (size_t i = 0; i < rule.size(); i++) {
+        if (llama_grammar_is_end_of_sequence(&rule[i])) {
+            if (at_rule_start) {
+                (*rules_may_be_empty)[rule_index] = true;
+                break;
+            }
+            at_rule_start = true;
+        } else {
+            at_rule_start = false;
+        }
+    }
+
+    // Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
+    // be empty)
+    bool recurse_into_nonterminal = true;
+    for (size_t i = 0; i < rule.size(); i++) {
+        if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
+            if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
+                return true;
+            }
+            if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
+                recurse_into_nonterminal = false;
+            }
+        } else if (llama_grammar_is_end_of_sequence(&rule[i])) {
+            recurse_into_nonterminal = true;
+        } else {
+            recurse_into_nonterminal = false;
+        }
+    }
+
+    (*rules_in_progress)[rule_index] = false;
+    (*rules_visited)[rule_index] = true;
+
+    return false;
+}
+
+const llama_grammar_rules & llama_grammar_get_rules(const struct llama_grammar * grammar) {
+    return grammar->rules;
+}
+
+llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar) {
+    return grammar->stacks;
+}
+
+void llama_grammar_accept(
+        const llama_grammar_rules  & rules,
+        const llama_grammar_stacks & stacks,
+        const uint32_t               chr,
+              llama_grammar_stacks & stacks_new) {
+    stacks_new.clear();
+    stacks_new.reserve(stacks.size());
+
+    for (const auto & stack : stacks) {
+        if (stack.empty()) {
+            continue;
+        }
+
+        auto match = llama_grammar_match_char(stack.back(), chr);
+        if (match.first) {
+            const llama_grammar_element * pos = match.second;
+
+            // update top of stack to next element, if any
+            llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
+            if (!llama_grammar_is_end_of_sequence(pos)) {
+                new_stack.push_back(pos);
+            }
+            llama_grammar_advance_stack(rules, new_stack, stacks_new);
+        }
+    }
+}
+
 llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
        const llama_grammar_rules      & rules,
        const llama_grammar_stack      & stack,
@ -328,63 +904,10 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
    return rejects;
 }

-static bool llama_grammar_detect_left_recursion(
-        const llama_grammar_rules & rules,
-        size_t rule_index,
-        std::vector<bool> * rules_visited,
-        std::vector<bool> * rules_in_progress,
-        std::vector<bool> * rules_may_be_empty) {
-    if ((*rules_in_progress)[rule_index]) {
-        return true;
-    }
-
-    (*rules_in_progress)[rule_index] = true;
-
-    const llama_grammar_rule & rule = rules[rule_index];
-
-    // First check if the rule might produce the empty string. This could be done combined with the second
-    // step but it's more readable as two steps.
-    bool at_rule_start = true;
-    for (size_t i = 0; i < rule.size(); i++) {
-        if (llama_grammar_is_end_of_sequence(&rule[i])) {
-            if (at_rule_start) {
-                (*rules_may_be_empty)[rule_index] = true;
-                break;
-            }
-            at_rule_start = true;
-        } else {
-            at_rule_start = false;
-        }
-    }
-
-    // Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
-    // be empty)
-    bool recurse_into_nonterminal = true;
-    for (size_t i = 0; i < rule.size(); i++) {
-        if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
-            if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
-                return true;
-            }
-            if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
-                recurse_into_nonterminal = false;
-            }
-        } else if (llama_grammar_is_end_of_sequence(&rule[i])) {
-            recurse_into_nonterminal = true;
-        } else {
-            recurse_into_nonterminal = false;
-        }
-    }
-
-    (*rules_in_progress)[rule_index] = false;
-    (*rules_visited)[rule_index] = true;
-    return false;
-}
-
-//
-// grammar - external
-//
+////////////////////

 struct llama_grammar * llama_grammar_init_impl(
+        const struct llama_vocab * vocab,
        const llama_grammar_element ** rules,
        size_t n_rules,
        size_t start_rule_index) {
@ -438,22 +961,104 @@ struct llama_grammar * llama_grammar_init_impl(
    // Important: vec_rules has to be moved here, not copied, because stacks contains
    // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
    // then the pointers would be invalidated when the local vec_rules goes out of scope.
-    return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
+    return new llama_grammar { vocab, std::move(vec_rules), std::move(stacks), {}, };
+}
+
+struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root) {
+    llama_grammar_parser parser;
+
+    // if there is a grammar, parse it
+    if (!parser.parse(grammar_str)) {
+        return nullptr;
+    }
+
+    // will be empty (default) if there are parse errors
+    if (parser.rules.empty()) {
+        fprintf(stderr, "%s: failed to parse grammar\n", __func__);
+        return nullptr;
+    }
+
+    // Ensure that there is a "root" node.
+    if (parser.symbol_ids.find("root") == parser.symbol_ids.end()) {
+        fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
+        return nullptr;
+    }
+
+    std::vector<const llama_grammar_element *> grammar_rules(parser.c_rules());
+
+    const size_t n_rules = grammar_rules.size();
+    const size_t start_rule_index = parser.symbol_ids.at(grammar_root);
+
+    const llama_grammar_element * pos;
+
+    // copy rule definitions into vectors
+    llama_grammar_rules vec_rules(n_rules);
+    for (size_t i = 0; i < n_rules; i++) {
+        for (pos = grammar_rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
+            vec_rules[i].push_back(*pos);
+        }
+        vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
+    }
+
+    // Check for left recursion
+    std::vector<bool> rules_visited(n_rules);
+    std::vector<bool> rules_in_progress(n_rules);
+    std::vector<bool> rules_may_be_empty(n_rules);
+    for (size_t i = 0; i < n_rules; i++) {
+        if (rules_visited[i]) {
+            continue;
+        }
+        if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
+            LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
+            return nullptr;
+        }
+    }
+
+    // loop over alternates of start rule to build initial stacks
+    llama_grammar_stacks stacks;
+    pos = vec_rules[start_rule_index].data();
+    do {
+        llama_grammar_stack stack;
+        if (!llama_grammar_is_end_of_sequence(pos)) {
+            // if alternate is nonempty, add to stack
+            stack.push_back(pos);
+        }
+        llama_grammar_advance_stack(vec_rules, stack, stacks);
+        while (!llama_grammar_is_end_of_sequence(pos)) {
+            // scan to end of alternate def
+            pos++;
+        }
+        if (pos->type == LLAMA_GRETYPE_ALT) {
+            // there's another alternate def of this rule to process
+            pos++;
+        } else {
+            break;
+        }
+    } while (true);
+
+    // Important: vec_rules has to be moved here, not copied, because stacks contains
+    // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
+    // then the pointers would be invalidated when the local vec_rules goes out of scope.
+    return new llama_grammar { vocab, std::move(vec_rules), std::move(stacks), {}, };
 }

 void llama_grammar_free_impl(struct llama_grammar * grammar) {
+    if (grammar == nullptr) {
+        return;
+    }
+
    delete grammar;
 }

-struct llama_grammar * llama_grammar_copy_impl(const struct llama_grammar * grammar) {
-    llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
+struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
+    llama_grammar * result = new llama_grammar { grammar.vocab, grammar.rules, grammar.stacks, grammar.partial_utf8, };

    // redirect elements in stacks to point to new rules
    for (size_t is = 0; is < result->stacks.size(); is++) {
        for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
-            for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
-                for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
-                    if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
+            for (size_t ir0 = 0; ir0 < grammar.rules.size(); ir0++) {
+                for (size_t ir1 = 0; ir1 < grammar.rules[ir0].size(); ir1++) {
+                    if (grammar.stacks[is][ie] == &grammar.rules[ir0][ir1]) {
                         result->stacks[is][ie]  =  &result->rules[ir0][ir1];
                    }
                }
@ -464,14 +1069,11 @@ struct llama_grammar * llama_grammar_copy_impl(const struct llama_grammar * gram
    return result;
 }

-void llama_grammar_sample_impl(const struct llama_grammar * grammar, const struct llama_vocab * vocab, const struct llama_sampling * smpl, llama_token_data_array * candidates) {
-    GGML_ASSERT(grammar);
-    GGML_ASSERT(vocab);
-
-    int64_t t_start_sample_us = ggml_time_us();
+void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * cur_p) {
+    GGML_ASSERT(grammar.vocab != nullptr);

    bool allow_eog = false;
-    for (const auto & stack : grammar->stacks) {
+    for (const auto & stack : grammar.stacks) {
        if (stack.empty()) {
            allow_eog = true;
            break;
@ -479,40 +1081,38 @@ void llama_grammar_sample_impl(const struct llama_grammar * grammar, const struc
    }

    std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
-    candidates_decoded.reserve(candidates->size);
+    candidates_decoded.reserve(cur_p->size);

    llama_grammar_candidates candidates_grammar;
-    candidates_grammar.reserve(candidates->size);
+    candidates_grammar.reserve(cur_p->size);

-    for (size_t i = 0; i < candidates->size; ++i) {
-        const llama_token id      = candidates->data[i].id;
-        const std::string & piece = vocab->cache_token_to_piece.at(id);
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        const llama_token id      = cur_p->data[i].id;
+        const std::string & piece = grammar.vocab->cache_token_to_piece.at(id);

-        if (llama_token_is_eog_impl(*vocab, id)) {
+        if (llama_token_is_eog_impl(*grammar.vocab, id)) {
            if (!allow_eog) {
-                candidates->data[i].logit = -INFINITY;
+                cur_p->data[i].logit = -INFINITY;
            }
        } else if (piece.empty() || piece[0] == 0) {
-            candidates->data[i].logit = -INFINITY;
+            cur_p->data[i].logit = -INFINITY;
        } else {
-            candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
+            candidates_decoded.push_back(decode_utf8(piece, grammar.partial_utf8));
            candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
        }
    }

-    const auto rejects = llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
+    const auto rejects = llama_grammar_reject_candidates(grammar.rules, grammar.stacks, candidates_grammar);
    for (const auto & reject : rejects) {
-        candidates->data[reject.index].logit = -INFINITY;
+        cur_p->data[reject.index].logit = -INFINITY;
+    }
 }

-    smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
-}
+void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
+    GGML_ASSERT(grammar.vocab != nullptr);

-void llama_grammar_accept_token_impl(struct llama_grammar * grammar, const struct llama_vocab * vocab, const struct llama_sampling * smpl, llama_token token) {
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    if (llama_token_is_eog_impl(*vocab, token)) {
-        for (const auto & stack : grammar->stacks) {
+    if (llama_token_is_eog_impl(*grammar.vocab, token)) {
+        for (const auto & stack : grammar.stacks) {
            if (stack.empty()) {
                return;
            }
@ -520,20 +1120,19 @@ void llama_grammar_accept_token_impl(struct llama_grammar * grammar, const struc
        GGML_ABORT("fatal error");
    }

-    const std::string & piece = vocab->cache_token_to_piece.at(token);
+    const std::string & piece = grammar.vocab->cache_token_to_piece.at(token);

    // Note terminating 0 in decoded string
-    const auto   decoded     = decode_utf8(piece, grammar->partial_utf8);
+    const auto   decoded     = decode_utf8(piece, grammar.partial_utf8);
    const auto & code_points = decoded.first;

-    llama_grammar_stacks tmp_new_stacks;
+    llama_grammar_stacks stacks_new;
+
    for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
-        llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
-        grammar->stacks = tmp_new_stacks;
+        llama_grammar_accept(grammar.rules, grammar.stacks, *it, stacks_new);
+        grammar.stacks = std::move(stacks_new);
    }

-    grammar->partial_utf8 = decoded.second;
-    GGML_ASSERT(!grammar->stacks.empty());
-
-    smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
+    grammar.partial_utf8 = decoded.second;
+    GGML_ASSERT(!grammar.stacks.empty());
 }
--- a/src/llama-grammar.h
+++ b/src/llama-grammar.h
@ -2,11 +2,115 @@

 #include "llama-impl.h"

+#include <map>
+
 struct llama_vocab;
-struct llama_sampling;
+
+// grammar element type
+enum llama_gretype {
+    // end of rule definition
+    LLAMA_GRETYPE_END            = 0,
+
+    // start of alternate definition for rule
+    LLAMA_GRETYPE_ALT            = 1,
+
+    // non-terminal element: reference to rule
+    LLAMA_GRETYPE_RULE_REF       = 2,
+
+    // terminal element: character (code point)
+    LLAMA_GRETYPE_CHAR           = 3,
+
+    // inverse char(s) ([^a], [^a-b] [^abc])
+    LLAMA_GRETYPE_CHAR_NOT       = 4,
+
+    // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
+    // be an inclusive range ([a-z])
+    LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
+
+    // modifies a preceding LLAMA_GRETYPE_CHAR or
+    // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
+    LLAMA_GRETYPE_CHAR_ALT       = 6,
+
+    // any character (.)
+    LLAMA_GRETYPE_CHAR_ANY       = 7,
+};
+
+typedef struct llama_grammar_element {
+    enum llama_gretype type;
+    uint32_t           value; // Unicode code point or rule ID
+} llama_grammar_element;
+
+struct llama_partial_utf8 {
+    uint32_t value;    // bit value so far (unshifted)
+    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
+};
+
+struct llama_grammar_candidate {
+    size_t               index;
+    const uint32_t     * code_points;
+    llama_partial_utf8   partial_utf8;
+};
+
+using llama_grammar_rule  = std::vector<      llama_grammar_element>;
+using llama_grammar_stack = std::vector<const llama_grammar_element *>;
+
+using llama_grammar_rules      = std::vector<llama_grammar_rule>;
+using llama_grammar_stacks     = std::vector<llama_grammar_stack>;
+using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
+
+const llama_grammar_rules  & llama_grammar_get_rules (const struct llama_grammar * grammar);
+      llama_grammar_stacks & llama_grammar_get_stacks(      struct llama_grammar * grammar);
+
+// takes a set of possible pushdown stacks on a grammar, which are required to
+// be positioned at a character range (see `llama_grammar_advance_stack`), and
+// produces the N possible stacks if the given char is accepted at those
+// positions
+void llama_grammar_accept(
+        const llama_grammar_rules  & rules,
+        const llama_grammar_stacks & stacks,
+                          uint32_t   chr,
+              llama_grammar_stacks & stacks_new);
+
+std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
+        const llama_grammar_rules      & rules,
+        const llama_grammar_stack      & stack,
+        const llama_grammar_candidates & candidates);
+
+struct llama_grammar_parser {
+    std::map<std::string, uint32_t> symbol_ids;
+
+    llama_grammar_rules rules;
+
+    llama_grammar_stack c_rules() const;
+
+    uint32_t get_symbol_id(const char * src, size_t len);
+    uint32_t generate_symbol_id(const std::string & base_name);
+
+    void add_rule(uint32_t rule_id, const llama_grammar_rule & rule);
+
+    const char * parse_alternates(
+            const char        * src,
+            const std::string & rule_name,
+            uint32_t            rule_id,
+            bool                is_nested);
+
+    const char * parse_sequence(
+            const char         * src,
+            const std::string  & rule_name,
+            llama_grammar_rule & rule,
+            bool               is_nested);
+
+    const char * parse_rule(const char * src);
+
+    bool parse(const char * src);
+    void print(FILE * file);
+};

 struct llama_grammar {
-    const llama_grammar_rules  rules;
+    // note: allow null vocab for testing (not great)
+    const llama_vocab * vocab;
+
+    const llama_grammar_rules  rules;  // TODO: shared ptr
          llama_grammar_stacks stacks;

    // buffer for partially generated UTF-8 sequence from accepted tokens
@ -17,23 +121,24 @@ struct llama_grammar {
 // internal API
 //

+// note: needed for tests (not great)
 struct llama_grammar * llama_grammar_init_impl(
+        const struct llama_vocab * vocab,
        const llama_grammar_element ** rules,
        size_t n_rules,
        size_t start_rule_index);

+struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root);
+
 void llama_grammar_free_impl(struct llama_grammar * grammar);

-struct llama_grammar * llama_grammar_copy_impl(const struct llama_grammar * grammar);
+struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar);

-void llama_grammar_sample_impl(
-        const struct llama_grammar * grammar,
-          const struct llama_vocab * vocab,
-       const struct llama_sampling * smpl,
-            llama_token_data_array * candidates);
+// TODO: move the API below as member functions of llama_grammar
+void llama_grammar_apply_impl(
+        const struct llama_grammar & grammar,
+            llama_token_data_array * cur_p);

-void llama_grammar_accept_token_impl(
-              struct llama_grammar * grammar,
-          const struct llama_vocab * vocab,
-       const struct llama_sampling * smpl,
+void llama_grammar_accept_impl(
+              struct llama_grammar & grammar,
                       llama_token   token);
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@ -1,8 +1,11 @@
 #pragma once

-#define LLAMA_API_INTERNAL
 #include "llama.h"

+#include <string>
+#include <vector>
+#include <stdexcept>
+
 #ifdef __GNUC__
 #ifdef __MINGW32__
 #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
@ -29,6 +32,20 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
 // helpers
 //

+struct time_meas {
+    time_meas(int64_t & t_acc, bool disable = false) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
+
+    ~time_meas() {
+        if (t_start_us >= 0) {
+            t_acc += ggml_time_us() - t_start_us;
+        }
+    }
+
+    const int64_t t_start_us;
+
+    int64_t & t_acc;
+};
+
 static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    if (search.empty()) {
        return;
@ -45,3 +62,113 @@ static void replace_all(std::string & s, const std::string & search, const std::
    builder.append(s, last_pos, std::string::npos);
    s = std::move(builder);
 }
+
+const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
+    struct llama_context * ctx
+);
+
+// the ring buffer works similarly to std::deque, but with a fixed capacity
+template<typename T>
+struct ring_buffer {
+    ring_buffer(size_t cap) : capacity(cap), data(cap) {}
+
+    T & front() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[first];
+    }
+
+    const T & front() const {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[first];
+    }
+
+    T & back() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[pos];
+    }
+
+    const T & back() const {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[pos];
+    }
+
+    void push_back(const T & value) {
+        if (sz == capacity) {
+            // advance the start when buffer is full
+            first = (first + 1) % capacity;
+        } else {
+            sz++;
+        }
+        data[pos] = value;
+        pos = (pos + 1) % capacity;
+    }
+
+    T pop_front() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        T value = data[first];
+        first = (first + 1) % capacity;
+        sz--;
+        return value;
+    }
+
+    //T & operator[](size_t i) {
+    //    if (i >= sz) {
+    //        throw std::runtime_error("ring buffer: index out of bounds");
+    //    }
+    //    return data[(first + i) % capacity];
+    //}
+
+    //const T & at(size_t i) const {
+    //    if (i >= sz) {
+    //        throw std::runtime_error("ring buffer: index out of bounds");
+    //    }
+    //    return data[(first + i) % capacity];
+    //}
+
+    const T & rat(size_t i) const {
+        if (i >= sz) {
+            throw std::runtime_error("ring buffer: index out of bounds");
+        }
+        return data[(first + sz - i - 1) % capacity];
+    }
+
+    std::vector<T> to_vector() const {
+        std::vector<T> result;
+        result.reserve(sz);
+        for (size_t i = 0; i < sz; i++) {
+            result.push_back(data[(first + i) % capacity]);
+        }
+        return result;
+    }
+
+    void clear() {
+        // here only reset the status of the buffer
+        sz = 0;
+        first = 0;
+        pos = 0;
+    }
+
+    bool empty() const {
+        return sz == 0;
+    }
+
+    size_t size() const {
+        return sz;
+    }
+
+    size_t capacity = 0;
+    size_t sz = 0;
+    size_t first = 0;
+    size_t pos = 0;
+    std::vector<T> data;
+};
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
--- a/src/llama-sampling.h
+++ b/src/llama-sampling.h
@ -1,56 +1,39 @@
 #pragma once

-#include "llama-impl.h"
+// TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ?

-struct llama_sampling {
-    llama_sampling(int32_t n_vocab) : n_vocab(n_vocab) {}
+#include "llama-grammar.h"

-    std::mt19937 rng;
+#include <unordered_map>

-    int32_t n_vocab = 0;
+struct llama_vocab;
+struct llama_grammar;

-    mutable int64_t t_sample_us = 0;
-    mutable int32_t n_sample = 0;
+// sampler chain

-    void reset_timings() const {
-        t_sample_us = 0;
-        n_sample = 0;
-    }
+struct llama_sampler_chain {
+    llama_sampler_chain_params params;
+
+    std::vector<struct llama_sampler *> samplers;
+
+    // timing
+
+    mutable int64_t t_sample_us;
+
+    mutable int32_t n_sample;
 };

-//
-// internal API
-//
+using llama_token_cnt = std::unordered_map<llama_token, int>;

-void llama_set_rng_seed_impl(struct llama_sampling * smpl, uint32_t seed);
-
-void llama_sample_softmax_impl  (struct llama_sampling * smpl, llama_token_data_array * candidates);
-void llama_sample_top_k_impl    (struct llama_sampling * smpl, llama_token_data_array * candidates, int32_t k, size_t min_keep);
-void llama_sample_top_p_impl    (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
-void llama_sample_min_p_impl    (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
-void llama_sample_tail_free_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float z, size_t min_keep);
-void llama_sample_typical_impl  (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
-void llama_sample_entropy_impl  (struct llama_sampling * smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val, float smoothing_factor);
-void llama_sample_temp_impl     (struct llama_sampling * smpl, llama_token_data_array * candidates, float temp, float smoothing_factor);
-
-void llama_sample_repetition_penalties_impl(
-        struct llama_sampling * smpl,
-       llama_token_data_array * candidates,
-            const llama_token * last_tokens,
-                       size_t   penalty_last_n,
+// TODO: tmp exposed until test-sampling is fixed
+void llama_sampler_penalties_impl(
+       llama_token_data_array * cur_p,
+        const llama_token_cnt & token_count,
                        float   penalty_repeat,
                        float   penalty_freq,
                        float   penalty_present);

-void llama_sample_apply_guidance_impl(
-        struct llama_sampling * smpl,
-                        float * logits,
-                        float * logits_guidance,
-                        float   scale);
-
-llama_token llama_sample_token_mirostat_impl   (struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu);
-llama_token llama_sample_token_mirostat_v2_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, float * mu);
-llama_token llama_sample_token_greedy_impl     (struct llama_sampling * smpl, llama_token_data_array * candidates);
-llama_token llama_sample_token_with_rng_impl   (struct llama_sampling * smpl, llama_token_data_array * candidates, std::mt19937 & rng);
-llama_token llama_sample_token_impl            (struct llama_sampling * smpl, llama_token_data_array * candidates);
-
+struct llama_sampler * llama_sampler_init_grammar_impl(
+        const struct llama_vocab & vocab,
+                      const char * grammar_str,
+                      const char * grammar_root);
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@ -18,6 +18,8 @@ struct llama_vocab {
        tattr attr;
    };

+    uint32_t n_vocab = 0; // TODO: not great because has to keep in sync with hparams.n_vocab
+
    enum llama_vocab_type     type     = LLAMA_VOCAB_TYPE_SPM;
    enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;

@ -62,8 +64,6 @@ struct llama_vocab {
    int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
 };

-const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx);
-
 //
 // internal API
 //
@ -76,6 +76,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
        bool add_special,
        bool parse_special = false);

+// TODO: move the API below as member functions of llama_vocab
 llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch);

 const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token);
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -3192,7 +3192,6 @@ struct llama_sbatch {
 struct llama_context {
    llama_context(const llama_model & model)
        : model(model)
-        , sampling(llama_n_vocab(&model))
        , t_start_us(model.t_start_us)
        , t_load_us(model.t_load_us) {}

@ -3209,7 +3208,6 @@ struct llama_context {
    const struct llama_model & model;

    struct llama_cparams        cparams;
-    struct llama_sampling       sampling;
    struct llama_sbatch         sbatch;
    struct llama_kv_cache       kv_self;
    struct llama_control_vector cvec;
@ -3230,16 +3228,16 @@ struct llama_context {

    bool has_evaluated_once = false;

-    int64_t t_start_us;
-    int64_t t_load_us;
-    int64_t t_p_eval_us = 0;
-    int64_t t_eval_us   = 0;
+    mutable int64_t t_start_us;
+    mutable int64_t t_load_us;
+    mutable int64_t t_p_eval_us = 0;
+    mutable int64_t t_eval_us   = 0;

-    int64_t t_compute_start_us = 0;
-    int64_t n_queued_tokens = 0;
+    mutable int64_t t_compute_start_us = 0;
+    mutable int64_t n_queued_tokens = 0;

-    int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
-    int32_t n_eval   = 0; // number of eval calls
+    mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
+    mutable int32_t n_eval   = 0; // number of eval calls

    // host buffer for the model output (logits and embeddings)
    ggml_backend_buffer_t buf_output = nullptr;
@ -6291,6 +6289,7 @@ static void llm_load_vocab(

    const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);

+    vocab.n_vocab = n_vocab;
    vocab.id_to_token.resize(n_vocab);

    for (uint32_t i = 0; i < n_vocab; i++) {
@ -17968,7 +17967,6 @@ struct llama_model_params llama_model_default_params() {

 struct llama_context_params llama_context_default_params() {
    struct llama_context_params result = {
-        /*.seed                        =*/ LLAMA_DEFAULT_SEED,
        /*.n_ctx                       =*/ 512,
        /*.n_batch                     =*/ 2048,
        /*.n_ubatch                    =*/ 512,
@ -18001,6 +17999,14 @@ struct llama_context_params llama_context_default_params() {
    return result;
 }

+struct llama_sampler_chain_params llama_sampler_chain_default_params() {
+    struct llama_sampler_chain_params result = {
+        /*.no_perf                     =*/ true,
+    };
+
+    return result;
+}
+
 struct llama_model_quantize_params llama_model_quantize_default_params() {
    struct llama_model_quantize_params result = {
        /*.nthread                     =*/ 0,
@ -18240,10 +18246,6 @@ struct llama_context * llama_new_context_with_model(
        cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
    }

-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
    LLAMA_LOG_INFO("%s: n_ctx      = %u\n",     __func__, cparams.n_ctx);
    LLAMA_LOG_INFO("%s: n_batch    = %u\n",     __func__, cparams.n_batch);
    LLAMA_LOG_INFO("%s: n_ubatch   = %u\n",     __func__, cparams.n_ubatch);
@ -18254,8 +18256,8 @@ struct llama_context * llama_new_context_with_model(
    ctx->abort_callback      = params.abort_callback;
    ctx->abort_callback_data = params.abort_callback_data;

-    ctx->sampling.rng = std::mt19937(params.seed);
    ctx->logits_all = params.logits_all;
+
    // build worst-case graph for encoder if a model contains encoder
    ctx->is_encoding = llama_model_has_encoder(model);

@ -18535,14 +18537,6 @@ void llama_free(struct llama_context * ctx) {
    delete ctx;
 }

-const struct llama_model * llama_get_model(const struct llama_context * ctx) {
-    return &ctx->model;
-}
-
-const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx) {
-    return &ctx->model.vocab;
-}
-
 uint32_t llama_n_ctx(const struct llama_context * ctx) {
    return ctx->cparams.n_ctx;
 }
@ -18563,6 +18557,30 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
    return model->vocab.type;
 }

+int32_t llama_n_vocab(const struct llama_model * model) {
+    return model->hparams.n_vocab;
+}
+
+int32_t llama_n_ctx_train(const struct llama_model * model) {
+    return model->hparams.n_ctx_train;
+}
+
+int32_t llama_n_embd(const struct llama_model * model) {
+    return model->hparams.n_embd;
+}
+
+int32_t llama_n_layer(const struct llama_model * model) {
+    return model->hparams.n_layer;
+}
+
+const struct llama_model * llama_get_model(const struct llama_context * ctx) {
+    return &ctx->model;
+}
+
+enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
+    return ctx->cparams.pooling_type;
+}
+
 enum llama_rope_type llama_rope_type(const struct llama_model * model) {
    switch (model->arch) {
        // these models do not use RoPE
@ -18626,26 +18644,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
    return LLAMA_ROPE_TYPE_NONE;
 }

-enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
-    return ctx->cparams.pooling_type;
-}
-
-int32_t llama_n_vocab(const struct llama_model * model) {
-    return model->hparams.n_vocab;
-}
-
-int32_t llama_n_ctx_train(const struct llama_model * model) {
-    return model->hparams.n_ctx_train;
-}
-
-int32_t llama_n_embd(const struct llama_model * model) {
-    return model->hparams.n_embd;
-}
-
-int32_t llama_n_layer(const struct llama_model * model) {
-    return model->hparams.n_layer;
-}
-
 float llama_rope_freq_scale_train(const struct llama_model * model) {
    return model->hparams.rope_freq_scale_train;
 }
@ -19062,14 +19060,14 @@ struct llama_data_write {
        // TODO: add more model-specific info which should prevent loading the session file if not identical
    }

-    void write_rng(const std::mt19937 & rng) {
-        std::ostringstream rng_ss;
-        rng_ss << rng;
+    //void write_rng(const std::mt19937 & rng) {
+    //    std::ostringstream rng_ss;
+    //    rng_ss << rng;

-        const std::string & rng_str = rng_ss.str();
+    //    const std::string & rng_str = rng_ss.str();

-        write_string(rng_str);
-    }
+    //    write_string(rng_str);
+    //}

    void write_output_ids(struct llama_context * ctx) {
        llama_output_reorder(ctx);
@ -19289,17 +19287,17 @@ struct llama_data_read {
        // TODO: add more info which needs to be identical but which is not verified otherwise
    }

-    void read_rng(std::mt19937 & rng) {
-        std::string rng_str;
-        read_string(rng_str);
+    //void read_rng(std::mt19937 & rng) {
+    //    std::string rng_str;
+    //    read_string(rng_str);

-        std::istringstream rng_ss(rng_str);
-        rng_ss >> rng;
+    //    std::istringstream rng_ss(rng_str);
+    //    rng_ss >> rng;

-        if (rng_ss.fail()) {
-            throw std::runtime_error("failed to load RNG state");
-        }
-    }
+    //    if (rng_ss.fail()) {
+    //        throw std::runtime_error("failed to load RNG state");
+    //    }
+    //}

    void read_output_ids(struct llama_context * ctx) {
        std::vector<int32_t> output_pos;
@ -19729,8 +19727,6 @@ static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_da

    data_ctx.write_model_info(ctx);

-    data_ctx.write_rng(ctx->sampling.rng);
-
    // copy outputs
    data_ctx.write_output_ids(ctx);
    data_ctx.write_logits(ctx);
@ -19768,9 +19764,6 @@ static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_da

    data_ctx.read_model_info(ctx);

-    // set rng
-    data_ctx.read_rng(ctx->sampling.rng);
-
    // set outputs
    data_ctx.read_output_ids(ctx);
    data_ctx.read_logits(ctx);
@ -20183,8 +20176,9 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
        LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
 #ifndef NDEBUG
        GGML_ABORT("fatal error");
-#endif
+#else
        return nullptr;
+#endif
    }
 }

@ -20232,8 +20226,9 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
        LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
 #ifndef NDEBUG
        GGML_ABORT("fatal error");
-#endif
+#else
        return nullptr;
+#endif
    }
 }

@ -20666,124 +20661,18 @@ int32_t llama_chat_apply_template(
    return res;
 }

-//
-// grammar
-//
-
-struct llama_grammar * llama_grammar_init(
-        const llama_grammar_element ** rules,
-        size_t    n_rules,
-        size_t    start_rule_index) {
-    return llama_grammar_init_impl(rules, n_rules, start_rule_index);
-}
-
-void llama_grammar_free(struct llama_grammar * grammar) {
-    llama_grammar_free_impl(grammar);
-}
-
-struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
-    return llama_grammar_copy_impl(grammar);
-}
-
-void llama_grammar_sample(
-      const struct llama_grammar * grammar,
-      const struct llama_context * ctx,
-          llama_token_data_array * candidates) {
-    llama_grammar_sample_impl(grammar, &ctx->model.vocab, &ctx->sampling, candidates);
-}
-
-void llama_sample_grammar(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-      const struct llama_grammar * grammar) {
-    llama_grammar_sample(grammar, ctx, candidates);
-}
-
-void llama_grammar_accept_token(
-            struct llama_grammar * grammar,
-            struct llama_context * ctx,
-                     llama_token   token) {
-    llama_grammar_accept_token_impl(grammar, &ctx->model.vocab, &ctx->sampling, token);
-}
-
 //
 // sampling
 //

-void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
-    llama_set_rng_seed_impl(&ctx->sampling, seed);
+// TODO: remove indirection when vocab becomes accesible in llama-sampling.cpp
+struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * model, const char * grammar_str, const char * grammar_root) {
+    return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
 }

-void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
-    llama_sample_softmax_impl(ctx ? &ctx->sampling : nullptr, candidates);
-}
-
-void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
-    llama_sample_top_k_impl(ctx ? &ctx->sampling : nullptr, candidates, k, min_keep);
-}
-
-void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
-    llama_sample_top_p_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
-}
-
-void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
-    llama_sample_min_p_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
-}
-
-void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
-    llama_sample_tail_free_impl(ctx ? &ctx->sampling : nullptr, candidates, z, min_keep);
-}
-
-void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
-    llama_sample_typical_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
-}
-
-void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val,  float smoothing_factor) {
-    llama_sample_entropy_impl(ctx ? &ctx->sampling : nullptr, candidates_p, min_temp, max_temp, exponent_val, smoothing_factor);
-}
-
-void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp, float smoothing_factor) {
-    llama_sample_temp_impl(ctx ? &ctx->sampling : nullptr, candidates_p, temp, smoothing_factor);
-}
-
-void llama_sample_repetition_penalties(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-               const llama_token * last_tokens,
-                          size_t   penalty_last_n,
-                           float   penalty_repeat,
-                           float   penalty_freq,
-                           float   penalty_present) {
-    llama_sample_repetition_penalties_impl(ctx ? &ctx->sampling : nullptr, candidates, last_tokens, penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
-}
-
-void llama_sample_apply_guidance(
-          struct llama_context * ctx,
-                         float * logits,
-                         float * logits_guidance,
-                         float   scale) {
-    llama_sample_apply_guidance_impl(&ctx->sampling, logits, logits_guidance, scale);
-}
-
-llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
-    return llama_sample_token_mirostat_impl(&ctx->sampling, candidates, tau, eta, m, mu);
-}
-
-llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
-    return llama_sample_token_mirostat_v2_impl(ctx ? &ctx->sampling : nullptr, candidates, tau, eta, mu);
-}
-
-llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
-    return llama_sample_token_greedy_impl(ctx ? &ctx->sampling : nullptr, candidates);
-}
-
-llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
-    return llama_sample_token_with_rng_impl(&ctx->sampling, candidates, rng);
-}
-
-llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
-    return llama_sample_token_with_rng_impl(&ctx->sampling, candidates, ctx->sampling.rng);
-}
+//
+// model split
+//

 int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
    static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
@ -20809,45 +20698,6 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
    return 0;
 }

-struct llama_timings llama_get_timings(struct llama_context * ctx) {
-    struct llama_timings result = {
-        /*.t_start_ms  =*/ 1e-3 * ctx->t_start_us,
-        /*.t_end_ms    =*/ 1.00 * ggml_time_ms(),
-        /*.t_load_ms   =*/ 1e-3 * ctx->t_load_us,
-        /*.t_sample_ms =*/ 1e-3 * ctx->sampling.t_sample_us,
-        /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
-        /*.t_eval_ms   =*/ 1e-3 * ctx->t_eval_us,
-
-        /*.n_sample =*/ std::max(1, ctx->sampling.n_sample),
-        /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
-        /*.n_eval   =*/ std::max(1, ctx->n_eval),
-    };
-
-    return result;
-}
-
-void llama_print_timings(struct llama_context * ctx) {
-    const llama_timings timings = llama_get_timings(ctx);
-
-    LLAMA_LOG_INFO("\n");
-    LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, timings.t_load_ms);
-    LLAMA_LOG_INFO("%s:      sample time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
-    LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
-    LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
-    LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
-}
-
-void llama_reset_timings(struct llama_context * ctx) {
-    ctx->t_start_us  = ggml_time_us();
-    ctx->t_eval_us   = ctx->n_eval   = 0;
-    ctx->t_p_eval_us = ctx->n_p_eval = 0;
-
-    ctx->sampling.reset_timings();
-}
-
 const char * llama_print_system_info(void) {
    static std::string s;

@ -20876,7 +20726,68 @@ const char * llama_print_system_info(void) {
    return s.c_str();
 }

-void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
+void llama_perf_print(const void * ctx, enum llama_perf_type type) {
+    switch (type) {
+        case LLAMA_PERF_TYPE_CONTEXT:
+            {
+                const auto * p = (const struct llama_context *) ctx;
+
+                const double t_start_ms   = 1e-3 * p->t_start_us;
+                const double t_end_ms     = 1.00 * ggml_time_ms();
+                const double t_load_ms    = 1e-3 * p->t_load_us;
+                const double t_p_eval_ms  = 1e-3 * p->t_p_eval_us;
+                const double t_eval_ms    = 1e-3 * p->t_eval_us;
+
+                const int32_t n_p_eval  = std::max(0, p->n_p_eval);
+                const int32_t n_eval    = std::max(1, p->n_eval);
+
+                LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, t_load_ms);
+                LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+                        __func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
+                LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+                        __func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
+                LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
+            } break;
+        case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
+            {
+                const auto * smpl = (const struct llama_sampler *) ctx;
+                const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
+
+                const double t_sampler_ms = 1e-3 * p->t_sample_us;
+
+                const int32_t n_sampler = std::max(0, p->n_sample);
+
+                LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+                        __func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler);
+            } break;
+        default:
+            GGML_ABORT("invalid perf type");
+    }
+}
+
+void llama_perf_reset(void * ctx, enum llama_perf_type type) {
+    switch (type) {
+        case LLAMA_PERF_TYPE_CONTEXT:
+            {
+                auto * p = (struct llama_context *) ctx;
+
+                p->t_start_us  = ggml_time_us();
+                p->t_eval_us   = p->n_eval = 0;
+                p->t_p_eval_us = p->n_p_eval = 0;
+            } break;
+        case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
+            {
+                auto * smpl = (struct llama_sampler *) ctx;
+                auto * p = (struct llama_sampler_chain *) smpl->ctx;
+
+                p->t_sample_us = p->n_sample = 0;
+            } break;
+        default:
+            GGML_ABORT("invalid perf type");
+    }
+}
+
+void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
    fprintf(stream, "\n");
    fprintf(stream, "###########\n");
    fprintf(stream, "# Timings #\n");
@ -20887,21 +20798,15 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
            1.0e-3 * ctx->t_eval_us / ctx->n_eval);
    fprintf(stream, "mst_p_eval: %.2f  # ms / token during prompt processing\n",
            1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
-    fprintf(stream, "mst_sample: %.2f  # ms / token during sampling\n",
-            1.0e-3 * ctx->sampling.t_sample_us / ctx->sampling.n_sample);
    fprintf(stream, "n_eval: %d  # number of tokens generated (excluding the first one)\n", ctx->n_eval);
    fprintf(stream, "n_p_eval: %d  # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
-    fprintf(stream, "n_sample: %d  # number of sampled tokens\n", ctx->sampling.n_sample);
    fprintf(stream, "t_eval_us: %" PRId64 "  # total microseconds spent generating tokens\n", ctx->t_eval_us);
    fprintf(stream, "t_load_us: %" PRId64 "  # total microseconds spent loading the model\n", ctx->t_load_us);
    fprintf(stream, "t_p_eval_us: %" PRId64 "  # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
-    fprintf(stream, "t_sample_us: %" PRId64 "  # total microseconds spent sampling\n", ctx->sampling.t_sample_us);
    fprintf(stream, "ts_eval: %.2f  # tokens / second during generation\n",
            1.0e6 * ctx->n_eval / ctx->t_eval_us);
    fprintf(stream, "ts_p_eval: %.2f  # tokens / second during prompt processing\n",
            1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
-    fprintf(stream, "ts_sample: %.2f  # tokens / second during sampling\n",
-            1.0e6 * ctx->sampling.n_sample / ctx->sampling.t_sample_us);
 }

 // For internal test use