diff --git a/.gitignore b/.gitignore
index 4f17b7861..4d4f4336e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,7 @@ gcovr-report/
 build*/
 out/
 tmp/
+autogen-*.md
 
 models/*
 models-mnt
diff --git a/common/common.cpp b/common/common.cpp
index 29b9b283b..166c7a1de 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -26,6 +26,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+#include <climits>
 
 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>
@@ -78,41 +79,6 @@
 
 using json = nlohmann::ordered_json;
 
-//
-// Environment variable utils
-//
-
-template<typename T>
-static typename std::enable_if<std::is_same<T, std::string>::value, void>::type
-get_env(std::string name, T & target) {
-    char * value = std::getenv(name.c_str());
-    target = value ? std::string(value) : target;
-}
-
-template<typename T>
-static typename std::enable_if<!std::is_same<T, bool>::value && std::is_integral<T>::value, void>::type
-get_env(std::string name, T & target) {
-    char * value = std::getenv(name.c_str());
-    target = value ? std::stoi(value) : target;
-}
-
-template<typename T>
-static typename std::enable_if<std::is_floating_point<T>::value, void>::type
-get_env(std::string name, T & target) {
-    char * value = std::getenv(name.c_str());
-    target = value ? std::stof(value) : target;
-}
-
-template<typename T>
-static typename std::enable_if<std::is_same<T, bool>::value, void>::type
-get_env(std::string name, T & target) {
-    char * value = std::getenv(name.c_str());
-    if (value) {
-        std::string val(value);
-        target = val == "1" || val == "true";
-    }
-}
-
 //
 // CPU utils
 //
@@ -307,7 +273,33 @@ bool set_process_priority(enum ggml_sched_priority prio) {
 // CLI argument parsing
 //
 
-void gpt_params_handle_model_default(gpt_params & params) {
+#ifdef __GNUC__
+#ifdef __MINGW32__
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
+#endif
+
+LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
+static std::string format(const char * fmt, ...) {
+    va_list ap;
+    va_list ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+
+static void gpt_params_handle_model_default(gpt_params & params) {
     if (!params.hf_repo.empty()) {
         // short-hand to avoid specifying --hf-file -> default it to --model
         if (params.hf_file.empty()) {
@@ -353,7 +345,47 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)
     }
 }
 
-bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
+bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options) {
+    std::string arg;
+    const std::string arg_prefix = "--";
+    gpt_sampler_params & sparams = params.sparams;
+
+    std::unordered_map<std::string, llama_arg *> arg_to_options;
+    for (auto & opt : options) {
+        for (const auto & arg : opt.args) {
+            arg_to_options[arg] = &opt;
+        }
+    }
+
+    // handle environment variables
+    for (auto & opt : options) {
+        std::string value;
+        if (opt.get_value_from_env(value)) {
+            try {
+                if (opt.handler_void && (value == "1" || value == "true")) {
+                    opt.handler_void(params);
+                }
+                if (opt.handler_int) {
+                    opt.handler_int(params, std::stoi(value));
+                }
+                if (opt.handler_string) {
+                    opt.handler_string(params, value);
+                    continue;
+                }
+            } catch (std::exception & e) {
+                throw std::invalid_argument(format(
+                    "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what()));
+            }
+        }
+    }
+
+    // handle command line arguments
+    auto check_arg = [&](int i) {
+        if (i+1 >= argc) {
+            throw std::invalid_argument("expected value for argument");
+        }
+    };
+
     for (int i = 1; i < argc; i++) {
         const std::string arg_prefix = "--";
 
@@ -361,13 +393,43 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
         if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
             std::replace(arg.begin(), arg.end(), '_', '-');
         }
-
-        bool invalid_param = false;
-        if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
-            throw std::invalid_argument("error: unknown argument: " + arg);
+        if (arg_to_options.find(arg) == arg_to_options.end()) {
+            throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str()));
         }
-        if (invalid_param) {
-            throw std::invalid_argument("error: invalid parameter for argument: " + arg);
+        auto opt = *arg_to_options[arg];
+        if (opt.has_value_from_env()) {
+            fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
+        }
+        try {
+            if (opt.handler_void) {
+                opt.handler_void(params);
+                continue;
+            }
+
+            // arg with single value
+            check_arg(i);
+            std::string val = argv[++i];
+            if (opt.handler_int) {
+                opt.handler_int(params, std::stoi(val));
+                continue;
+            }
+            if (opt.handler_string) {
+                opt.handler_string(params, val);
+                continue;
+            }
+
+            // arg with 2 values
+            check_arg(i);
+            std::string val2 = argv[++i];
+            if (opt.handler_str_str) {
+                opt.handler_str_str(params, val, val2);
+                continue;
+            }
+        } catch (std::exception & e) {
+            throw std::invalid_argument(format(
+                "error while handling argument \"%s\": %s\n\n"
+                "usage:\n%s\n\nto show complete usage, run with -h",
+                arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
         }
     }
 
@@ -382,12 +444,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
 
     gpt_params_handle_model_default(params);
 
-    if (params.hf_token.empty()) {
-        get_env("HF_TOKEN", params.hf_token);
-    }
-
-    auto & sparams = params.sparams;
-
     if (params.escape) {
         string_process_escapes(params.prompt);
         string_process_escapes(params.input_prefix);
@@ -409,41 +465,21 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
     return true;
 }
 
-void gpt_params_parse_from_env(gpt_params & params) {
-    // we only care about server-related params for now
-    get_env("LLAMA_ARG_MODEL",            params.model);
-    get_env("LLAMA_ARG_MODEL_URL",        params.model_url);
-    get_env("LLAMA_ARG_MODEL_ALIAS",      params.model_alias);
-    get_env("LLAMA_ARG_HF_REPO",          params.hf_repo);
-    get_env("LLAMA_ARG_HF_FILE",          params.hf_file);
-    get_env("LLAMA_ARG_THREADS",          params.cpuparams.n_threads);
-    get_env("LLAMA_ARG_CTX_SIZE",         params.n_ctx);
-    get_env("LLAMA_ARG_N_PARALLEL",       params.n_parallel);
-    get_env("LLAMA_ARG_BATCH",            params.n_batch);
-    get_env("LLAMA_ARG_UBATCH",           params.n_ubatch);
-    get_env("LLAMA_ARG_N_GPU_LAYERS",     params.n_gpu_layers);
-    get_env("LLAMA_ARG_THREADS_HTTP",     params.n_threads_http);
-    get_env("LLAMA_ARG_CHAT_TEMPLATE",    params.chat_template);
-    get_env("LLAMA_ARG_N_PREDICT",        params.n_predict);
-    get_env("LLAMA_ARG_ENDPOINT_METRICS", params.endpoint_metrics);
-    get_env("LLAMA_ARG_ENDPOINT_SLOTS",   params.endpoint_slots);
-    get_env("LLAMA_ARG_EMBEDDINGS",       params.embedding);
-    get_env("LLAMA_ARG_FLASH_ATTN",       params.flash_attn);
-    get_env("LLAMA_ARG_DEFRAG_THOLD",     params.defrag_thold);
-    get_env("LLAMA_ARG_CONT_BATCHING",    params.cont_batching);
-    get_env("LLAMA_ARG_HOST",             params.hostname);
-    get_env("LLAMA_ARG_PORT",             params.port);
-}
-
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options) {
     const auto params_org = params; // the example can modify the default params
 
     try {
-        if (!gpt_params_parse_ex(argc, argv, params) || params.usage) {
+        if (!gpt_params_parse_ex(argc, argv, params, options)) {
             params = params_org;
-            params.usage = true;
             return false;
         }
+        if (params.usage) {
+            gpt_params_print_usage(params, options);
+            if (params.print_usage) {
+                params.print_usage(argc, argv);
+            }
+            exit(0);
+        }
     } catch (const std::invalid_argument & ex) {
         fprintf(stderr, "%s\n", ex.what());
         params = params_org;
@@ -526,1558 +562,1741 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
     return true;
 }
 
-#define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
-
-bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
-    const char split_delim = ',';
-
-    auto & sparams = params.sparams;
-
-    if (arg == "-s" || arg == "--seed") {
-        CHECK_ARG
-        sparams.seed = std::stoul(argv[i]);
-        return true;
-    }
-    if (arg == "-t" || arg == "--threads") {
-        CHECK_ARG
-        params.cpuparams.n_threads = std::stoi(argv[i]);
-        if (params.cpuparams.n_threads <= 0) {
-            params.cpuparams.n_threads = std::thread::hardware_concurrency();
-        }
-        return true;
-    }
-    if (arg == "-C" || arg == "--cpu-mask") {
-        CHECK_ARG
-        std::string mask = argv[i];
-        params.cpuparams.mask_valid = true;
-        invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
-        return true;
-    }
-    if (arg == "-Cr" || arg == "--cpu-range") {
-        CHECK_ARG
-        std::string range = argv[i];
-        params.cpuparams.mask_valid = true;
-        invalid_param = !parse_cpu_range(range, params.cpuparams.cpumask);
-        return true;
-    }
-    if (arg == "--prio") {
-        CHECK_ARG
-        params.cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
-        return true;
-    }
-    if (arg == "--cpu-strict") {
-        CHECK_ARG
-        params.cpuparams.strict_cpu = std::stoul(argv[i]);
-        return true;
-    }
-    if (arg == "--poll") {
-        CHECK_ARG
-        params.cpuparams.poll = std::stoul(argv[i]);
-        return true;
-    }
-    if (arg == "-tb" || arg == "--threads-batch") {
-        CHECK_ARG
-        params.cpuparams_batch.n_threads = std::stoi(argv[i]);
-        if (params.cpuparams_batch.n_threads <= 0) {
-            params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
-        }
-        return true;
-    }
-    if (arg == "-Cb" || arg == "--cpu-mask-batch") {
-        CHECK_ARG
-        std::string mask = argv[i];
-        params.cpuparams_batch.mask_valid = true;
-        invalid_param = !parse_cpu_mask(mask, params.cpuparams_batch.cpumask);
-        return true;
-    }
-    if (arg == "-Crb" || arg == "--cpu-range_batch") {
-        CHECK_ARG
-        std::string range = argv[i];
-        params.cpuparams_batch.mask_valid = true;
-        invalid_param = !parse_cpu_range(range, params.cpuparams_batch.cpumask);
-        return true;
-    }
-    if (arg == "--prio-batch") {
-        CHECK_ARG
-        params.cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
-        return true;
-    }
-    if (arg == "--cpu-strict-batch") {
-        params.cpuparams_batch.strict_cpu = true;
-        return true;
-    }
-    if (arg == "--poll-batch") {
-        CHECK_ARG
-        params.cpuparams_batch.poll = std::stoul(argv[i]);
-        return true;
-    }
-    if (arg == "-td" || arg == "--threads-draft") {
-        CHECK_ARG
-        params.draft_cpuparams.n_threads = std::stoi(argv[i]);
-        if (params.draft_cpuparams.n_threads <= 0) {
-            params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
-        }
-        return true;
-    }
-        if (arg == "-Cd" || arg == "--cpu-mask-draft") {
-        CHECK_ARG
-        std::string mask = argv[i];
-        params.draft_cpuparams.mask_valid = true;
-        invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams.cpumask);
-        return true;
-    }
-    if (arg == "-Crd" || arg == "--cpu-range-draft") {
-        CHECK_ARG
-        std::string range = argv[i];
-        params.draft_cpuparams.mask_valid = true;
-        invalid_param = !parse_cpu_range(range, params.draft_cpuparams.cpumask);
-        return true;
-    }
-    if (arg == "--prio-draft") {
-        CHECK_ARG
-        params.draft_cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
-        return true;
-    }
-    if (arg == "--cpu-strict-draft") {
-        params.draft_cpuparams.strict_cpu = true;
-        return true;
-    }
-    if (arg == "--poll-draft") {
-        CHECK_ARG
-        params.draft_cpuparams.poll = std::stoul(argv[i]);
-        return true;
-    }
-    if (arg == "-tbd" || arg == "--threads-batch-draft") {
-        CHECK_ARG
-        params.draft_cpuparams_batch.n_threads = std::stoi(argv[i]);
-        if (params.draft_cpuparams_batch.n_threads <= 0) {
-            params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
-        }
-        return true;
-    }
-    if (arg == "-Crbd" || arg == "--cpu-range-batch-draft") {
-        CHECK_ARG
-        std::string range = argv[i];
-        params.draft_cpuparams_batch.mask_valid = true;
-        invalid_param = !parse_cpu_range(range, params.draft_cpuparams_batch.cpumask);
-        return true;
-    }
-    if (arg == "--prio-batch-draft") {
-        CHECK_ARG
-        params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
-        return true;
-    }
-    if (arg == "--cpu-strict-batch-draft") {
-        params.draft_cpuparams_batch.strict_cpu = true;
-        return true;
-    }
-    if (arg == "--poll-batch-draft") {
-        CHECK_ARG
-        params.draft_cpuparams_batch.poll = std::stoul(argv[i]);
-        return true;
-    }
-    if (arg == "-p" || arg == "--prompt") {
-        CHECK_ARG
-        params.prompt = argv[i];
-        return true;
-    }
-    if (arg == "-e" || arg == "--escape") {
-        params.escape = true;
-        return true;
-    }
-    if (arg == "--no-escape") {
-        params.escape = false;
-        return true;
-    }
-    if (arg == "--prompt-cache") {
-        CHECK_ARG
-        params.path_prompt_cache = argv[i];
-        return true;
-    }
-    if (arg == "--prompt-cache-all") {
-        params.prompt_cache_all = true;
-        return true;
-    }
-    if (arg == "--prompt-cache-ro") {
-        params.prompt_cache_ro = true;
-        return true;
-    }
-    if (arg == "-bf" || arg == "--binary-file") {
-        CHECK_ARG
-        std::ifstream file(argv[i], std::ios::binary);
-        if (!file) {
-            fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-            invalid_param = true;
-            return true;
-        }
-        // store the external file name in params
-        params.prompt_file = argv[i];
-        std::ostringstream ss;
-        ss << file.rdbuf();
-        params.prompt = ss.str();
-        fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]);
-        return true;
-    }
-    if (arg == "-f" || arg == "--file") {
-        CHECK_ARG
-        std::ifstream file(argv[i]);
-        if (!file) {
-            fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-            invalid_param = true;
-            return true;
-        }
-        // store the external file name in params
-        params.prompt_file = argv[i];
-        std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
-        if (!params.prompt.empty() && params.prompt.back() == '\n') {
-            params.prompt.pop_back();
-        }
-        return true;
-    }
-    if (arg == "--in-file") {
-        CHECK_ARG
-        std::ifstream file(argv[i]);
-        if (!file) {
-            fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-            invalid_param = true;
-            return true;
-        }
-        params.in_files.push_back(argv[i]);
-        return true;
-    }
-    if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
-        CHECK_ARG
-        params.n_predict = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--top-k") {
-        CHECK_ARG
-        sparams.top_k = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "-c" || arg == "--ctx-size") {
-        CHECK_ARG
-        params.n_ctx = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--grp-attn-n" || arg == "-gan") {
-        CHECK_ARG
-        params.grp_attn_n = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--grp-attn-w" || arg == "-gaw") {
-        CHECK_ARG
-        params.grp_attn_w = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--rope-freq-base") {
-        CHECK_ARG
-        params.rope_freq_base = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--rope-freq-scale") {
-        CHECK_ARG
-        params.rope_freq_scale = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--rope-scaling") {
-        CHECK_ARG
-        std::string value(argv[i]);
-        /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
-        else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
-        else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
-        else { invalid_param = true; }
-        return true;
-    }
-    if (arg == "--rope-scale") {
-        CHECK_ARG
-        params.rope_freq_scale = 1.0f / std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--yarn-orig-ctx") {
-        CHECK_ARG
-        params.yarn_orig_ctx = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--yarn-ext-factor") {
-        CHECK_ARG
-        params.yarn_ext_factor = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--yarn-attn-factor") {
-        CHECK_ARG
-        params.yarn_attn_factor = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--yarn-beta-fast") {
-        CHECK_ARG
-        params.yarn_beta_fast = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--yarn-beta-slow") {
-        CHECK_ARG
-        params.yarn_beta_slow = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--pooling") {
-        CHECK_ARG
-        std::string value(argv[i]);
-        /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
-        else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
-        else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
-        else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
-        else { invalid_param = true; }
-        return true;
-    }
-    if (arg == "--attention") {
-        CHECK_ARG
-        std::string value(argv[i]);
-        /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
-        else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
-        else { invalid_param = true; }
-        return true;
-    }
-    if (arg == "--defrag-thold" || arg == "-dt") {
-        CHECK_ARG
-        params.defrag_thold = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--samplers") {
-        CHECK_ARG
-        const auto sampler_names = string_split(argv[i], ';');
-        sparams.samplers = gpt_sampler_types_from_names(sampler_names, true);
-        return true;
-    }
-    if (arg == "--sampling-seq") {
-        CHECK_ARG
-        sparams.samplers = gpt_sampler_types_from_chars(argv[i]);
-        return true;
-    }
-    if (arg == "--top-p") {
-        CHECK_ARG
-        sparams.top_p = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--min-p") {
-        CHECK_ARG
-        sparams.min_p = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--temp") {
-        CHECK_ARG
-        sparams.temp = std::stof(argv[i]);
-        sparams.temp = std::max(sparams.temp, 0.0f);
-        return true;
-    }
-    if (arg == "--tfs") {
-        CHECK_ARG
-        sparams.tfs_z = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--typical") {
-        CHECK_ARG
-        sparams.typ_p = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--repeat-last-n") {
-        CHECK_ARG
-        sparams.penalty_last_n = std::stoi(argv[i]);
-        sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
-        return true;
-    }
-    if (arg == "--repeat-penalty") {
-        CHECK_ARG
-        sparams.penalty_repeat = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--frequency-penalty") {
-        CHECK_ARG
-        sparams.penalty_freq = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--presence-penalty") {
-        CHECK_ARG
-        sparams.penalty_present = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--dynatemp-range") {
-        CHECK_ARG
-        sparams.dynatemp_range = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--dynatemp-exp") {
-        CHECK_ARG
-        sparams.dynatemp_exponent = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--mirostat") {
-        CHECK_ARG
-        sparams.mirostat = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--mirostat-lr") {
-        CHECK_ARG
-        sparams.mirostat_eta = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "--mirostat-ent") {
-        CHECK_ARG
-        sparams.mirostat_tau = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "-b" || arg == "--batch-size") {
-        CHECK_ARG
-        params.n_batch = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "-ub" || arg == "--ubatch-size") {
-        CHECK_ARG
-        params.n_ubatch = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--keep") {
-        CHECK_ARG
-        params.n_keep = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--draft") {
-        CHECK_ARG
-        params.n_draft = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--chunks") {
-        CHECK_ARG
-        params.n_chunks = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "-np" || arg == "--parallel") {
-        CHECK_ARG
-        params.n_parallel = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "-ns" || arg == "--sequences") {
-        CHECK_ARG
-        params.n_sequences = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--p-split" || arg == "-ps") {
-        CHECK_ARG
-        params.p_split = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "-m" || arg == "--model") {
-        CHECK_ARG
-        params.model = argv[i];
-        return true;
-    }
-    if (arg == "-md" || arg == "--model-draft") {
-        CHECK_ARG
-        params.model_draft = argv[i];
-        return true;
-    }
-    if (arg == "-a" || arg == "--alias") {
-        CHECK_ARG
-        params.model_alias = argv[i];
-        return true;
-    }
-    if (arg == "-mu" || arg == "--model-url") {
-        CHECK_ARG
-        params.model_url = argv[i];
-        return true;
-    }
-    if (arg == "-hft" || arg == "--hf-token") {
-        if (++i >= argc) {
-          invalid_param = true;
-          return true;
-        }
-        params.hf_token = argv[i];
-        return true;
-    }
-    if (arg == "-hfr" || arg == "--hf-repo") {
-        CHECK_ARG
-        params.hf_repo = argv[i];
-        return true;
-    }
-    if (arg == "-hff" || arg == "--hf-file") {
-        CHECK_ARG
-        params.hf_file = argv[i];
-        return true;
-    }
-    if (arg == "--lora") {
-        CHECK_ARG
-        params.lora_adapters.push_back({
-            std::string(argv[i]),
-            1.0,
-        });
-        return true;
-    }
-    if (arg == "--lora-scaled") {
-        CHECK_ARG
-        std::string lora_adapter = argv[i];
-        CHECK_ARG
-        params.lora_adapters.push_back({
-            lora_adapter,
-            std::stof(argv[i]),
-        });
-        return true;
-    }
-    if (arg == "--lora-init-without-apply") {
-        params.lora_init_without_apply = true;
-        return true;
-    }
-    if (arg == "--control-vector") {
-        CHECK_ARG
-        params.control_vectors.push_back({ 1.0f, argv[i], });
-        return true;
-    }
-    if (arg == "--control-vector-scaled") {
-        CHECK_ARG
-        const char* fname = argv[i];
-        CHECK_ARG
-        params.control_vectors.push_back({ std::stof(argv[i]), fname, });
-        return true;
-    }
-    if (arg == "--control-vector-layer-range") {
-        CHECK_ARG
-        params.control_vector_layer_start = std::stoi(argv[i]);
-        CHECK_ARG
-        params.control_vector_layer_end = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--mmproj") {
-        CHECK_ARG
-        params.mmproj = argv[i];
-        return true;
-    }
-    if (arg == "--image") {
-        CHECK_ARG
-        params.image.emplace_back(argv[i]);
-        return true;
-    }
-    if (arg == "-i" || arg == "--interactive") {
-        params.interactive = true;
-        return true;
-    }
-    if (arg == "-sp" || arg == "--special") {
-        params.special = true;
-        return true;
-    }
-    if (arg == "--embedding" || arg == "--embeddings") {
-        params.embedding = true;
-        return true;
-    }
-    if (arg == "--embd-normalize") {
-        CHECK_ARG
-        params.embd_normalize = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--embd-output-format") {
-        CHECK_ARG
-        params.embd_out = argv[i];
-        return true;
-    }
-    if (arg == "--embd-separator") {
-        CHECK_ARG
-        params.embd_sep = argv[i];
-        return true;
-    }
-    if (arg == "-if" || arg == "--interactive-first") {
-        params.interactive_first = true;
-        return true;
-    }
-    if (arg == "-cnv" || arg == "--conversation") {
-        params.conversation = true;
-        return true;
-    }
-    if (arg == "--infill") {
-        params.infill = true;
-        return true;
-    }
-    if (arg == "-dkvc" || arg == "--dump-kv-cache") {
-        params.dump_kv_cache = true;
-        return true;
-    }
-    if (arg == "-nkvo" || arg == "--no-kv-offload") {
-        params.no_kv_offload = true;
-        return true;
-    }
-    if (arg == "-ctk" || arg == "--cache-type-k") {
-        params.cache_type_k = argv[++i];
-        return true;
-    }
-    if (arg == "-ctv" || arg == "--cache-type-v") {
-        params.cache_type_v = argv[++i];
-        return true;
-    }
-    if (arg == "-mli" || arg == "--multiline-input") {
-        params.multiline_input = true;
-        return true;
-    }
-    if (arg == "--simple-io") {
-        params.simple_io = true;
-        return true;
-    }
-    if (arg == "-cb" || arg == "--cont-batching") {
-        params.cont_batching = true;
-        return true;
-    }
-    if (arg == "-nocb" || arg == "--no-cont-batching") {
-        params.cont_batching = false;
-        return true;
-    }
-    if (arg == "-fa" || arg == "--flash-attn") {
-        params.flash_attn = true;
-        return true;
-    }
-    if (arg == "-co" || arg == "--color") {
-        params.use_color = true;
-        return true;
-    }
-    if (arg == "--mlock") {
-        params.use_mlock = true;
-        return true;
-    }
-    if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
-        CHECK_ARG
-        params.n_gpu_layers = std::stoi(argv[i]);
-        if (!llama_supports_gpu_offload()) {
-            fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
-            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
-        }
-        return true;
-    }
-    if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--n-gpu-layers-draft") {
-        CHECK_ARG
-        params.n_gpu_layers_draft = std::stoi(argv[i]);
-        if (!llama_supports_gpu_offload()) {
-            fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
-            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
-        }
-        return true;
-    }
-    if (arg == "--main-gpu" || arg == "-mg") {
-        CHECK_ARG
-        params.main_gpu = std::stoi(argv[i]);
-#ifndef GGML_USE_CUDA_SYCL_VULKAN
-        fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL_VULKAN
-        return true;
-    }
-    if (arg == "--split-mode" || arg == "-sm") {
-        CHECK_ARG
-        std::string arg_next = argv[i];
-        if (arg_next == "none") {
-            params.split_mode = LLAMA_SPLIT_MODE_NONE;
-        }
-        else if (arg_next == "layer") {
-            params.split_mode = LLAMA_SPLIT_MODE_LAYER;
-        }
-        else if (arg_next == "row") {
-#ifdef GGML_USE_SYCL
-            fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
-            exit(1);
-#endif // GGML_USE_SYCL
-            params.split_mode = LLAMA_SPLIT_MODE_ROW;
-        }
-        else {
-            invalid_param = true;
-            return true;
-        }
-#ifndef GGML_USE_CUDA_SYCL_VULKAN
-        fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL_VULKAN
-        return true;
-    }
-    if (arg == "--tensor-split" || arg == "-ts") {
-        CHECK_ARG
-        std::string arg_next = argv[i];
-
-        // split string by , and /
-        const std::regex regex{ R"([,/]+)" };
-        std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
-        std::vector<std::string> split_arg{ it, {} };
-        if (split_arg.size() >= llama_max_devices()) {
-            invalid_param = true;
-            return true;
-        }
-        for (size_t i = 0; i < llama_max_devices(); ++i) {
-            if (i < split_arg.size()) {
-                params.tensor_split[i] = std::stof(split_arg[i]);
-            }
-            else {
-                params.tensor_split[i] = 0.0f;
-            }
-        }
-#ifndef GGML_USE_CUDA_SYCL_VULKAN
-        fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL_VULKAN
-        return true;
-    }
-#ifdef GGML_USE_RPC
-    if (arg == "--rpc") {
-        CHECK_ARG
-        params.rpc_servers = argv[i];
-        return true;
-    }
-#endif
-    if (arg == "--no-mmap") {
-        params.use_mmap = false;
-        return true;
-    }
-    if (arg == "--numa") {
-        CHECK_ARG
-        std::string value(argv[i]);
-        /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
-        else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
-        else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
-        else { invalid_param = true; }
-        return true;
-    }
-    if (arg == "-v" || arg == "--verbose") {
-        params.verbosity = 1;
-        return true;
-    }
-    if (arg == "--verbosity") {
-        CHECK_ARG
-        params.verbosity = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--verbose-prompt") {
-        params.verbose_prompt = true;
-        return true;
-    }
-    if (arg == "--no-display-prompt") {
-        params.display_prompt = false;
-        return true;
-    }
-    if (arg == "-r" || arg == "--reverse-prompt") {
-        CHECK_ARG
-        params.antiprompt.emplace_back(argv[i]);
-        return true;
-    }
-    if (arg == "-ld" || arg == "--logdir") {
-        CHECK_ARG
-        params.logdir = argv[i];
-
-        if (params.logdir.back() != DIRECTORY_SEPARATOR) {
-            params.logdir += DIRECTORY_SEPARATOR;
-        }
-        return true;
-    }
-    if (arg == "-lcs" || arg == "--lookup-cache-static") {
-        CHECK_ARG
-        params.lookup_cache_static = argv[i];
-        return true;
-    }
-    if (arg == "-lcd" || arg == "--lookup-cache-dynamic") {
-        CHECK_ARG
-        params.lookup_cache_dynamic = argv[i];
-        return true;
-    }
-    if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
-        CHECK_ARG
-        params.logits_file = argv[i];
-        return true;
-    }
-    if (arg == "--perplexity" || arg == "--all-logits") {
-        params.logits_all = true;
-        return true;
-    }
-    if (arg == "--ppl-stride") {
-        CHECK_ARG
-        params.ppl_stride = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--ppl-output-type") {
-        CHECK_ARG
-        params.ppl_output_type = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "-ptc" || arg == "--print-token-count") {
-        CHECK_ARG
-        params.n_print = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--check-tensors") {
-        params.check_tensors = true;
-        return true;
-    }
-    if (arg == "--hellaswag") {
-        params.hellaswag = true;
-        return true;
-    }
-    if (arg == "--hellaswag-tasks") {
-        CHECK_ARG
-        params.hellaswag_tasks = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--winogrande") {
-        params.winogrande = true;
-        return true;
-    }
-    if (arg == "--winogrande-tasks") {
-        CHECK_ARG
-        params.winogrande_tasks = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--multiple-choice") {
-        params.multiple_choice = true;
-        return true;
-    }
-    if (arg == "--multiple-choice-tasks") {
-        CHECK_ARG
-        params.multiple_choice_tasks = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--kl-divergence") {
-        params.kl_divergence = true;
-        return true;
-    }
-    if (arg == "--ignore-eos") {
-        sparams.ignore_eos = true;
-        return true;
-    }
-    if (arg == "--penalize-nl") {
-        sparams.penalize_nl = true;
-        return true;
-    }
-    if (arg == "-l" || arg == "--logit-bias") {
-        CHECK_ARG
-        std::stringstream ss(argv[i]);
-        llama_token key;
-        char sign;
-        std::string value_str;
-        try {
-            if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
-                const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
-                sparams.logit_bias.push_back({key, bias});
-            }
-            else {
-                throw std::exception();
-            }
-        }
-        catch (const std::exception&) {
-            invalid_param = true;
-            return true;
-        }
-        return true;
-    }
-    if (arg == "-h" || arg == "--help" || arg == "--usage"  ) {
-        params.usage = true;
-        return true;
-    }
-    if (arg == "--version") {
-        fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
-        fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
-        exit(0);
-    }
-    if (arg == "--in-prefix-bos") {
-        params.input_prefix_bos = true;
-        params.enable_chat_template = false;
-        return true;
-    }
-    if (arg == "--in-prefix") {
-        CHECK_ARG
-        params.input_prefix = argv[i];
-        params.enable_chat_template = false;
-        return true;
-    }
-    if (arg == "--in-suffix") {
-        CHECK_ARG
-        params.input_suffix = argv[i];
-        params.enable_chat_template = false;
-        return true;
-    }
-    if (arg == "--spm-infill") {
-        params.spm_infill = true;
-        return true;
-    }
-    if (arg == "--grammar") {
-        CHECK_ARG
-        sparams.grammar = argv[i];
-        return true;
-    }
-    if (arg == "--grammar-file") {
-        CHECK_ARG
-        std::ifstream file(argv[i]);
-        if (!file) {
-            fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-            invalid_param = true;
-            return true;
-        }
-        std::copy(
-            std::istreambuf_iterator<char>(file),
-            std::istreambuf_iterator<char>(),
-            std::back_inserter(sparams.grammar)
-        );
-        return true;
-    }
-    // if (arg == "-j" || arg == "--json-schema") {
-    //     CHECK_ARG
-    //     sparams.grammar = json_schema_to_grammar(json::parse(argv[i]));
-    //     return true;
-    // }
-    if (arg == "--override-kv") {
-        CHECK_ARG
-        if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
-            fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
-            invalid_param = true;
-            return true;
-        }
-        return true;
-    }
-    if (arg == "--host") {
-        CHECK_ARG
-        params.hostname = argv[i];
-        return true;
-    }
-    if (arg == "--port") {
-        CHECK_ARG
-        params.port = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--path") {
-        CHECK_ARG
-        params.public_path = argv[i];
-        return true;
-    }
-    if (arg == "--api-key") {
-        CHECK_ARG
-        params.api_keys.push_back(argv[i]);
-        return true;
-    }
-    if (arg == "--api-key-file") {
-        CHECK_ARG
-        std::ifstream key_file(argv[i]);
-        if (!key_file) {
-            fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-            invalid_param = true;
-            return true;
-        }
-        std::string key;
-        while (std::getline(key_file, key)) {
-            if (!key.empty()) {
-                params.api_keys.push_back(key);
-            }
-        }
-        key_file.close();
-        return true;
-    }
-    if (arg == "--ssl-key-file") {
-        CHECK_ARG
-        params.ssl_file_key = argv[i];
-        return true;
-    }
-    if (arg == "--ssl-cert-file") {
-        CHECK_ARG
-        params.ssl_file_cert = argv[i];
-        return true;
-    }
-    if (arg == "--timeout" || arg == "-to") {
-        CHECK_ARG
-        params.timeout_read  = std::stoi(argv[i]);
-        params.timeout_write = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--threads-http") {
-        CHECK_ARG
-        params.n_threads_http = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "-spf" || arg == "--system-prompt-file") {
-        CHECK_ARG
-        std::ifstream file(argv[i]);
-        if (!file) {
-            fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-            invalid_param = true;
-            return true;
-        }
-        std::string system_prompt;
-        std::copy(
-                std::istreambuf_iterator<char>(file),
-                std::istreambuf_iterator<char>(),
-                std::back_inserter(system_prompt)
-                );
-        params.system_prompt = system_prompt;
-        return true;
-    }
-    if (arg == "--log-format") {
-        CHECK_ARG
-        if (std::strcmp(argv[i], "json") == 0) {
-            params.log_json = true;
-        } else if (std::strcmp(argv[i], "text") == 0) {
-            params.log_json = false;
+static std::vector<std::string> break_str_into_lines(std::string input, size_t max_char_per_line) {
+    std::vector<std::string> result;
+    std::istringstream iss(input);
+    std::string line;
+    auto add_line = [&](const std::string& l) {
+        if (l.length() <= max_char_per_line) {
+            result.push_back(l);
         } else {
-            invalid_param = true;
-            return true;
+            std::istringstream line_stream(l);
+            std::string word, current_line;
+            while (line_stream >> word) {
+                if (current_line.length() + !current_line.empty() + word.length() > max_char_per_line) {
+                    if (!current_line.empty()) result.push_back(current_line);
+                    current_line = word;
+                } else {
+                    current_line += (!current_line.empty() ? " " : "") + word;
+                }
+            }
+            if (!current_line.empty()) result.push_back(current_line);
         }
-        return true;
+    };
+    while (std::getline(iss, line)) {
+        add_line(line);
     }
-    if (arg == "--no-slots") {
-        params.endpoint_slots = false;
-        return true;
-    }
-    if (arg == "--metrics") {
-        params.endpoint_metrics = true;
-        return true;
-    }
-    if (arg == "--slot-save-path") {
-        CHECK_ARG
-        params.slot_save_path = argv[i];
-        // if doesn't end with DIRECTORY_SEPARATOR, add it
-        if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
-            params.slot_save_path += DIRECTORY_SEPARATOR;
-        }
-        return true;
-    }
-    if (arg == "--chat-template") {
-        CHECK_ARG
-        if (!llama_chat_verify_template(argv[i])) {
-            fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]);
-            fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
-            invalid_param = true;
-            return true;
-        }
-        params.chat_template = argv[i];
-        return true;
-    }
-    if (arg == "--slot-prompt-similarity" || arg == "-sps") {
-        CHECK_ARG
-        params.slot_prompt_similarity = std::stof(argv[i]);
-        return true;
-    }
-    if (arg == "-pps") {
-        params.is_pp_shared = true;
-        return true;
-    }
-    if (arg == "-npp") {
-        CHECK_ARG
-        auto p = string_split<int>(argv[i], split_delim);
-        params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
-        return true;
-    }
-    if (arg == "-ntg") {
-        CHECK_ARG
-        auto p = string_split<int>(argv[i], split_delim);
-        params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
-        return true;
-    }
-    if (arg == "-npl") {
-        CHECK_ARG
-        auto p = string_split<int>(argv[i], split_delim);
-        params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
-        return true;
-    }
-    if (arg == "--context-file") {
-        CHECK_ARG
-        std::ifstream file(argv[i], std::ios::binary);
-        if (!file) {
-            fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-            invalid_param = true;
-            return true;
-        }
-        params.context_files.push_back(argv[i]);
-        return true;
-    }
-    if (arg == "--chunk-size") {
-        CHECK_ARG
-        params.chunk_size = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--chunk-separator") {
-        CHECK_ARG
-        params.chunk_separator = argv[i];
-        return true;
-    }
-    if (arg == "--junk") {
-        CHECK_ARG
-        params.n_junk = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--pos") {
-        CHECK_ARG
-        params.i_pos = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "-o" || arg == "--output" || arg == "--output-file") {
-        CHECK_ARG
-        params.out_file = argv[i];
-        params.cvector_outfile = argv[i];
-        params.lora_outfile = argv[i];
-        return true;
-    }
-    if (arg == "-ofreq" || arg == "--output-frequency") {
-        CHECK_ARG
-        params.n_out_freq = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--save-frequency") {
-        CHECK_ARG
-        params.n_save_freq = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--process-output") {
-        params.process_output = true;
-        return true;
-    }
-    if (arg == "--no-ppl") {
-        params.compute_ppl = false;
-        return true;
-    }
-    if (arg == "--chunk" || arg == "--from-chunk") {
-        CHECK_ARG
-        params.i_chunk = std::stoi(argv[i]);
-        return true;
-    }
-    // cvector params
-    if (arg == "--positive-file") {
-        CHECK_ARG
-        params.cvector_positive_file = argv[i];
-        return true;
-    }
-    if (arg == "--negative-file") {
-        CHECK_ARG
-        params.cvector_negative_file = argv[i];
-        return true;
-    }
-    if (arg == "--pca-batch") {
-        CHECK_ARG
-        params.n_pca_batch = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--pca-iter") {
-        CHECK_ARG
-        params.n_pca_iterations = std::stoi(argv[i]);
-        return true;
-    }
-    if (arg == "--method") {
-        CHECK_ARG
-        std::string value(argv[i]);
-        /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
-        else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
-        else { invalid_param = true; }
-        return true;
-    }
-    if (arg == "--output-format") {
-        CHECK_ARG
-        std::string value(argv[i]);
-        /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
-        else if (value == "md") { params.batched_bench_output_jsonl = false; }
-        else { invalid_param = true; }
-        return true;
-    }
-    if (arg == "--no-warmup") {
-        params.warmup = false;
-        return true;
-    }
-#ifndef LOG_DISABLE_LOGS
-    // Parse args for logging parameters
-    if (log_param_single_parse(argv[i])) {
-        // Do nothing, log_param_single_parse automatically does it's thing
-        //  and returns if a match was found and parsed.
-        return true;
-    }
-    if (log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i])) {
-        // We have a matching known parameter requiring an argument,
-        //  now we need to check if there is anything after this argv
-        //  and flag invalid_param or parse it.
-        CHECK_ARG
-        if (!log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i - 1], argv[i])) {
-            invalid_param = true;
-            return true;
-        }
-        return true;
-    }
-    // End of Parse args for logging parameters
-#endif // LOG_DISABLE_LOGS
-
-    return false;
+    return result;
 }
 
-#ifdef __GNUC__
-#ifdef __MINGW32__
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
-#else
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
-#endif
-#else
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
-#endif
+std::string llama_arg::to_string() {
+    // params for printing to console
+    const static int n_leading_spaces = 40;
+    const static int n_char_per_line_help = 70; // TODO: detect this based on current console
+    std::string leading_spaces(n_leading_spaces, ' ');
 
-void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
-    const auto & sparams = params.sparams;
+    std::ostringstream ss;
+    for (const auto arg : args) {
+        if (arg == args.front()) {
+            if (args.size() == 1) {
+                ss << arg;
+            } else {
+                // first arg is usually abbreviation, we need padding to make it more beautiful
+                auto tmp = std::string(arg) + ", ";
+                ss << format("%-7s", tmp.c_str());
+            }
+        } else {
+            ss << arg << (arg != args.back() ? ", " : "");
+        }
+    }
+    if (value_hint) ss << " " << value_hint;
+    if (value_hint_2) ss << " " << value_hint_2;
+    if (ss.tellp() > n_leading_spaces - 3) {
+        // current line is too long, add new line
+        ss << "\n" << leading_spaces;
+    } else {
+        // padding between arg and help, same line
+        ss << std::string(leading_spaces.size() - ss.tellp(), ' ');
+    }
+    const auto help_lines = break_str_into_lines(help, n_char_per_line_help);
+    for (const auto & line : help_lines) {
+        ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n";
+    }
+    return ss.str();
+}
+
+void gpt_params_print_usage(gpt_params & params, std::vector<llama_arg> & options) {
+    auto print_options = [](std::vector<llama_arg *> & options) {
+        for (llama_arg * opt : options) {
+            printf("%s", opt->to_string().c_str());
+        }
+    };
+
+    std::vector<llama_arg *> common_options;
+    std::vector<llama_arg *> specific_options;
+    for (auto & opt : options) {
+        // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
+        if (opt.in_example(params.curr_ex)) {
+            specific_options.push_back(&opt);
+        } else {
+            common_options.push_back(&opt);
+        }
+    }
+    printf("----- common options -----\n\n");
+    print_options(common_options);
+    // TODO: maybe convert enum llama_example to string
+    printf("\n\n----- example-specific options -----\n\n");
+    print_options(specific_options);
+}
+
+std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex) {
+    return gpt_params_parser_init(params, ex, nullptr);
+}
+
+std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex, std::function<void(int, char **)> print_usage) {
+    std::vector<llama_arg> options;
+    params.print_usage = print_usage;
+    params.curr_ex     = ex;
 
     std::string sampler_type_chars;
     std::string sampler_type_names;
-    for (const auto & sampler : sparams.samplers) {
+    for (const auto & sampler : params.sparams.samplers) {
         sampler_type_chars += gpt_sampler_type_to_chr(sampler);
         sampler_type_names += gpt_sampler_type_to_str(sampler) + ";";
     }
     sampler_type_names.pop_back();
 
-    struct option_info {
-        LLAMA_COMMON_ATTRIBUTE_FORMAT(4, 5)
-        option_info(const std::string & tags, const char * args, const char * desc, ...) : tags(tags), args(args), desc(desc) {
-            va_list args_list;
-            va_start(args_list, desc);
-            char buffer[1024];
-            vsnprintf(buffer, sizeof(buffer), desc, args_list);
-            va_end(args_list);
-            this->desc = buffer;
+
+    /**
+     * filter options by example
+     * rules:
+     * - all examples inherit options from LLAMA_EXAMPLE_COMMON
+     * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example
+     * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
+     */
+    auto add_opt = [&](llama_arg arg) {
+        if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) {
+            options.push_back(std::move(arg));
         }
-
-        option_info(const std::string & grp) : grp(grp) {}
-
-        std::string tags;
-        std::string args;
-        std::string desc;
-        std::string grp;
     };
 
-    std::vector<option_info> options;
 
-    // TODO: filter by tags
-
-    options.push_back({ "general" });
-    options.push_back({ "*",           "-h,    --help, --usage",        "print usage and exit" });
-    options.push_back({ "*",           "       --version",              "show version and build info" });
-    options.push_back({ "*",           "-v,    --verbose",              "print verbose information" });
-    options.push_back({ "*",           "       --verbosity N",          "set specific verbosity level (default: %d)", params.verbosity });
-    options.push_back({ "*",           "       --verbose-prompt",       "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
-    options.push_back({ "*",           "       --no-display-prompt",    "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
-    options.push_back({ "*",           "-co,   --color",                "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
-    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during generation (default: %d)", params.cpuparams.n_threads });
-    options.push_back({ "*",           "-tb,   --threads-batch N",      "number of threads to use during batch and prompt processing (default: same as --threads)" });
-    options.push_back({ "speculative", "-td,   --threads-draft N",      "number of threads to use during generation (default: same as --threads)" });
-    options.push_back({ "speculative", "-tbd,  --threads-batch-draft N","number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
-
-#ifndef GGML_USE_OPENMP
-    // these options are available only with the internal threadpool
-    options.push_back({ "*",           "-C,    --cpu-mask M",            "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"});
-    options.push_back({ "*",           "-Cr,   --cpu-range lo-hi",       "range of CPUs for affinity. Complements --cpu-mask"});
-    options.push_back({ "*",           "       --cpu-strict <0|1>",      "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
-    options.push_back({ "*",           "       --priority N",            "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
-    options.push_back({ "*",           "       --poll <0...100>",        "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll});
-
-    options.push_back({ "*",           "-Cb,   --cpu-mask-batch M",      "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
-    options.push_back({ "*",           "-Crb,  --cpu-range-batch lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch"});
-    options.push_back({ "*",           "       --cpu-strict-batch <0|1>","use strict CPU placement (default: same as --cpu-strict)"});
-    options.push_back({ "*",           "       --priority-batch N",      "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"});
-    options.push_back({ "*",           "       --poll-batch <0|1>",      "use polling to wait for work (default: same as --poll"});
-
-    options.push_back({ "speculative", "-Cd,   --cpu-mask-draft M",      "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"});
-    options.push_back({ "speculative", "-Crd,  --cpu-range-draft lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft"});
-    options.push_back({ "speculative", "       --cpu-strict-draft <0|1>","Use strict CPU placement for draft model (default: same as --cpu-strict)"});
-    options.push_back({ "speculative", "       --priority-draft N",      "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"});
-    options.push_back({ "speculative", "       --poll-draft <0|1>",      "Use polling to wait for draft model work (default: same as --poll])"});
-
-    options.push_back({ "speculative", "-Cbd,  --cpu-mask-batch-draft M","Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"});
-    options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi",
-                                                                         "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"});
-    options.push_back({ "speculative", "       --cpu-strict-batch-draft <0|1>",
-                                                                         "Use strict CPU placement for draft model (default: --cpu-strict-draft)"});
-    options.push_back({ "speculative", "       --priority-batch-draft N","Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"});
-    options.push_back({ "speculative", "       --poll-batch-draft <0|1>","Use polling to wait for draft model work (default: --poll-draft)"});
-#endif // GGML_USE_OPENMP
-
-    options.push_back({ "speculative", "       --draft N",              "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
-    options.push_back({ "speculative", "-ps,   --p-split N",            "speculative decoding split probability (default: %.1f)", (double)params.p_split });
-    options.push_back({ "*",           "-lcs,  --lookup-cache-static FNAME",
-                                                                        "path to static lookup cache to use for lookup decoding (not updated by generation)" });
-    options.push_back({ "*",           "-lcd,  --lookup-cache-dynamic FNAME",
-                                                                        "path to dynamic lookup cache to use for lookup decoding (updated by generation)" });
-
-    options.push_back({ "*",           "-c,    --ctx-size N",           "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
-    options.push_back({ "*",           "-n,    --predict N",            "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
-    options.push_back({ "*",           "-b,    --batch-size N",         "logical maximum batch size (default: %d)", params.n_batch });
-    options.push_back({ "*",           "-ub,   --ubatch-size N",        "physical maximum batch size (default: %d)", params.n_ubatch });
-    options.push_back({ "*",           "       --keep N",               "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
-    options.push_back({ "*",           "       --chunks N",             "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
-    options.push_back({ "*",           "-fa,   --flash-attn",           "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
-    options.push_back({ "*",           "-p,    --prompt PROMPT",        "prompt to start generation with\n"
-                                                                        "in conversation mode, this will be used as system prompt\n"
-                                                                        "(default: '%s')", params.prompt.c_str() });
-    options.push_back({ "*",           "-f,    --file FNAME",           "a file containing the prompt (default: none)" });
-    options.push_back({ "*",           "       --in-file FNAME",        "an input file (repeat to specify multiple files)" });
-    options.push_back({ "*",           "-bf,   --binary-file FNAME",    "binary file containing the prompt (default: none)" });
-    options.push_back({ "*",           "-e,    --escape",               "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" });
-    options.push_back({ "*",           "       --no-escape",            "do not process escape sequences" });
-    options.push_back({ "main",        "-ptc,  --print-token-count N",  "print token count every N tokens (default: %d)", params.n_print });
-    options.push_back({ "main",        "       --prompt-cache FNAME",   "file to cache prompt state for faster startup (default: none)" });
-    options.push_back({ "main",        "       --prompt-cache-all",     "if specified, saves user input and generations to cache as well\n"
-                                                                        "not supported with --interactive or other interactive options" });
-    options.push_back({ "main",        "       --prompt-cache-ro",      "if specified, uses the prompt cache but does not update it" });
-    options.push_back({ "main",        "-r,    --reverse-prompt PROMPT",
-                                                                        "halt generation at PROMPT, return control in interactive mode\n"
-                                                                        "can be specified more than once for multiple prompts" });
-    options.push_back({ "main",        "-sp,   --special",              "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
-    options.push_back({ "main",        "-cnv,  --conversation",         "run in conversation mode, does not print special tokens and suffix/prefix\n"
-                                                                        "if suffix/prefix are not specified, default chat template will be used\n"
-                                                                        "(default: %s)", params.conversation ? "true" : "false" });
-    options.push_back({ "main infill", "-i,    --interactive",          "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
-    options.push_back({ "main infill", "-if,   --interactive-first",    "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
-    options.push_back({ "main infill", "-mli,  --multiline-input",      "allows you to write or paste multiple lines without ending each in '\\'" });
-    options.push_back({ "main infill", "       --in-prefix-bos",        "prefix BOS to user inputs, preceding the `--in-prefix` string" });
-    options.push_back({ "main infill", "       --in-prefix STRING",     "string to prefix user inputs with (default: empty)" });
-    options.push_back({ "main infill", "       --in-suffix STRING",     "string to suffix after user inputs with (default: empty)" });
-    options.push_back({ "main",        "       --no-warmup",            "skip warming up the model with an empty run" });
-    options.push_back({ "server infill",
-                                       "       --spm-infill",           "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
-
-    options.push_back({ "sampling" });
-    options.push_back({ "*",           "-s,    --seed SEED",            "RNG seed (default: %d, use random seed for < 0)", sparams.seed });
-    options.push_back({ "*",           "       --samplers SAMPLERS",    "samplers that will be used for generation in the order, separated by \';\'\n"
-                                                                        "(default: %s)", sampler_type_names.c_str() });
-    options.push_back({ "*",           "       --sampling-seq SEQUENCE",
-                                                                        "simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() });
-    options.push_back({ "*",           "       --ignore-eos",           "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" });
-    options.push_back({ "*",           "       --penalize-nl",          "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" });
-    options.push_back({ "*",           "       --temp T",               "temperature (default: %.1f)", (double)sparams.temp });
-    options.push_back({ "*",           "       --top-k N",              "top-k sampling (default: %d, 0 = disabled)", sparams.top_k });
-    options.push_back({ "*",           "       --top-p P",              "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p });
-    options.push_back({ "*",           "       --min-p P",              "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p });
-    options.push_back({ "*",           "       --tfs P",                "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z });
-    options.push_back({ "*",           "       --typical P",            "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typ_p });
-    options.push_back({ "*",           "       --repeat-last-n N",      "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n });
-    options.push_back({ "*",           "       --repeat-penalty N",     "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat });
-    options.push_back({ "*",           "       --presence-penalty N",   "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present });
-    options.push_back({ "*",           "       --frequency-penalty N",  "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq });
-    options.push_back({ "*",           "       --dynatemp-range N",     "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range });
-    options.push_back({ "*",           "       --dynatemp-exp N",       "dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent });
-    options.push_back({ "*",           "       --mirostat N",           "use Mirostat sampling.\n"
-                                                                        "Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
-                                                                        "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat });
-    options.push_back({ "*",           "       --mirostat-lr N",        "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta });
-    options.push_back({ "*",           "       --mirostat-ent N",       "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau });
-    options.push_back({ "*",           "       -l TOKEN_ID(+/-)BIAS",   "modifies the likelihood of token appearing in the completion,\n"
-                                                                        "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
-                                                                        "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" });
-    options.push_back({ "main",        "       --chat-template JINJA_TEMPLATE",
-                                                                        "set custom jinja chat template (default: template taken from model's metadata)\n"
-                                                                        "if suffix/prefix are specified, template will be disabled\n"
-                                                                        "only commonly used templates are accepted:\n"
-                                                                        "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
-    options.push_back({ "grammar" });
-    options.push_back({ "*",           "       --grammar GRAMMAR",      "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
-    options.push_back({ "*",           "       --grammar-file FNAME",   "file to read grammar from" });
-    options.push_back({ "*",           "-j,    --json-schema SCHEMA",
-                                                                        "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\n"
-                                                                        "For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" });
-
-    options.push_back({ "embedding" });
-    options.push_back({ "embedding",   "       --pooling {none,mean,cls,last}",
-                                                                        "pooling type for embeddings, use model default if unspecified" });
-    options.push_back({ "embedding",   "       --attention {causal,non-causal}",
-                                                                        "attention type for embeddings, use model default if unspecified" });
-
-    options.push_back({ "context hacking" });
-    options.push_back({ "*",           "       --rope-scaling {none,linear,yarn}",
-                                                                        "RoPE frequency scaling method, defaults to linear unless specified by the model" });
-    options.push_back({ "*",           "       --rope-scale N",         "RoPE context scaling factor, expands context by a factor of N" });
-    options.push_back({ "*",           "       --rope-freq-base N",     "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)" });
-    options.push_back({ "*",           "       --rope-freq-scale N",    "RoPE frequency scaling factor, expands context by a factor of 1/N" });
-    options.push_back({ "*",           "       --yarn-orig-ctx N",      "YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx });
-    options.push_back({ "*",           "       --yarn-ext-factor N",    "YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor });
-    options.push_back({ "*",           "       --yarn-attn-factor N",   "YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor });
-    options.push_back({ "*",           "       --yarn-beta-slow N",     "YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow });
-    options.push_back({ "*",           "       --yarn-beta-fast N",     "YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast });
-    options.push_back({ "*",           "-gan,  --grp-attn-n N",         "group-attention factor (default: %d)", params.grp_attn_n });
-    options.push_back({ "*",           "-gaw,  --grp-attn-w N",         "group-attention width (default: %.1f)", (double)params.grp_attn_w });
-    options.push_back({ "*",           "-dkvc, --dump-kv-cache",        "verbose print of the KV cache" });
-    options.push_back({ "*",           "-nkvo, --no-kv-offload",        "disable KV offload" });
-    options.push_back({ "*",           "-ctk,  --cache-type-k TYPE",    "KV cache data type for K (default: %s)", params.cache_type_k.c_str() });
-    options.push_back({ "*",           "-ctv,  --cache-type-v TYPE",    "KV cache data type for V (default: %s)", params.cache_type_v.c_str() });
-
-    options.push_back({ "perplexity" });
-    options.push_back({ "perplexity",  "       --all-logits",           "return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false" });
-    options.push_back({ "perplexity",  "       --hellaswag",            "compute HellaSwag score over random tasks from datafile supplied with -f" });
-    options.push_back({ "perplexity",  "       --hellaswag-tasks N",    "number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks });
-    options.push_back({ "perplexity",  "       --winogrande",           "compute Winogrande score over random tasks from datafile supplied with -f" });
-    options.push_back({ "perplexity",  "       --winogrande-tasks N",   "number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks });
-    options.push_back({ "perplexity",  "       --multiple-choice",      "compute multiple choice score over random tasks from datafile supplied with -f" });
-    options.push_back({ "perplexity",  "       --multiple-choice-tasks N",
-                                                                        "number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks });
-    options.push_back({ "perplexity",  "       --kl-divergence",        "computes KL-divergence to logits provided via --kl-divergence-base" });
-    options.push_back({ "perplexity",  "       --ppl-stride N",         "stride for perplexity calculation (default: %d)", params.ppl_stride });
-    options.push_back({ "perplexity",  "       --ppl-output-type {0,1}",
-                                                                        "output type for perplexity calculation (default: %d)", params.ppl_output_type });
-
-    options.push_back({ "parallel" });
-    options.push_back({ "*",           "-dt,   --defrag-thold N",       "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold });
-    options.push_back({ "*",           "-np,   --parallel N",           "number of parallel sequences to decode (default: %d)", params.n_parallel });
-    options.push_back({ "*",           "-ns,   --sequences N",          "number of sequences to decode (default: %d)", params.n_sequences });
-    options.push_back({ "*",           "-cb,   --cont-batching",        "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
-    options.push_back({ "*",           "-nocb, --no-cont-batching",     "disable continuous batching" });
-
-    options.push_back({ "multi-modality" });
-    options.push_back({ "*",           "       --mmproj FILE",          "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
-    options.push_back({ "*",           "       --image FILE",           "path to an image file. use with multimodal models. Specify multiple times for batching" });
-
-    options.push_back({ "backend" });
+    add_opt(llama_arg(
+        {"-h", "--help", "--usage"},
+        "print usage and exit",
+        [](gpt_params & params) {
+            params.usage = true;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--version"},
+        "show version and build info",
+        [](gpt_params &) {
+            fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
+            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+            exit(0);
+        }
+    ));
+    add_opt(llama_arg(
+        {"-v", "--verbose"},
+        "print verbose information",
+        [](gpt_params & params) {
+            params.verbosity = 1;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--verbosity"}, "N",
+        format("set specific verbosity level (default: %d)", params.verbosity),
+        [](gpt_params & params, int value) {
+            params.verbosity = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--verbose-prompt"},
+        format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
+        [](gpt_params & params) {
+            params.verbose_prompt = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--no-display-prompt"},
+        format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
+        [](gpt_params & params) {
+            params.display_prompt = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"-co", "--color"},
+        format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
+        [](gpt_params & params) {
+            params.use_color = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
+    add_opt(llama_arg(
+        {"-s", "--seed"}, "SEED",
+        format("RNG seed (default: %d, use random seed for < 0)", params.sparams.seed),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.seed = std::stoul(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"-t", "--threads"}, "N",
+        format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
+        [](gpt_params & params, int value) {
+            params.cpuparams.n_threads = value;
+            if (params.cpuparams.n_threads <= 0) {
+                params.cpuparams.n_threads = std::thread::hardware_concurrency();
+            }
+        }
+    ).set_env("LLAMA_ARG_THREADS"));
+    add_opt(llama_arg(
+        {"-tb", "--threads-batch"}, "N",
+        "number of threads to use during batch and prompt processing (default: same as --threads)",
+        [](gpt_params & params, int value) {
+            params.cpuparams_batch.n_threads = value;
+            if (params.cpuparams_batch.n_threads <= 0) {
+                params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
+            }
+        }
+    ));
+    add_opt(llama_arg(
+        {"-td", "--threads-draft"}, "N",
+        "number of threads to use during generation (default: same as --threads)",
+        [](gpt_params & params, int value) {
+            params.draft_cpuparams.n_threads = value;
+            if (params.draft_cpuparams.n_threads <= 0) {
+                params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"-tbd", "--threads-batch-draft"}, "N",
+        "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
+        [](gpt_params & params, int value) {
+            params.draft_cpuparams_batch.n_threads = value;
+            if (params.draft_cpuparams_batch.n_threads <= 0) {
+                params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"-C", "--cpu-mask"}, "M",
+        "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
+        [](gpt_params & params, const std::string & mask) {
+            params.cpuparams.mask_valid = true;
+            if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
+            }
+        }
+    ));
+    add_opt(llama_arg(
+        {"-Cr", "--cpu-range"}, "lo-hi",
+        "range of CPUs for affinity. Complements --cpu-mask",
+        [](gpt_params & params, const std::string & range) {
+            params.cpuparams.mask_valid = true;
+            if (!parse_cpu_range(range, params.cpuparams.cpumask)) {
+                throw std::invalid_argument("invalid range");
+            }
+        }
+    ));
+    add_opt(llama_arg(
+        {"--cpu-strict"}, "<0|1>",
+        format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
+        [](gpt_params & params, const std::string & value) {
+            params.cpuparams.strict_cpu = std::stoul(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--prio"}, "N",
+        format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
+        [](gpt_params & params, int prio) {
+            if (prio < 0 || prio > 3) {
+                throw std::invalid_argument("invalid value");
+            }
+            params.cpuparams.priority = (enum ggml_sched_priority) prio;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--poll"}, "<0...100>",
+        format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
+        [](gpt_params & params, const std::string & value) {
+            params.cpuparams.poll = std::stoul(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"-Cb", "--cpu-mask-batch"}, "M",
+        "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)",
+        [](gpt_params & params, const std::string & mask) {
+            params.cpuparams_batch.mask_valid = true;
+            if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
+            }
+        }
+    ));
+    add_opt(llama_arg(
+        {"-Crb", "--cpu-range-batch"}, "lo-hi",
+        "ranges of CPUs for affinity. Complements --cpu-mask-batch",
+        [](gpt_params & params, const std::string & range) {
+            params.cpuparams_batch.mask_valid = true;
+            if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) {
+                throw std::invalid_argument("invalid range");
+            }
+        }
+    ));
+    add_opt(llama_arg(
+        {"--cpu-strict-batch"}, "<0|1>",
+        "use strict CPU placement (default: same as --cpu-strict)",
+        [](gpt_params & params, int value) {
+            params.cpuparams_batch.strict_cpu = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--prio-batch"}, "N",
+        format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
+        [](gpt_params & params, int prio) {
+            if (prio < 0 || prio > 3) {
+                throw std::invalid_argument("invalid value");
+            }
+            params.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--poll-batch"}, "<0|1>",
+        "use polling to wait for work (default: same as --poll)",
+        [](gpt_params & params, int value) {
+            params.cpuparams_batch.poll = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-Cd", "--cpu-mask-draft"}, "M",
+        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
+        [](gpt_params & params, const std::string & mask) {
+            params.draft_cpuparams.mask_valid = true;
+            if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"-Crd", "--cpu-range-draft"}, "lo-hi",
+        "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
+        [](gpt_params & params, const std::string & range) {
+            params.draft_cpuparams.mask_valid = true;
+            if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) {
+                throw std::invalid_argument("invalid range");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"--cpu-strict-draft"}, "<0|1>",
+        "Use strict CPU placement for draft model (default: same as --cpu-strict)",
+        [](gpt_params & params, int value) {
+            params.draft_cpuparams.strict_cpu = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"--prio-draft"}, "N",
+        format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
+        [](gpt_params & params, int prio) {
+            if (prio < 0 || prio > 3) {
+                throw std::invalid_argument("invalid value");
+            }
+            params.draft_cpuparams.priority = (enum ggml_sched_priority) prio;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"--poll-draft"}, "<0|1>",
+        "Use polling to wait for draft model work (default: same as --poll])",
+        [](gpt_params & params, int value) {
+            params.draft_cpuparams.poll = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"-Cbd", "--cpu-mask-batch-draft"}, "M",
+        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
+        [](gpt_params & params, const std::string & mask) {
+            params.draft_cpuparams_batch.mask_valid = true;
+            if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
+        "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
+        [](gpt_params & params, const std::string & range) {
+            params.draft_cpuparams_batch.mask_valid = true;
+            if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"--cpu-strict-batch-draft"}, "<0|1>",
+        "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
+        [](gpt_params & params, int value) {
+            params.draft_cpuparams_batch.strict_cpu = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"--prio-batch-draft"}, "N",
+        format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
+        [](gpt_params & params, int prio) {
+            if (prio < 0 || prio > 3) {
+                throw std::invalid_argument("invalid value");
+            }
+            params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"--poll-batch-draft"}, "<0|1>",
+        "Use polling to wait for draft model work (default: --poll-draft)",
+        [](gpt_params & params, int value) {
+            params.draft_cpuparams_batch.poll = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"--draft"}, "N",
+        format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
+        [](gpt_params & params, int value) {
+            params.n_draft = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"-ps", "--p-split"}, "N",
+        format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
+        [](gpt_params & params, const std::string & value) {
+            params.p_split = std::stof(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"-lcs", "--lookup-cache-static"}, "FNAME",
+        "path to static lookup cache to use for lookup decoding (not updated by generation)",
+        [](gpt_params & params, const std::string & value) {
+            params.lookup_cache_static = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-lcd", "--lookup-cache-dynamic"}, "FNAME",
+        "path to dynamic lookup cache to use for lookup decoding (updated by generation)",
+        [](gpt_params & params, const std::string & value) {
+            params.lookup_cache_dynamic = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-c", "--ctx-size"}, "N",
+        format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
+        [](gpt_params & params, int value) {
+            params.n_ctx = value;
+        }
+    ).set_env("LLAMA_ARG_CTX_SIZE"));
+    add_opt(llama_arg(
+        {"-n", "--predict", "--n-predict"}, "N",
+        format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
+        [](gpt_params & params, int value) {
+            params.n_predict = value;
+        }
+    ).set_env("LLAMA_ARG_N_PREDICT"));
+    add_opt(llama_arg(
+        {"-b", "--batch-size"}, "N",
+        format("logical maximum batch size (default: %d)", params.n_batch),
+        [](gpt_params & params, int value) {
+            params.n_batch = value;
+        }
+    ).set_env("LLAMA_ARG_BATCH"));
+    add_opt(llama_arg(
+        {"-ub", "--ubatch-size"}, "N",
+        format("physical maximum batch size (default: %d)", params.n_ubatch),
+        [](gpt_params & params, int value) {
+            params.n_ubatch = value;
+        }
+    ).set_env("LLAMA_ARG_UBATCH"));
+    add_opt(llama_arg(
+        {"--keep"}, "N",
+        format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
+        [](gpt_params & params, int value) {
+            params.n_keep = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--chunks"}, "N",
+        format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
+        [](gpt_params & params, int value) {
+            params.n_chunks = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-fa", "--flash-attn"},
+        format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
+        [](gpt_params & params) {
+            params.flash_attn = true;
+        }
+    ).set_env("LLAMA_ARG_FLASH_ATTN"));
+    add_opt(llama_arg(
+        {"-p", "--prompt"}, "PROMPT",
+        ex == LLAMA_EXAMPLE_MAIN
+            ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
+            : "prompt to start generation with",
+        [](gpt_params & params, const std::string & value) {
+            params.prompt = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-f", "--file"}, "FNAME",
+        "a file containing the prompt (default: none)",
+        [](gpt_params & params, const std::string & value) {
+            std::ifstream file(value);
+            if (!file) {
+                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            // store the external file name in params
+            params.prompt_file = value;
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
+            if (!params.prompt.empty() && params.prompt.back() == '\n') {
+                params.prompt.pop_back();
+            }
+        }
+    ));
+    add_opt(llama_arg(
+        {"--in-file"}, "FNAME",
+        "an input file (repeat to specify multiple files)",
+        [](gpt_params & params, const std::string & value) {
+            std::ifstream file(value);
+            if (!file) {
+                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            params.in_files.push_back(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"-bf", "--binary-file"}, "FNAME",
+        "binary file containing the prompt (default: none)",
+        [](gpt_params & params, const std::string & value) {
+            std::ifstream file(value, std::ios::binary);
+            if (!file) {
+                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            // store the external file name in params
+            params.prompt_file = value;
+            std::ostringstream ss;
+            ss << file.rdbuf();
+            params.prompt = ss.str();
+            fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
+        }
+    ));
+    add_opt(llama_arg(
+        {"-e", "--escape"},
+        format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
+        [](gpt_params & params) {
+            params.escape = true;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--no-escape"},
+        "do not process escape sequences",
+        [](gpt_params & params) {
+            params.escape = false;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-ptc", "--print-token-count"}, "N",
+        format("print token count every N tokens (default: %d)", params.n_print),
+        [](gpt_params & params, int value) {
+            params.n_print = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--prompt-cache"}, "FNAME",
+        "file to cache prompt state for faster startup (default: none)",
+        [](gpt_params & params, const std::string & value) {
+            params.path_prompt_cache = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--prompt-cache-all"},
+        "if specified, saves user input and generations to cache as well\n",
+        [](gpt_params & params) {
+            params.prompt_cache_all = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--prompt-cache-ro"},
+        "if specified, uses the prompt cache but does not update it",
+        [](gpt_params & params) {
+            params.prompt_cache_ro = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"-r", "--reverse-prompt"}, "PROMPT",
+        "halt generation at PROMPT, return control in interactive mode\n",
+        [](gpt_params & params, const std::string & value) {
+            params.antiprompt.emplace_back(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"-sp", "--special"},
+        format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
+        [](gpt_params & params) {
+            params.special = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"-cnv", "--conversation"},
+        format(
+            "run in conversation mode:\n"
+            "- does not print special tokens and suffix/prefix\n"
+            "- interactive mode is also enabled\n"
+            "(default: %s)",
+            params.conversation ? "true" : "false"
+        ),
+        [](gpt_params & params) {
+            params.conversation = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"-i", "--interactive"},
+        format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
+        [](gpt_params & params) {
+            params.interactive = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"-if", "--interactive-first"},
+        format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
+        [](gpt_params & params) {
+            params.interactive_first = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"-mli", "--multiline-input"},
+        "allows you to write or paste multiple lines without ending each in '\\'",
+        [](gpt_params & params) {
+            params.multiline_input = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--in-prefix-bos"},
+        "prefix BOS to user inputs, preceding the `--in-prefix` string",
+        [](gpt_params & params) {
+            params.input_prefix_bos = true;
+            params.enable_chat_template = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--in-prefix"}, "STRING",
+        "string to prefix user inputs with (default: empty)",
+        [](gpt_params & params, const std::string & value) {
+            params.input_prefix = value;
+            params.enable_chat_template = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--in-suffix"}, "STRING",
+        "string to suffix after user inputs with (default: empty)",
+        [](gpt_params & params, const std::string & value) {
+            params.input_suffix = value;
+            params.enable_chat_template = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--no-warmup"},
+        "skip warming up the model with an empty run",
+        [](gpt_params & params) {
+            params.warmup = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--spm-infill"},
+        format(
+            "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)",
+            params.spm_infill ? "enabled" : "disabled"
+        ),
+        [](gpt_params & params) {
+            params.spm_infill = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
+    add_opt(llama_arg(
+        {"--samplers"}, "SAMPLERS",
+        format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            const auto sampler_names = string_split(value, ';');
+            params.sparams.samplers = gpt_sampler_types_from_names(sampler_names, true);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--sampling-seq"}, "SEQUENCE",
+        format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.samplers = gpt_sampler_types_from_chars(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--ignore-eos"},
+        "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
+        [](gpt_params & params) {
+            params.sparams.ignore_eos = true;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--penalize-nl"},
+        format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
+        [](gpt_params & params) {
+            params.sparams.penalize_nl = true;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--temp"}, "N",
+        format("temperature (default: %.1f)", (double)params.sparams.temp),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.temp = std::stof(value);
+            params.sparams.temp = std::max(params.sparams.temp, 0.0f);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--top-k"}, "N",
+        format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k),
+        [](gpt_params & params, int value) {
+            params.sparams.top_k = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--top-p"}, "N",
+        format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.top_p = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--min-p"}, "N",
+        format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.min_p = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--tfs"}, "N",
+        format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.tfs_z = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--typical"}, "N",
+        format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.typ_p = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--repeat-last-n"}, "N",
+        format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n),
+        [](gpt_params & params, int value) {
+            params.sparams.penalty_last_n = value;
+            params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--repeat-penalty"}, "N",
+        format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.penalty_repeat = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--presence-penalty"}, "N",
+        format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.penalty_present = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--frequency-penalty"}, "N",
+        format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.penalty_freq = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--dynatemp-range"}, "N",
+        format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.dynatemp_range = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--dynatemp-exp"}, "N",
+        format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.dynatemp_exponent = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--mirostat"}, "N",
+        format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
+        "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
+        [](gpt_params & params, int value) {
+            params.sparams.mirostat = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--mirostat-lr"}, "N",
+        format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.mirostat_eta = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--mirostat-ent"}, "N",
+        format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.mirostat_tau = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS",
+        "modifies the likelihood of token appearing in the completion,\n"
+        "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
+        "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'",
+        [](gpt_params & params, const std::string & value) {
+            std::stringstream ss(value);
+            llama_token key;
+            char sign;
+            std::string value_str;
+            try {
+                if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
+                    const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
+                    params.sparams.logit_bias.push_back({key, bias});
+                } else {
+                    throw std::invalid_argument("invalid input format");
+                }
+            } catch (const std::exception&) {
+                throw std::invalid_argument("invalid input format");
+            }
+        }
+    ));
+    add_opt(llama_arg(
+        {"--grammar"}, "GRAMMAR",
+        format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.grammar = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--grammar-file"}, "FNAME",
+        "file to read grammar from",
+        [](gpt_params & params, const std::string & value) {
+            std::ifstream file(value);
+            if (!file) {
+                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            std::copy(
+                std::istreambuf_iterator<char>(file),
+                std::istreambuf_iterator<char>(),
+                std::back_inserter(params.sparams.grammar)
+            );
+        }
+    ));
+    add_opt(llama_arg(
+        {"-j", "--json-schema"}, "SCHEMA",
+        "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
+        [](gpt_params & params, const std::string & value) {
+            //params.sparams.grammar = json_schema_to_grammar(json::parse(value));
+        }
+    ));
+    add_opt(llama_arg(
+        {"--pooling"}, "{none,mean,cls,last}",
+        "pooling type for embeddings, use model default if unspecified",
+        [](gpt_params & params, const std::string & value) {
+            /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
+            else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
+            else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
+            else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
+            else { throw std::invalid_argument("invalid value"); }
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(llama_arg(
+        {"--attention"}, "{causal,non,causal}",
+        "attention type for embeddings, use model default if unspecified",
+        [](gpt_params & params, const std::string & value) {
+            /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
+            else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
+            else { throw std::invalid_argument("invalid value"); }
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(llama_arg(
+        {"--rope-scaling"}, "{none,linear,yarn}",
+        "RoPE frequency scaling method, defaults to linear unless specified by the model",
+        [](gpt_params & params, const std::string & value) {
+            /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
+            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
+            else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
+            else { throw std::invalid_argument("invalid value"); }
+        }
+    ));
+    add_opt(llama_arg(
+        {"--rope-scale"}, "N",
+        "RoPE context scaling factor, expands context by a factor of N",
+        [](gpt_params & params, const std::string & value) {
+            params.rope_freq_scale = 1.0f / std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--rope-freq-base"}, "N",
+        "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
+        [](gpt_params & params, const std::string & value) {
+            params.rope_freq_base = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--rope-freq-scale"}, "N",
+        "RoPE frequency scaling factor, expands context by a factor of 1/N",
+        [](gpt_params & params, const std::string & value) {
+            params.rope_freq_scale = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--yarn-orig-ctx"}, "N",
+        format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
+        [](gpt_params & params, int value) {
+            params.yarn_orig_ctx = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--yarn-ext-factor"}, "N",
+        format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
+        [](gpt_params & params, const std::string & value) {
+            params.yarn_ext_factor = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--yarn-attn-factor"}, "N",
+        format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
+        [](gpt_params & params, const std::string & value) {
+            params.yarn_attn_factor = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--yarn-beta-slow"}, "N",
+        format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
+        [](gpt_params & params, const std::string & value) {
+            params.yarn_beta_slow = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--yarn-beta-fast"}, "N",
+        format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
+        [](gpt_params & params, const std::string & value) {
+            params.yarn_beta_fast = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"-gan", "--grp-attn-n"}, "N",
+        format("group-attention factor (default: %d)", params.grp_attn_n),
+        [](gpt_params & params, int value) {
+            params.grp_attn_n = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-gaw", "--grp-attn-w"}, "N",
+        format("group-attention width (default: %.1f)", (double)params.grp_attn_w),
+        [](gpt_params & params, int value) {
+            params.grp_attn_w = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-dkvc", "--dump-kv-cache"},
+        "verbose print of the KV cache",
+        [](gpt_params & params) {
+            params.dump_kv_cache = true;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-nkvo", "--no-kv-offload"},
+        "disable KV offload",
+        [](gpt_params & params) {
+            params.no_kv_offload = true;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-ctk", "--cache-type-k"}, "TYPE",
+        format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            // TODO: get the type right here
+            params.cache_type_k = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-ctv", "--cache-type-v"}, "TYPE",
+        format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            // TODO: get the type right here
+            params.cache_type_v = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--perplexity", "--all-logits"},
+        format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
+        [](gpt_params & params) {
+            params.logits_all = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--hellaswag"},
+        "compute HellaSwag score over random tasks from datafile supplied with -f",
+        [](gpt_params & params) {
+            params.hellaswag = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--hellaswag-tasks"}, "N",
+        format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
+        [](gpt_params & params, int value) {
+            params.hellaswag_tasks = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--winogrande"},
+        "compute Winogrande score over random tasks from datafile supplied with -f",
+        [](gpt_params & params) {
+            params.winogrande = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--winogrande-tasks"}, "N",
+        format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
+        [](gpt_params & params, int value) {
+            params.winogrande_tasks = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--multiple-choice"},
+        "compute multiple choice score over random tasks from datafile supplied with -f",
+        [](gpt_params & params) {
+            params.multiple_choice = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--multiple-choice-tasks"}, "N",
+        format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
+        [](gpt_params & params, int value) {
+            params.multiple_choice_tasks = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--kl-divergence"},
+        "computes KL-divergence to logits provided via --kl-divergence-base",
+        [](gpt_params & params) {
+            params.kl_divergence = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--save-all-logits", "--kl-divergence-base"}, "FNAME",
+        "set logits file",
+        [](gpt_params & params, const std::string & value) {
+            params.logits_file = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--ppl-stride"}, "N",
+        format("stride for perplexity calculation (default: %d)", params.ppl_stride),
+        [](gpt_params & params, int value) {
+            params.ppl_stride = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--ppl-output-type"}, "<0|1>",
+        format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
+        [](gpt_params & params, int value) {
+            params.ppl_output_type = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"-dt", "--defrag-thold"}, "N",
+        format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
+        [](gpt_params & params, const std::string & value) {
+            params.defrag_thold = std::stof(value);
+        }
+    ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
+    add_opt(llama_arg(
+        {"-np", "--parallel"}, "N",
+        format("number of parallel sequences to decode (default: %d)", params.n_parallel),
+        [](gpt_params & params, int value) {
+            params.n_parallel = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-ns", "--sequences"}, "N",
+        format("number of sequences to decode (default: %d)", params.n_sequences),
+        [](gpt_params & params, int value) {
+            params.n_sequences = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-cb", "--cont-batching"},
+        format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
+        [](gpt_params & params) {
+            params.cont_batching = true;
+        }
+    ).set_env("LLAMA_ARG_CONT_BATCHING"));
+    add_opt(llama_arg(
+        {"-nocb", "--no-cont-batching"},
+        "disable continuous batching",
+        [](gpt_params & params) {
+            params.cont_batching = false;
+        }
+    ).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
+    add_opt(llama_arg(
+        {"--mmproj"}, "FILE",
+        "path to a multimodal projector file for LLaVA. see examples/llava/README.md",
+        [](gpt_params & params, const std::string & value) {
+            params.mmproj = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    add_opt(llama_arg(
+        {"--image"}, "FILE",
+        "path to an image file. use with multimodal models. Specify multiple times for batching",
+        [](gpt_params & params, const std::string & value) {
+            params.image.emplace_back(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
 #ifdef GGML_USE_RPC
-    options.push_back({ "*",           "       --rpc SERVERS",          "comma separated list of RPC servers" });
+    add_opt(llama_arg(
+        {"--rpc"}, "SERVERS",
+        "comma separated list of RPC servers",
+        [](gpt_params & params, const std::string & value) {
+            params.rpc_servers = value;
+        }
+    ));
 #endif
+    add_opt(llama_arg(
+        {"--mlock"},
+        "force system to keep model in RAM rather than swapping or compressing",
+        [](gpt_params & params) {
+            params.use_mlock = true;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--no-mmap"},
+        "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
+        [](gpt_params & params) {
+            params.use_mmap = false;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--numa"}, "TYPE",
+        "attempt optimizations that help on some NUMA systems\n"
+        "- distribute: spread execution evenly over all nodes\n"
+        "- isolate: only spawn threads on CPUs on the node that execution started on\n"
+        "- numactl: use the CPU map provided by numactl\n"
+        "if run without this previously, it is recommended to drop the system page cache before using this\n"
+        "see https://github.com/ggerganov/llama.cpp/issues/1437",
+        [](gpt_params & params, const std::string & value) {
+            /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
+            else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
+            else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
+            else { throw std::invalid_argument("invalid value"); }
+        }
+    ));
+    add_opt(llama_arg(
+        {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
+        "number of layers to store in VRAM",
+        [](gpt_params & params, int value) {
+            params.n_gpu_layers = value;
+            if (!llama_supports_gpu_offload()) {
+                fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
+                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+            }
+        }
+    ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
+    add_opt(llama_arg(
+        {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
+        "number of layers to store in VRAM for the draft model",
+        [](gpt_params & params, int value) {
+            params.n_gpu_layers_draft = value;
+            if (!llama_supports_gpu_offload()) {
+                fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
+                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"-sm", "--split-mode"}, "{none,layer,row}",
+        "how to split the model across multiple GPUs, one of:\n"
+        "- none: use one GPU only\n"
+        "- layer (default): split layers and KV across GPUs\n"
+        "- row: split rows across GPUs",
+        [](gpt_params & params, const std::string & value) {
+            std::string arg_next = value;
+            if (arg_next == "none") {
+                params.split_mode = LLAMA_SPLIT_MODE_NONE;
+            } else if (arg_next == "layer") {
+                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
+            }
+            else if (arg_next == "row") {
+#ifdef GGML_USE_SYCL
+                fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
+                exit(1);
+#endif // GGML_USE_SYCL
+                params.split_mode = LLAMA_SPLIT_MODE_ROW;
+            }
+            else {
+                throw std::invalid_argument("invalid value");
+            }
+#ifndef GGML_USE_CUDA_SYCL_VULKAN
+            fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL_VULKAN
+        }
+    ));
+    add_opt(llama_arg(
+        {"-ts", "--tensor-split"}, "N0,N1,N2,...",
+        "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
+        [](gpt_params & params, const std::string & value) {
+            std::string arg_next = value;
 
-    if (llama_supports_mlock()) {
-        options.push_back({ "*",           "       --mlock",                "force system to keep model in RAM rather than swapping or compressing" });
-    }
-    if (llama_supports_mmap()) {
-        options.push_back({ "*",           "       --no-mmap",              "do not memory-map model (slower load but may reduce pageouts if not using mlock)" });
-    }
-    options.push_back({ "*",           "       --numa TYPE",            "attempt optimizations that help on some NUMA systems\n"
-                                                                        "  - distribute: spread execution evenly over all nodes\n"
-                                                                        "  - isolate: only spawn threads on CPUs on the node that execution started on\n"
-                                                                        "  - numactl: use the CPU map provided by numactl\n"
-                                                                        "if run without this previously, it is recommended to drop the system page cache before using this\n"
-                                                                        "see https://github.com/ggerganov/llama.cpp/issues/1437" });
-
-    if (llama_supports_gpu_offload()) {
-        options.push_back({ "*",           "-ngl,  --gpu-layers N",
-                                                                        "number of layers to store in VRAM" });
-        options.push_back({ "*",           "-ngld, --gpu-layers-draft N",
-                                                                        "number of layers to store in VRAM for the draft model" });
-        options.push_back({ "*",           "-sm,   --split-mode SPLIT_MODE",
-                                                                        "how to split the model across multiple GPUs, one of:\n"
-                                                                        "  - none: use one GPU only\n"
-                                                                        "  - layer (default): split layers and KV across GPUs\n"
-                                                                        "  - row: split rows across GPUs" });
-        options.push_back({ "*",           "-ts,   --tensor-split SPLIT",
-                                                                        "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" });
-        options.push_back({ "*",           "-mg,   --main-gpu i",       "the GPU to use for the model (with split-mode = none),\n"
-                                                                        "or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu });
-    }
-
-    options.push_back({ "model" });
-    options.push_back({ "*",           "       --check-tensors",        "check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false" });
-    options.push_back({ "*",           "       --override-kv KEY=TYPE:VALUE",
-                                                                        "advanced option to override model metadata by key. may be specified multiple times.\n"
-                                                                        "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
-    options.push_back({ "*",           "       --lora FNAME",           "apply LoRA adapter (can be repeated to use multiple adapters)" });
-    options.push_back({ "*",           "       --lora-scaled FNAME S",  "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
-    options.push_back({ "*",           "       --control-vector FNAME", "add a control vector\n"
-                                                                        "note: this argument can be repeated to add multiple control vectors" });
-    options.push_back({ "*",           "       --control-vector-scaled FNAME SCALE",
-                                                                        "add a control vector with user defined scaling SCALE\n"
-                                                                        "note: this argument can be repeated to add multiple scaled control vectors" });
-    options.push_back({ "*",           "       --control-vector-layer-range START END",
-                                                                        "layer range to apply the control vector(s) to, start and end inclusive" });
-    options.push_back({ "*",           "-m,    --model FNAME",          "model path (default: models/$filename with filename from --hf-file\n"
-                                                                        "or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH });
-    options.push_back({ "*",           "-md,   --model-draft FNAME",    "draft model for speculative decoding (default: unused)" });
-    options.push_back({ "*",           "-mu,   --model-url MODEL_URL",  "model download url (default: unused)" });
-    options.push_back({ "*",           "-hfr,  --hf-repo REPO",         "Hugging Face model repository (default: unused)" });
-    options.push_back({ "*",           "-hff,  --hf-file FILE",         "Hugging Face model file (default: unused)" });
-    options.push_back({ "*",           "-hft,  --hf-token TOKEN",       "Hugging Face access token (default: value from HF_TOKEN environment variable)" });
-
-    options.push_back({ "retrieval" });
-    options.push_back({ "retrieval",   "       --context-file FNAME",   "file to load context from (repeat to specify multiple files)" });
-    options.push_back({ "retrieval",   "       --chunk-size N",         "minimum length of embedded text chunks (default: %d)", params.chunk_size });
-    options.push_back({ "retrieval",   "       --chunk-separator STRING",
-                                                                        "separator between chunks (default: '%s')", params.chunk_separator.c_str() });
-
-    options.push_back({ "passkey" });
-    options.push_back({ "passkey",     "       --junk N",               "number of times to repeat the junk text (default: %d)", params.n_junk });
-    options.push_back({ "passkey",     "       --pos N",                "position of the passkey in the junk text (default: %d)", params.i_pos });
-
-    options.push_back({ "imatrix" });
-    options.push_back({ "imatrix",     "-o,    --output FNAME",         "output file (default: '%s')", params.out_file.c_str() });
-    options.push_back({ "imatrix",     "       --output-frequency N",   "output the imatrix every N iterations (default: %d)", params.n_out_freq });
-    options.push_back({ "imatrix",     "       --save-frequency N",     "save an imatrix copy every N iterations (default: %d)", params.n_save_freq });
-    options.push_back({ "imatrix",     "       --process-output",       "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" });
-    options.push_back({ "imatrix",     "       --no-ppl",               "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" });
-    options.push_back({ "imatrix",     "       --chunk N",              "start processing the input from chunk N (default: %d)", params.i_chunk });
-
-    options.push_back({ "bench" });
-    options.push_back({ "bench",       "-pps",                          "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" });
-    options.push_back({ "bench",       "-npp n0,n1,...",                "number of prompt tokens" });
-    options.push_back({ "bench",       "-ntg n0,n1,...",                "number of text generation tokens" });
-    options.push_back({ "bench",       "-npl n0,n1,...",                "number of parallel prompts" });
-
-    options.push_back({ "embedding" });
-    options.push_back({ "embedding",   "       --embd-normalize",       "normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize });
-    options.push_back({ "embedding",   "       --embd-output-format",   "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix" });
-    options.push_back({ "embedding",   "       --embd-separator",       "separator of embendings (default \\n) for example \"<#sep#>\"" });
-
-    options.push_back({ "server" });
-    options.push_back({ "server",      "       --host HOST",            "ip address to listen (default: %s)", params.hostname.c_str() });
-    options.push_back({ "server",      "       --port PORT",            "port to listen (default: %d)", params.port });
-    options.push_back({ "server",      "       --path PATH",            "path to serve static files from (default: %s)", params.public_path.c_str() });
-    options.push_back({ "server",      "       --embedding(s)",         "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
-    options.push_back({ "server",      "       --api-key KEY",          "API key to use for authentication (default: none)" });
-    options.push_back({ "server",      "       --api-key-file FNAME",   "path to file containing API keys (default: none)" });
-    options.push_back({ "server",      "       --ssl-key-file FNAME",   "path to file a PEM-encoded SSL private key" });
-    options.push_back({ "server",      "       --ssl-cert-file FNAME",  "path to file a PEM-encoded SSL certificate" });
-    options.push_back({ "server",      "       --timeout N",            "server read/write timeout in seconds (default: %d)", params.timeout_read });
-    options.push_back({ "server",      "       --threads-http N",       "number of threads used to process HTTP requests (default: %d)", params.n_threads_http });
-    options.push_back({ "server",      "       --system-prompt-file FNAME",
-                                                                        "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" });
-    options.push_back({ "server",      "       --log-format {text,json}",
-                                                                        "log output format: json or text (default: json)" });
-    options.push_back({ "server",      "       --metrics",              "enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled" });
-    options.push_back({ "server",      "       --no-slots",             "disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled" });
-    options.push_back({ "server",      "       --slot-save-path PATH",  "path to save slot kv cache (default: disabled)" });
-    options.push_back({ "server",      "       --chat-template JINJA_TEMPLATE",
-                                                                        "set custom jinja chat template (default: template taken from model's metadata)\n"
-                                                                        "only commonly used templates are accepted:\n"
-                                                                        "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
-    options.push_back({ "server",      "-sps,  --slot-prompt-similarity SIMILARITY",
-                                                                        "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
-    options.push_back({ "server",      "       --lora-init-without-apply",     "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"});
+            // split string by , and /
+            const std::regex regex{ R"([,/]+)" };
+            std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
+            std::vector<std::string> split_arg{ it, {} };
+            if (split_arg.size() >= llama_max_devices()) {
+                throw std::invalid_argument(
+                    format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
+                );
+            }
+            for (size_t i = 0; i < llama_max_devices(); ++i) {
+                if (i < split_arg.size()) {
+                        params.tensor_split[i] = std::stof(split_arg[i]);
+                } else {
+                        params.tensor_split[i] = 0.0f;
+                }
+            }
+#ifndef GGML_USE_CUDA_SYCL_VULKAN
+            fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL_VULKAN
+        }
+    ));
+    add_opt(llama_arg(
+        {"-mg", "--main-gpu"}, "INDEX",
+        format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
+        [](gpt_params & params, int value) {
+            params.main_gpu = value;
+#ifndef GGML_USE_CUDA_SYCL_VULKAN
+            fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL_VULKAN
+        }
+    ));
+    add_opt(llama_arg(
+        {"--check-tensors"},
+        format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
+        [](gpt_params & params) {
+            params.check_tensors = true;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--override-kv"}, "KEY=TYPE:VALUE",
+        "advanced option to override model metadata by key. may be specified multiple times.\n"
+        "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
+        [](gpt_params & params, const std::string & value) {
+            if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
+                throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str()));
+            }
+        }
+    ));
+    add_opt(llama_arg(
+        {"--lora"}, "FNAME",
+        "path to LoRA adapter (can be repeated to use multiple adapters)",
+        [](gpt_params & params, const std::string & value) {
+            params.lora_adapters.push_back({ std::string(value), 1.0 });
+        }
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
+    add_opt(llama_arg(
+        {"--lora-scaled"}, "FNAME", "SCALE",
+        "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
+        [](gpt_params & params, const std::string & fname, const std::string & scale) {
+            params.lora_adapters.push_back({ fname, std::stof(scale) });
+        }
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
+    add_opt(llama_arg(
+        {"--control-vector"}, "FNAME",
+        "add a control vector\nnote: this argument can be repeated to add multiple control vectors",
+        [](gpt_params & params, const std::string & value) {
+            params.control_vectors.push_back({ 1.0f, value, });
+        }
+    ));
+    add_opt(llama_arg(
+        {"--control-vector-scaled"}, "FNAME", "SCALE",
+        "add a control vector with user defined scaling SCALE\n"
+        "note: this argument can be repeated to add multiple scaled control vectors",
+        [](gpt_params & params, const std::string & fname, const std::string & scale) {
+            params.control_vectors.push_back({ std::stof(scale), fname });
+        }
+    ));
+    add_opt(llama_arg(
+        {"--control-vector-layer-range"}, "START", "END",
+        "layer range to apply the control vector(s) to, start and end inclusive",
+        [](gpt_params & params, const std::string & start, const std::string & end) {
+            params.control_vector_layer_start = std::stoi(start);
+            params.control_vector_layer_end = std::stoi(end);
+        }
+    ));
+    add_opt(llama_arg(
+        {"-a", "--alias"}, "STRING",
+        "set alias for model name (to be used by REST API)",
+        [](gpt_params & params, const std::string & value) {
+            params.model_alias = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"-m", "--model"}, "FNAME",
+        ex == LLAMA_EXAMPLE_EXPORT_LORA
+            ? std::string("model path from which to load base model")
+            : format(
+                "model path (default: `models/$filename` with filename from `--hf-file` "
+                "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
+            ),
+        [](gpt_params & params, const std::string & value) {
+            params.model = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
+    add_opt(llama_arg(
+        {"-md", "--model-draft"}, "FNAME",
+        "draft model for speculative decoding (default: unused)",
+        [](gpt_params & params, const std::string & value) {
+            params.model_draft = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"-mu", "--model-url"}, "MODEL_URL",
+        "model download url (default: unused)",
+        [](gpt_params & params, const std::string & value) {
+            params.model_url = value;
+        }
+    ).set_env("LLAMA_ARG_MODEL_URL"));
+    add_opt(llama_arg(
+        {"-hfr", "--hf-repo"}, "REPO",
+        "Hugging Face model repository (default: unused)",
+        [](gpt_params & params, const std::string & value) {
+            params.hf_repo = value;
+        }
+    ).set_env("LLAMA_ARG_HF_REPO"));
+    add_opt(llama_arg(
+        {"-hff", "--hf-file"}, "FILE",
+        "Hugging Face model file (default: unused)",
+        [](gpt_params & params, const std::string & value) {
+            params.hf_file = value;
+        }
+    ).set_env("LLAMA_ARG_HF_FILE"));
+    add_opt(llama_arg(
+        {"-hft", "--hf-token"}, "TOKEN",
+        "Hugging Face access token (default: value from HF_TOKEN environment variable)",
+        [](gpt_params & params, const std::string & value) {
+            params.hf_token = value;
+        }
+    ).set_env("HF_TOKEN"));
+    add_opt(llama_arg(
+        {"--context-file"}, "FNAME",
+        "file to load context from (repeat to specify multiple files)",
+        [](gpt_params & params, const std::string & value) {
+            std::ifstream file(value, std::ios::binary);
+            if (!file) {
+                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            params.context_files.push_back(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
+    add_opt(llama_arg(
+        {"--chunk-size"}, "N",
+        format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
+        [](gpt_params & params, int value) {
+            params.chunk_size = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
+    add_opt(llama_arg(
+        {"--chunk-separator"}, "STRING",
+        format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            params.chunk_separator = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
+    add_opt(llama_arg(
+        {"--junk"}, "N",
+        format("number of times to repeat the junk text (default: %d)", params.n_junk),
+        [](gpt_params & params, int value) {
+            params.n_junk = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
+    add_opt(llama_arg(
+        {"--pos"}, "N",
+        format("position of the passkey in the junk text (default: %d)", params.i_pos),
+        [](gpt_params & params, int value) {
+            params.i_pos = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
+    add_opt(llama_arg(
+        {"-o", "--output", "--output-file"}, "FNAME",
+        format("output file (default: '%s')",
+            ex == LLAMA_EXAMPLE_EXPORT_LORA
+                ? params.lora_outfile.c_str()
+                : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
+                    ? params.cvector_outfile.c_str()
+                    : params.out_file.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            params.out_file = value;
+            params.cvector_outfile = value;
+            params.lora_outfile = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
+    add_opt(llama_arg(
+        {"-ofreq", "--output-frequency"}, "N",
+        format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
+        [](gpt_params & params, int value) {
+            params.n_out_freq = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(llama_arg(
+        {"--save-frequency"}, "N",
+        format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
+        [](gpt_params & params, int value) {
+            params.n_save_freq = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(llama_arg(
+        {"--process-output"},
+        format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
+        [](gpt_params & params) {
+            params.process_output = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(llama_arg(
+        {"--no-ppl"},
+        format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
+        [](gpt_params & params) {
+            params.compute_ppl = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(llama_arg(
+        {"--chunk", "--from-chunk"}, "N",
+        format("start processing the input from chunk N (default: %d)", params.i_chunk),
+        [](gpt_params & params, int value) {
+            params.i_chunk = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(llama_arg(
+        {"-pps"},
+        format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
+        [](gpt_params & params) {
+            params.is_pp_shared = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    add_opt(llama_arg(
+        {"-npp"}, "n0,n1,...",
+        "number of prompt tokens",
+        [](gpt_params & params, const std::string & value) {
+            auto p = string_split<int>(value, ',');
+            params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
+        }
+    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    add_opt(llama_arg(
+        {"-ntg"}, "n0,n1,...",
+        "number of text generation tokens",
+        [](gpt_params & params, const std::string & value) {
+            auto p = string_split<int>(value, ',');
+            params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
+        }
+    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    add_opt(llama_arg(
+        {"-npl"}, "n0,n1,...",
+        "number of parallel prompts",
+        [](gpt_params & params, const std::string & value) {
+            auto p = string_split<int>(value, ',');
+            params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
+        }
+    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    add_opt(llama_arg(
+        {"--embd-normalize"}, "N",
+        format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
+        [](gpt_params & params, int value) {
+            params.embd_normalize = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(llama_arg(
+        {"--embd-output-format"}, "FORMAT",
+        "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
+        [](gpt_params & params, const std::string & value) {
+            params.embd_out = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(llama_arg(
+        {"--embd-separator"}, "STRING",
+        "separator of embendings (default \\n) for example \"<#sep#>\"",
+        [](gpt_params & params, const std::string & value) {
+            params.embd_sep = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(llama_arg(
+        {"--host"}, "HOST",
+        format("ip address to listen (default: %s)", params.hostname.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            params.hostname = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
+    add_opt(llama_arg(
+        {"--port"}, "PORT",
+        format("port to listen (default: %d)", params.port),
+        [](gpt_params & params, int value) {
+            params.port = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
+    add_opt(llama_arg(
+        {"--path"}, "PATH",
+        format("path to serve static files from (default: %s)", params.public_path.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            params.public_path = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--embedding", "--embeddings"},
+        format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
+        [](gpt_params & params) {
+            params.embedding = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
+    add_opt(llama_arg(
+        {"--api-key"}, "KEY",
+        "API key to use for authentication (default: none)",
+        [](gpt_params & params, const std::string & value) {
+            params.api_keys.push_back(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
+    add_opt(llama_arg(
+        {"--api-key-file"}, "FNAME",
+        "path to file containing API keys (default: none)",
+        [](gpt_params & params, const std::string & value) {
+            std::ifstream key_file(value);
+            if (!key_file) {
+                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            std::string key;
+            while (std::getline(key_file, key)) {
+                if (!key.empty()) {
+                        params.api_keys.push_back(key);
+                }
+            }
+            key_file.close();
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--ssl-key-file"}, "FNAME",
+        "path to file a PEM-encoded SSL private key",
+        [](gpt_params & params, const std::string & value) {
+            params.ssl_file_key = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--ssl-cert-file"}, "FNAME",
+        "path to file a PEM-encoded SSL certificate",
+        [](gpt_params & params, const std::string & value) {
+            params.ssl_file_cert = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"-to", "--timeout"}, "N",
+        format("server read/write timeout in seconds (default: %d)", params.timeout_read),
+        [](gpt_params & params, int value) {
+            params.timeout_read  = value;
+            params.timeout_write = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--threads-http"}, "N",
+        format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
+        [](gpt_params & params, int value) {
+            params.n_threads_http = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
+    add_opt(llama_arg(
+        {"-spf", "--system-prompt-file"}, "FNAME",
+        "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications",
+        [](gpt_params & params, const std::string & value) {
+            std::ifstream file(value);
+            if (!file) {
+                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            std::string system_prompt;
+            std::copy(
+                        std::istreambuf_iterator<char>(file),
+                        std::istreambuf_iterator<char>(),
+                        std::back_inserter(system_prompt)
+                        );
+            params.system_prompt = system_prompt;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--log-format"}, "{text, json}",
+        "log output format: json or text (default: json)",
+        [](gpt_params & params, const std::string & value) {
+            if (value == "json") {
+                params.log_json = true;
+            } else if (value == "text") {
+                params.log_json = false;
+            } else {
+                throw std::invalid_argument("invalid value");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--metrics"},
+        format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
+        [](gpt_params & params) {
+            params.endpoint_metrics = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
+    add_opt(llama_arg(
+        {"--no-slots"},
+        format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
+        [](gpt_params & params) {
+            params.endpoint_slots = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
+    add_opt(llama_arg(
+        {"--slot-save-path"}, "PATH",
+        "path to save slot kv cache (default: disabled)",
+        [](gpt_params & params, const std::string & value) {
+            params.slot_save_path = value;
+            // if doesn't end with DIRECTORY_SEPARATOR, add it
+            if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
+                params.slot_save_path += DIRECTORY_SEPARATOR;
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--chat-template"}, "JINJA_TEMPLATE",
+        "set custom jinja chat template (default: template taken from model's metadata)\n"
+        "if suffix/prefix are specified, template will be disabled\n"
+        "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template",
+        [](gpt_params & params, const std::string & value) {
+            if (!llama_chat_verify_template(value)) {
+                throw std::runtime_error(format(
+                    "error: the supplied chat template is not supported: %s\n"
+                    "note: llama.cpp does not use jinja parser, we only support commonly used templates\n",
+                    value.c_str()
+                ));
+            }
+            params.chat_template = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
+    add_opt(llama_arg(
+        {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
+        format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
+        [](gpt_params & params, const std::string & value) {
+            params.slot_prompt_similarity = std::stof(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--lora-init-without-apply"},
+        format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
+        [](gpt_params & params) {
+            params.lora_init_without_apply = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--simple-io"},
+        "use basic IO for better compatibility in subprocesses and limited consoles",
+        [](gpt_params & params) {
+            params.simple_io = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
+    add_opt(llama_arg(
+        {"-ld", "--logdir"}, "LOGDIR",
+        "path under which to save YAML logs (no logging if unset)",
+        [](gpt_params & params, const std::string & value) {
+            params.logdir = value;
 
+            if (params.logdir.back() != DIRECTORY_SEPARATOR) {
+                params.logdir += DIRECTORY_SEPARATOR;
+            }
+        }
+    ));
+    add_opt(llama_arg(
+        {"--positive-file"}, "FNAME",
+        format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            params.cvector_positive_file = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+    add_opt(llama_arg(
+        {"--negative-file"}, "FNAME",
+        format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            params.cvector_negative_file = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+    add_opt(llama_arg(
+        {"--pca-batch"}, "N",
+        format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
+        [](gpt_params & params, int value) {
+            params.n_pca_batch = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+    add_opt(llama_arg(
+        {"--pca-iter"}, "N",
+        format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
+        [](gpt_params & params, int value) {
+            params.n_pca_iterations = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+    add_opt(llama_arg(
+        {"--method"}, "{pca, mean}",
+        "dimensionality reduction method to be used (default: pca)",
+        [](gpt_params & params, const std::string & value) {
+            /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
+            else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
+            else { throw std::invalid_argument("invalid value"); }
+        }
+    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+    add_opt(llama_arg(
+        {"--output-format"}, "{md,jsonl}",
+        "output format for batched-bench results (default: md)",
+        [](gpt_params & params, const std::string & value) {
+            /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
+            else if (value == "md") { params.batched_bench_output_jsonl = false; }
+            else { std::invalid_argument("invalid value"); }
+        }
+    ).set_examples({LLAMA_EXAMPLE_BENCH}));
 #ifndef LOG_DISABLE_LOGS
-    options.push_back({ "logging" });
-    options.push_back({ "*",           "       --simple-io",            "use basic IO for better compatibility in subprocesses and limited consoles" });
-    options.push_back({ "*",           "-ld,   --logdir LOGDIR",        "path under which to save YAML logs (no logging if unset)" });
-    options.push_back({ "logging",     "       --log-test",             "Run simple logging test" });
-    options.push_back({ "logging",     "       --log-disable",          "Disable trace logs" });
-    options.push_back({ "logging",     "       --log-enable",           "Enable trace logs" });
-    options.push_back({ "logging",     "       --log-file FNAME",       "Specify a log filename (without extension)" });
-    options.push_back({ "logging",     "       --log-new",              "Create a separate new log file on start. "
-                                                                        "Each log file will have unique name: \"<name>.<ID>.log\"" });
-    options.push_back({ "logging",     "       --log-append",           "Don't truncate the old log file." });
+    // TODO: make this looks less weird
+    add_opt(llama_arg(
+        {"--log-test"},
+        "Log test",
+        [](gpt_params &) { log_param_single_parse("--log-test"); }
+    ));
+    add_opt(llama_arg(
+        {"--log-disable"},
+        "Log disable",
+        [](gpt_params &) { log_param_single_parse("--log-disable"); }
+    ));
+    add_opt(llama_arg(
+        {"--log-enable"},
+        "Log enable",
+        [](gpt_params &) { log_param_single_parse("--log-enable"); }
+    ));
+    add_opt(llama_arg(
+        {"--log-new"},
+        "Log new",
+        [](gpt_params &) { log_param_single_parse("--log-new"); }
+    ));
+    add_opt(llama_arg(
+        {"--log-append"},
+        "Log append",
+        [](gpt_params &) { log_param_single_parse("--log-append"); }
+    ));
+    add_opt(llama_arg(
+        {"--log-file"}, "FNAME",
+        "Log file",
+        [](gpt_params &, const std::string & value) { log_param_pair_parse(false, "--log-file", value); }
+    ));
 #endif // LOG_DISABLE_LOGS
 
-    options.push_back({ "cvector" });
-    options.push_back({ "cvector",     "-o,    --output FNAME",         "output file (default: '%s')", params.cvector_outfile.c_str() });
-    options.push_back({ "cvector",     "       --positive-file FNAME",  "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() });
-    options.push_back({ "cvector",     "       --negative-file FNAME",  "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() });
-    options.push_back({ "cvector",     "       --pca-batch N",          "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
-    options.push_back({ "cvector",     "       --pca-iter N",           "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
-    options.push_back({ "cvector",     "       --method {pca,mean}",    "dimensionality reduction method to be used (default: pca)" });
-
-    options.push_back({ "export-lora" });
-    options.push_back({ "export-lora", "-m,    --model",                "model path from which to load base model (default '%s')", params.model.c_str() });
-    options.push_back({ "export-lora", "       --lora FNAME",           "path to LoRA adapter  (can be repeated to use multiple adapters)" });
-    options.push_back({ "export-lora", "       --lora-scaled FNAME S",  "path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)" });
-    options.push_back({ "export-lora", "-o,    --output FNAME",         "output file (default: '%s')", params.lora_outfile.c_str() });
-
-    options.push_back({ "batched-bench" });
-    options.push_back({ "batched-bench", "       --output-format {md,jsonl}", "output format for batched-bench results (default: md)" });
-
-    printf("usage: %s [options]\n", argv[0]);
-
-    for (const auto & o : options) {
-        if (!o.grp.empty()) {
-            printf("\n%s:\n\n", o.grp.c_str());
-            continue;
-        }
-        printf("  %-32s", o.args.c_str());
-        if (o.args.length() > 30) {
-            printf("\n%34s", "");
-        }
-
-        const auto desc = o.desc;
-        size_t start = 0;
-        size_t end = desc.find('\n');
-        while (end != std::string::npos) {
-            printf("%s\n%34s", desc.substr(start, end - start).c_str(), "");
-            start = end + 1;
-            end = desc.find('\n', start);
-        }
-
-        printf("%s\n", desc.substr(start).c_str());
-    }
-    printf("\n");
+    return options;
 }
 
 std::string gpt_params_get_system_info(const gpt_params & params) {
@@ -2515,10 +2734,15 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         llama_token bos = llama_token_bos(model);
         llama_token eos = llama_token_eos(model);
         // some models (e.g. T5) don't have a BOS token
-        if (bos != -1) {
+        if (bos != LLAMA_TOKEN_NULL) {
             tmp.push_back(bos);
         }
-        tmp.push_back(eos);
+        if (eos != LLAMA_TOKEN_NULL) {
+            tmp.push_back(eos);
+        }
+        if (tmp.empty()) {
+            tmp.push_back(0);
+        }
 
         if (llama_model_has_encoder(model)) {
             llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
diff --git a/common/common.h b/common/common.h
index 224528d48..480e597a8 100644
--- a/common/common.h
+++ b/common/common.h
@@ -14,8 +14,10 @@
 #include <vector>
 #include <random>
 #include <thread>
+#include <set>
 #include <unordered_map>
 #include <tuple>
+#include <functional>
 
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -57,6 +59,25 @@ int32_t cpu_get_num_math();
 // CLI argument parsing
 //
 
+enum llama_example {
+    LLAMA_EXAMPLE_COMMON,
+    LLAMA_EXAMPLE_SPECULATIVE,
+    LLAMA_EXAMPLE_MAIN,
+    LLAMA_EXAMPLE_INFILL,
+    LLAMA_EXAMPLE_EMBEDDING,
+    LLAMA_EXAMPLE_PERPLEXITY,
+    LLAMA_EXAMPLE_RETRIEVAL,
+    LLAMA_EXAMPLE_PASSKEY,
+    LLAMA_EXAMPLE_IMATRIX,
+    LLAMA_EXAMPLE_BENCH,
+    LLAMA_EXAMPLE_SERVER,
+    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
+    LLAMA_EXAMPLE_EXPORT_LORA,
+    LLAMA_EXAMPLE_LLAVA,
+
+    LLAMA_EXAMPLE_COUNT,
+};
+
 // dimensionality reduction methods, used by cvector-generator
 enum dimre_method {
     DIMRE_METHOD_PCA,
@@ -73,6 +94,8 @@ struct cpu_params {
 };
 
 struct gpt_params {
+    enum llama_example curr_ex    = LLAMA_EXAMPLE_COMMON;
+
     uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
 
     int32_t n_predict             =    -1; // new tokens to predict
@@ -193,6 +216,7 @@ struct gpt_params {
 
     bool   kl_divergence    = false; // compute KL divergence
 
+    std::function<void(int, char **)> print_usage = nullptr; // print example-specific usage and example
     bool usage             = false; // print usage
     bool use_color         = false; // use color to distinguish generations and inputs
     bool special           = false; // enable special token output
@@ -214,7 +238,6 @@ struct gpt_params {
     bool use_mlock         = false; // use mlock to keep model in memory
     bool verbose_prompt    = false; // print prompt tokens before generation
     bool display_prompt    = true;  // print prompt before generation
-    bool infill            = false; // use infill mode
     bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
     bool no_kv_offload     = false; // disable KV offloading
     bool warmup            = true;  // warmup run
@@ -303,13 +326,91 @@ struct gpt_params {
     bool batched_bench_output_jsonl = false;
 };
 
-void gpt_params_parse_from_env(gpt_params & params);
-void gpt_params_handle_model_default(gpt_params & params);
+struct llama_arg {
+    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
+    std::vector<const char *> args;
+    const char * value_hint   = nullptr; // help text or example for arg value
+    const char * value_hint_2 = nullptr; // for second arg value
+    const char * env          = nullptr;
+    std::string help;
+    void (*handler_void)   (gpt_params & params) = nullptr;
+    void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
+    void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
+    void (*handler_int)    (gpt_params & params, int) = nullptr;
 
-bool gpt_params_parse_ex   (int argc, char ** argv, gpt_params & params);
-bool gpt_params_parse      (int argc, char ** argv, gpt_params & params);
-bool gpt_params_find_arg   (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
-void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
+    llama_arg(
+        const std::initializer_list<const char *> & args,
+        const char * value_hint,
+        const std::string & help,
+        void (*handler)(gpt_params & params, const std::string &)
+    ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
+
+    llama_arg(
+        const std::initializer_list<const char *> & args,
+        const char * value_hint,
+        const std::string & help,
+        void (*handler)(gpt_params & params, int)
+    ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
+
+    llama_arg(
+        const std::initializer_list<const char *> & args,
+        const std::string & help,
+        void (*handler)(gpt_params & params)
+    ) : args(args), help(help), handler_void(handler) {}
+
+    // support 2 values for arg
+    llama_arg(
+        const std::initializer_list<const char *> & args,
+        const char * value_hint,
+        const char * value_hint_2,
+        const std::string & help,
+        void (*handler)(gpt_params & params, const std::string &, const std::string &)
+    ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
+
+    llama_arg & set_examples(std::initializer_list<enum llama_example> examples) {
+        this->examples = std::move(examples);
+        return *this;
+    }
+
+    llama_arg & set_env(const char * env) {
+        help = help + "\n(env: " + env + ")";
+        this->env = env;
+        return *this;
+    }
+
+    bool in_example(enum llama_example ex) {
+        return examples.find(ex) != examples.end();
+    }
+
+    bool get_value_from_env(std::string & output) const {
+        if (env == nullptr) return false;
+        char * value = std::getenv(env);
+        if (value) {
+            output = value;
+            return true;
+        }
+        return false;
+    }
+
+    bool has_value_from_env() const {
+        return env != nullptr && std::getenv(env);
+    }
+
+    std::string to_string();
+};
+
+// initialize list of options (arguments) that can be used by the current example
+std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex);
+// optionally, we can provide "print_usage" to print example usage
+std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex, std::function<void(int, char **)> print_usage);
+
+// parse input arguments from CLI
+// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
+bool gpt_params_parse   (int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
+bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
+
+// print full usage message; it will be called internally by gpt_params_parse() if "-h" is set
+void gpt_params_print_usage(gpt_params & params, std::vector<llama_arg> & options);
 
 std::string gpt_params_get_system_info(const gpt_params & params);
 
diff --git a/common/sampling.cpp b/common/sampling.cpp
index c81b4d233..7806b77e0 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -145,7 +145,7 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
         /* .params = */ params,
         /* .grmr   = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
         /* .chain  = */ llama_sampler_chain_init(lparams),
-        /* .prev   = */ ring_buffer<llama_token>(params.n_prev),
+        /* .prev   = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
         /* .cur    = */ {},
         /* .cur_p  = */ {},
     };
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index b043c74cc..f3b0c433b 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -28,9 +28,7 @@ static std::vector<int> parse_list(char * p) {
     return ret;
 }
 
-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
+static void print_usage(int, char ** argv) {
     LOG_TEE("\nexample usage:\n");
     LOG_TEE("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
     LOG_TEE("\n");
@@ -39,8 +37,8 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_BENCH, print_usage);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index f321f6104..f5f309022 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -6,9 +6,7 @@
 #include <string>
 #include <vector>
 
-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
+static void print_usage(int, char ** argv) {
     LOG_TEE("\nexample usage:\n");
     LOG_TEE("\n    %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
     LOG_TEE("\n");
@@ -20,8 +18,8 @@ int main(int argc, char ** argv) {
     params.prompt = "Hello my name is";
     params.n_predict = 32;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp
index a68268388..0795175a1 100644
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -35,9 +35,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
     return ret;
 }
 
-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
+static void print_usage(int, char ** argv) {
     printf("\nexample usage:\n");
     printf("\n    CPU only:   %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
     printf("\n    with GPU:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
@@ -390,8 +388,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index ebe58b13b..7a141f953 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -80,8 +80,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EMBEDDING);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index aea15c864..881111ffd 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -144,8 +144,8 @@ int main(int argc, char ** argv) {
 
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index 8df457e21..544e7fff6 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -391,9 +391,7 @@ struct lora_merge_ctx {
     }
 };
 
-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
+static void print_usage(int, char ** argv) {
     printf("\nexample usage:\n");
     printf("\n  %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
     printf("\nNOTE: output model is F16\n");
@@ -403,8 +401,8 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/gen-docs/CMakeLists.txt b/examples/gen-docs/CMakeLists.txt
new file mode 100644
index 000000000..c94cda776
--- /dev/null
+++ b/examples/gen-docs/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-gen-docs)
+add_executable(${TARGET} gen-docs.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/gen-docs/gen-docs.cpp b/examples/gen-docs/gen-docs.cpp
new file mode 100644
index 000000000..8b1dafd63
--- /dev/null
+++ b/examples/gen-docs/gen-docs.cpp
@@ -0,0 +1,51 @@
+#include "common.h"
+
+#include <fstream>
+#include <string>
+
+// Export usage message (-h) to markdown format
+
+static void export_md(std::string fname, llama_example ex) {
+    std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
+
+    gpt_params params;
+    auto options = gpt_params_parser_init(params, ex);
+
+    file << "| Argument | Explanation |\n";
+    file << "| -------- | ----------- |\n";
+    for (auto & opt : options) {
+        file << "| `";
+        // args
+        for (const auto & arg : opt.args) {
+        if (arg == opt.args.front()) {
+                file << arg;
+                if (opt.args.size() > 1) file << ", ";
+            } else {
+                file << arg << (arg != opt.args.back() ? ", " : "");
+            }
+        }
+        // value hint
+        if (opt.value_hint) {
+            std::string md_value_hint(opt.value_hint);
+            string_replace_all(md_value_hint, "|", "\\|");
+            file << " " << md_value_hint;
+        }
+        if (opt.value_hint_2) {
+            std::string md_value_hint_2(opt.value_hint_2);
+            string_replace_all(md_value_hint_2, "|", "\\|");
+            file << " " << md_value_hint_2;
+        }
+        // help text
+        std::string md_help(opt.help);
+        string_replace_all(md_help, "\n", "<br/>");
+        string_replace_all(md_help, "|", "\\|");
+        file << "` | " << md_help << " |\n";
+    }
+}
+
+int main(int, char **) {
+    export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN);
+    export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);
+
+    return 0;
+}
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index 4e801c69d..e1efbf573 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -154,8 +154,8 @@ static std::string gritlm_instruction(const std::string & instruction) {
 int main(int argc, char * argv[]) {
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 635bd9849..c310c229a 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -18,9 +18,7 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
+static void print_usage(int, char ** argv) {
     LOG_TEE("\nexample usage:\n");
     LOG_TEE("\n    %s \\\n"
             "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
@@ -580,8 +578,8 @@ int main(int argc, char ** argv) {
     params.logits_all = true;
     params.verbosity = 1;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_IMATRIX, print_usage);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
index 921793751..06ec160c2 100644
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -269,12 +269,6 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
     return env->NewStringUTF(result.str().c_str());
 }
 
-extern "C"
-JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
-    llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
-}
-
 extern "C"
 JNIEXPORT jlong JNICALL
 Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
@@ -311,6 +305,29 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
     return reinterpret_cast<jlong>(batch);
 }
 
+extern "C"
+JNIEXPORT void JNICALL
+Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
+    llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
+}
+
+extern "C"
+JNIEXPORT jlong JNICALL
+Java_android_llama_cpp_LLamaAndroid_new_1sampler(JNIEnv *, jobject) {
+    auto sparams = llama_sampler_chain_default_params();
+    sparams.no_perf = true;
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+
+    return reinterpret_cast<jlong>(smpl);
+}
+
+extern "C"
+JNIEXPORT void JNICALL
+Java_android_llama_cpp_LLamaAndroid_free_1sampler(JNIEnv *, jobject, jlong sampler_pointer) {
+    llama_sampler_free(reinterpret_cast<llama_sampler *>(sampler_pointer));
+}
+
 extern "C"
 JNIEXPORT void JNICALL
 Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
@@ -380,14 +397,14 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
         JNIEnv * env,
         jobject,
         jlong context_pointer,
-        jlong sampling_pointer,
         jlong batch_pointer,
+        jlong sampler_pointer,
         jint n_len,
         jobject intvar_ncur
 ) {
     const auto context = reinterpret_cast<llama_context *>(context_pointer);
-    const auto sampling = reinterpret_cast<llama_sampler *>(sampling_pointer);
-    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
+    const auto batch   = reinterpret_cast<llama_batch   *>(batch_pointer);
+    const auto sampler = reinterpret_cast<llama_sampler *>(sampler_pointer);
     const auto model = llama_get_model(context);
 
     if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
@@ -395,9 +412,9 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
     if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");
 
     // sample the most likely token
-    const auto new_token_id = llama_sampler_sample(sampling, context, batch->n_tokens - 1);
+    const auto new_token_id = llama_sampler_sample(sampler, context, -1);
 
-    llama_sampler_accept(sampling, new_token_id);
+    llama_sampler_accept(sampler, new_token_id);
 
     const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
     if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
diff --git a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
index 6c63e54e0..cf520e459 100644
--- a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
+++ b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
@@ -45,8 +45,10 @@ class LLamaAndroid {
     private external fun free_context(context: Long)
     private external fun backend_init(numa: Boolean)
     private external fun backend_free()
-    private external fun free_batch(batch: Long)
     private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
+    private external fun free_batch(batch: Long)
+    private external fun new_sampler(): Long
+    private external fun free_sampler(sampler: Long)
     private external fun bench_model(
         context: Long,
         model: Long,
@@ -69,6 +71,7 @@ class LLamaAndroid {
     private external fun completion_loop(
         context: Long,
         batch: Long,
+        sampler: Long,
         nLen: Int,
         ncur: IntVar
     ): String?
@@ -101,8 +104,11 @@ class LLamaAndroid {
                     val batch = new_batch(512, 0, 1)
                     if (batch == 0L) throw IllegalStateException("new_batch() failed")
 
+                    val sampler = new_sampler()
+                    if (sampler == 0L) throw IllegalStateException("new_sampler() failed")
+
                     Log.i(tag, "Loaded model $pathToModel")
-                    threadLocalState.set(State.Loaded(model, context, batch))
+                    threadLocalState.set(State.Loaded(model, context, batch, sampler))
                 }
                 else -> throw IllegalStateException("Model already loaded")
             }
@@ -114,7 +120,7 @@ class LLamaAndroid {
             is State.Loaded -> {
                 val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
                 while (ncur.value <= nlen) {
-                    val str = completion_loop(state.context, state.batch, nlen, ncur)
+                    val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur)
                     if (str == null) {
                         break
                     }
@@ -138,6 +144,7 @@ class LLamaAndroid {
                     free_context(state.context)
                     free_model(state.model)
                     free_batch(state.batch)
+                    free_sampler(state.sampler);
 
                     threadLocalState.set(State.Idle)
                 }
@@ -161,7 +168,7 @@ class LLamaAndroid {
 
         private sealed interface State {
             data object Idle: State
-            data class Loaded(val model: Long, val context: Long, val batch: Long): State
+            data class Loaded(val model: Long, val context: Long, val batch: Long, val sampler: Long): State
         }
 
         // Enforce only one instance of Llm.
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index 4d7ccc91f..5845d0106 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -112,9 +112,7 @@ struct llava_context {
     struct llama_model * model = NULL;
 };
 
-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
+static void print_usage(int, char ** argv) {
     LOG_TEE("\n example usage:\n");
     LOG_TEE("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
     LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
@@ -280,8 +278,8 @@ int main(int argc, char ** argv) {
 
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_LLAVA, print_usage);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
@@ -293,7 +291,7 @@ int main(int argc, char ** argv) {
 #endif // LOG_DISABLE_LOGS
 
     if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
-        print_usage(argc, argv, {});
+        print_usage(argc, argv);
         return 1;
     }
     auto model = llava_init(&params);
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
index 237da9429..57e7d42c5 100644
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -253,8 +253,8 @@ int main(int argc, char ** argv) {
 
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        show_additional_info(argc, argv);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, show_additional_info);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
@@ -266,7 +266,6 @@ int main(int argc, char ** argv) {
 #endif // LOG_DISABLE_LOGS
 
     if (params.mmproj.empty() || (params.image.empty())) {
-        gpt_params_print_usage(argc, argv, params);
         show_additional_info(argc, argv);
         return 1;
     }
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index c2e931c65..5027a483a 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -36,8 +36,8 @@ struct ngram_container {
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp
index 5f04709f5..795b06c88 100644
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@@ -13,8 +13,8 @@
 int main(int argc, char ** argv){
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp
index 400f3e0b0..93299ef8b 100644
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -15,8 +15,8 @@
 int main(int argc, char ** argv){
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index 071400b7e..9ac7f6b47 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -12,8 +12,8 @@
 int main(int argc, char ** argv){
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 7fe673bd6..f443360eb 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -42,6 +42,13 @@ static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting  = false;
 static bool need_insert_eot = false;
 
+static void print_usage(int, char ** argv) {
+    printf("\nexample usage:\n");
+    printf("\n  text generation:     %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
+    printf("\n  chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
+    printf("\n");
+}
+
 static bool file_exists(const std::string & path) {
     std::ifstream f(path.c_str());
     return f.good();
@@ -132,9 +139,9 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
 int main(int argc, char ** argv) {
     gpt_params params;
     g_params = &params;
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_MAIN, print_usage);
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 22dc97d54..2822334a5 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -102,8 +102,8 @@ int main(int argc, char ** argv) {
 
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index ff8d0302f..76d235c2c 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -6,9 +6,7 @@
 #include <string>
 #include <vector>
 
-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
+static void print_usage(int, char ** argv) {
     LOG_TEE("\nexample usage:\n");
     LOG_TEE("\n    %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
     LOG_TEE("\n");
@@ -21,8 +19,8 @@ int main(int argc, char ** argv) {
     params.n_keep = 32;
     params.i_pos  = -1;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PASSKEY, print_usage);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index b606e5464..420dbb813 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1968,8 +1968,8 @@ int main(int argc, char ** argv) {
     params.n_ctx = 512;
     params.logits_all = true;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PERPLEXITY);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/quantize/README.md b/examples/quantize/README.md
index 5d1e11c67..704f0d56b 100644
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@@ -54,6 +54,8 @@ As the models are currently fully loaded into memory, you will need adequate dis
 
 Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
 
+The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format.
+
 *(outdated)*
 
 | Model | Measure      |    F16 |   Q4_0 |   Q4_1 |   Q5_0 |   Q5_1 |   Q8_0 |
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index 7eb947650..dd8a82e6e 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -4,9 +4,7 @@
 #include <algorithm>
 #include <fstream>
 
-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
+static void print_usage(int, char ** argv) {
     LOG_TEE("\nexample usage:\n");
     LOG_TEE("\n    %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
     LOG_TEE("\n");
@@ -113,8 +111,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_RETRIEVAL, print_usage);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index d86292b22..9e3fad60a 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -11,8 +11,8 @@ int main(int argc, char ** argv) {
     params.prompt = "The quick brown fox";
     params.sparams.seed = 1234;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 95f3706e1..d3dce1294 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2424,14 +2424,11 @@ int main(int argc, char ** argv) {
     // own arguments required by this example
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SERVER);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
-    // parse arguments from environment variables
-    gpt_params_parse_from_env(params);
-
     // TODO: not great to use extern vars
     server_log_json = params.log_json;
     server_verbose = params.verbosity > 0;
diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature
index 6f163ce04..e1eade6cd 100644
--- a/examples/server/tests/features/embeddings.feature
+++ b/examples/server/tests/features/embeddings.feature
@@ -9,8 +9,11 @@ Feature: llama.cpp server
     And   a model alias bert-bge-small
     And   42 as server seed
     And   2 slots
-    And   1024 as batch size
-    And   1024 as ubatch size
+    # the bert-bge-small model has context size of 512
+    # since the generated prompts are as big as the batch size, we need to set the batch size to 512
+    # ref: https://huggingface.co/BAAI/bge-small-en-v1.5/blob/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/config.json#L20
+    And   512 as batch size
+    And   512 as ubatch size
     And   2048 KV cache size
     And   embeddings extraction
     Then  the server is starting
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 8a0ad43ad..a53cef547 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -6,9 +6,7 @@
 #include <string>
 #include <vector>
 
-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
+static void print_usage(int, char ** argv) {
     LOG_TEE("\nexample usage:\n");
     LOG_TEE("\n    %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
     LOG_TEE("\n");
@@ -20,8 +18,8 @@ int main(int argc, char ** argv) {
     params.prompt = "Hello my name is";
     params.n_predict = 32;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 9d1c3f662..8df587fee 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -29,8 +29,8 @@ struct seq_draft {
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SPECULATIVE);
+    if (!gpt_params_parse(argc, argv, params, options)) {
         return 1;
     }
 
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 9b0e32718..44b6eabd4 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -687,8 +687,8 @@ extern "C" {
 
     struct ggml_hash_set {
         size_t size;
-        ggml_bitset_t * used;
-        struct ggml_tensor ** keys;
+        ggml_bitset_t * used;       // whether or not the keys are in use i.e. set
+        struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
     };
 
     // computation graph
@@ -1278,7 +1278,7 @@ extern "C" {
             size_t                nb1,
             size_t                nb2,
             size_t                nb3,
-            size_t                offset);
+            size_t                offset); // in bytes
 
     // b -> view(a,offset,nb1,nb2,3), return view(a)
     GGML_API struct ggml_tensor * ggml_set_inplace(
@@ -1288,19 +1288,19 @@ extern "C" {
             size_t                nb1,
             size_t                nb2,
             size_t                nb3,
-            size_t                offset);
+            size_t                offset); // in bytes
 
     GGML_API struct ggml_tensor * ggml_set_1d(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             struct ggml_tensor  * b,
-            size_t                offset);
+            size_t                offset); // in bytes
 
     GGML_API struct ggml_tensor * ggml_set_1d_inplace(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
             struct ggml_tensor  * b,
-            size_t                offset);
+            size_t                offset); // in bytes
 
     // b -> view(a,offset,nb1,nb2,3), return modified a
     GGML_API struct ggml_tensor * ggml_set_2d(
@@ -1308,7 +1308,7 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b,
             size_t                nb1,
-            size_t                offset);
+            size_t                offset); // in bytes
 
     // b -> view(a,offset,nb1,nb2,3), return view(a)
     GGML_API struct ggml_tensor * ggml_set_2d_inplace(
@@ -1316,7 +1316,7 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b,
             size_t                nb1,
-            size_t                offset);
+            size_t                offset); // in bytes
 
     // a -> b, return view(b)
     GGML_API struct ggml_tensor * ggml_cpy(
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
index 96b93d2d8..b5d9301a7 100644
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@@ -827,6 +827,10 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
                 op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
         case GGML_OP_MUL_MAT:
             return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
+        case GGML_OP_ROPE_BACK:
+            return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
+        case GGML_OP_IM2COL_BACK:
+            return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
         default:
             return true;
     }
diff --git a/ggml/src/ggml-cann/Doxyfile b/ggml/src/ggml-cann/Doxyfile
index 2b009e8f9..3290a4859 100644
--- a/ggml/src/ggml-cann/Doxyfile
+++ b/ggml/src/ggml-cann/Doxyfile
@@ -32,7 +32,7 @@ DOXYFILE_ENCODING      = UTF-8
 # title of most generated pages and in a few other places.
 # The default value is: My Project.
 
-PROJECT_NAME           = "llama.cpp"
+PROJECT_NAME           = "ggml"
 
 # The PROJECT_NUMBER tag can be used to enter a project or revision number. This
 # could be handy for archiving the generated documentation or if some version
@@ -44,7 +44,7 @@ PROJECT_NUMBER         =
 # for a project that appears at the top of each page and should give viewer a
 # quick idea about the purpose of the project. Keep the description short.
 
-PROJECT_BRIEF          = "llama inference engine"
+PROJECT_BRIEF          = "Tensor library for machine learning"
 
 # With the PROJECT_LOGO tag one can specify a logo or an icon that is included
 # in the documentation. The maximum height of the logo should not exceed 55
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index faf301964..bdd98487f 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -29,6 +29,7 @@ bool g_mul_mat_q = false;
 #include "ggml-cuda/rope.cuh"
 #include "ggml-cuda/scale.cuh"
 #include "ggml-cuda/softmax.cuh"
+#include "ggml-cuda/sum.cuh"
 #include "ggml-cuda/sumrows.cuh"
 #include "ggml-cuda/tsembd.cuh"
 #include "ggml-cuda/unary.cuh"
@@ -2184,6 +2185,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
             ggml_cuda_dup(ctx, dst);
             break;
         case GGML_OP_ADD:
+        case GGML_OP_ADD1: // TODO: more efficient implementation
             ggml_cuda_op_add(ctx, dst);
             break;
         case GGML_OP_SUB:
@@ -2200,6 +2202,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
             break;
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(dst)) {
+                case GGML_UNARY_OP_NEG:
+                    ggml_cuda_op_neg(ctx, dst);
+                    break;
                 case GGML_UNARY_OP_GELU:
                     ggml_cuda_op_gelu(ctx, dst);
                     break;
@@ -2308,6 +2313,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_POOL_2D:
             ggml_cuda_op_pool2d(ctx, dst);
             break;
+        case GGML_OP_SUM:
+            ggml_cuda_op_sum(ctx, dst);
+            break;
         case GGML_OP_SUM_ROWS:
             ggml_cuda_op_sum_rows(ctx, dst);
             break;
@@ -2752,6 +2760,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
     switch (op->op) {
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(op)) {
+                case GGML_UNARY_OP_NEG:
                 case GGML_UNARY_OP_GELU:
                 case GGML_UNARY_OP_SILU:
                 case GGML_UNARY_OP_RELU:
@@ -2881,6 +2890,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
         case GGML_OP_TRANSPOSE:
         case GGML_OP_NORM:
         case GGML_OP_ADD:
+        case GGML_OP_ADD1:
         case GGML_OP_SUB:
         case GGML_OP_MUL:
         case GGML_OP_DIV:
@@ -2891,14 +2901,18 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
         case GGML_OP_SIN:
         case GGML_OP_COS:
         case GGML_OP_CLAMP:
+            return true;
         case GGML_OP_CONT:
+            return op->src[0]->type != GGML_TYPE_BF16;
         case GGML_OP_DIAG_MASK_INF:
         case GGML_OP_SOFT_MAX:
             return true;
         case GGML_OP_ROPE:
             return ggml_is_contiguous(op->src[0]);
         case GGML_OP_IM2COL:
+            return op->src[0]->type == GGML_TYPE_F16;
         case GGML_OP_POOL_2D:
+        case GGML_OP_SUM:
         case GGML_OP_SUM_ROWS:
         case GGML_OP_ARGSORT:
         case GGML_OP_ACC:
diff --git a/ggml/src/ggml-cuda/cross-entropy-loss.cu b/ggml/src/ggml-cuda/cross-entropy-loss.cu
index a14043e70..5575a90f6 100644
--- a/ggml/src/ggml-cuda/cross-entropy-loss.cu
+++ b/ggml/src/ggml-cuda/cross-entropy-loss.cu
@@ -1,6 +1,6 @@
 #include "common.cuh"
 #include "cross-entropy-loss.cuh"
-#include "sumrows.cuh"
+#include "sum.cuh"
 
 #include <cmath>
 #include <cstdint>
@@ -102,5 +102,5 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
     cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
 
     // Combine results from individual blocks:
-    sum_rows_f32_cuda(dst_tmp.ptr, dst_d, blocks_num.x, 1, stream);
+    sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream);
 }
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
index f87f33b3e..f28a19d40 100644
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -152,7 +152,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
     }                                                                       \
 
 static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    ggml_tensor * Q = dst->src[1];
+    ggml_tensor * Q = dst->src[0];
     ggml_tensor * K = dst->src[1];
     ggml_tensor * V = dst->src[2];
 
@@ -227,7 +227,7 @@ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, gg
     }                                                                       \
 
 static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    ggml_tensor * Q = dst->src[1];
+    ggml_tensor * Q = dst->src[0];
     ggml_tensor * K = dst->src[1];
     ggml_tensor * V = dst->src[2];
 
diff --git a/ggml/src/ggml-cuda/sum.cu b/ggml/src/ggml-cuda/sum.cu
new file mode 100644
index 000000000..0d5e953ee
--- /dev/null
+++ b/ggml/src/ggml-cuda/sum.cu
@@ -0,0 +1,41 @@
+#include "sumrows.cuh"
+#include "sum.cuh"
+
+#include <cstdint>
+
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+#include <cub/cub.cuh>
+using namespace cub;
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+
+void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+    size_t tmp_size = 0;
+    DeviceReduce::Sum(nullptr,       tmp_size, x, dst, ne, stream);
+    ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
+    DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, x, dst, ne, stream);
+#else
+    // Use (inefficient) sum_rows implementation as a fallback.
+    // For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14.
+    sum_rows_f32_cuda(x, dst, ne, 1, stream);
+    GGML_UNUSED(pool);
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+}
+
+void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    const float * src0_d = (const float *) src0->data;
+    float * dst_d = (float *) dst->data;
+
+    const int64_t ne = ggml_nelements(src0);
+
+    ggml_cuda_pool & pool = ctx.pool();
+    cudaStream_t stream = ctx.stream();
+
+    sum_f32_cuda(pool, src0_d, dst_d, ne, stream);
+}
diff --git a/ggml/src/ggml-cuda/sum.cuh b/ggml/src/ggml-cuda/sum.cuh
new file mode 100644
index 000000000..8cadc3736
--- /dev/null
+++ b/ggml/src/ggml-cuda/sum.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream);
+
+void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu
index 89abfc21d..8ac669f94 100644
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@@ -1,5 +1,15 @@
 #include "unary.cuh"
 
+static __global__ void neg_f32(const float * x, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = -x[i];
+}
+
 static __global__ void gelu_f32(const float * x, float * dst, const int k) {
     const float GELU_COEF_A    = 0.044715f;
     const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
@@ -119,6 +129,11 @@ static __global__ void cos_f32(const float * x, float * dst, const int k) {
     dst[i] = cosf(x[i]);
 }
 
+static void neg_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
+    neg_f32<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
 static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
     const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
     gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@@ -184,6 +199,20 @@ static void cos_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
     cos_f32<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 }
 
+void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    neg_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
+}
+
 void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const ggml_tensor * src0 = dst->src[0];
     const float * src0_d = (const float *)src0->data;
diff --git a/ggml/src/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh
index c610e996a..ed2ffc461 100644
--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@@ -1,5 +1,6 @@
 #include "common.cuh"
 
+#define CUDA_NEG_BLOCK_SIZE 256
 #define CUDA_GELU_BLOCK_SIZE 256
 #define CUDA_SILU_BLOCK_SIZE 256
 #define CUDA_TANH_BLOCK_SIZE 256
@@ -12,6 +13,8 @@
 #define CUDA_SIN_BLOCK_SIZE 256
 #define CUDA_COS_BLOCK_SIZE 256
 
+void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
 void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
 void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index cf004c02c..fcf08fbdc 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -799,8 +799,9 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
             return ctx->support_simdgroup_reduction;
         case GGML_OP_NORM:
         case GGML_OP_ROPE:
-        case GGML_OP_IM2COL:
             return true;
+        case GGML_OP_IM2COL:
+            return op->src[0]->type == GGML_TYPE_F16;
         case GGML_OP_POOL_1D:
         case GGML_OP_POOL_2D:
             return false;
diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp
index 0d884f89a..4f03b01e7 100644
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -1954,6 +1954,11 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
         SYCL_CHECK(
             CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
                                 look_ahead_size, *qptr)));
+        if (!ptr) {
+            fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, look_ahead_size);
+            return nullptr;
+        }
+
         *actual_size = look_ahead_size;
         pool_size += look_ahead_size;
 
@@ -4350,6 +4355,10 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
     void * dev_ptr;
     SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
                                     size, *stream)));
+    if (!dev_ptr) {
+        fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, size);
+        return nullptr;
+    }
     ggml_backend_sycl_buffer_context * ctx = new  ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
     return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
 }
@@ -4570,7 +4579,11 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
         */
         SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(
                                         size, *stream)));
-
+        if (!buf) {
+            char err_buf[1024];
+            snprintf(err_buf, 1023, "%s: can't malloc %lu Bytes memory on device", __func__, size);
+            throw std::runtime_error(err_buf);
+        }
         // set padding to 0 to avoid possible NaN values
         if (size > original_size) {
             /*
diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index f98dbb37b..52cf83426 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -787,6 +787,9 @@ static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, s
 
 static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
     if (ctx->seqs.empty()) {
+        if (fence) {
+            ctx->q->queue.submit({}, fence);
+        }
         return;
     }
     VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")");
@@ -4616,7 +4619,7 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const
     }, dryrun);
 }
 
-static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
     const uint32_t src0_type_size = ggml_type_size(src0->type);
     const uint32_t dst_type_size = ggml_type_size(dst->type);
 
@@ -4626,10 +4629,10 @@ static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const
         (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
         0,
         0.0f, 0.0f,
-    });
+    }, dryrun);
 }
 
-static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
     const uint32_t src0_type_size = ggml_type_size(src0->type);
     const uint32_t dst_type_size = ggml_type_size(dst->type);
 
@@ -4639,7 +4642,7 @@ static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const
         (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
         0,
         0.0f, 0.0f,
-    });
+    }, dryrun);
 }
 
 static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
@@ -5658,11 +5661,15 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
     }
 }
 
-static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, bool last_node, bool dryrun){
+static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
+
+// Returns true if node has enqueued work into the queue, false otherwise
+// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
+static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
 
     if (ggml_is_empty(node) || extra == nullptr) {
-        return;
+        return false;
     }
 
     VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
@@ -5679,7 +5686,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
     case GGML_OP_PERMUTE:
     case GGML_OP_TRANSPOSE:
     case GGML_OP_NONE:
-        return;
+        return false;
     case GGML_OP_UNARY:
         switch (ggml_get_unary_op(node)) {
         case GGML_UNARY_OP_SILU:
@@ -5689,7 +5696,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
         case GGML_UNARY_OP_TANH:
             break;
         default:
-            return;
+            return false;
         }
         break;
     case GGML_OP_REPEAT:
@@ -5726,7 +5733,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
     default:
         std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
         GGML_ABORT("fatal error");
-        return;
+        return false;
     }
 
     vk_context compute_ctx;
@@ -5783,11 +5790,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
 
         break;
     case GGML_OP_SIN:
-        ggml_vk_sin(ctx, compute_ctx, src0, node);
+        ggml_vk_sin(ctx, compute_ctx, src0, node, dryrun);
 
         break;
     case GGML_OP_COS:
-        ggml_vk_cos(ctx, compute_ctx, src0, node);
+        ggml_vk_cos(ctx, compute_ctx, src0, node, dryrun);
 
         break;
     case GGML_OP_CLAMP:
@@ -5826,7 +5833,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
             ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun);
             break;
         default:
-            return;
+            return false;
         }
         break;
     case GGML_OP_DIAG_MASK_INF:
@@ -5870,11 +5877,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
 
         break;
     default:
-        return;
+        return false;
     }
 
     if (dryrun) {
-        return;
+        return false;
     }
 
     ctx->tensor_ctxs[node_idx] = compute_ctx;
@@ -5885,14 +5892,34 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
     last_node = true;
 #endif
 
-    if (last_node) {
+    if (submit || last_node) {
         ggml_vk_ctx_end(compute_ctx);
-        compute_ctx->exit_tensor_idx = node_idx;
+
+        // TODO probably it'd be better to pass a exit_node flag to ggml_vk_compute_forward
+        if (last_node) {
+            compute_ctx->exit_tensor_idx = node_idx_begin;
+        }
+        else {
+            compute_ctx->exit_tensor_idx = -1;
+        }
+
         ctx->compute_ctx.reset();
+
+        bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false);
+        if (!ok) {
+            if (node->op == GGML_OP_UNARY) {
+                std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
+            }
+            else {
+                std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
+            }
+        }
+
     }
+    return true;
 }
 
-static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx){
+static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){
     ggml_tensor_extra_gpu * extra = nullptr;
 
     switch (tensor->op) {
@@ -5960,40 +5987,38 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
 
     VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
 
-#ifdef GGML_VULKAN_CHECK_RESULTS
-    ggml_vk_check_results_0(tensor);
-#endif
-
     vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock();
 
-#ifdef GGML_VULKAN_PERF
-    std::chrono::steady_clock::time_point start;
-#endif // GGML_VULKAN_PERF
+    // always wait for the GPU work to be done for the last submit
+    if (tensor_idx == subctx->exit_tensor_idx) {
+        use_fence = true;
+    }
 
     // Only run if ctx hasn't been submitted yet
     if (!subctx->seqs.empty()) {
+#ifdef GGML_VULKAN_CHECK_RESULTS
+        ggml_vk_check_results_0(tensor);
+        use_fence = true;
+#endif
+
         // Do staging buffer copies
         for (auto& cpy : subctx->in_memcpys) {
             memcpy(cpy.dst, cpy.src, cpy.n);
         }
 
-#ifdef GGML_VULKAN_PERF
-        start = std::chrono::steady_clock::now();
-#endif // GGML_VULKAN_PERF
+        ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
 
-        ggml_vk_submit(subctx, ctx->fence);
+        if (use_fence) {
+            VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
+
+            ctx->device->device.resetFences({ ctx->fence });
+        }
+#ifdef GGML_VULKAN_CHECK_RESULTS
+        ggml_vk_check_results_1(tensor);
+#endif
     }
 
     if (tensor_idx == subctx->exit_tensor_idx) {
-        VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
-
-#ifdef GGML_VULKAN_PERF
-        auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start);
-        ctx->device->perf_logger->log_timing(tensor, duration.count());
-#endif // GGML_VULKAN_PERF
-
-        ctx->device->device.resetFences({ ctx->fence });
-
         // Do staging buffer copies
         for (auto& cpy : subctx->out_memcpys) {
             memcpy(cpy.dst, cpy.src, cpy.n);
@@ -6482,7 +6507,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
     ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_vk_build_graph(ctx, cgraph->nodes[i], i, 0, true);
+        ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
     }
     ggml_vk_preallocate_buffers(ctx);
     ggml_pipeline_allocate_descriptor_sets(ctx->device);
@@ -6497,31 +6522,36 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
     // Reserve tensor context space for all nodes
     ctx->tensor_ctxs.resize(cgraph->n_nodes);
 
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_vk_build_graph(ctx, cgraph->nodes[i], i, i == last_node, false);
-    }
+    bool first_node_in_batch = true; // true if next node will be first node in a batch
+    int submit_node_idx = 0; // index to first node in a batch
 
+    // submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution
+    constexpr int submit_count = 100;
+    int submitted_nodes = 0;
     for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_tensor * node = cgraph->nodes[i];
-
-        if (ggml_vk_is_empty(node)) {
-            continue;
+        if (first_node_in_batch) {
+            submit_node_idx = i;
         }
 
-        bool ok = ggml_vk_compute_forward(ctx, node, i);
-        if (!ok) {
-            if (node->op == GGML_OP_UNARY) {
-                std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
-            } else {
-                std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
+        bool submit = (submitted_nodes >= submit_count) || (i == last_node);
+
+
+        bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
+
+        if (enqueued) {
+            ++submitted_nodes;
+
+#ifndef GGML_VULKAN_CHECK_RESULTS
+            if (first_node_in_batch) {
+                first_node_in_batch = false;
             }
-        }
-#ifdef GGML_VULKAN_CHECK_RESULTS
-        else {
-            ggml_vk_check_results_1(node);
-        }
 #endif
-        GGML_ASSERT(ok);
+        }
+
+        if (submit) {
+            first_node_in_batch = true;
+            submitted_nodes = 0;
+        }
     }
 
 #ifdef GGML_VULKAN_PERF
@@ -6602,6 +6632,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
                         return false;
                 }
             } break;
+        case GGML_OP_CONT:
         case GGML_OP_CPY:
         case GGML_OP_DUP:
             {
@@ -6642,7 +6673,6 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
         case GGML_OP_COS:
         case GGML_OP_CLAMP:
         case GGML_OP_PAD:
-        case GGML_OP_CONT:
         case GGML_OP_DIAG_MASK_INF:
         case GGML_OP_SOFT_MAX:
         case GGML_OP_ARGSORT:
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 3e0e6bccd..85eea1a3a 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -5291,6 +5291,7 @@ struct ggml_tensor * ggml_concat(
     bool is_node = false;
 
     if (a->grad || b->grad) {
+        GGML_ABORT("fatal error"); // TODO: implement
         is_node = true;
     }
 
@@ -5412,6 +5413,7 @@ struct ggml_tensor * ggml_leaky_relu(
     bool is_node = false;
 
     if (!inplace && (a->grad)) {
+        GGML_ABORT("fatal error"); // TODO: not implemented
         is_node = true;
     }
 
@@ -5850,6 +5852,7 @@ static struct ggml_tensor * ggml_set_impl(
     // make a view of the destination
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
+    GGML_ASSERT(offset < (size_t)(1 << 30));
     int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
     ggml_set_op_params(result, params, sizeof(params));
 
@@ -6807,14 +6810,12 @@ struct ggml_tensor * ggml_rope_back(
     GGML_ASSERT(ggml_is_vector(b));
     GGML_ASSERT(b->type == GGML_TYPE_I32);
     GGML_ASSERT(a->ne[2] == b->ne[0]);
-    GGML_ASSERT(c == NULL && "freq factors not implemented yet");
-
-    GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
 
     bool is_node = false;
 
     if (a->grad) {
-        is_node = false; // TODO: implement backward
+        GGML_ASSERT(false && "backwards pass not implemented");
+        is_node = false;
     }
 
     struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
@@ -6832,6 +6833,7 @@ struct ggml_tensor * ggml_rope_back(
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
     result->src[1] = b;
+    result->src[2] = c;
 
     return result;
 }
@@ -7385,6 +7387,11 @@ struct ggml_tensor * ggml_argsort(
         enum ggml_sort_order  order) {
     bool is_node = false;
 
+    if (a->grad) {
+        GGML_ABORT("fatal error"); // TODO: not implemented
+        is_node = true;
+    }
+
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
 
     ggml_set_op_params_i32(result, 0, (int32_t) order);
@@ -8346,8 +8353,7 @@ static void ggml_compute_forward_dup_same_cont(
     GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
     GGML_ASSERT(src0->type == dst->type);
 
-    const size_t nb00 = src0->nb[0];
-    const size_t nb0 = dst->nb[0];
+    const size_t nb0 = ggml_type_size(src0->type);
 
     const int ith = params->ith; // thread index
     const int nth = params->nth; // number of threads
@@ -8361,8 +8367,8 @@ static void ggml_compute_forward_dup_same_cont(
     if (ie0 < ie1) {
         memcpy(
             ((char *)  dst->data + ie0*nb0),
-            ((char *) src0->data + ie0*nb00),
-            (ie1 - ie0) * ggml_type_size(src0->type));
+            ((char *) src0->data + ie0*nb0),
+            (ie1 - ie0) * nb0);
     }
 }
 
@@ -8379,11 +8385,6 @@ static void ggml_compute_forward_dup_f16(
     const int ith = params->ith; // thread index
     const int nth = params->nth; // number of threads
 
-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
-        ggml_compute_forward_dup_same_cont(params, dst);
-        return;
-    }
-
     // parallelize by rows
     const int nr = ne01;
     // number of rows per thread
@@ -8648,11 +8649,6 @@ static void ggml_compute_forward_dup_bf16(
     const int ith = params->ith; // thread index
     const int nth = params->nth; // number of threads
 
-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
-        ggml_compute_forward_dup_same_cont(params, dst);
-        return;
-    }
-
     // parallelize by rows
     const int nr = ne01;
     // number of rows per thread
@@ -9004,11 +9000,6 @@ static void ggml_compute_forward_dup_f32(
     const int ith = params->ith; // thread index
     const int nth = params->nth; // number of threads
 
-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
-        ggml_compute_forward_dup_same_cont(params, dst);
-        return;
-    }
-
     // parallelize by rows
     const int nr = ne01;
     // number of rows per thread
@@ -9318,13 +9309,13 @@ static void ggml_compute_forward_dup_bytes(
     GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
     GGML_ASSERT(src0->type == dst->type);
 
+    GGML_TENSOR_UNARY_OP_LOCALS;
+
     if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
         ggml_compute_forward_dup_same_cont(params, dst);
         return;
     }
 
-    GGML_TENSOR_UNARY_OP_LOCALS;
-
     const size_t type_size = ggml_type_size(src0->type);
     const int ith = params->ith; // thread index
     const int nth = params->nth; // number of threads
@@ -11015,9 +11006,6 @@ static void ggml_compute_forward_sum_f32(
         return;
     }
 
-    assert(ggml_is_scalar(dst));
-
-
     assert(ggml_is_scalar(dst));
     assert(src0->nb[0] == sizeof(float));
 
@@ -13775,7 +13763,7 @@ static void ggml_compute_forward_get_rows_q(
         const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
         const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
 
-        assert(i01 >= 0 && i01 < ne01);
+        GGML_ASSERT(i01 >= 0 && i01 < ne01);
 
         dequantize_row_q(
                 (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
@@ -13816,7 +13804,7 @@ static void ggml_compute_forward_get_rows_f16(
         const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
         const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
 
-        assert(i01 >= 0 && i01 < ne01);
+        GGML_ASSERT(i01 >= 0 && i01 < ne01);
 
         ggml_fp16_to_fp32_row(
                 (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
@@ -13857,7 +13845,7 @@ static void ggml_compute_forward_get_rows_bf16(
         const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
         const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
 
-        assert(i01 >= 0 && i01 < ne01);
+        GGML_ASSERT(i01 >= 0 && i01 < ne01);
 
         ggml_bf16_to_fp32_row(
                 (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
@@ -13898,7 +13886,7 @@ static void ggml_compute_forward_get_rows_f32(
         const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
         const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
 
-        assert(i01 >= 0 && i01 < ne01);
+        GGML_ASSERT(i01 >= 0 && i01 < ne01);
 
         ggml_vec_cpy_f32(nc,
                 (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3),
@@ -18426,14 +18414,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 if (src0->grad || src1->grad) {
                     GGML_ASSERT(src0->type == tensor->type);
                     GGML_ASSERT(tensor->grad->type == tensor->type);
-                    GGML_ASSERT(tensor->grad->type == src1->grad->type);
+                    GGML_ASSERT(!src1->grad || src1->grad->type == tensor->grad->type);
 
                     tensor_grad_view = ggml_view_4d(ctx,
-                        tensor->grad,
-                        src1->grad->ne[0],
-                        src1->grad->ne[1],
-                        src1->grad->ne[2],
-                        src1->grad->ne[3],
+                        tensor->grad, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
                         nb1, nb2, nb3, offset);
                 }
 
@@ -18502,9 +18486,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
 
                     memcpy(&offset, tensor->op_params, sizeof(offset));
 
-                    size_t nb1     = tensor->nb[1];
-                    size_t nb2     = tensor->nb[2];
-                    size_t nb3     = tensor->nb[3];
+                    size_t nb1 = tensor->nb[1];
+                    size_t nb2 = tensor->nb[2];
+                    size_t nb3 = tensor->nb[3];
 
                     if (src0->type != src0->grad->type) {
                         // gradient is typically F32, but src0 could be other type
@@ -19200,7 +19184,8 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
     }
 
     for (size_t i = 0; i < src->visited_hash_set.size; ++i) {
-        if (src->visited_hash_set.keys[i]) {
+        // copy all hashset keys (tensors) that are in use
+        if (ggml_bitset_get(src->visited_hash_set.used, i)) {
             ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]);
         }
     }
diff --git a/ggml/src/llamafile/sgemm.cpp b/ggml/src/llamafile/sgemm.cpp
index f0988ba7c..d0c2bb284 100644
--- a/ggml/src/llamafile/sgemm.cpp
+++ b/ggml/src/llamafile/sgemm.cpp
@@ -1006,6 +1006,10 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
     assert(nth > 0);
     assert(ith < nth);
 
+    // only enable sgemm for prompt processing
+    if (n < 2)
+        return false;
+
     if (Ctype != GGML_TYPE_F32)
         return false;
 
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index c9b6c6dec..02aed8120 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -401,9 +401,112 @@ static void GetOverlappingTokenSequences(const std::string& str, std::unordered_
     }
 }
 
+// KCPP SAMPLING FUNCTIONS
+void sample_softmax(llama_token_data_array * cur_p) {
+    GGML_ASSERT(cur_p->size > 0);
+
+    // Sort the logits in descending order
+    if (!cur_p->sorted) {
+        std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
+            return a.logit > b.logit;
+        });
+        cur_p->sorted = true;
+    }
+
+    float max_l = cur_p->data[0].logit;
+    float cum_sum = 0.0f;
+
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        float p = expf(cur_p->data[i].logit - max_l);
+        cur_p->data[i].p = p;
+        cum_sum += p;
+    }
+
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        cur_p->data[i].p /= cum_sum;
+    }
+}
+
+void sample_top_k(llama_token_data_array * cur_p, int32_t k) {
+    // TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
+    // if (k >= (int32_t)cur_p->size) {
+    //     return;
+    // }
+
+    if (k <= 0) {
+        k = cur_p->size;
+    }
+
+    k = std::max(k, (int) 1); //min keep of 1
+    k = std::min(k, (int) cur_p->size);
+
+    // Sort scores in descending order
+    if (!cur_p->sorted) {
+        auto comp = [](const llama_token_data & a, const llama_token_data & b) {
+            return a.logit > b.logit;
+        };
+        if (k <= 128) {
+            std::partial_sort(cur_p->data, cur_p->data + k, cur_p->data + cur_p->size, comp);
+        } else {
+            constexpr int   nbuckets     = 128;
+            constexpr float bucket_low   = -10.0f;
+            constexpr float bucket_high  =  10.0f;
+            constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
+            constexpr float bucket_inter = -bucket_low * bucket_scale;
+
+            std::vector<int> bucket_idx(cur_p->size);
+            std::vector<int> histo(nbuckets, 0);
+
+            for (int i = 0; i < (int)cur_p->size; ++i) {
+                const float val = cur_p->data[i].logit;
+                int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
+                ib = std::max(0, std::min(nbuckets-1, ib));
+                bucket_idx[i] = ib;
+                ++histo[ib];
+            }
+            int nhave = 0;
+            int ib = nbuckets - 1;
+            for ( ; ib >= 0; --ib) {
+                nhave += histo[ib];
+                if (nhave >= k) {
+                    break;
+                }
+            }
+            std::vector<llama_token_data> tmp_tokens(nhave);
+            auto * ptr = tmp_tokens.data();
+            std::vector<llama_token_data*> bucket_ptrs;
+            bucket_ptrs.reserve(nbuckets - ib);
+            for (int j = nbuckets - 1; j >= ib; --j) {
+                bucket_ptrs.push_back(ptr);
+                ptr += histo[j];
+            }
+            for (int i = 0; i < (int)cur_p->size; ++i) {
+                int j = bucket_idx[i];
+                if (j >= ib) {
+                    *bucket_ptrs[nbuckets-1-j]++ = cur_p->data[i];
+                }
+            }
+
+            ptr = tmp_tokens.data();
+            int ndone = 0;
+            for (int j = nbuckets-1; j > ib; --j) {
+                std::sort(ptr, ptr + histo[j], comp);
+                ptr += histo[j];
+                ndone += histo[j];
+            }
+            std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
+
+            std::memcpy(cur_p->data, tmp_tokens.data(), k*sizeof(llama_token_data));
+
+        }
+        cur_p->sorted = true;
+    }
+    cur_p->size = k;
+}
+
 llama_token sample_token(llama_token_data_array * candidates, std::mt19937 & rng)
 {
-    llama_sampler_softmax_impl(candidates);
+    sample_softmax(candidates);
     std::vector<float> probs;
     probs.reserve(candidates->size);
     top_picks.clear();
@@ -433,7 +536,7 @@ llama_token sample_token(llama_token_data_array * candidates, std::mt19937 & rng
 llama_token sample_token_mirostat(int n_vocab, llama_token_data_array * candidates, std::mt19937 & rng, float tau, float eta, int m, float * mu)
 {
     float N = float(n_vocab);
-    llama_sampler_softmax_impl(candidates);
+    sample_softmax(candidates);
     // Estimate s_hat using the most probable m tokens
     float s_hat = 0.0;
     float sum_ti_bi = 0.0;
@@ -449,7 +552,7 @@ llama_token sample_token_mirostat(int n_vocab, llama_token_data_array * candidat
     float epsilon_hat = s_hat - 1;
     float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
     // Sample the next word X using top-k sampling
-    llama_sampler_top_k_impl(candidates, int(k));
+    sample_top_k(candidates, int(k));
     llama_token X = sample_token(candidates, rng);    // Compute error as the difference between observed surprise and target surprise value
     size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
         return candidate.id == X;
@@ -463,7 +566,7 @@ llama_token sample_token_mirostat(int n_vocab, llama_token_data_array * candidat
 
 llama_token sample_token_mirostat_v2(llama_token_data_array * candidates, std::mt19937 & rng, float tau, float eta, float * mu)
 {
-    llama_sampler_softmax_impl(candidates);
+    sample_softmax(candidates);
     // Truncate the words with surprise values greater than mu
     candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
         return -log2f(candidate.p) > *mu;
@@ -474,7 +577,7 @@ llama_token sample_token_mirostat_v2(llama_token_data_array * candidates, std::m
     }
 
     // Normalize the probabilities of the remaining words
-    llama_sampler_softmax_impl(candidates);
+    sample_softmax(candidates);
     // Sample the next word X from the remaining words
     llama_token X = sample_token(candidates,rng);
 
@@ -496,7 +599,7 @@ void sample_top_a(llama_token_data_array * candidates, float a, size_t min_keep)
         return;
     }
 
-    llama_sampler_softmax_impl(candidates);
+    sample_softmax(candidates);
 
     // Compute the cumulative probabilities
     float maxprob = candidates->data[0].p;
@@ -532,7 +635,7 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float
         return;
     }
 
-    llama_sampler_softmax_impl(candidates);
+    sample_softmax(candidates);
 
     //calculate how many tokens cross the xtc threshold
     size_t last_idx = candidates->size;
@@ -825,18 +928,292 @@ void sample_rep_pen(int n_ctx, int rep_pen_range, float rep_pen, float rep_pen_s
 
 }
 
+void sample_top_p(llama_token_data_array * cur_p, float p, size_t min_keep) {
+    if (p >= 1.0f) {
+        return;
+    }
+
+    sample_softmax(cur_p);
+
+    // Compute the cumulative probabilities
+    float cum_sum = 0.0f;
+    size_t last_idx = cur_p->size;
+
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        cum_sum += cur_p->data[i].p;
+
+        // Check if the running sum is at least p or if we have kept at least min_keep tokens
+        // we set the last index to i+1 to indicate that the current iterate should be included in the set
+        if (cum_sum >= p && i + 1 >= min_keep) {
+            last_idx = i + 1;
+            break;
+        }
+    }
+
+    // Resize the output vector to keep only the top-p tokens
+    cur_p->size = last_idx;
+}
+
+void sample_min_p(llama_token_data_array * cur_p, float p, size_t min_keep) {
+    if (p <= 0.0f || !cur_p->size) {
+        return;
+    }
+
+    bool min_p_applied = false;
+
+    // if the cur_p aren't sorted, try the unsorted implementation first
+    if (!cur_p->sorted) {
+        std::vector<llama_token_data> filtered_tokens;
+
+        float max_logit = -FLT_MAX;
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            max_logit = std::max(max_logit, cur_p->data[i].logit);
+        }
+        const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
+
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            if (cur_p->data[i].logit >= min_logit) {
+                filtered_tokens.push_back(cur_p->data[i]);
+            }
+        }
+
+        // if we have enough values the operation was a success
+        if (filtered_tokens.size() >= min_keep) {
+            memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
+            cur_p->size = filtered_tokens.size();
+            min_p_applied = true;
+        }
+    }
+
+    // if the cur_p are sorted or the unsorted implementation failed, use this implementation
+    if (!min_p_applied) {
+        // Sort the logits in descending order
+        if (!cur_p->sorted) {
+            std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
+                return a.logit > b.logit;
+            });
+            cur_p->sorted = true;
+        }
+
+        const float min_logit = cur_p->data[0].logit + logf(p); // min logit for p_i >= p * p_max
+        size_t i = 1; // first token always matches
+
+        for (; i < cur_p->size; ++i) {
+            if (cur_p->data[i].logit < min_logit && i >= min_keep) {
+                break; // prob too small
+            }
+        }
+
+        // Resize the output vector to keep only the matching tokens
+        cur_p->size = i;
+    }
+}
+
+void sample_tail_free(llama_token_data_array * cur_p, float z, size_t min_keep) {
+    if (z >= 1.0f || cur_p->size <= 2) {
+        return;
+    }
+
+    sample_softmax(cur_p);
+
+    // Compute the first and second derivatives
+    std::vector<float> first_derivatives(cur_p->size - 1);
+    std::vector<float> second_derivatives(cur_p->size - 2);
+
+    for (size_t i = 0; i < first_derivatives.size(); ++i) {
+        first_derivatives[i] = cur_p->data[i].p - cur_p->data[i + 1].p;
+    }
+    for (size_t i = 0; i < second_derivatives.size(); ++i) {
+        second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
+    }
+
+    // Calculate absolute value of second derivatives
+    for (size_t i = 0; i < second_derivatives.size(); ++i) {
+        second_derivatives[i] = std::abs(second_derivatives[i]);
+    }
+
+    // Normalize the second derivatives
+    {
+        const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
+
+        if (second_derivatives_sum > 1e-6f) {
+            for (float & value : second_derivatives) {
+                value /= second_derivatives_sum;
+            }
+        } else {
+            for (float & value : second_derivatives) {
+                value = 1.0f / second_derivatives.size();
+            }
+        }
+    }
+
+    float cum_sum = 0.0f;
+    size_t last_idx = cur_p->size;
+    for (size_t i = 0; i < second_derivatives.size(); ++i) {
+        cum_sum += second_derivatives[i];
+
+        // Check if the running sum is greater than z or if we have kept at least min_keep tokens
+        if (cum_sum > z && i >= min_keep) {
+            last_idx = i;
+            break;
+        }
+    }
+
+    // Resize the output vector to keep only the tokens above the tail location
+    cur_p->size = last_idx;
+}
+
+void sampler_typical(llama_token_data_array * cur_p, float p, size_t min_keep) {
+    // Reference implementation:
+    // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
+    if (p >= 1.0f) {
+        return;
+    }
+
+    // Compute the softmax of logits and calculate entropy
+    sample_softmax(cur_p);
+
+    float entropy = 0.0f;
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        if(cur_p->data[i].p>0)
+        {
+            entropy += -cur_p->data[i].p * logf(cur_p->data[i].p);
+        }
+    }
+
+    // Compute the absolute difference between negative log probability and entropy for each candidate
+    std::vector<float> shifted_scores;
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        float shifted_score = fabsf(-logf(cur_p->data[i].p) - entropy);
+        shifted_scores.push_back(shifted_score);
+    }
+
+    // Sort tokens based on the shifted_scores and their corresponding indices
+    std::vector<size_t> indices(cur_p->size);
+    std::iota(indices.begin(), indices.end(), 0);
+
+    std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
+        return shifted_scores[a] < shifted_scores[b];
+    });
+
+    // Compute the cumulative probabilities
+    float cum_sum = 0.0f;
+    size_t last_idx = indices.size();
+
+    for (size_t i = 0; i < indices.size(); ++i) {
+        size_t idx = indices[i];
+        cum_sum += cur_p->data[idx].p;
+
+        // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
+        if (cum_sum > p && i >= min_keep - 1) {
+            last_idx = i + 1;
+            break;
+        }
+    }
+
+    // Resize the output vector to keep only the locally typical tokens
+    std::vector<llama_token_data> cur_p_new;
+    for (size_t i = 0; i < last_idx; ++i) {
+        size_t idx = indices[i];
+        cur_p_new.push_back(cur_p->data[idx]);
+    }
+
+    // Replace the data in cur_p with the cur_p_new data
+    std::copy(cur_p_new.begin(), cur_p_new.end(), cur_p->data);
+    cur_p->size = cur_p_new.size();
+    cur_p->sorted = false;
+}
+
+void sample_entropy(llama_token_data_array * cur_p, float min_temp, float max_temp, float exponent_val, float smoothing_factor) {
+    // no need to do anything if there is only one (or zero) candidates
+    if (cur_p->size <= 1) {
+        return;
+    }
+
+    // Calculate maximum possible entropy
+    float max_entropy = -logf(1.0f / cur_p->size);
+
+    sample_softmax(cur_p);
+
+    // Calculate entropy of the softmax probabilities
+    float entropy = 0.0f;
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        float prob = cur_p->data[i].p;
+        if (prob > 0.0f) { // Ensure no log(0)
+            entropy -= prob * logf(prob);
+        }
+    }
+
+    // Normalize the entropy (max_entropy cannot be 0 here because we checked cur_p->size != 1 above)
+    float normalized_entropy = entropy / max_entropy;
+
+    // Map the normalized entropy to the desired temperature range using the power function
+    float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
+
+    // Apply the dynamically calculated temperature scaling
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        cur_p->data[i].logit /= dyn_temp;
+    }
+
+    // Re-compute softmax probabilities after scaling logits with dynamic temperature
+    const double max_l_double = cur_p->data[0].logit;
+
+    double cum_sum_double = 0.0;
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        double p = exp(cur_p->data[i].logit - max_l_double);
+        cur_p->data[i].p = p; // Store the scaled probability
+        cum_sum_double += p;
+    }
+
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        cur_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
+    }
+
+    // Only apply smoothing if smoothing_factor is > 0. Do not change base implementation otherwise.
+    if (smoothing_factor > 0 && cur_p->size > 1) {
+        sample_softmax(cur_p);
+        float h = cur_p->data[0].logit; // Find the maximum logit for h to be added after the transformation
+        // Apply quadratic transformation using the smoothing_factor
+        for (size_t i = 0; i < cur_p->size; ++i)
+        {
+            float logit_shifted = cur_p->data[i].logit - h;
+            cur_p->data[i].logit = -smoothing_factor * logit_shifted * logit_shifted + h;
+        }
+        sample_softmax(cur_p);
+    }
+
+}
+
 void sample_temperature(llama_token_data_array * candidates_p, float temp, float smoothing_factor)
 {
+    bool isgreedy = false;
     if (temp <= 0)
     {
         // Imitate greedy sampling
         temp = 0.00390625f; //cannot be zero else div0, this is 1/256
-        llama_sampler_temp_impl(candidates_p, temp, 0);
-        llama_sampler_top_k_impl(candidates_p, 1); //only want first candidate
+        smoothing_factor = 0;
+        isgreedy = true;
     }
-    else
+
+    for (size_t i = 0; i < candidates_p->size; ++i) {
+        candidates_p->data[i].logit /= temp;
+    }
+    // Only apply smoothing if smoothing_factor is > 0. Do not change base implementation otherwise.
+    if (smoothing_factor > 0 && candidates_p->size > 1) {
+        sample_softmax(candidates_p);
+        float h = candidates_p->data[0].logit; // Find the maximum logit for h to be added after the transformation
+        // Apply quadratic transformation using the smoothing_factor
+        for (size_t i = 0; i < candidates_p->size; ++i)
+        {
+            float logit_shifted = candidates_p->data[i].logit - h;
+            candidates_p->data[i].logit = -smoothing_factor * logit_shifted * logit_shifted + h;
+        }
+        sample_softmax(candidates_p);
+    }
+
+    if(isgreedy)
     {
-        llama_sampler_temp_impl(candidates_p, temp, smoothing_factor);
+        sample_top_k(candidates_p, 1); //only want first candidate
     }
 }
 
@@ -907,7 +1284,7 @@ const std::vector<samplers> & sampler_order, llama_grammar * grammar, float dyna
     sample_dry(n_ctx, dry_penalty_last_n, dry_multiplier, dry_base, dry_allowed_length, dry_sequence_breakers, &candidates_p);
 
     //prefilter to top 5k tokens for improved speed
-    llama_sampler_top_k_impl(&candidates_p, 5000);
+    sample_top_k(&candidates_p, 5000);
 
     if (mirostat == 1 || mirostat == 2)
     {
@@ -931,20 +1308,20 @@ const std::vector<samplers> & sampler_order, llama_grammar * grammar, float dyna
             switch (sampler_order[i])
             {
                 case KCPP_SAMPLER_TOP_K:
-                    llama_sampler_top_k_impl(&candidates_p, top_k);
+                    sample_top_k(&candidates_p, top_k);
                     break;
                 case KCPP_SAMPLER_TOP_A:
                     sample_top_a(&candidates_p, top_a, 1);
                     break;
                 case KCPP_SAMPLER_TOP_P:
-                    llama_sampler_top_p_impl(&candidates_p, top_p, 1);
-                    llama_sampler_min_p_impl(&candidates_p, min_p, 1);
+                    sample_top_p(&candidates_p, top_p, 1);
+                    sample_min_p(&candidates_p, min_p, 1);
                     break;
                 case KCPP_SAMPLER_TFS:
-                    llama_sampler_tail_free_impl(&candidates_p, tfs, 1);
+                    sample_tail_free(&candidates_p, tfs, 1);
                     break;
                 case KCPP_SAMPLER_TYP:
-                    llama_sampler_typical_impl(&candidates_p, typical_p, 1);
+                    sampler_typical(&candidates_p, typical_p, 1);
                     break;
                 case KCPP_SAMPLER_TEMP:
                     if (dynatemp_range>0)
@@ -955,7 +1332,7 @@ const std::vector<samplers> & sampler_order, llama_grammar * grammar, float dyna
                         dynatemp_min = dynatemp_min<0?0:dynatemp_min;
                         dynatemp_max = dynatemp_max<0?0:dynatemp_max;
                         dynatemp_exponent = dynatemp_exponent<0?0:dynatemp_exponent;
-                        llama_sampler_entropy_impl(&candidates_p, dynatemp_min, dynatemp_max, dynatemp_exponent, smoothing_factor);
+                        sample_entropy(&candidates_p, dynatemp_min, dynatemp_max, dynatemp_exponent, smoothing_factor);
                     }
                     else
                     {
diff --git a/src/llama-impl.h b/src/llama-impl.h
index fa2e09e1f..87012617f 100644
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@@ -101,6 +101,10 @@ struct ring_buffer {
     }
 
     void push_back(const T & value) {
+        if (capacity == 0) {
+            throw std::runtime_error("ring buffer: capacity is zero");
+        }
+
         if (sz == capacity) {
             // advance the start when buffer is full
             first = (first + 1) % capacity;
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index ad14e392a..64d4cc995 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -13,12 +13,40 @@
 #include <unordered_map>
 
 static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng, std::vector<float> & probs) {
+#if 1
     probs.resize(cur_p->size);
     for (size_t i = 0; i < cur_p->size; ++i) {
         probs[i] = cur_p->data[i].p;
     }
 
     std::discrete_distribution<size_t> dist(probs.begin(), probs.end());
+#else
+    // avoid the copy with a custom iterator
+    #pragma GCC diagnostic push
+    #pragma GCC diagnostic ignored "-Wunused-local-typedefs"
+
+    struct probs_iterator {
+        typedef std::input_iterator_tag iterator_category;
+        typedef float value_type;
+        typedef float * pointer;
+        typedef float & reference;
+        typedef size_t difference_type;
+
+        const llama_token_data_array * data;
+        size_t i;
+
+        bool operator==(const probs_iterator & other) const { return data + i == other.data + other.i; }
+        bool operator!=(const probs_iterator & other) const { return data + i != other.data + other.i; }
+        float operator*() const { return data->data[i].p; }
+        probs_iterator & operator++() { ++i; return *this; }
+        probs_iterator operator++(int) { probs_iterator tmp = *this; ++i; return tmp; }
+    };
+    #pragma GCC diagnostic pop
+
+    std::discrete_distribution<size_t> dist(probs_iterator{cur_p, 0}, probs_iterator{cur_p, cur_p->size});
+
+    GGML_UNUSED(probs);
+#endif
 
     return dist(rng);
 }
@@ -72,7 +100,6 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
         k = cur_p->size;
     }
 
-    k = std::max(k, (int) 1); //min keep of 1
     k = std::min(k, (int) cur_p->size);
 
     // Sort scores in descending order
@@ -139,313 +166,6 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
     cur_p->size = k;
 }
 
-static void llama_sampler_top_p_impl(llama_token_data_array * cur_p, float p, size_t min_keep) {
-    if (p >= 1.0f) {
-        return;
-    }
-
-    llama_sampler_softmax_impl(cur_p);
-
-    // Compute the cumulative probabilities
-    float cum_sum = 0.0f;
-    size_t last_idx = cur_p->size;
-
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        cum_sum += cur_p->data[i].p;
-
-        // Check if the running sum is at least p or if we have kept at least min_keep tokens
-        // we set the last index to i+1 to indicate that the current iterate should be included in the set
-        if (cum_sum >= p && i + 1 >= min_keep) {
-            last_idx = i + 1;
-            break;
-        }
-    }
-
-    // Resize the output vector to keep only the top-p tokens
-    cur_p->size = last_idx;
-}
-
-static void llama_sampler_min_p_impl(llama_token_data_array * cur_p, float p, size_t min_keep) {
-    if (p <= 0.0f || !cur_p->size) {
-        return;
-    }
-
-    bool min_p_applied = false;
-
-    // if the cur_p aren't sorted, try the unsorted implementation first
-    if (!cur_p->sorted) {
-        std::vector<llama_token_data> filtered_tokens;
-
-        float max_logit = -FLT_MAX;
-        for (size_t i = 0; i < cur_p->size; ++i) {
-            max_logit = std::max(max_logit, cur_p->data[i].logit);
-        }
-        const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
-
-        for (size_t i = 0; i < cur_p->size; ++i) {
-            if (cur_p->data[i].logit >= min_logit) {
-                filtered_tokens.push_back(cur_p->data[i]);
-            }
-        }
-
-        // if we have enough values the operation was a success
-        if (filtered_tokens.size() >= min_keep) {
-            memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
-            cur_p->size = filtered_tokens.size();
-            min_p_applied = true;
-        }
-    }
-
-    // if the cur_p are sorted or the unsorted implementation failed, use this implementation
-    if (!min_p_applied) {
-        // Sort the logits in descending order
-        if (!cur_p->sorted) {
-            std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
-                return a.logit > b.logit;
-            });
-            cur_p->sorted = true;
-        }
-
-        const float min_logit = cur_p->data[0].logit + logf(p); // min logit for p_i >= p * p_max
-        size_t i = 1; // first token always matches
-
-        for (; i < cur_p->size; ++i) {
-            if (cur_p->data[i].logit < min_logit && i >= min_keep) {
-                break; // prob too small
-            }
-        }
-
-        // Resize the output vector to keep only the matching tokens
-        cur_p->size = i;
-    }
-}
-
-static void llama_sampler_tail_free_impl(llama_token_data_array * cur_p, float z, size_t min_keep) {
-    if (z >= 1.0f || cur_p->size <= 2) {
-        return;
-    }
-
-    llama_sampler_softmax_impl(cur_p);
-
-    // Compute the first and second derivatives
-    std::vector<float> first_derivatives(cur_p->size - 1);
-    std::vector<float> second_derivatives(cur_p->size - 2);
-
-    for (size_t i = 0; i < first_derivatives.size(); ++i) {
-        first_derivatives[i] = cur_p->data[i].p - cur_p->data[i + 1].p;
-    }
-    for (size_t i = 0; i < second_derivatives.size(); ++i) {
-        second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
-    }
-
-    // Calculate absolute value of second derivatives
-    for (size_t i = 0; i < second_derivatives.size(); ++i) {
-        second_derivatives[i] = std::abs(second_derivatives[i]);
-    }
-
-    // Normalize the second derivatives
-    {
-        const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
-
-        if (second_derivatives_sum > 1e-6f) {
-            for (float & value : second_derivatives) {
-                value /= second_derivatives_sum;
-            }
-        } else {
-            for (float & value : second_derivatives) {
-                value = 1.0f / second_derivatives.size();
-            }
-        }
-    }
-
-    float cum_sum = 0.0f;
-    size_t last_idx = cur_p->size;
-    for (size_t i = 0; i < second_derivatives.size(); ++i) {
-        cum_sum += second_derivatives[i];
-
-        // Check if the running sum is greater than z or if we have kept at least min_keep tokens
-        if (cum_sum > z && i >= min_keep) {
-            last_idx = i;
-            break;
-        }
-    }
-
-    // Resize the output vector to keep only the tokens above the tail location
-    cur_p->size = last_idx;
-}
-
-static void llama_sampler_typical_impl(llama_token_data_array * cur_p, float p, size_t min_keep) {
-    // Reference implementation:
-    // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
-    if (p >= 1.0f) {
-        return;
-    }
-
-    // Compute the softmax of logits and calculate entropy
-    llama_sampler_softmax_impl(cur_p);
-
-    float entropy = 0.0f;
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        if(cur_p->data[i].p>0)
-        {
-            entropy += -cur_p->data[i].p * logf(cur_p->data[i].p);
-        }
-    }
-
-    // Compute the absolute difference between negative log probability and entropy for each candidate
-    std::vector<float> shifted_scores;
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        float shifted_score = fabsf(-logf(cur_p->data[i].p) - entropy);
-        shifted_scores.push_back(shifted_score);
-    }
-
-    // Sort tokens based on the shifted_scores and their corresponding indices
-    std::vector<size_t> indices(cur_p->size);
-    std::iota(indices.begin(), indices.end(), 0);
-
-    std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
-        return shifted_scores[a] < shifted_scores[b];
-    });
-
-    // Compute the cumulative probabilities
-    float cum_sum = 0.0f;
-    size_t last_idx = indices.size();
-
-    for (size_t i = 0; i < indices.size(); ++i) {
-        size_t idx = indices[i];
-        cum_sum += cur_p->data[idx].p;
-
-        // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
-        if (cum_sum > p && i >= min_keep - 1) {
-            last_idx = i + 1;
-            break;
-        }
-    }
-
-    // Resize the output vector to keep only the locally typical tokens
-    std::vector<llama_token_data> cur_p_new;
-    for (size_t i = 0; i < last_idx; ++i) {
-        size_t idx = indices[i];
-        cur_p_new.push_back(cur_p->data[idx]);
-    }
-
-    // Replace the data in cur_p with the cur_p_new data
-    std::copy(cur_p_new.begin(), cur_p_new.end(), cur_p->data);
-    cur_p->size = cur_p_new.size();
-    cur_p->sorted = false;
-}
-
-static void llama_sampler_entropy_impl(llama_token_data_array * cur_p, float min_temp, float max_temp, float exponent_val, float smoothing_factor) {
-    // no need to do anything if there is only one (or zero) candidates
-    if (cur_p->size <= 1) {
-        return;
-    }
-
-    // Calculate maximum possible entropy
-    float max_entropy = -logf(1.0f / cur_p->size);
-
-    llama_sampler_softmax_impl(cur_p);
-
-    // Calculate entropy of the softmax probabilities
-    float entropy = 0.0f;
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        float prob = cur_p->data[i].p;
-        if (prob > 0.0f) { // Ensure no log(0)
-            entropy -= prob * logf(prob);
-        }
-    }
-
-    // Normalize the entropy (max_entropy cannot be 0 here because we checked cur_p->size != 1 above)
-    float normalized_entropy = entropy / max_entropy;
-
-    // Map the normalized entropy to the desired temperature range using the power function
-    float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
-
-    // Apply the dynamically calculated temperature scaling
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        cur_p->data[i].logit /= dyn_temp;
-    }
-
-    // Re-compute softmax probabilities after scaling logits with dynamic temperature
-    const double max_l_double = cur_p->data[0].logit;
-
-    double cum_sum_double = 0.0;
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        double p = exp(cur_p->data[i].logit - max_l_double);
-        cur_p->data[i].p = p; // Store the scaled probability
-        cum_sum_double += p;
-    }
-
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        cur_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
-    }
-
-    // Only apply smoothing if smoothing_factor is > 0. Do not change base implementation otherwise.
-    if (smoothing_factor > 0 && cur_p->size > 1) {
-        llama_sampler_softmax_impl(cur_p);
-        float h = cur_p->data[0].logit; // Find the maximum logit for h to be added after the transformation
-        // Apply quadratic transformation using the smoothing_factor
-        for (size_t i = 0; i < cur_p->size; ++i)
-        {
-            float logit_shifted = cur_p->data[i].logit - h;
-            cur_p->data[i].logit = -smoothing_factor * logit_shifted * logit_shifted + h;
-        }
-        llama_sampler_softmax_impl(cur_p);
-    }
-
-}
-
-static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp, float smoothing_factor) {
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        cur_p->data[i].logit /= temp;
-    }
-    // Only apply smoothing if smoothing_factor is > 0. Do not change base implementation otherwise.
-    if (smoothing_factor > 0 && cur_p->size > 1) {
-        llama_sampler_softmax_impl(cur_p);
-        float h = cur_p->data[0].logit; // Find the maximum logit for h to be added after the transformation
-        // Apply quadratic transformation using the smoothing_factor
-        for (size_t i = 0; i < cur_p->size; ++i)
-        {
-            float logit_shifted = cur_p->data[i].logit - h;
-            cur_p->data[i].logit = -smoothing_factor * logit_shifted * logit_shifted + h;
-        }
-        llama_sampler_softmax_impl(cur_p);
-    }
-}
-
-static void llama_sampler_grammar_impl(llama_token_data_array * cur_p, const struct llama_grammar & grammar) {
-    llama_grammar_apply_impl(grammar, cur_p);
-}
-
-void llama_sampler_penalties_impl(
-       llama_token_data_array * cur_p,
-        const llama_token_cnt & token_count,
-                        float   penalty_repeat,
-                        float   penalty_freq,
-                        float   penalty_present) {
-    // Apply frequency and presence penalties to the cur_p
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        const auto token_iter = token_count.find(cur_p->data[i].id);
-        if (token_iter == token_count.end()) {
-            continue;
-        }
-
-        const int count = token_iter->second;
-
-        // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
-        // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
-        if (cur_p->data[i].logit <= 0) {
-            cur_p->data[i].logit *= penalty_repeat;
-        } else {
-            cur_p->data[i].logit /= penalty_repeat;
-        }
-
-        cur_p->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present;
-    }
-
-    cur_p->sorted = false;
-}
-
 // llama_sampler API
 
 const char * llama_sampler_name(const struct llama_sampler * smpl) {
@@ -613,17 +333,23 @@ int llama_sampler_chain_n(const struct llama_sampler * chain) {
 
 // greedy
 
-static struct llama_sampler_i llama_sampler_greedy_i = {
-    /* .name   = */ [](const struct llama_sampler * /*smpl*/) { return "greedy"; },
-    /* .accept = */ nullptr,
-    /* .apply  = */ [](struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) {
-        cur_p->selected = 0;
-        for (size_t i = 1; i < cur_p->size; ++i) {
-            if (cur_p->data[i].logit > cur_p->data[cur_p->selected].logit) {
-                cur_p->selected = i;
-            }
+static const char * llama_sampler_greedy_name(const struct llama_sampler * /*smpl*/) {
+    return "greedy";
+}
+
+static void llama_sampler_greedy_apply(struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) {
+    cur_p->selected = 0;
+    for (size_t i = 1; i < cur_p->size; ++i) {
+        if (cur_p->data[i].logit > cur_p->data[cur_p->selected].logit) {
+            cur_p->selected = i;
         }
-    },
+    }
+}
+
+static struct llama_sampler_i llama_sampler_greedy_i = {
+    /* .name   = */ llama_sampler_greedy_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sampler_greedy_apply,
     /* .reset  = */ nullptr,
     /* .clone  = */ nullptr,
     /* .free   = */ nullptr,
@@ -646,30 +372,45 @@ struct llama_sampler_dist {
     std::vector<float> probs; // work array
 };
 
+static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*/) {
+    return "dist";
+}
+
+static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_dist *) smpl->ctx;
+    cur_p->selected = llama_sample_dist(cur_p, ctx->rng, ctx->probs);
+}
+
+static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_dist *) smpl->ctx;
+    auto * result = llama_sampler_init_dist(ctx->seed);
+
+    // copy the state
+    {
+        auto * result_ctx = (llama_sampler_dist *) result->ctx;
+
+        result_ctx->rng = ctx->rng;
+    }
+
+    return result;
+}
+
+static void llama_sampler_dist_reset(struct llama_sampler * smpl) {
+    auto * ctx = (llama_sampler_dist *) smpl->ctx;
+    ctx->rng = std::mt19937(ctx->seed);
+}
+
+static void llama_sampler_dist_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_dist *) smpl->ctx;
+}
+
 static struct llama_sampler_i llama_sampler_dist_i = {
-    /* .name   = */ [](const struct llama_sampler * /*smpl*/) { return "dist"; },
+    /* .name   = */ llama_sampler_dist_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-        auto * ctx = (llama_sampler_dist *) smpl->ctx;
-        cur_p->selected = llama_sample_dist(cur_p, ctx->rng, ctx->probs);
-    },
-    /* .reset  = */ nullptr,
-    /* .clone  = */ [](const struct llama_sampler * smpl) {
-        const auto * ctx = (const llama_sampler_dist *) smpl->ctx;
-        auto * result = llama_sampler_init_dist(ctx->seed);
-
-        // copy the state
-        {
-            auto * result_ctx = (llama_sampler_dist *) result->ctx;
-
-            result_ctx->rng = ctx->rng;
-        }
-
-        return result;
-    },
-    /* .free   = */ [](struct llama_sampler * smpl) {
-        delete (llama_sampler_dist *) smpl->ctx;
-    },
+    /* .apply  = */ llama_sampler_dist_apply,
+    /* .reset  = */ llama_sampler_dist_reset,
+    /* .clone  = */ llama_sampler_dist_clone,
+    /* .free   = */ llama_sampler_dist_free,
 };
 
 struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
@@ -685,12 +426,18 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
 
 // softmax
 
+static const char * llama_sampler_softmax_name(const struct llama_sampler * /*smpl*/) {
+    return "softmax";
+}
+
+static void llama_sampler_softmax_apply(struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) {
+    llama_sampler_softmax_impl(cur_p);
+}
+
 static struct llama_sampler_i llama_sampler_softmax_i = {
-    /* .name   = */ [](const struct llama_sampler * /*smpl*/) { return "softmax"; },
+    /* .name   = */ llama_sampler_softmax_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ [](struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) {
-        llama_sampler_softmax_impl(cur_p);
-    },
+    /* .apply  = */ llama_sampler_softmax_apply,
     /* .reset  = */ nullptr,
     /* .clone  = */ nullptr,
     /* .free   = */ nullptr,
@@ -709,21 +456,31 @@ struct llama_sampler_top_k {
     const int32_t k;
 };
 
+static const char * llama_sampler_top_k_name(const struct llama_sampler * /*smpl*/) {
+    return "top-k";
+}
+
+static void llama_sampler_top_k_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    const auto * ctx = (llama_sampler_top_k *) smpl->ctx;
+    llama_sampler_top_k_impl(cur_p, ctx->k);
+}
+
+static struct llama_sampler * llama_sampler_top_k_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_top_k *) smpl->ctx;
+    return llama_sampler_init_top_k(ctx->k);
+}
+
+static void llama_sampler_top_k_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_top_k *) smpl->ctx;
+}
+
 static struct llama_sampler_i llama_sampler_top_k_i = {
-    /* .name   = */ [](const struct llama_sampler * /*smpl*/) { return "top-k"; },
+    /* .name   = */ llama_sampler_top_k_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-        const auto * ctx = (llama_sampler_top_k *) smpl->ctx;
-        llama_sampler_top_k_impl(cur_p, ctx->k);
-    },
+    /* .apply  = */ llama_sampler_top_k_apply,
     /* .reset  = */ nullptr,
-    /* .clone  = */ [](const struct llama_sampler * smpl) {
-        const auto * ctx = (const llama_sampler_top_k *) smpl->ctx;
-        return llama_sampler_init_top_k(ctx->k);
-    },
-    /* .free   = */ [](struct llama_sampler * smpl) {
-        delete (llama_sampler_top_k *) smpl->ctx;
-    },
+    /* .clone  = */ llama_sampler_top_k_clone,
+    /* .free   = */ llama_sampler_top_k_free,
 };
 
 struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
@@ -742,21 +499,54 @@ struct llama_sampler_top_p {
     const size_t min_keep;
 };
 
+static const char * llama_sampler_top_p_name(const struct llama_sampler * /*smpl*/) {
+    return "top-p";
+}
+
+static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    const auto * ctx = (llama_sampler_top_p *) smpl->ctx;
+
+    if (ctx->p >= 1.0f) {
+        return;
+    }
+
+    llama_sampler_softmax_impl(cur_p);
+
+    // Compute the cumulative probabilities
+    float cum_sum = 0.0f;
+    size_t last_idx = cur_p->size;
+
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        cum_sum += cur_p->data[i].p;
+
+        // Check if the running sum is at least p or if we have kept at least min_keep tokens
+        // we set the last index to i+1 to indicate that the current iterate should be included in the set
+        if (cum_sum >= ctx->p && i + 1 >= ctx->min_keep) {
+            last_idx = i + 1;
+            break;
+        }
+    }
+
+    // Resize the output vector to keep only the top-p tokens
+    cur_p->size = last_idx;
+}
+
+static struct llama_sampler * llama_sampler_top_p_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_top_p *) smpl->ctx;
+    return llama_sampler_init_top_p(ctx->p, ctx->min_keep);
+}
+
+static void llama_sampler_top_p_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_top_p *) smpl->ctx;
+}
+
 static struct llama_sampler_i llama_sampler_top_p_i = {
-    /* .name   = */ [](const struct llama_sampler * /*smpl*/) { return "top-p"; },
+    /* .name   = */ llama_sampler_top_p_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-        const auto * ctx = (llama_sampler_top_p *) smpl->ctx;
-        llama_sampler_top_p_impl(cur_p, ctx->p, ctx->min_keep);
-    },
+    /* .apply  = */ llama_sampler_top_p_apply,
     /* .reset  = */ nullptr,
-    /* .clone  = */ [](const struct llama_sampler * smpl) {
-        const auto * ctx = (const llama_sampler_top_p *) smpl->ctx;
-        return llama_sampler_init_top_p(ctx->p, ctx->min_keep);
-    },
-    /* .free   = */ [](struct llama_sampler * smpl) {
-        delete (llama_sampler_top_p *) smpl->ctx;
-    },
+    /* .clone  = */ llama_sampler_top_p_clone,
+    /* .free   = */ llama_sampler_top_p_free,
 };
 
 struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
@@ -776,21 +566,83 @@ struct llama_sampler_min_p {
     const size_t min_keep;
 };
 
+static const char * llama_sampler_min_p_name(const struct llama_sampler * /*smpl*/) {
+    return "min-p";
+}
+
+static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    const auto * ctx = (llama_sampler_min_p *) smpl->ctx;
+
+    if (ctx->p <= 0.0f || !cur_p->size) {
+        return;
+    }
+
+    bool min_p_applied = false;
+
+    // if the cur_p aren't sorted, try the unsorted implementation first
+    if (!cur_p->sorted) {
+        std::vector<llama_token_data> filtered_tokens;
+
+        float max_logit = -FLT_MAX;
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            max_logit = std::max(max_logit, cur_p->data[i].logit);
+        }
+        const float min_logit = max_logit + logf(ctx->p); // min logit for p_i >= p * p_max
+
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            if (cur_p->data[i].logit >= min_logit) {
+                filtered_tokens.push_back(cur_p->data[i]);
+            }
+        }
+
+        // if we have enough values the operation was a success
+        if (filtered_tokens.size() >= ctx->min_keep) {
+            memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
+            cur_p->size = filtered_tokens.size();
+            min_p_applied = true;
+        }
+    }
+
+    // if the cur_p are sorted or the unsorted implementation failed, use this implementation
+    if (!min_p_applied) {
+        // Sort the logits in descending order
+        if (!cur_p->sorted) {
+            std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
+                return a.logit > b.logit;
+            });
+            cur_p->sorted = true;
+        }
+
+        const float min_logit = cur_p->data[0].logit + logf(ctx->p); // min logit for p_i >= p * p_max
+        size_t i = 1; // first token always matches
+
+        for (; i < cur_p->size; ++i) {
+            if (cur_p->data[i].logit < min_logit && i >= ctx->min_keep) {
+                break; // prob too small
+            }
+        }
+
+        // Resize the output vector to keep only the matching tokens
+        cur_p->size = i;
+    }
+}
+
+static struct llama_sampler * llama_sampler_min_p_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_min_p *) smpl->ctx;
+    return llama_sampler_init_min_p(ctx->p, ctx->min_keep);
+}
+
+static void llama_sampler_min_p_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_min_p *) smpl->ctx;
+}
+
 static struct llama_sampler_i llama_sampler_min_p_i = {
-    /* .name   = */ [](const struct llama_sampler * /*smpl*/) { return "min-p"; },
+    /* .name   = */ llama_sampler_min_p_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-        const auto * ctx = (llama_sampler_min_p *) smpl->ctx;
-        llama_sampler_min_p_impl(cur_p, ctx->p, ctx->min_keep);
-    },
+    /* .apply  = */ llama_sampler_min_p_apply,
     /* .reset  = */ nullptr,
-    /* .clone  = */ [](const struct llama_sampler * smpl) {
-        const auto * ctx = (const llama_sampler_min_p *) smpl->ctx;
-        return llama_sampler_init_min_p(ctx->p, ctx->min_keep);
-    },
-    /* .free   = */ [](struct llama_sampler * smpl) {
-        delete (llama_sampler_min_p *) smpl->ctx;
-    },
+    /* .clone  = */ llama_sampler_min_p_clone,
+    /* .free   = */ llama_sampler_min_p_free,
 };
 
 struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
@@ -810,21 +662,82 @@ struct llama_sampler_tail_free {
     const size_t min_keep;
 };
 
+static const char * llama_sampler_tail_free_name(const struct llama_sampler * /*smpl*/) {
+    return "tail-free";
+}
+
+static void llama_sampler_tail_free_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    const auto * ctx = (llama_sampler_tail_free *) smpl->ctx;
+
+    if (ctx->z >= 1.0f || cur_p->size <= 2) {
+        return;
+    }
+
+    llama_sampler_softmax_impl(cur_p);
+
+    // Compute the first and second derivatives
+    std::vector<float> first_derivatives(cur_p->size - 1);
+    std::vector<float> second_derivatives(cur_p->size - 2);
+
+    for (size_t i = 0; i < first_derivatives.size(); ++i) {
+        first_derivatives[i] = cur_p->data[i].p - cur_p->data[i + 1].p;
+    }
+    for (size_t i = 0; i < second_derivatives.size(); ++i) {
+        second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
+    }
+
+    // Calculate absolute value of second derivatives
+    for (size_t i = 0; i < second_derivatives.size(); ++i) {
+        second_derivatives[i] = std::abs(second_derivatives[i]);
+    }
+
+    // Normalize the second derivatives
+    {
+        const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
+
+        if (second_derivatives_sum > 1e-6f) {
+            for (float & value : second_derivatives) {
+                value /= second_derivatives_sum;
+            }
+        } else {
+            for (float & value : second_derivatives) {
+                value = 1.0f / second_derivatives.size();
+            }
+        }
+    }
+
+    float cum_sum = 0.0f;
+    size_t last_idx = cur_p->size;
+    for (size_t i = 0; i < second_derivatives.size(); ++i) {
+        cum_sum += second_derivatives[i];
+
+        // Check if the running sum is greater than z or if we have kept at least min_keep tokens
+        if (cum_sum > ctx->z && i >= ctx->min_keep) {
+            last_idx = i;
+            break;
+        }
+    }
+
+    // Resize the output vector to keep only the tokens above the tail location
+    cur_p->size = last_idx;
+}
+
+static struct llama_sampler * llama_sampler_tail_free_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_tail_free *) smpl->ctx;
+    return llama_sampler_init_tail_free(ctx->z, ctx->min_keep);
+}
+
+static void llama_sampler_tail_free_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_tail_free *) smpl->ctx;
+}
+
 static struct llama_sampler_i llama_sampler_tail_free_i = {
-    /* .name   = */ [](const struct llama_sampler * /*smpl*/) { return "tail-free"; },
+    /* .name   = */ llama_sampler_tail_free_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-        const auto * ctx = (llama_sampler_tail_free *) smpl->ctx;
-        llama_sampler_tail_free_impl(cur_p, ctx->z, ctx->min_keep);
-    },
+    /* .apply  = */ llama_sampler_tail_free_apply,
     /* .reset  = */ nullptr,
-    /* .clone  = */ [](const struct llama_sampler * smpl) {
-        const auto * ctx = (const llama_sampler_tail_free *) smpl->ctx;
-        return llama_sampler_init_tail_free(ctx->z, ctx->min_keep);
-    },
-    /* .free   = */ [](struct llama_sampler * smpl) {
-        delete (llama_sampler_tail_free *) smpl->ctx;
-    },
+    /* .clone  = */ llama_sampler_tail_free_clone,
+    /* .free   = */ llama_sampler_tail_free_free,
 };
 
 struct llama_sampler * llama_sampler_init_tail_free(float z, size_t min_keep) {
@@ -844,21 +757,86 @@ struct llama_sampler_typical {
     const size_t min_keep;
 };
 
+static const char * llama_sampler_typical_name(const struct llama_sampler * /*smpl*/) {
+    return "typical";
+}
+
+static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    const auto * ctx = (llama_sampler_typical *) smpl->ctx;
+
+    // Reference implementation:
+    // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
+    if (ctx->p >= 1.0f) {
+        return;
+    }
+
+    // Compute the softmax of logits and calculate entropy
+    llama_sampler_softmax_impl(cur_p);
+
+    float entropy = 0.0f;
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        entropy += -cur_p->data[i].p * logf(cur_p->data[i].p);
+    }
+
+    // Compute the absolute difference between negative log probability and entropy for each candidate
+    std::vector<float> shifted_scores;
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        float shifted_score = fabsf(-logf(cur_p->data[i].p) - entropy);
+        shifted_scores.push_back(shifted_score);
+    }
+
+    // Sort tokens based on the shifted_scores and their corresponding indices
+    std::vector<size_t> indices(cur_p->size);
+    std::iota(indices.begin(), indices.end(), 0);
+
+    std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
+        return shifted_scores[a] < shifted_scores[b];
+    });
+
+    // Compute the cumulative probabilities
+    float cum_sum = 0.0f;
+    size_t last_idx = indices.size();
+
+    for (size_t i = 0; i < indices.size(); ++i) {
+        size_t idx = indices[i];
+        cum_sum += cur_p->data[idx].p;
+
+        // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
+        if (cum_sum > ctx->p && i >= ctx->min_keep - 1) {
+            last_idx = i + 1;
+            break;
+        }
+    }
+
+    // Resize the output vector to keep only the locally typical tokens
+    std::vector<llama_token_data> cur_p_new;
+    for (size_t i = 0; i < last_idx; ++i) {
+        size_t idx = indices[i];
+        cur_p_new.push_back(cur_p->data[idx]);
+    }
+
+    // Replace the data in cur_p with the cur_p_new data
+    std::copy(cur_p_new.begin(), cur_p_new.end(), cur_p->data);
+    cur_p->size = cur_p_new.size();
+    cur_p->sorted = false;
+}
+
+static struct llama_sampler * llama_sampler_typical_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_typical *) smpl->ctx;
+    return llama_sampler_init_typical(ctx->p, ctx->min_keep);
+}
+
+static void llama_sampler_typical_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_typical *) smpl->ctx;
+}
+
 static struct llama_sampler_i llama_sampler_typical_i = {
-    /* .name   = */ [](const struct llama_sampler * /*smpl*/) { return "typical"; },
+    /* .name   = */ llama_sampler_typical_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-        const auto * ctx = (llama_sampler_typical *) smpl->ctx;
-        llama_sampler_typical_impl(cur_p, ctx->p, ctx->min_keep);
-    },
+    /* .apply  = */ llama_sampler_typical_apply,
     /* .reset  = */ nullptr,
-    /* .clone  = */ [](const struct llama_sampler * smpl) {
-        const auto * ctx = (const llama_sampler_typical *) smpl->ctx;
-        return llama_sampler_init_typical(ctx->p, ctx->min_keep);
-    },
-    /* .free   = */ [](struct llama_sampler * smpl) {
-        delete (llama_sampler_typical *) smpl->ctx;
-    },
+    /* .clone  = */ llama_sampler_typical_clone,
+    /* .free   = */ llama_sampler_typical_free,
 };
 
 struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) {
@@ -877,21 +855,33 @@ struct llama_sampler_temp {
     const float temp;
 };
 
+static const char * llama_sampler_temp_name(const struct llama_sampler * /*smpl*/) {
+    return "temp";
+}
+
+static void llama_sampler_temp_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    const auto * ctx = (llama_sampler_temp *) smpl->ctx;
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        cur_p->data[i].logit /= ctx->temp;
+    }
+}
+
+static struct llama_sampler * llama_sampler_temp_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_temp *) smpl->ctx;
+    return llama_sampler_init_temp(ctx->temp);
+}
+
+static void llama_sampler_temp_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_temp *) smpl->ctx;
+}
+
 static struct llama_sampler_i llama_sampler_temp_i = {
-    /* .name   = */ [](const struct llama_sampler * /*smpl*/) { return "temp"; },
+    /* .name   = */ llama_sampler_temp_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-        const auto * ctx = (llama_sampler_temp *) smpl->ctx;
-        llama_sampler_temp_impl(cur_p, ctx->temp, 0); //UNUSED IN KCPP
-    },
+    /* .apply  = */ llama_sampler_temp_apply,
     /* .reset  = */ nullptr,
-    /* .clone  = */ [](const struct llama_sampler * smpl) {
-        const auto * ctx = (const llama_sampler_temp *) smpl->ctx;
-        return llama_sampler_init_temp(ctx->temp);
-    },
-    /* .free   = */ [](struct llama_sampler * smpl) {
-        delete (llama_sampler_temp *) smpl->ctx;
-    },
+    /* .clone  = */ llama_sampler_temp_clone,
+    /* .free   = */ llama_sampler_temp_free,
 };
 
 struct llama_sampler * llama_sampler_init_temp(float temp) {
@@ -911,28 +901,100 @@ struct llama_sampler_temp_ext {
     const float exponent;
 };
 
-static struct llama_sampler_i llama_sampler_temp_ext_i = {
-    /* .name   = */ [](const struct llama_sampler * /*smpl*/) { return "temp-ext"; },
-    /* .accept = */ nullptr,
-    /* .apply  = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-        const auto * ctx = (llama_sampler_temp_ext *) smpl->ctx;
-        if (ctx->delta > 0) {
-            const float temp_min = std::max(0.0f, ctx->temp - ctx->delta);
-            const float temp_max = ctx->temp + ctx->delta;
+static const char * llama_sampler_temp_ext_name(const struct llama_sampler * /*smpl*/) {
+    return "temp-ext";
+}
 
-            llama_sampler_entropy_impl(cur_p, temp_min, temp_max, ctx->exponent,0);//UNUSED IN KCPP
-        } else {
-            llama_sampler_temp_impl(cur_p, ctx->temp,0);//UNUSED IN KCPP
+static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    const auto * ctx = (llama_sampler_temp_ext *) smpl->ctx;
+    if (ctx->delta > 0) {
+        const float min_temp = std::max(0.0f, ctx->temp - ctx->delta);
+        const float max_temp = ctx->temp + ctx->delta;
+        float exponent_val = ctx->exponent;
+
+        // no need to do anything if there is only one (or zero) candidates
+        if (cur_p->size <= 1) {
+            return;
         }
-    },
+
+        // Calculate maximum possible entropy
+        float max_entropy = -logf(1.0f / cur_p->size);
+
+        llama_sampler_softmax_impl(cur_p);
+
+        // Calculate entropy of the softmax probabilities
+        float entropy = 0.0f;
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            float prob = cur_p->data[i].p;
+            if (prob > 0.0f) { // Ensure no log(0)
+                entropy -= prob * logf(prob);
+            }
+        }
+
+        // Normalize the entropy (max_entropy cannot be 0 here because we checked cur_p->size != 1 above)
+        float normalized_entropy = entropy / max_entropy;
+
+        // Map the normalized entropy to the desired temperature range using the power function
+        float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
+
+    #ifdef DEBUG
+        LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp);
+        LLAMA_LOG_INFO("Entropy: %f\n", entropy);
+        LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy);
+        LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy);
+        LLAMA_LOG_INFO("Exponent: %f\n", exponent_val);
+        LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
+    #endif
+
+        // Apply the dynamically calculated temperature scaling
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            cur_p->data[i].logit /= dyn_temp;
+        }
+
+        // Re-compute softmax probabilities after scaling logits with dynamic temperature
+        const double max_l_double = cur_p->data[0].logit;
+
+        double cum_sum_double = 0.0;
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            double p = exp(cur_p->data[i].logit - max_l_double);
+            cur_p->data[i].p = p; // Store the scaled probability
+            cum_sum_double += p;
+        }
+
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            cur_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
+        }
+
+    #ifdef DEBUG
+        // Print the updated top 25 probabilities after temperature scaling
+        LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
+        for (size_t i = 0; i < 25 && i < cur_p->size; ++i) {
+            LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, cur_p->data[i].p * 100.0f);
+        }
+    #endif
+    } else {
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            cur_p->data[i].logit /= ctx->temp;
+        }
+    }
+}
+
+static struct llama_sampler * llama_sampler_temp_ext_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_temp_ext *) smpl->ctx;
+    return llama_sampler_init_temp_ext(ctx->temp, ctx->delta, ctx->exponent);
+}
+
+static void llama_sampler_temp_ext_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_temp_ext *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_temp_ext_i = {
+    /* .name   = */ llama_sampler_temp_ext_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sampler_temp_ext_apply,
     /* .reset  = */ nullptr,
-    /* .clone  = */ [](const struct llama_sampler * smpl) {
-        const auto * ctx = (const llama_sampler_temp_ext *) smpl->ctx;
-        return llama_sampler_init_temp_ext(ctx->temp, ctx->delta, ctx->exponent);
-    },
-    /* .free   = */ [](struct llama_sampler * smpl) {
-        delete (llama_sampler_temp_ext *) smpl->ctx;
-    },
+    /* .clone  = */ llama_sampler_temp_ext_clone,
+    /* .free   = */ llama_sampler_temp_ext_free,
 };
 
 struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) {
@@ -965,65 +1027,77 @@ struct llama_sampler_mirostat {
     std::vector<float> probs;
 };
 
+static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*smpl*/) {
+    return "mirostat";
+}
+
+static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
+
+    llama_sampler_softmax_impl(cur_p);
+
+    // Estimate s_hat using the most probable m tokens
+    float s_hat = 0.0;
+    float sum_ti_bi = 0.0;
+    float sum_ti_sq = 0.0;
+    for (size_t i = 0; i < size_t(ctx->m - 1) && i < cur_p->size - 1; ++i) {
+        float t_i = logf(float(i + 2) / float(i + 1));
+        float b_i = logf(cur_p->data[i].p / cur_p->data[i + 1].p);
+        sum_ti_bi += t_i * b_i;
+        sum_ti_sq += t_i * t_i;
+    }
+    s_hat = sum_ti_bi / sum_ti_sq;
+
+    // Compute k from the estimated s_hat and target surprise value
+    float epsilon_hat = s_hat - 1;
+    float k = powf((epsilon_hat * powf(2, ctx->mu)) / (1 - powf(ctx->n_vocab, -epsilon_hat)), 1 / s_hat);
+
+    llama_sampler_top_k_impl(cur_p, std::max(int(k), 1));
+    llama_sampler_softmax_impl(cur_p);
+
+    const int idx = llama_sample_dist(cur_p, ctx->rng, ctx->probs);
+
+    cur_p->selected = idx;
+
+    float observed_surprise = -log2f(cur_p->data[idx].p);
+    float e = observed_surprise - ctx->tau;
+
+    // Update mu using the learning rate and error
+    ctx->mu = ctx->mu - ctx->eta * e;
+}
+
+static struct llama_sampler * llama_sampler_mirostat_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_mirostat *) smpl->ctx;
+    auto * result = llama_sampler_init_mirostat(ctx->n_vocab, ctx->seed, ctx->tau, ctx->eta, ctx->m);
+
+    // copy the state
+    {
+        auto * result_ctx = (llama_sampler_mirostat *) smpl->ctx;
+
+        result_ctx->mu  = ctx->mu;
+        result_ctx->rng = ctx->rng;
+    }
+
+    return result;
+}
+
+static void llama_sampler_mirostat_reset(struct llama_sampler * smpl) {
+    auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
+    ctx->mu = 2.0f*ctx->tau;
+    ctx->rng = std::mt19937(ctx->seed);
+}
+
+static void llama_sampler_mirostat_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_mirostat *) smpl->ctx;
+}
+
 static struct llama_sampler_i llama_sampler_mirostat_i = {
-    /* .name   = */ [](const struct llama_sampler * /*smpl*/) { return "mirostat"; },
+    /* .name   = */ llama_sampler_mirostat_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-        auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
-
-        llama_sampler_softmax_impl(cur_p);
-
-        // Estimate s_hat using the most probable m tokens
-        float s_hat = 0.0;
-        float sum_ti_bi = 0.0;
-        float sum_ti_sq = 0.0;
-        for (size_t i = 0; i < size_t(ctx->m - 1) && i < cur_p->size - 1; ++i) {
-            float t_i = logf(float(i + 2) / float(i + 1));
-            float b_i = logf(cur_p->data[i].p / cur_p->data[i + 1].p);
-            sum_ti_bi += t_i * b_i;
-            sum_ti_sq += t_i * t_i;
-        }
-        s_hat = sum_ti_bi / sum_ti_sq;
-
-        // Compute k from the estimated s_hat and target surprise value
-        float epsilon_hat = s_hat - 1;
-        float k = powf((epsilon_hat * powf(2, ctx->mu)) / (1 - powf(ctx->n_vocab, -epsilon_hat)), 1 / s_hat);
-
-        llama_sampler_top_k_impl(cur_p, std::max(int(k), 1));
-        llama_sampler_softmax_impl(cur_p);
-
-        const int idx = llama_sample_dist(cur_p, ctx->rng, ctx->probs);
-
-        cur_p->selected = idx;
-
-        float observed_surprise = -log2f(cur_p->data[idx].p);
-        float e = observed_surprise - ctx->tau;
-
-        // Update mu using the learning rate and error
-        ctx->mu = ctx->mu - ctx->eta * e;
-    },
-    /* .reset  = */ [](struct llama_sampler * smpl) {
-        auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
-        ctx->mu = 2.0f*ctx->tau;
-        ctx->rng = std::mt19937(ctx->seed);
-    },
-    /* .clone  = */ [](const struct llama_sampler * smpl) {
-        const auto * ctx = (const llama_sampler_mirostat *) smpl->ctx;
-        auto * result = llama_sampler_init_mirostat(ctx->n_vocab, ctx->seed, ctx->tau, ctx->eta, ctx->m);
-
-        // copy the state
-        {
-            auto * result_ctx = (llama_sampler_mirostat *) smpl->ctx;
-
-            result_ctx->mu  = ctx->mu;
-            result_ctx->rng = ctx->rng;
-        }
-
-        return result;
-    },
-    /* .free   = */ [](struct llama_sampler * smpl) {
-        delete (llama_sampler_mirostat *) smpl->ctx;
-    },
+    /* .apply  = */ llama_sampler_mirostat_apply,
+    /* .reset  = */ llama_sampler_mirostat_reset,
+    /* .clone  = */ llama_sampler_mirostat_clone,
+    /* .free   = */ llama_sampler_mirostat_free,
 };
 
 struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
@@ -1057,59 +1131,71 @@ struct llama_sampler_mirostat_v2 {
     std::vector<float> probs;
 };
 
+static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler * /*smpl*/) {
+    return "mirostat-v2";
+}
+
+static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
+
+    llama_sampler_softmax_impl(cur_p);
+
+    // Truncate the words with surprise values greater than mu
+    cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) {
+        return -log2f(candidate.p) > ctx->mu;
+    }));
+
+    if (cur_p->size == 0) {
+        cur_p->size = 1;
+    }
+
+    // Normalize the probabilities of the remaining words
+    llama_sampler_softmax_impl(cur_p);
+
+    const int idx = llama_sample_dist(cur_p, ctx->rng, ctx->probs);
+
+    cur_p->selected = idx;
+
+    float observed_surprise = -log2f(cur_p->data[idx].p);
+    float e = observed_surprise - ctx->tau;
+
+    // Update mu using the learning rate and error
+    ctx->mu = ctx->mu - ctx->eta * e;
+}
+
+static void llama_sampler_mirostat_v2_reset(struct llama_sampler * smpl) {
+    auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
+    ctx->mu = 2.0f*ctx->tau;
+    ctx->rng = std::mt19937(ctx->seed);
+}
+
+static struct llama_sampler * llama_sampler_mirostat_v2_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_mirostat_v2 *) smpl->ctx;
+
+    auto * result = llama_sampler_init_mirostat_v2(ctx->seed, ctx->tau, ctx->eta);
+
+    // copy the state
+    {
+        auto * result_ctx = (llama_sampler_mirostat_v2 *) result->ctx;
+
+        result_ctx->mu  = ctx->mu;
+        result_ctx->rng = ctx->rng;
+    }
+
+    return result;
+}
+
+static void llama_sampler_mirostat_v2_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_mirostat_v2 *) smpl->ctx;
+}
+
 static struct llama_sampler_i llama_sampler_mirostat_v2_i = {
-    /* .name   = */ [](const struct llama_sampler * /*smpl*/) { return "mirostat-v2"; },
+    /* .name   = */ llama_sampler_mirostat_v2_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-        auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
-
-        llama_sampler_softmax_impl(cur_p);
-
-        // Truncate the words with surprise values greater than mu
-        cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) {
-            return -log2f(candidate.p) > ctx->mu;
-        }));
-
-        if (cur_p->size == 0) {
-            cur_p->size = 1;
-        }
-
-        // Normalize the probabilities of the remaining words
-        llama_sampler_softmax_impl(cur_p);
-
-        const int idx = llama_sample_dist(cur_p, ctx->rng, ctx->probs);
-
-        cur_p->selected = idx;
-
-        float observed_surprise = -log2f(cur_p->data[idx].p);
-        float e = observed_surprise - ctx->tau;
-
-        // Update mu using the learning rate and error
-        ctx->mu = ctx->mu - ctx->eta * e;
-    },
-    /* .reset  = */ [](struct llama_sampler * smpl) {
-        auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
-        ctx->mu = 2.0f*ctx->tau;
-        ctx->rng = std::mt19937(ctx->seed);
-    },
-    /* .clone  = */ [](const struct llama_sampler * smpl) {
-        const auto * ctx = (const llama_sampler_mirostat_v2 *) smpl->ctx;
-
-        auto * result = llama_sampler_init_mirostat_v2(ctx->seed, ctx->tau, ctx->eta);
-
-        // copy the state
-        {
-            auto * result_ctx = (llama_sampler_mirostat_v2 *) result->ctx;
-
-            result_ctx->mu  = ctx->mu;
-            result_ctx->rng = ctx->rng;
-        }
-
-        return result;
-    },
-    /* .free   = */ [](struct llama_sampler * smpl) {
-        delete (llama_sampler_mirostat_v2 *) smpl->ctx;
-    },
+    /* .apply  = */ llama_sampler_mirostat_v2_apply,
+    /* .reset  = */ llama_sampler_mirostat_v2_reset,
+    /* .clone  = */ llama_sampler_mirostat_v2_clone,
+    /* .free   = */ llama_sampler_mirostat_v2_free,
 };
 
 struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
@@ -1137,59 +1223,73 @@ struct llama_sampler_grammar {
     struct llama_grammar * grammar;
 };
 
-static struct llama_sampler_i llama_sampler_grammar_i = {
-    /* .name   = */ [](const struct llama_sampler * /*smpl*/) { return "grammar"; },
-    /* .accept = */ [](struct llama_sampler * smpl, llama_token token) {
-        const auto * ctx = (llama_sampler_grammar *) smpl->ctx;
-        if (ctx->grammar) {
-            llama_grammar_accept_impl(*ctx->grammar, token);
-        }
-    },
-    /* .apply  = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-        const auto * ctx = (llama_sampler_grammar *) smpl->ctx;
-        if (ctx->grammar) {
-            llama_sampler_grammar_impl(cur_p, *ctx->grammar);
-        }
-    },
-    /* .reset  = */ [](struct llama_sampler * smpl) {
-        auto * ctx = (llama_sampler_grammar *) smpl->ctx;
-        if (!ctx->grammar) {
-            return;
-        }
+static const char * llama_sampler_grammar_name(const struct llama_sampler * /*smpl*/) {
+    return "grammar";
+}
 
-        auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str());
+static void llama_sampler_grammar_accept_impl(struct llama_sampler * smpl, llama_token token) {
+    auto * ctx = (llama_sampler_grammar *) smpl->ctx;
+    if (ctx->grammar) {
+        llama_grammar_accept_impl(*ctx->grammar, token);
+    }
+}
 
+static void llama_sampler_grammar_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_grammar *) smpl->ctx;
+    if (ctx->grammar) {
+        llama_grammar_apply_impl(*ctx->grammar, cur_p);
+    }
+}
+
+static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
+    auto * ctx = (llama_sampler_grammar *) smpl->ctx;
+    if (!ctx->grammar) {
+        return;
+    }
+
+    auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str());
+
+    llama_grammar_free_impl(ctx->grammar);
+    ctx->grammar = grammar_new;
+}
+
+static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
+
+    auto * result = llama_sampler_init_grammar_impl(*ctx->vocab, nullptr, nullptr);
+
+    // copy the state
+    {
+        auto * result_ctx = (llama_sampler_grammar *) result->ctx;
+
+        if (ctx->grammar) {
+            result_ctx->grammar_str  = ctx->grammar_str;
+            result_ctx->grammar_root = ctx->grammar_root;
+
+            result_ctx->grammar = llama_grammar_clone_impl(*ctx->grammar);
+        }
+    }
+
+    return result;
+}
+
+static void llama_sampler_grammar_free(struct llama_sampler * smpl) {
+    const auto * ctx = (llama_sampler_grammar *) smpl->ctx;
+
+    if (ctx->grammar) {
         llama_grammar_free_impl(ctx->grammar);
-        ctx->grammar = grammar_new;
-    },
-    /* .clone  = */ [](const struct llama_sampler * smpl) {
-        const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
+    }
 
-        auto * result = llama_sampler_init_grammar_impl(*ctx->vocab, nullptr, nullptr);
+    delete ctx;
+}
 
-        // copy the state
-        {
-            auto * result_ctx = (llama_sampler_grammar *) result->ctx;
-
-            if (ctx->grammar) {
-                result_ctx->grammar_str  = ctx->grammar_str;
-                result_ctx->grammar_root = ctx->grammar_root;
-
-                result_ctx->grammar = llama_grammar_clone_impl(*ctx->grammar);
-            }
-        }
-
-        return result;
-    },
-    /* .free   = */ [](struct llama_sampler * smpl) {
-        const auto * ctx = (llama_sampler_grammar *) smpl->ctx;
-
-        if (ctx->grammar) {
-            llama_grammar_free_impl(ctx->grammar);
-        }
-
-        delete ctx;
-    },
+static struct llama_sampler_i llama_sampler_grammar_i = {
+    /* .name   = */ llama_sampler_grammar_name,
+    /* .accept = */ llama_sampler_grammar_accept_impl,
+    /* .apply  = */ llama_sampler_grammar_apply,
+    /* .reset  = */ llama_sampler_grammar_reset,
+    /* .clone  = */ llama_sampler_grammar_clone,
+    /* .free   = */ llama_sampler_grammar_free,
 };
 
 struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab & vocab, const char * grammar_str, const char * grammar_root) {
@@ -1235,104 +1335,144 @@ struct llama_sampler_penalties {
     ring_buffer<llama_token> prev;
 };
 
+static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) {
+    return "penalties";
+}
+
+static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_token token) {
+    auto * ctx = (llama_sampler_penalties *) smpl->ctx;
+    if (ctx->penalty_last_n == 0) {
+        return;
+    }
+
+    ctx->prev.push_back(token);
+}
+
+static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_penalties *) smpl->ctx;
+
+    if (ctx->ignore_eos) {
+        assert(ctx->special_eos_id >= 0);
+
+        // optimistically check if the candidates are not yet sorted/shuffled/truncated
+        if (cur_p->size > (size_t) ctx->special_eos_id && cur_p->data[ctx->special_eos_id].id == ctx->special_eos_id) {
+            cur_p->data[ctx->special_eos_id].logit = -INFINITY;
+        } else {
+            // else, search for the special EOS token
+            for (size_t i = 0; i < cur_p->size; ++i) {
+                if (cur_p->data[i].id == ctx->special_eos_id) {
+                    cur_p->data[i].logit = -INFINITY;
+                    break;
+                }
+            }
+        }
+    }
+
+    if ((ctx->penalty_last_n == 0) ||
+        (ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
+        return;
+    }
+
+    bool nl_found = false;
+    size_t nl_idx = 0;
+    float nl_logit = -INFINITY;
+    if (!ctx->penalize_nl) {
+        assert(ctx->linefeed_id >= 0);
+
+        // optimistically check if the candidates are not yet sorted/shuffled/truncated
+        if (cur_p->size > (size_t) ctx->linefeed_id && cur_p->data[ctx->linefeed_id].id == ctx->linefeed_id) {
+            nl_found = true;
+            nl_idx = ctx->linefeed_id;
+            nl_logit = cur_p->data[ctx->linefeed_id].logit;
+        } else {
+            // else, search for the linefeed token
+            for (size_t i = 0; i < cur_p->size; ++i) {
+                if (cur_p->data[i].id == ctx->linefeed_id) {
+                    nl_found = true;
+                    nl_idx = i;
+                    nl_logit = cur_p->data[i].logit;
+                    break;
+                }
+            }
+        }
+    }
+
+    // Create a frequency map to count occurrences of each token in last_tokens
+    // TODO: optimize this by maintaining the token count in the sampler context
+    using llama_token_cnt = std::unordered_map<llama_token, int>;
+    llama_token_cnt token_count;
+
+    for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
+        token_count[ctx->prev.rat(i)]++;
+    }
+
+    // Apply frequency and presence penalties to the cur_p
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        const auto token_iter = token_count.find(cur_p->data[i].id);
+        if (token_iter == token_count.end()) {
+            continue;
+        }
+
+        const int count = token_iter->second;
+
+        // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
+        // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
+        if (cur_p->data[i].logit <= 0) {
+            cur_p->data[i].logit *= ctx->penalty_repeat;
+        } else {
+            cur_p->data[i].logit /= ctx->penalty_repeat;
+        }
+
+        cur_p->data[i].logit -= float(count) * ctx->penalty_freq + float(count > 0) * ctx->penalty_present;
+    }
+
+    cur_p->sorted = false;
+
+    if (!ctx->penalize_nl && nl_found) {
+        // restore the logit of the newline token if it was penalized
+        cur_p->data[nl_idx].logit = nl_logit;
+    }
+}
+
+static void llama_sampler_penalties_reset(struct llama_sampler * smpl) {
+    auto * ctx = (llama_sampler_penalties *) smpl->ctx;
+    ctx->prev.clear();
+}
+
+static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_penalties *) smpl->ctx;
+    auto * result = llama_sampler_init_penalties(
+            ctx->n_vocab,
+            ctx->special_eos_id,
+            ctx->linefeed_id,
+            ctx->penalty_last_n,
+            ctx->penalty_repeat,
+            ctx->penalty_freq,
+            ctx->penalty_present,
+            ctx->penalize_nl,
+            ctx->ignore_eos);
+
+    // copy the state
+    {
+        auto * result_ctx = (llama_sampler_penalties *) result->ctx;
+
+        result_ctx->prev = ctx->prev;
+    }
+
+    return result;
+}
+
+static void llama_sampler_penalties_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_penalties *) smpl->ctx;
+}
+
 static struct llama_sampler_i llama_sampler_penalties_i = {
-    /* .name   = */ [](const struct llama_sampler * /*smpl*/) { return "penalties"; },
-    /* .accept = */ [](struct llama_sampler * smpl, llama_token token) {
-        auto * ctx = (llama_sampler_penalties *) smpl->ctx;
-        ctx->prev.push_back(token);
-    },
-    /* .apply  = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-        auto * ctx = (llama_sampler_penalties *) smpl->ctx;
-
-        if (ctx->ignore_eos) {
-            assert(ctx->special_eos_id >= 0);
-
-            // optimistically check if the candidates are not yet sorted/shuffled/truncated
-            if (cur_p->size > (size_t) ctx->special_eos_id && cur_p->data[ctx->special_eos_id].id == ctx->special_eos_id) {
-                cur_p->data[ctx->special_eos_id].logit = -INFINITY;
-            } else {
-                // else, search for the special EOS token
-                for (size_t i = 0; i < cur_p->size; ++i) {
-                    if (cur_p->data[i].id == ctx->special_eos_id) {
-                        cur_p->data[i].logit = -INFINITY;
-                        break;
-                    }
-                }
-            }
-        }
-
-        if ((ctx->penalty_last_n == 0) ||
-            (ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
-            return;
-        }
-
-        bool nl_found = false;
-        size_t nl_idx = 0;
-        float nl_logit = -INFINITY;
-        if (!ctx->penalize_nl) {
-            assert(ctx->linefeed_id >= 0);
-
-            // optimistically check if the candidates are not yet sorted/shuffled/truncated
-            if (cur_p->size > (size_t) ctx->linefeed_id && cur_p->data[ctx->linefeed_id].id == ctx->linefeed_id) {
-                nl_found = true;
-                nl_idx = ctx->linefeed_id;
-                nl_logit = cur_p->data[ctx->linefeed_id].logit;
-            } else {
-                // else, search for the linefeed token
-                for (size_t i = 0; i < cur_p->size; ++i) {
-                    if (cur_p->data[i].id == ctx->linefeed_id) {
-                        nl_found = true;
-                        nl_idx = i;
-                        nl_logit = cur_p->data[i].logit;
-                        break;
-                    }
-                }
-            }
-        }
-
-        // Create a frequency map to count occurrences of each token in last_tokens
-        // TODO: optimize this by maintaining the token count in the sampler context
-        llama_token_cnt token_count;
-        for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
-            token_count[ctx->prev.rat(i)]++;
-        }
-
-        llama_sampler_penalties_impl(cur_p, token_count, ctx->penalty_repeat, ctx->penalty_freq, ctx->penalty_present);
-
-        if (!ctx->penalize_nl && nl_found) {
-            // restore the logit of the newline token if it was penalized
-            cur_p->data[nl_idx].logit = nl_logit;
-        }
-    },
-    /* .reset  = */ [](struct llama_sampler * smpl) {
-        auto * ctx = (llama_sampler_penalties *) smpl->ctx;
-        ctx->prev.clear();
-    },
-    /* .clone  = */ [](const struct llama_sampler * smpl) {
-        const auto * ctx = (const llama_sampler_penalties *) smpl->ctx;
-        auto * result = llama_sampler_init_penalties(
-                ctx->n_vocab,
-                ctx->special_eos_id,
-                ctx->linefeed_id,
-                ctx->penalty_last_n,
-                ctx->penalty_repeat,
-                ctx->penalty_freq,
-                ctx->penalty_present,
-                ctx->penalize_nl,
-                ctx->ignore_eos);
-
-        // copy the state
-        {
-            auto * result_ctx = (llama_sampler_penalties *) result->ctx;
-
-            result_ctx->prev = ctx->prev;
-        }
-
-        return result;
-    },
-    /* .free   = */ [](struct llama_sampler * smpl) {
-        delete (llama_sampler_penalties *) smpl->ctx;
-    },
+    /* .name   = */ llama_sampler_penalties_name,
+    /* .accept = */ llama_sampler_penalties_accept,
+    /* .apply  = */ llama_sampler_penalties_apply,
+    /* .reset  = */ llama_sampler_penalties_reset,
+    /* .clone  = */ llama_sampler_penalties_clone,
+    /* .free   = */ llama_sampler_penalties_free,
 };
 
 struct llama_sampler * llama_sampler_init_penalties(
@@ -1346,11 +1486,11 @@ struct llama_sampler * llama_sampler_init_penalties(
         bool penalize_nl,
         bool ignore_eos) {
     if (linefeed_id == LLAMA_TOKEN_NULL) {
-        penalize_nl = false;
+        penalize_nl = true;
     }
 
     if (special_eos_id == LLAMA_TOKEN_NULL) {
-        ignore_eos = true;
+        ignore_eos = false;
     }
 
     return new llama_sampler {
@@ -1380,41 +1520,50 @@ struct llama_sampler_logit_bias {
     std::vector<llama_logit_bias> to_search;
 };
 
+static const char * llama_sampler_logit_bias_name(const struct llama_sampler * /*smpl*/) {
+    return "logit-bias";
+}
+
+static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_logit_bias *) smpl->ctx;
+
+    ctx->to_search.clear();
+
+    // update the candidates that have not been shuffled in the vocabulary (i.e. idx == id)
+    for (const auto & lb : ctx->logit_bias) {
+        if (lb.token >= 0 && cur_p->size > (size_t) lb.token && cur_p->data[lb.token].id == lb.token) {
+            cur_p->data[lb.token].logit += lb.bias;
+        } else {
+            ctx->to_search.push_back(lb);
+        }
+    }
+
+    // search for the remaining candidates that were not found in the previous step
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        for (const auto & lb : ctx->to_search) {
+            if (cur_p->data[i].id == lb.token) {
+                cur_p->data[i].logit += lb.bias;
+                break;
+            }
+        }
+    }
+}
+static struct llama_sampler * llama_sampler_logit_bias_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_logit_bias *) smpl->ctx;
+    return llama_sampler_init_logit_bias(ctx->n_vocab, ctx->logit_bias.size(), ctx->logit_bias.data());
+}
+
+static void llama_sampler_logit_bias_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_logit_bias *) smpl->ctx;
+}
+
 static struct llama_sampler_i llama_sampler_logit_bias_i = {
-    /* .name   = */ [](const struct llama_sampler * /*smpl*/) { return "logit-bias"; },
+    /* .name   = */ llama_sampler_logit_bias_name,
     /* .accept = */ nullptr,
-    /* .apply  = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-        auto * ctx = (llama_sampler_logit_bias *) smpl->ctx;
-
-        ctx->to_search.clear();
-
-        // update the candidates that have not been shuffled in the vocabulary (i.e. idx == id)
-        for (const auto & lb : ctx->logit_bias) {
-            if (lb.token >= 0 && cur_p->size > (size_t) lb.token && cur_p->data[lb.token].id == lb.token) {
-                cur_p->data[lb.token].logit += lb.bias;
-            } else {
-                ctx->to_search.push_back(lb);
-            }
-        }
-
-        // search for the remaining candidates that were not found in the previous step
-        for (size_t i = 0; i < cur_p->size; ++i) {
-            for (const auto & lb : ctx->to_search) {
-                if (cur_p->data[i].id == lb.token) {
-                    cur_p->data[i].logit += lb.bias;
-                    break;
-                }
-            }
-        }
-    },
+    /* .apply  = */ llama_sampler_logit_bias_apply,
     /* .reset  = */ nullptr,
-    /* .clone  = */ [](const struct llama_sampler * smpl) {
-        const auto * ctx = (const llama_sampler_logit_bias *) smpl->ctx;
-        return llama_sampler_init_logit_bias(ctx->n_vocab, ctx->logit_bias.size(), ctx->logit_bias.data());
-    },
-    /* .free   = */ [](struct llama_sampler * smpl) {
-        delete (llama_sampler_logit_bias *) smpl->ctx;
-    },
+    /* .clone  = */ llama_sampler_logit_bias_clone,
+    /* .free   = */ llama_sampler_logit_bias_free,
 };
 
 struct llama_sampler * llama_sampler_init_logit_bias(
@@ -1429,4 +1578,4 @@ struct llama_sampler * llama_sampler_init_logit_bias(
             /* .to_search  = */ {},
         },
     };
-}
+}
\ No newline at end of file
diff --git a/src/llama-sampling.h b/src/llama-sampling.h
index 137c0025c..d90b14713 100644
--- a/src/llama-sampling.h
+++ b/src/llama-sampling.h
@@ -23,16 +23,6 @@ struct llama_sampler_chain {
     mutable int32_t n_sample;
 };
 
-using llama_token_cnt = std::unordered_map<llama_token, int>;
-
-// TODO: tmp exposed until test-sampling is fixed
-void llama_sampler_penalties_impl(
-       llama_token_data_array * cur_p,
-        const llama_token_cnt & token_count,
-                        float   penalty_repeat,
-                        float   penalty_freq,
-                        float   penalty_present);
-
 struct llama_sampler * llama_sampler_init_grammar_impl(
         const struct llama_vocab & vocab,
                       const char * grammar_str,
diff --git a/src/llama.cpp b/src/llama.cpp
index 3c26d1573..0687cf463 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6449,6 +6449,11 @@ static void llm_load_vocab(
                         )
                    ) {
                     vocab.special_eot_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                            __func__, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
                     break;
                 }
             }
@@ -6462,6 +6467,11 @@ static void llm_load_vocab(
             const auto & t = vocab.token_to_id.find("<|eom_id|>");
             if (t != vocab.token_to_id.end()) {
                 vocab.special_eom_id = t->second;
+                if ((vocab.id_to_token[t->second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                        __func__, t->first.c_str());
+                    vocab.id_to_token[t->second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                }
             }
         }
     }
@@ -16143,6 +16153,13 @@ static int llama_decode_internal(
         return -1;
     }
 
+    for (uint32_t i = 0; i < n_tokens_all; ++i) {
+        if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
+            LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
+            return -1;
+        }
+    }
+
     const auto & model   = lctx.model;
     const auto & hparams = model.hparams;
     const auto & cparams = lctx.cparams;
@@ -16435,6 +16452,13 @@ static int llama_encode_internal(
         return -1;
     }
 
+    for (uint32_t i = 0; i < n_tokens; ++i) {
+        if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
+            LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
+            return -1;
+        }
+    }
+
     const auto & model   = lctx.model;
     const auto & hparams = model.hparams;
     const auto & cparams = lctx.cparams;
diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp
new file mode 100644
index 000000000..9ad91acc0
--- /dev/null
+++ b/tests/test-arg-parser.cpp
@@ -0,0 +1,119 @@
+#include <string>
+#include <vector>
+#include <sstream>
+#include <unordered_set>
+
+#undef NDEBUG
+#include <cassert>
+
+#include "common.h"
+
+int main(void) {
+    gpt_params params;
+
+    printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
+    for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
+        try {
+            auto options = gpt_params_parser_init(params, (enum llama_example)ex);
+            std::unordered_set<std::string> seen_args;
+            std::unordered_set<std::string> seen_env_vars;
+            for (const auto & opt : options) {
+                // check for args duplications
+                for (const auto & arg : opt.args) {
+                    if (seen_args.find(arg) == seen_args.end()) {
+                        seen_args.insert(arg);
+                    } else {
+                        fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg);
+                        exit(1);
+                    }
+                }
+                // check for env var duplications
+                if (opt.env) {
+                    if (seen_env_vars.find(opt.env) == seen_env_vars.end()) {
+                        seen_env_vars.insert(opt.env);
+                    } else {
+                        fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", opt.env);
+                        exit(1);
+                    }
+                }
+            }
+        } catch (std::exception & e) {
+            printf("%s\n", e.what());
+            assert(false);
+        }
+    }
+
+    auto list_str_to_char = [](std::vector<std::string> & argv) -> std::vector<char *> {
+        std::vector<char *> res;
+        for (auto & arg : argv) {
+            res.push_back(const_cast<char *>(arg.data()));
+        }
+        return res;
+    };
+
+    std::vector<std::string> argv;
+    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+
+    printf("test-arg-parser: test invalid usage\n\n");
+
+    argv = {"binary_name", "-m"};
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+
+    argv = {"binary_name", "-ngl", "hello"};
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+
+    argv = {"binary_name", "-sm", "hello"};
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+
+
+    printf("test-arg-parser: test valid usage\n\n");
+
+    argv = {"binary_name", "-m", "model_file.gguf"};
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(params.model == "model_file.gguf");
+
+    argv = {"binary_name", "-t", "1234"};
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(params.cpuparams.n_threads == 1234);
+
+    argv = {"binary_name", "--verbose"};
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(params.verbosity == 1);
+
+    argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(params.model == "abc.gguf");
+    assert(params.n_predict == 6789);
+    assert(params.n_batch == 9090);
+
+// skip this part on windows, because setenv is not supported
+#ifdef _WIN32
+    printf("test-arg-parser: skip on windows build\n");
+#else
+    printf("test-arg-parser: test environment variables (valid + invalid usages)\n\n");
+
+    setenv("LLAMA_ARG_THREADS", "blah", true);
+    argv = {"binary_name"};
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+
+    setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
+    setenv("LLAMA_ARG_THREADS", "1010", true);
+    argv = {"binary_name"};
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(params.model == "blah.gguf");
+    assert(params.cpuparams.n_threads == 1010);
+
+
+    printf("test-arg-parser: test environment variables being overwritten\n\n");
+
+    setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
+    setenv("LLAMA_ARG_THREADS", "1010", true);
+    argv = {"binary_name", "-m", "overwritten.gguf"};
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(params.model == "overwritten.gguf");
+    assert(params.cpuparams.n_threads == 1010);
+#endif // _WIN32
+
+
+    printf("test-arg-parser: all tests OK\n\n");
+}