merged, try to fix metal build

2026-05-09 19:46:11 +00:00 · 2024-03-14 11:15:50 +08:00 · 2024-03-14 11:15:50 +08:00 · ec5dea14d7
commit ec5dea14d7
parent 9f102b9db6 19885d205e
29 changed files with 1541 additions and 967 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -484,6 +484,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_batch = std::stoi(argv[i]);
+        } else if (arg == "-ub" || arg == "--ubatch-size") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_ubatch = std::stoi(argv[i]);
        } else if (arg == "--keep") {
            if (++i >= argc) {
                invalid_param = true;
@ -978,7 +984,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        binary file containing multiple choice tasks.\n");
    printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
    printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
-    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    printf("  -b N, --batch-size N  logical maximum batch size (default: %d)\n", params.n_batch);
+    printf("  -ub N, --ubatch-size N\n");
+    printf("                        physical maximum batch size (default: %d)\n", params.n_ubatch);
    printf("  --samplers            samplers that will be used for generation in the order, separated by \';\'\n");
    printf("                        (default: %s)\n", sampler_type_names.c_str());
    printf("  --sampling-seq        simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
@ -1288,8 +1296,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    auto cparams = llama_context_default_params();

    cparams.n_ctx             = params.n_ctx;
-    cparams.n_batch           = params.n_batch;
    cparams.n_seq_max         = params.n_parallel;
+    cparams.n_batch           = params.n_batch;
+    cparams.n_ubatch          = params.n_ubatch;
    cparams.n_threads         = params.n_threads;
    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
    cparams.seed              = params.seed;
@ -1380,6 +1389,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
        std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
        llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
        llama_kv_cache_clear(lctx);
+        llama_synchronize(lctx);
        llama_reset_timings(lctx);
    }

--- a/common/common.h
+++ b/common/common.h
@ -45,7 +45,8 @@ struct gpt_params {
    int32_t n_threads_batch_draft = -1;
    int32_t n_predict             = -1;    // new tokens to predict
    int32_t n_ctx                 = 512;   // context size
-    int32_t n_batch               = 512;   // batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_batch               = 2048;  // logical batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_ubatch              = 512;   // physical batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep                = 0;     // number of tokens to keep from initial prompt
    int32_t n_draft               = 5;     // number of tokens to draft during speculative decoding
    int32_t n_chunks              = -1;    // max number of chunks to process (-1 = unlimited)
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -17,6 +17,13 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
            return nullptr;
        }

+        // Ensure that there is a "root" node.
+        if (result->parsed_grammar.symbol_ids.find("root") == result->parsed_grammar.symbol_ids.end()) {
+            fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
+            delete result;
+            return nullptr;
+        }
+
        std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());

        result->grammar = llama_grammar_init(