Merge branch 'upstream' into concedo_experimental

# Conflicts: # .flake8 # .github/workflows/bench.yml # .github/workflows/python-lint.yml # .pre-commit-config.yaml # Makefile # README.md # models/ggml-vocab-bert-bge.gguf.inp # models/ggml-vocab-bert-bge.gguf.out # models/ggml-vocab-deepseek-coder.gguf.inp # models/ggml-vocab-deepseek-coder.gguf.out # models/ggml-vocab-deepseek-llm.gguf.inp # models/ggml-vocab-deepseek-llm.gguf.out # models/ggml-vocab-falcon.gguf.inp # models/ggml-vocab-falcon.gguf.out # models/ggml-vocab-gpt-2.gguf.inp # models/ggml-vocab-gpt-2.gguf.out # models/ggml-vocab-llama-bpe.gguf.inp # models/ggml-vocab-llama-bpe.gguf.out # models/ggml-vocab-llama-spm.gguf.inp # models/ggml-vocab-llama-spm.gguf.out # models/ggml-vocab-mpt.gguf.inp # models/ggml-vocab-mpt.gguf.out # models/ggml-vocab-phi-3.gguf # models/ggml-vocab-phi-3.gguf.inp # models/ggml-vocab-phi-3.gguf.out # models/ggml-vocab-refact.gguf # models/ggml-vocab-starcoder.gguf.inp # models/ggml-vocab-starcoder.gguf.out # requirements/requirements-convert.txt # scripts/compare-llama-bench.py # scripts/run-with-preset.py # scripts/verify-checksum-models.py # tests/CMakeLists.txt # tests/test-tokenizer-0.cpp
2025-09-15 03:19:41 +00:00 · 2024-05-06 18:09:45 +08:00 · 2024-05-06 18:09:45 +08:00 · 6c000cbe7a
commit 6c000cbe7a
parent 173c7272d5 bcdee0daa7
40 changed files with 1593 additions and 936 deletions
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@ -33,6 +33,7 @@ struct split_params {
    int n_split_tensors = 128;
    std::string input;
    std::string output;
+    bool no_tensor_first_split = false;
    bool dry_run = false;
 };

@ -50,6 +51,7 @@ static void split_print_usage(const char * executable) {
    printf("  --merge                 merge multiple GGUF to a single GGUF\n");
    printf("  --split-max-tensors     max tensors in each split (default: %d)\n", default_params.n_split_tensors);
    printf("  --split-max-size N(M|G) max size per split\n");
+    printf("  --no-tensor-first-split do not add tensors to the first split (disabled by default)\n");
    printf("  --dry-run               only print out a split plan and exit, without writing any new files\n");
    printf("\n");
 }
@ -101,6 +103,10 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
            arg_found = true;
            params.dry_run = true;
        }
+        if (arg == "--no-tensor-first-split") {
+            arg_found = true;
+            params.no_tensor_first_split = true;
+        }

        if (is_op_set) {
            throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
@ -201,10 +207,10 @@ struct split_strategy {
        // because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
        int i_split = -1;
        struct gguf_context * ctx_out = NULL;
-        auto new_ctx_out = [&]() {
+        auto new_ctx_out = [&](bool allow_no_tensors) {
            i_split++;
            if (ctx_out != NULL) {
-                if (gguf_get_n_tensors(ctx_out) == 0) {
+                if (gguf_get_n_tensors(ctx_out) == 0 && !allow_no_tensors) {
                    fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n");
                    exit(EXIT_FAILURE);
                }
@ -221,7 +227,12 @@ struct split_strategy {
        };

        // initialize ctx_out for the first split
-        new_ctx_out();
+        new_ctx_out(false);
+
+        // skip first split if no_tensor_first_split is set
+        if (params.no_tensor_first_split) {
+            new_ctx_out(true);
+        }

        // process tensors one by one
        size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata)
@ -231,7 +242,7 @@ struct split_strategy {
            size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
            size_t next_tensors_size = curr_tensors_size + n_bytes;
            if (should_split(i, next_tensors_size)) {
-                new_ctx_out();
+                new_ctx_out(false);
                curr_tensors_size = n_bytes;
            } else {
                curr_tensors_size = next_tensors_size;