Merge branch 'upstream' into concedo_experimental

# Conflicts: # README.md # examples/parallel/parallel.cpp # ggml/src/CMakeLists.txt # ggml/src/ggml-blas/CMakeLists.txt # ggml/src/ggml-sycl/CMakeLists.txt # ggml/src/gguf.cpp # scripts/sync-ggml.last # tests/test-gguf.cpp
2025-09-11 17:44:38 +00:00 · 2025-06-02 23:26:43 +08:00 · 2025-06-02 23:26:43 +08:00 · b42b618897
commit b42b618897
parent 5e667659ec 7675c555a1
8 changed files with 89 additions and 37 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -3814,7 +3814,7 @@ class BertModel(TextModel):
            remove_whitespaces = tokenizer.clean_up_tokenization_spaces
            precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])

-            vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size)
+            vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
        else:
            sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
            sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
@ -3827,7 +3827,7 @@ class BertModel(TextModel):
            tokenizer = SentencePieceProcessor()
            tokenizer.LoadFromFile(str(tokenizer_path))

-            vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+            vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size())

        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
        scores: list[float] = [-10000.0] * vocab_size
@ -3857,33 +3857,26 @@ class BertModel(TextModel):
            unk_token = tokenizer_config_json.get("unk_token")
            unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3))

-            for token_id in range(vocab_size):
+            for token_id in range(tokenizer.vocab_size):
                piece = tokenizer._convert_id_to_token(token_id)
-                text = piece.encode("utf-8")
-                score = tokenizer_json["model"]["vocab"][token_id][1]
+                if (piece := tokenizer._convert_id_to_token(token_id)) is not None:
+                    text = piece.encode("utf-8")
+                    score = tokenizer_json["model"]["vocab"][token_id][1]

-                toktype = SentencePieceTokenTypes.NORMAL
-                if token_id == unk_token_id:
-                    toktype = SentencePieceTokenTypes.UNKNOWN
-                elif token_id in tokenizer.all_special_ids:
-                    toktype = SentencePieceTokenTypes.CONTROL
-                elif token_id in added_vocab.values():
-                    toktype = SentencePieceTokenTypes.USER_DEFINED
-                # No reliable way to detect this, but jina doesn't have any
-                # elif tokenizer.IsByte(token_id):
-                #     toktype = SentencePieceTokenTypes.BYTE
+                    toktype = SentencePieceTokenTypes.NORMAL
+                    if token_id == unk_token_id:
+                        toktype = SentencePieceTokenTypes.UNKNOWN
+                    elif token_id in tokenizer.all_special_ids:
+                        toktype = SentencePieceTokenTypes.CONTROL
+                    elif token_id in added_vocab.values():
+                        toktype = SentencePieceTokenTypes.USER_DEFINED
+                    # No reliable way to detect this, but jina doesn't have any
+                    # elif tokenizer.IsByte(token_id):
+                    #     toktype = SentencePieceTokenTypes.BYTE

-                tokens[token_id] = text
-                scores[token_id] = score
-                toktypes[token_id] = toktype
-
-        if vocab_size > len(tokens):
-            pad_count = vocab_size - len(tokens)
-            logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
-            for i in range(1, pad_count + 1):
-                tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
-                scores.append(-1000.0)
-                toktypes.append(SentencePieceTokenTypes.UNUSED)
+                    tokens[token_id] = text
+                    scores[token_id] = score
+                    toktypes[token_id] = toktype

        if isinstance(tokenizer, SentencePieceProcessor):
            # realign tokens (see HF tokenizer code)
@ -3896,6 +3889,12 @@ class BertModel(TextModel):
                SentencePieceTokenTypes.UNKNOWN,
            ] + toktypes[3:-1]

+            if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE:
+                # Add mask token missing from sentencepiece.bpe.model
+                tokens[250001] = b'<mask>'
+                scores[250001] = 0.0
+                toktypes[250001] = SentencePieceTokenTypes.CONTROL
+
        self.gguf_writer.add_tokenizer_model("t5")
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -2108,9 +2108,6 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_graph_get_grad    (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
    GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);

-    GGML_API void                 ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
-    GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
-
    // print info and performance information for the graph
    GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);

--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@ -32,6 +32,8 @@
 extern "C" {
 #endif

+void ggml_print_backtrace(void);
+
 #ifndef MIN
 #    define MIN(a, b) ((a) < (b) ? (a) : (b))
 #endif
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -1668,7 +1668,7 @@ static std::array<uint32_t, 2> fa_rows_cols(FaCodePath path, uint32_t D, uint32_
        return {64, 32};
    }
    return {64, 64};
-};
+}

 static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vector<uint32_t>& warptile, bool mul_mat_id, ggml_type src0_type) {

--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -134,7 +134,7 @@ static void ggml_print_backtrace_symbols(void) {
 }
 #endif

-static void ggml_print_backtrace(void) {
+void ggml_print_backtrace(void) {
    const char * GGML_NO_BACKTRACE = getenv("GGML_NO_BACKTRACE");
    if (GGML_NO_BACKTRACE) {
        return;
@ -161,6 +161,10 @@ static void ggml_print_backtrace(void) {
    const int parent_pid = getpid();
    const int child_pid = fork();
    if (child_pid < 0) { // error
+#if defined(__linux__)
+        close(lock[1]);
+        close(lock[0]);
+#endif
        return;
    } else if (child_pid == 0) { // child
        char attach[32];
@ -168,6 +172,7 @@ static void ggml_print_backtrace(void) {
 #if defined(__linux__)
        close(lock[1]);
        (void) !read(lock[0], lock, 1);
+        close(lock[0]);
 #endif
        // try gdb
        execlp("gdb", "gdb", "--batch",
@ -196,7 +201,7 @@ static void ggml_print_backtrace(void) {
    }
 }
 #else
-static void ggml_print_backtrace(void) {
+void ggml_print_backtrace(void) {
    // platform not supported
 }
 #endif
@ -217,6 +222,8 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
    abort();
 }

+// ggml_print_backtrace is registered with std::set_terminate by ggml.cpp
+
 //
 // logging
 //
--- a/ggml/src/ggml.cpp
+++ b/ggml/src/ggml.cpp
@ -0,0 +1,26 @@
+#include "ggml-impl.h"
+
+#include <cstdlib>
+#include <exception>
+
+static std::terminate_handler previous_terminate_handler;
+
+GGML_NORETURN static void ggml_uncaught_exception() {
+    ggml_print_backtrace();
+    if (previous_terminate_handler) {
+        previous_terminate_handler();
+    }
+    abort(); // unreachable unless previous_terminate_handler was nullptr
+}
+
+static bool ggml_uncaught_exception_init = []{
+    const char * GGML_NO_BACKTRACE = getenv("GGML_NO_BACKTRACE");
+    if (GGML_NO_BACKTRACE) {
+        return false;
+    }
+    const auto prev{std::get_terminate()};
+    GGML_ASSERT(prev != ggml_uncaught_exception);
+    previous_terminate_handler = prev;
+    std::set_terminate(ggml_uncaught_exception);
+    return true;
+}();
--- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp
@ -358,13 +358,29 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
    isggufv1 = false;

    if (ok && gr.read(ctx->version)) {
-        if (ctx->version == 1) {
+        if (ok && ctx->version == 0) {
+            GGML_LOG_ERROR("%s: bad GGUF version: %" PRIu32 "\n", __func__, ctx->version);
+            ok = false;
+        }
+
+        /*
+         * bit layout is different when reading non-native endian models.
+         * assuming that the GGUF version is 3, the non-native endian model
+         * would read it as 0x30000000. we can use the AND operation against
+         * the last 4 hexadecimal digits to check if the model is the same
+         * endianness as the host system.
+        */
+        if (ok && (ctx->version & 0x0000FFFF) == 0x00000000) {
+            GGML_LOG_ERROR("%s: failed to load model: this GGUF file version %" PRIu32 " is extremely large, is there a mismatch between the host and model endianness?\n", __func__, ctx->version);
+            ok = false;
+        }
+        if (ok && ctx->version == 1) {
            fprintf(stderr, "%s: GGUFv1 is deprecated, please use a more up-to-date version!\n", __func__);
            // ok = false;
            isggufv1 = true;
        }
-        if (ctx->version > GGUF_VERSION) {
-            fprintf(stderr, "%s: this GGUF file is version %" PRIu32 " but this software is designed for version %d, things may break.\n",
+        if (ok && ctx->version > GGUF_VERSION) {
+            GGML_LOG_ERROR("%s: this GGUF file is version %" PRIu32 " but this software only supports up to version %d\n",
                __func__, ctx->version, GGUF_VERSION);
           // ok = false;
        }
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -2319,9 +2319,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {

        std::string model_name;
        std::string tokenizer_pre;
+        std::string general_arch;

        ml.get_key(LLM_KV_GENERAL_NAME,  model_name,    false);
        ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
+        ml.get_key(LLM_KV_GENERAL_ARCHITECTURE, general_arch, false);

        // model name to lowercase
        std::transform(model_name.begin(), model_name.end(), model_name.begin(),
@ -2330,8 +2332,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            }
        );

-        // set attributes by model/tokenizer name
-        if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
+        // set attributes by model/tokenizer/architecture name
+        if (false
+                || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
+                || _contains_any(general_arch, {"nomic-bert-moe"})
+           ) {
            _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
        } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
            for (auto id : cache_special_tokens) {