Merge branch 'upstream' into concedo_experimental

# Conflicts: # README.md # examples/parallel/parallel.cpp # ggml/src/CMakeLists.txt # ggml/src/ggml-blas/CMakeLists.txt # ggml/src/ggml-sycl/CMakeLists.txt # ggml/src/gguf.cpp # scripts/sync-ggml.last # tests/test-gguf.cpp
2025-09-11 17:44:38 +00:00 · 2025-06-02 23:26:43 +08:00 · 2025-06-02 23:26:43 +08:00 · b42b618897
commit b42b618897
parent 5e667659ec 7675c555a1
8 changed files with 89 additions and 37 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -3814,7 +3814,7 @@ class BertModel(TextModel):
            remove_whitespaces = tokenizer.clean_up_tokenization_spaces
            precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])
-            vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size)
+            vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
        else:
            sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
            sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
@ -3827,7 +3827,7 @@ class BertModel(TextModel):
            tokenizer = SentencePieceProcessor()
            tokenizer.LoadFromFile(str(tokenizer_path))
-            vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+            vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size())
        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
        scores: list[float] = [-10000.0] * vocab_size
@ -3857,33 +3857,26 @@ class BertModel(TextModel):
            unk_token = tokenizer_config_json.get("unk_token")
            unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3))
-            for token_id in range(vocab_size):
+            for token_id in range(tokenizer.vocab_size):
                piece = tokenizer._convert_id_to_token(token_id)
-                text = piece.encode("utf-8")
+                if (piece := tokenizer._convert_id_to_token(token_id)) is not None:
-                score = tokenizer_json["model"]["vocab"][token_id][1]
+                    text = piece.encode("utf-8")
                    score = tokenizer_json["model"]["vocab"][token_id][1]
-                toktype = SentencePieceTokenTypes.NORMAL
+                    toktype = SentencePieceTokenTypes.NORMAL
-                if token_id == unk_token_id:
+                    if token_id == unk_token_id:
-                    toktype = SentencePieceTokenTypes.UNKNOWN
+                        toktype = SentencePieceTokenTypes.UNKNOWN
-                elif token_id in tokenizer.all_special_ids:
+                    elif token_id in tokenizer.all_special_ids:
-                    toktype = SentencePieceTokenTypes.CONTROL
+                        toktype = SentencePieceTokenTypes.CONTROL
-                elif token_id in added_vocab.values():
+                    elif token_id in added_vocab.values():
-                    toktype = SentencePieceTokenTypes.USER_DEFINED
+                        toktype = SentencePieceTokenTypes.USER_DEFINED
-                # No reliable way to detect this, but jina doesn't have any
+                    # No reliable way to detect this, but jina doesn't have any
-                # elif tokenizer.IsByte(token_id):
+                    # elif tokenizer.IsByte(token_id):
-                #     toktype = SentencePieceTokenTypes.BYTE
+                    #     toktype = SentencePieceTokenTypes.BYTE
-                tokens[token_id] = text
+                    tokens[token_id] = text
-                scores[token_id] = score
+                    scores[token_id] = score
-                toktypes[token_id] = toktype
+                    toktypes[token_id] = toktype
        if vocab_size > len(tokens):
            pad_count = vocab_size - len(tokens)
            logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
            for i in range(1, pad_count + 1):
                tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
                scores.append(-1000.0)
                toktypes.append(SentencePieceTokenTypes.UNUSED)
        if isinstance(tokenizer, SentencePieceProcessor):
            # realign tokens (see HF tokenizer code)
@ -3896,6 +3889,12 @@ class BertModel(TextModel):
                SentencePieceTokenTypes.UNKNOWN,
            ] + toktypes[3:-1]
            if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE:
                # Add mask token missing from sentencepiece.bpe.model
                tokens[250001] = b'<mask>'
                scores[250001] = 0.0
                toktypes[250001] = SentencePieceTokenTypes.CONTROL
        self.gguf_writer.add_tokenizer_model("t5")
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -2108,9 +2108,6 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_graph_get_grad    (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
    GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
    GGML_API void                 ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
    GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
    // print info and performance information for the graph
    GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@ -32,6 +32,8 @@
 extern "C" {
 #endif
 void ggml_print_backtrace(void);
 #ifndef MIN
 #    define MIN(a, b) ((a) < (b) ? (a) : (b))
 #endif
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -1668,7 +1668,7 @@ static std::array<uint32_t, 2> fa_rows_cols(FaCodePath path, uint32_t D, uint32_
        return {64, 32};
    }
    return {64, 64};
-};
+}
 static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vector<uint32_t>& warptile, bool mul_mat_id, ggml_type src0_type) {
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -134,7 +134,7 @@ static void ggml_print_backtrace_symbols(void) {
 }
 #endif
-static void ggml_print_backtrace(void) {
+void ggml_print_backtrace(void) {
    const char * GGML_NO_BACKTRACE = getenv("GGML_NO_BACKTRACE");
    if (GGML_NO_BACKTRACE) {
        return;
@ -161,6 +161,10 @@ static void ggml_print_backtrace(void) {
    const int parent_pid = getpid();
    const int child_pid = fork();
    if (child_pid < 0) { // error
 #if defined(__linux__)
        close(lock[1]);
        close(lock[0]);
 #endif
        return;
    } else if (child_pid == 0) { // child
        char attach[32];
@ -168,6 +172,7 @@ static void ggml_print_backtrace(void) {
 #if defined(__linux__)
        close(lock[1]);
        (void) !read(lock[0], lock, 1);
        close(lock[0]);
 #endif
        // try gdb
        execlp("gdb", "gdb", "--batch",
@ -196,7 +201,7 @@ static void ggml_print_backtrace(void) {
    }
 }
 #else
-static void ggml_print_backtrace(void) {
+void ggml_print_backtrace(void) {
    // platform not supported
 }
 #endif
@ -217,6 +222,8 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
    abort();
 }
 // ggml_print_backtrace is registered with std::set_terminate by ggml.cpp
 //
 // logging
 //
--- a/ggml/src/ggml.cpp
+++ b/ggml/src/ggml.cpp
@ -0,0 +1,26 @@
 #include "ggml-impl.h"
 #include <cstdlib>
 #include <exception>
 static std::terminate_handler previous_terminate_handler;
 GGML_NORETURN static void ggml_uncaught_exception() {
    ggml_print_backtrace();
    if (previous_terminate_handler) {
        previous_terminate_handler();
    }
    abort(); // unreachable unless previous_terminate_handler was nullptr
 }
 static bool ggml_uncaught_exception_init = []{
    const char * GGML_NO_BACKTRACE = getenv("GGML_NO_BACKTRACE");
    if (GGML_NO_BACKTRACE) {
        return false;
    }
    const auto prev{std::get_terminate()};
    GGML_ASSERT(prev != ggml_uncaught_exception);
    previous_terminate_handler = prev;
    std::set_terminate(ggml_uncaught_exception);
    return true;
 }();
--- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp
@ -358,13 +358,29 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
    isggufv1 = false;
    if (ok && gr.read(ctx->version)) {
-        if (ctx->version == 1) {
+        if (ok && ctx->version == 0) {
            GGML_LOG_ERROR("%s: bad GGUF version: %" PRIu32 "\n", __func__, ctx->version);
            ok = false;
        }
        /*
         * bit layout is different when reading non-native endian models.
         * assuming that the GGUF version is 3, the non-native endian model
         * would read it as 0x30000000. we can use the AND operation against
         * the last 4 hexadecimal digits to check if the model is the same
         * endianness as the host system.
        */
        if (ok && (ctx->version & 0x0000FFFF) == 0x00000000) {
            GGML_LOG_ERROR("%s: failed to load model: this GGUF file version %" PRIu32 " is extremely large, is there a mismatch between the host and model endianness?\n", __func__, ctx->version);
            ok = false;
        }
        if (ok && ctx->version == 1) {
            fprintf(stderr, "%s: GGUFv1 is deprecated, please use a more up-to-date version!\n", __func__);
            // ok = false;
            isggufv1 = true;
        }
-        if (ctx->version > GGUF_VERSION) {
+        if (ok && ctx->version > GGUF_VERSION) {
-            fprintf(stderr, "%s: this GGUF file is version %" PRIu32 " but this software is designed for version %d, things may break.\n",
+            GGML_LOG_ERROR("%s: this GGUF file is version %" PRIu32 " but this software only supports up to version %d\n",
                __func__, ctx->version, GGUF_VERSION);
           // ok = false;
        }
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -2319,9 +2319,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
        std::string model_name;
        std::string tokenizer_pre;
        std::string general_arch;
        ml.get_key(LLM_KV_GENERAL_NAME,  model_name,    false);
        ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
        ml.get_key(LLM_KV_GENERAL_ARCHITECTURE, general_arch, false);
        // model name to lowercase
        std::transform(model_name.begin(), model_name.end(), model_name.begin(),
@ -2330,8 +2332,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            }
        );
-        // set attributes by model/tokenizer name
+        // set attributes by model/tokenizer/architecture name
-        if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
+        if (false
                || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
                || _contains_any(general_arch, {"nomic-bert-moe"})
           ) {
            _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
        } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
            for (auto id : cache_special_tokens) {