Merge branch 'upstream' into concedo_experimental

# Conflicts: # README.md
2025-09-13 10:29:43 +00:00 · 2024-06-19 00:33:33 +08:00 · 2024-06-19 00:33:33 +08:00 · c9c050f323
commit c9c050f323
parent 153527745b a94e6ff877
7 changed files with 281 additions and 48 deletions
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -1632,6 +1632,12 @@ class Qwen2MoeModel(Model):
        super().set_gguf_parameters()
        if (n_experts := self.hparams.get("num_experts")) is not None:
            self.gguf_writer.add_expert_count(n_experts)
        if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
            self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
            logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
        if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
            self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
            logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
    _experts: list[dict[str, Tensor]] | None = None
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -1172,7 +1172,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
            // check if a backend with higher prio wants to offload the op
            if (src_backend_id == sched->n_backends - 1) {
                for (int b = 0; b < src_backend_id; b++) {
-                    if (ggml_backend_offload_op(sched->backends[b], tensor)) {
+                    if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
                        SET_CAUSE(tensor, "1.off");
                        return b;
                    }
--- a/ggml-impl.h
+++ b/ggml-impl.h
@ -17,7 +17,7 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
-#if defined(_WIN32)
+#if defined(_MSC_VER)
 #define m512bh(p) p
 #define m512i(p) p
--- a/ggml-rpc.cpp
+++ b/ggml-rpc.cpp
@ -73,9 +73,13 @@ struct rpc_tensor {
    uint64_t view_offs;
    uint64_t data;
    char name[GGML_MAX_NAME];
    char padding[4];
 };
 #pragma pack(pop)
 static_assert(sizeof(rpc_tensor) % 8 == 0, "rpc_tensor size must be multiple of 8");
 // RPC commands
 enum rpc_cmd {
    ALLOC_BUFFER = 0,
@ -599,9 +603,8 @@ static void serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & o
    int output_size = sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor);
    output.resize(output_size, 0);
    memcpy(output.data(), &n_nodes, sizeof(n_nodes));
    uint64_t * out_nodes = (uint64_t *)(output.data() + sizeof(n_nodes));
    for (uint32_t i = 0; i < n_nodes; i++) {
-        out_nodes[i] = reinterpret_cast<uint64_t>(cgraph->nodes[i]);
+        memcpy(output.data() + sizeof(n_nodes) + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t));
    }
    uint32_t * out_ntensors = (uint32_t *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t));
    *out_ntensors = n_tensors;
@ -1036,7 +1039,9 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<u
    }
    std::unordered_map<uint64_t, ggml_tensor*> tensor_map;
    for (uint32_t i = 0; i < n_nodes; i++) {
-        graph->nodes[i] = create_node(nodes[i], ctx, tensor_ptrs, tensor_map);
+        int64_t id;
        memcpy(&id, &nodes[i], sizeof(id));
        graph->nodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map);
    }
    ggml_status status = ggml_backend_graph_compute(backend, graph);
    // output serialization format: | status (1 byte) |
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -33,21 +33,22 @@ class Keys:
        FILE_TYPE            = "general.file_type"
    class LLM:
-        VOCAB_SIZE                 = "{arch}.vocab_size"
+        VOCAB_SIZE                        = "{arch}.vocab_size"
-        CONTEXT_LENGTH             = "{arch}.context_length"
+        CONTEXT_LENGTH                    = "{arch}.context_length"
-        EMBEDDING_LENGTH           = "{arch}.embedding_length"
+        EMBEDDING_LENGTH                  = "{arch}.embedding_length"
-        BLOCK_COUNT                = "{arch}.block_count"
+        BLOCK_COUNT                       = "{arch}.block_count"
-        LEADING_DENSE_BLOCK_COUNT  = "{arch}.leading_dense_block_count"
+        LEADING_DENSE_BLOCK_COUNT         = "{arch}.leading_dense_block_count"
-        FEED_FORWARD_LENGTH        = "{arch}.feed_forward_length"
+        FEED_FORWARD_LENGTH               = "{arch}.feed_forward_length"
-        EXPERT_FEED_FORWARD_LENGTH = "{arch}.expert_feed_forward_length"
+        EXPERT_FEED_FORWARD_LENGTH        = "{arch}.expert_feed_forward_length"
-        USE_PARALLEL_RESIDUAL      = "{arch}.use_parallel_residual"
+        EXPERT_SHARED_FEED_FORWARD_LENGTH = "{arch}.expert_shared_feed_forward_length"
-        TENSOR_DATA_LAYOUT         = "{arch}.tensor_data_layout"
+        USE_PARALLEL_RESIDUAL             = "{arch}.use_parallel_residual"
-        EXPERT_COUNT               = "{arch}.expert_count"
+        TENSOR_DATA_LAYOUT                = "{arch}.tensor_data_layout"
-        EXPERT_USED_COUNT          = "{arch}.expert_used_count"
+        EXPERT_COUNT                      = "{arch}.expert_count"
-        EXPERT_SHARED_COUNT        = "{arch}.expert_shared_count"
+        EXPERT_USED_COUNT                 = "{arch}.expert_used_count"
-        EXPERT_WEIGHTS_SCALE       = "{arch}.expert_weights_scale"
+        EXPERT_SHARED_COUNT               = "{arch}.expert_shared_count"
-        POOLING_TYPE               = "{arch}.pooling_type"
+        EXPERT_WEIGHTS_SCALE              = "{arch}.expert_weights_scale"
-        LOGIT_SCALE                = "{arch}.logit_scale"
+        POOLING_TYPE                      = "{arch}.pooling_type"
        LOGIT_SCALE                       = "{arch}.logit_scale"
    class Attention:
        HEAD_COUNT        = "{arch}.attention.head_count"
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -394,6 +394,9 @@ class GGUFWriter:
    def add_expert_feed_forward_length(self, length: int) -> None:
        self.add_uint32(Keys.LLM.EXPERT_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
    def add_expert_shared_feed_forward_length(self, length: int) -> None:
        self.add_uint32(Keys.LLM.EXPERT_SHARED_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
    def add_parallel_residual(self, use: bool) -> None:
        self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
--- a/llama.cpp
+++ b/llama.cpp
@ -310,6 +310,7 @@ enum llm_kv {
    LLM_KV_LEADING_DENSE_BLOCK_COUNT,
    LLM_KV_FEED_FORWARD_LENGTH,
    LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
    LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
    LLM_KV_USE_PARALLEL_RESIDUAL,
    LLM_KV_TENSOR_DATA_LAYOUT,
    LLM_KV_EXPERT_COUNT,
@ -388,21 +389,22 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_GENERAL_SOURCE_URL,            "general.source.url"                    },
    { LLM_KV_GENERAL_SOURCE_HF_REPO,        "general.source.huggingface.repository" },
-    { LLM_KV_VOCAB_SIZE,                    "%s.vocab_size"                 },
+    { LLM_KV_VOCAB_SIZE,                        "%s.vocab_size"                        },
-    { LLM_KV_CONTEXT_LENGTH,                "%s.context_length"             },
+    { LLM_KV_CONTEXT_LENGTH,                    "%s.context_length"                    },
-    { LLM_KV_EMBEDDING_LENGTH,              "%s.embedding_length"           },
+    { LLM_KV_EMBEDDING_LENGTH,                  "%s.embedding_length"                  },
-    { LLM_KV_BLOCK_COUNT,                   "%s.block_count"                },
+    { LLM_KV_BLOCK_COUNT,                       "%s.block_count"                       },
-    { LLM_KV_LEADING_DENSE_BLOCK_COUNT,     "%s.leading_dense_block_count"  },
+    { LLM_KV_LEADING_DENSE_BLOCK_COUNT,         "%s.leading_dense_block_count"         },
-    { LLM_KV_FEED_FORWARD_LENGTH,           "%s.feed_forward_length"        },
+    { LLM_KV_FEED_FORWARD_LENGTH,               "%s.feed_forward_length"               },
-    { LLM_KV_EXPERT_FEED_FORWARD_LENGTH,    "%s.expert_feed_forward_length" },
+    { LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        "%s.expert_feed_forward_length"        },
-    { LLM_KV_USE_PARALLEL_RESIDUAL,         "%s.use_parallel_residual"      },
+    { LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
-    { LLM_KV_TENSOR_DATA_LAYOUT,            "%s.tensor_data_layout"         },
+    { LLM_KV_USE_PARALLEL_RESIDUAL,             "%s.use_parallel_residual"             },
-    { LLM_KV_EXPERT_COUNT,                  "%s.expert_count"               },
+    { LLM_KV_TENSOR_DATA_LAYOUT,                "%s.tensor_data_layout"                },
-    { LLM_KV_EXPERT_USED_COUNT,             "%s.expert_used_count"          },
+    { LLM_KV_EXPERT_COUNT,                      "%s.expert_count"                      },
-    { LLM_KV_EXPERT_SHARED_COUNT,           "%s.expert_shared_count"        },
+    { LLM_KV_EXPERT_USED_COUNT,                 "%s.expert_used_count"                 },
-    { LLM_KV_EXPERT_WEIGHTS_SCALE,          "%s.expert_weights_scale"       },
+    { LLM_KV_EXPERT_SHARED_COUNT,               "%s.expert_shared_count"               },
-    { LLM_KV_POOLING_TYPE ,                 "%s.pooling_type"               },
+    { LLM_KV_EXPERT_WEIGHTS_SCALE,              "%s.expert_weights_scale"              },
-    { LLM_KV_LOGIT_SCALE,                   "%s.logit_scale"                },
+    { LLM_KV_POOLING_TYPE ,                     "%s.pooling_type"                      },
    { LLM_KV_LOGIT_SCALE,                       "%s.logit_scale"                       },
    { LLM_KV_ATTENTION_HEAD_COUNT,          "%s.attention.head_count"             },
    { LLM_KV_ATTENTION_HEAD_COUNT_KV,       "%s.attention.head_count_kv"          },
@ -1302,6 +1304,126 @@ struct no_init {
 };
 struct llama_file {
 #if defined(_WIN32)
    // use FILE * so we don't have to re-open the file to mmap
    FILE * fp;
    HANDLE fp_win32;
    size_t size;
 private:
    std::string GetErrorMessageWin32(DWORD error_code) const {
        std::string ret;
        LPSTR lpMsgBuf = NULL;
        DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
                                    NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
        if (!bufLen) {
            ret = format("Win32 error code: %s", error_code);
        } else {
            ret = lpMsgBuf;
            LocalFree(lpMsgBuf);
        }
        return ret;
    }
 public:
    llama_file(const char * fname, const char * mode) {
        fp = ggml_fopen(fname, mode);
        if (fp == NULL) {
            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
        }
        fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
        seek(0, SEEK_END);
        size = tell();
        seek(0, SEEK_SET);
    }
    size_t tell() const {
        // SetFilePointerEx returns the current position when seeking relative 0 bytes
        LARGE_INTEGER li;
        li.QuadPart = 0;
        BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT);
        if (!ret) {
            throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
        }
        return li.QuadPart;
    }
    void seek(size_t offset, int whence) const {
        // no need to convert SEEK_* to FILE_*. The enums are the same.
        // Still, keep static asserts to avoid failures in the future.
        static_assert(SEEK_SET == FILE_BEGIN, "SEEK_SET != FILE_BEGIN");
        static_assert(SEEK_CUR == FILE_CURRENT, "SEEK_CUR != FILE_CURRENT");
        static_assert(SEEK_END == FILE_END, "SEEK_END != FILE_END");
        LARGE_INTEGER li;
        li.QuadPart = offset;
        BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence);
        if (!ret) {
            throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
        }
    }
    void read_raw(void * ptr, size_t len) const {
        // On Win32 ReadFile is significant faster than fread which is again significant faster than std::fstream. Thus
        // use the Win32 API to do file io instead of the C/C++ library functions.
        // There are conditions under which ReadFile cannot read chunks >64MB.
        // Thus split the operation into smaller chunks if len exceeds this limit.
        size_t bytes_read = 0;
        while (bytes_read < len) {
            size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
            DWORD chunk_read = 0;
            BOOL result = ReadFile(fp_win32, reinterpret_cast<char*>(ptr) + bytes_read, chunk_size, &chunk_read, NULL);
            if (!result) {
                throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
            }
            if (chunk_read < chunk_size || chunk_read == 0) {
                throw std::runtime_error("unexpectedly reached end of file");
            }
            bytes_read += chunk_read;
        } ;
    }
    uint32_t read_u32() const {
        uint32_t val;
        read_raw(&val, sizeof(val));
        return val;
    }
    void write_raw(const void * ptr, size_t len) const {
        // There are conditions under which WriteFile cannot write chunks >64MB.
        // Thus split the operation into smaller chunks if len exceeds this limit.
        size_t bytes_written = 0;
        while (bytes_written < len) {
            size_t chunk_size = std::min<size_t>(len - bytes_written, 64*1024*1024);
            DWORD chunk_written = 0;
            BOOL result = WriteFile(fp_win32, reinterpret_cast<char const*>(ptr) + bytes_written, chunk_size, &chunk_written, NULL);
            if (!result) {
                throw std::runtime_error(format("write error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
            }
            if (chunk_written < chunk_size || chunk_written == 0) {
                throw std::runtime_error("unexpectedly failed to write bytes");
            }
            bytes_written += chunk_written;
        }
    }
    void write_u32(std::uint32_t val) const {
        write_raw(&val, sizeof(val));
    }
    ~llama_file() {
        if (fp) {
            std::fclose(fp);
        }
    }
 #else
    // use FILE * so we don't have to re-open the file to mmap
    FILE * fp;
    size_t size;
@ -1322,7 +1444,10 @@ struct llama_file {
 #else
        long ret = std::ftell(fp);
 #endif
-        GGML_ASSERT(ret != -1); // this really shouldn't fail
+        if (ret == -1) {
            throw std::runtime_error(format("ftell error: %s", strerror(errno)));
        }
        return (size_t) ret;
    }
@ -1332,7 +1457,9 @@ struct llama_file {
 #else
        int ret = std::fseek(fp, (long) offset, whence);
 #endif
-        GGML_ASSERT(ret == 0); // same
+        if (ret != 0) {
            throw std::runtime_error(format("seek error: %s", strerror(errno)));
        }
    }
    void read_raw(void * ptr, size_t len) const {
@ -1375,6 +1502,7 @@ struct llama_file {
            std::fclose(fp);
        }
    }
 #endif
 };
 using llama_files = std::vector<std::unique_ptr<llama_file>>;
@ -1872,6 +2000,7 @@ struct llama_hparams {
    uint32_t n_lora_q = 0;
    uint32_t n_lora_kv = 0;
    uint32_t n_ff_exp = 0;
    uint32_t n_ff_shexp = 0;
    uint32_t n_expert_shared = 0;
    float    expert_weights_scale = 0.0;
@ -1920,6 +2049,7 @@ struct llama_hparams {
        if (this->n_lora_q           != other.n_lora_q)           return true;
        if (this->n_lora_kv          != other.n_lora_kv)          return true;
        if (this->n_ff_exp           != other.n_ff_exp)           return true;
        if (this->n_ff_shexp         != other.n_ff_shexp)         return true;
        if (this->n_expert_shared    != other.n_expert_shared)    return true;
        if (this->rope_finetuned  != other.rope_finetuned)  return true;
@ -3760,6 +3890,44 @@ struct llama_model_loader {
        std::vector<no_init<uint8_t>> read_buf;
        std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
 #if defined(GGML_USE_CUDA)
        // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
        // NVMe raid configurations might require more / larger buffers.
        constexpr size_t num_buffers = 4;
        constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
        std::vector<ggml_backend_buffer_t> host_buffers;
        std::vector<void*> host_ptrs;
        std::vector<ggml_backend_event_t> events;
        size_t buffer_idx = 0; // buffer to use for async loads
        ggml_backend_t cuda_backend = nullptr;
        if (!use_mmap && !check_tensors) {
            // When not using mmaped io use async uploads from pinned memory to GPU memory.
            // First determine if the CUDA backend is active, and if so, determine the device ID.
            ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr;
            if (buf) {
                ggml_backend_buffer_type_t buffer_type = ggml_backend_buffer_get_type(buf);
                for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
                    auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i);
                    if (buffer_type == cuda_buffer_type) {
                        cuda_backend = ggml_backend_cuda_init(i);
                        break;
                    }
                }
            }
            // If the cuda backend is active create pinned memory buffers and events for synchronisation.
            if (cuda_backend) {
                for (size_t idx = 0; idx < num_buffers; ++idx) {
                    host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
                    host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
                    events.emplace_back(ggml_backend_event_new(cuda_backend));
                }
            }
        }
 #endif
        for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
            const auto * weight = get_weight(ggml_get_name(cur));
            if (weight == nullptr) {
@ -3815,12 +3983,36 @@ struct llama_model_loader {
                        }));
                    }
                } else {
-                    read_buf.resize(n_size);
+#if defined(GGML_USE_CUDA)
-                    file->seek(weight->offs, SEEK_SET);
+                    // If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
-                    file->read_raw(read_buf.data(), n_size);
+                    if (cuda_backend) {
-                    ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
+                        file->seek(weight->offs, SEEK_SET);
-                    if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
+
-                        throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
+                        size_t bytes_read = 0;
                        while (bytes_read < n_size) {
                            size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
                            ggml_backend_event_synchronize(events[buffer_idx]);
                            file->read_raw(host_ptrs[buffer_idx], read_iteration);
                            ggml_backend_tensor_set_async(cuda_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
                            ggml_backend_event_record(events[buffer_idx]);
                            bytes_read += read_iteration;
                            ++buffer_idx;
                            buffer_idx %= num_buffers;
                        }
                    }
                    else
 #endif
                    {
                        read_buf.resize(n_size);
                        file->seek(weight->offs, SEEK_SET);
                        file->read_raw(read_buf.data(), n_size);
                        ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
                        if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
                            throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
                        }
                    }
                }
            }
@ -3841,6 +4033,18 @@ struct llama_model_loader {
            size_done += n_size;
        }
 #if defined(GGML_USE_CUDA)
        // free temporary resources used for async cuda uploads
        if (cuda_backend) {
            for (size_t idx = 0; idx < num_buffers;++idx) {
                ggml_backend_event_synchronize(events[idx]);
                ggml_backend_event_free(events[idx]);
                ggml_backend_buffer_free(host_buffers[idx]);
            }
            ggml_backend_free(cuda_backend);
        }
 #endif
        // check validation results
        bool validation_failed = false;
        for (auto & future : validation_result) {
@ -4307,6 +4511,9 @@ static void llm_load_hparams(
            } break;
        case LLM_ARCH_QWEN2MOE:
            {
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                switch (hparams.n_layer) {
                    case 24: model.type = e_model::MODEL_A2_7B; break;
@ -5111,6 +5318,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
        LLAMA_LOG_INFO("%s: rope_yarn_log_mul    = %.4f\n",   __func__, hparams.rope_yarn_log_mul);
    }
    if (model.arch == LLM_ARCH_QWEN2MOE) {
        LLAMA_LOG_INFO("%s: n_ff_exp         = %d\n",     __func__, hparams.n_ff_exp);
        LLAMA_LOG_INFO("%s: n_ff_shexp       = %d\n",     __func__, hparams.n_ff_shexp);
    }
 }
 // Returns false if cancelled by progress_callback
@ -5261,7 +5473,7 @@ static bool llm_load_tensors(
    // create tensors for the weights
    {
        const int64_t n_embd       = hparams.n_embd;
-        const int64_t n_embd_head  = n_embd / hparams.n_head;
+        const int64_t n_embd_head  = (hparams.n_head == 0) ? 0 : n_embd / hparams.n_head;
        const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
        const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
        const int64_t n_embd_gqa   = n_embd_v_gqa;
@ -5904,16 +6116,17 @@ static bool llm_load_tensors(
                        GGML_ASSERT(hparams.n_expert_used > 0);
                        // MoE branch
-                        auto n_ff_exp = n_ff / hparams.n_expert_used;
+                        auto n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / hparams.n_expert_used;
                        layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert});
                        layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert});
                        layer.ffn_up_exps   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert});
                        // Shared expert branch
                        auto n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
                        layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
-                        layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd,   n_ff});
+                        layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp});
-                        layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {  n_ff, n_embd});
+                        layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd});
-                        layer.ffn_up_shexp   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_up_shexp   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_shexp});
                    }
                } break;
            case LLM_ARCH_PHI2:
@ -16371,6 +16584,11 @@ struct llama_context * llama_new_context_with_model(
        params.flash_attn = false;
    }
    if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {
        LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
        params.flash_attn = false;
    }
    if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
        LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
        return nullptr;