diff --git a/common/arg.cpp b/common/arg.cpp index e97c82ad9..4d01e7c4a 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2523,11 +2523,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex )); add_opt(common_arg( {"-a", "--alias"}, "STRING", - "set alias for model name (to be used by REST API)", + "set model name aliases, comma-separated (to be used by API)", [](common_params & params, const std::string & value) { - params.model_alias = value; + for (auto & alias : string_split(value, ',')) { + alias = string_strip(alias); + if (!alias.empty()) { + params.model_alias.insert(alias); + } + } } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS")); + add_opt(common_arg( + {"--tags"}, "STRING", + "set model tags, comma-separated (informational, not used for routing)", + [](common_params & params, const std::string & value) { + for (auto & tag : string_split(value, ',')) { + tag = string_strip(tag); + if (!tag.empty()) { + params.model_tags.insert(tag); + } + } + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TAGS")); add_opt(common_arg( {"-m", "--model"}, "FNAME", ex == LLAMA_EXAMPLE_EXPORT_LORA diff --git a/common/common.h b/common/common.h index 4a36bb5c4..a136d9391 100644 --- a/common/common.h +++ b/common/common.h @@ -407,7 +407,8 @@ struct common_params { struct common_params_model model; - std::string model_alias = ""; // model alias // NOLINT + std::set model_alias; // model aliases // NOLINT + std::set model_tags; // model tags (informational, not used for routing) // NOLINT std::string hf_token = ""; // HF token // NOLINT std::string prompt = ""; // NOLINT std::string system_prompt = ""; // NOLINT diff --git a/common/jinja/runtime.cpp b/common/jinja/runtime.cpp index c93e182a7..5757c76b7 100644 --- a/common/jinja/runtime.cpp +++ b/common/jinja/runtime.cpp @@ -721,6 +721,8 @@ value member_expression::execute_impl(context & ctx) { int64_t arr_size = 0; if (is_val(object)) { arr_size = object->as_array().size(); + } else if (is_val(object)) { + arr_size = object->as_string().length(); } if (is_stmt(this->property)) { diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index e03810959..095441739 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -116,7 +116,8 @@ class ModelBase: split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None, disable_mistral_community_chat_template: bool = False, - sentence_transformers_dense_modules: bool = False): + sentence_transformers_dense_modules: bool = False, + fuse_gate_up_exps: bool = False): if type(self) is ModelBase or \ type(self) is TextModel or \ type(self) is MmprojModel: @@ -135,6 +136,9 @@ class ModelBase: self.dry_run = dry_run self.remote_hf_model_id = remote_hf_model_id self.sentence_transformers_dense_modules = sentence_transformers_dense_modules + self.fuse_gate_up_exps = fuse_gate_up_exps + self._gate_exp_buffer: dict[int, Tensor] = {} + self._up_exp_buffer: dict[int, Tensor] = {} self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id) self.metadata_override = metadata_override @@ -512,8 +516,31 @@ class ModelBase: raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses") def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - return [(self.map_tensor_name(name), data_torch)] + new_name = self.map_tensor_name(name) + + # Handle gate/up expert tensor fusion if enabled + if self.fuse_gate_up_exps and bid is not None: + if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_GATE_EXP, bid): + self._gate_exp_buffer[bid] = data_torch + elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_UP_EXP, bid): + self._up_exp_buffer[bid] = data_torch + + # Check if both gate and up are buffered for this layer + if bid in self._gate_exp_buffer and bid in self._up_exp_buffer: + gate_data = self._gate_exp_buffer.pop(bid) + up_data = self._up_exp_buffer.pop(bid) + # gate/up shape: (n_expert, n_ff, n_embd), concatenate to (n_expert, n_ff*2, n_embd) + fused_data = torch.cat([gate_data, up_data], dim=1) + fused_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_UP_EXP, bid) + logger.info(f"Fused gate_exps and up_exps for layer {bid}") + return [(fused_name, fused_data)] + + # If we buffered a gate/up tensor, wait for the other + if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_GATE_EXP, bid) or \ + self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_UP_EXP, bid): + return [] + + return [(new_name, data_torch)] def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: del name, new_name, bid, n_dims # unused @@ -1148,6 +1175,9 @@ class TextModel(ModelBase): if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6": # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de res = "jina-v2-de" + if chkhsh == "a023e9fdc5a11f034d3ef515b92350e56fb2af1f66c6b6811a4444ea9bf8763d": + # ref: https://huggingface.co/jinaai/jina-embeddings-v5-text-nano + res = "jina-v5-nano" if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d": # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct res = "smaug-bpe" @@ -6125,6 +6155,32 @@ class NeoBert(BertModel): yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("EuroBertModel", "JinaEmbeddingsV5Model") +class EuroBertModel(TextModel): + model_arch = gguf.MODEL_ARCH.EUROBERT + + def set_vocab(self): + self.gguf_writer.add_add_bos_token(False) + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # EuroBert is bidirectional (encoder) + self.gguf_writer.add_causal_attention(False) + + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + self._try_set_pooling_type() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Strip "model." prefix from tensor names + if name.startswith("model."): + name = name[6:] + + yield from super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") class XLMRobertaModel(BertModel): model_arch = gguf.MODEL_ARCH.BERT @@ -11913,6 +11969,11 @@ def parse_args() -> argparse.Namespace: "Default these modules are not included.") ) + parser.add_argument( + "--fuse-gate-up-exps", action="store_true", + help="Fuse gate_exps and up_exps tensors into a single gate_up_exps tensor for MoE models.", + ) + args = parser.parse_args() if not args.print_supported_models and args.model is None: parser.error("the following arguments are required: model") @@ -12050,7 +12111,8 @@ def main() -> None: split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run, small_first_shard=args.no_tensor_first_split, remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template, - sentence_transformers_dense_modules=args.sentence_transformers_dense_modules + sentence_transformers_dense_modules=args.sentence_transformers_dense_modules, + fuse_gate_up_exps=args.fuse_gate_up_exps ) if args.vocab_only: diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 53a73759e..b31ddcca7 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -107,6 +107,7 @@ models = [ {"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM! {"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", }, {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, + {"name": "jina-v5-nano", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v5-text-nano", }, {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", }, {"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", }, {"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", }, diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 1ee63d025..1c032a581 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -640,8 +640,6 @@ struct vk_device_struct { // floor(log2(maxComputeWorkGroupInvocations)) uint32_t max_workgroup_size_log2 {}; - bool flash_attention_fp16; - bool coopmat_support; bool coopmat_acc_f32_support {}; bool coopmat_acc_f16_support {}; @@ -2994,11 +2992,15 @@ static vk_fa_tuning_params get_fa_tuning_params(const vk_device& device, uint32_ } } -static vk_fa_pipeline_state get_fa_pipeline_state(const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool aligned, bool f32acc, +static vk_fa_pipeline_state get_fa_pipeline_state(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool aligned, bool f32acc, bool use_mask, bool use_mask_opt, bool use_logit_softcap) { + const bool old_amd_windows = device->vendor_id == VK_VENDOR_ID_AMD && device->driver_id == vk::DriverId::eAmdProprietary && + (device->architecture == AMD_GCN || device->architecture == AMD_RDNA1 || device->architecture == AMD_RDNA2); + uint32_t flags = (use_mask_opt ? 1 : 0) | (use_mask ? 2 : 0) | - (use_logit_softcap ? 4 : 0); + (use_logit_softcap ? 4 : 0) | + (old_amd_windows ? 8 : 0); const uint32_t subgroup_size = params.disable_subgroups ? 0 : params.subgroup_size; @@ -3400,7 +3402,7 @@ static void ggml_vk_load_shaders(vk_device& device) { } \ } - if (device->flash_attention_fp16) { + if (device->fp16) { CREATE_FA(GGML_TYPE_F32, f32, FA_SCALAR, ) CREATE_FA(GGML_TYPE_F16, f16, FA_SCALAR, ) CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_SCALAR, ) @@ -5453,10 +5455,6 @@ static vk_device ggml_vk_get_device(size_t idx) { device->mmvq_mode = 1; } - // Driver issues with older AMD GPUs on Windows, see https://github.com/ggml-org/llama.cpp/pull/19625#issuecomment-3940840613 - const bool is_amd_proprietary_gcn = device->vendor_id == VK_VENDOR_ID_AMD && device->architecture == AMD_GCN && device->driver_id == vk::DriverId::eAmdProprietary; - device->flash_attention_fp16 = device->fp16 && !is_amd_proprietary_gcn; - return device; } @@ -8605,7 +8603,7 @@ static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, con const uint32_t Br = params.block_rows; const uint32_t Bc = params.block_cols; - const uint32_t float_type_size = device->flash_attention_fp16 ? sizeof(ggml_fp16_t) : sizeof(float); + const uint32_t float_type_size = device->fp16 ? sizeof(ggml_fp16_t) : sizeof(float); // tmpsh is overestimated slightly const uint32_t tmpsh = wg_size * sizeof(float); @@ -8728,7 +8726,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx uint32_t workgroups_y = (uint32_t)neq2; uint32_t workgroups_z = (uint32_t)neq3; - const bool f32acc = !ctx->device->flash_attention_fp16 || dst->op_params[3] == GGML_PREC_F32; + const bool f32acc = !ctx->device->fp16 || dst->op_params[3] == GGML_PREC_F32; // For scalar/coopmat1 FA, we can use the "large" size to accommodate qga. // For coopmat2 FA, we always use the small size (which is still pretty large for gqa). @@ -8783,7 +8781,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx // Only use mask opt when the mask is fairly large. This hasn't been tuned extensively. bool use_mask_opt = mask && nem1 >= 32 && nem0 * nem1 > 32768; - vk_fa_pipeline_state fa_pipeline_state = get_fa_pipeline_state(tuning_params, HSK, HSV, aligned, f32acc, + vk_fa_pipeline_state fa_pipeline_state = get_fa_pipeline_state(ctx->device, tuning_params, HSK, HSV, aligned, f32acc, mask != nullptr, use_mask_opt, logit_softcap != 0); vk_pipeline pipeline = nullptr; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp index 135ab1ad6..ec48f5b11 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp @@ -465,7 +465,14 @@ void main() { if (SubGroupSize > 0) { [[unroll]] for (uint s = D_split; s < SubGroupSize; s *= 2) { - Of[r][d] += subgroupShuffleXor(Of[r][d], s); + if (!OLD_AMD_WINDOWS) { + Of[r][d] += subgroupShuffleXor(Of[r][d], s); + } else { + // Something about f16vec4 subgroupShuffleXor is broken on AMD Windows RDNA2 and below. + // Shuffle full vec4 as workaround. + // See https://github.com/ggml-org/llama.cpp/issues/19881#issuecomment-3958643697 + Of[r][d] += FLOAT_TYPEV4(subgroupShuffleXor(vec4(Of[r][d]), s)); + } } if (row_split == 1) { barrier(); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl index d444542b5..172d38f03 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl @@ -14,9 +14,10 @@ layout (constant_id = 9) const uint32_t SHMEM_STAGING = 0; layout (constant_id = 10) const uint32_t Flags = 0; layout (constant_id = 11) const uint32_t LIMIT_OCCUPANCY_SHMEM = 0; -const bool USE_MASK_OPT = (Flags & 1) != 0; -const bool MASK_ENABLE = (Flags & 2) != 0; -const bool LOGIT_SOFTCAP = (Flags & 4) != 0; +const bool USE_MASK_OPT = (Flags & 1) != 0; +const bool MASK_ENABLE = (Flags & 2) != 0; +const bool LOGIT_SOFTCAP = (Flags & 4) != 0; +const bool OLD_AMD_WINDOWS = (Flags & 8) != 0; // Round up head sizes to a multiple of 16, for coopmat1/coopmat2 paths const uint32_t HSK_pad = (HSK + 15) & ~15; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 689acdc65..839c6e787 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -379,6 +379,7 @@ class MODEL_ARCH(IntEnum): NEO_BERT = auto() JINA_BERT_V2 = auto() JINA_BERT_V3 = auto() + EUROBERT = auto() BLOOM = auto() STABLELM = auto() QWEN = auto() @@ -531,6 +532,7 @@ class MODEL_TENSOR(IntEnum): FFN_GATE_EXP = auto() FFN_DOWN_EXP = auto() FFN_UP_EXP = auto() + FFN_GATE_UP_EXP = auto() FFN_GATE_SHEXP = auto() FFN_DOWN_SHEXP = auto() FFN_UP_SHEXP = auto() @@ -820,6 +822,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.NEO_BERT: "neo-bert", MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2", MODEL_ARCH.JINA_BERT_V3: "jina-bert-v3", + MODEL_ARCH.EUROBERT: "eurobert", MODEL_ARCH.BLOOM: "bloom", MODEL_ARCH.STABLELM: "stablelm", MODEL_ARCH.QWEN: "qwen", @@ -978,6 +981,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", + MODEL_TENSOR.FFN_GATE_UP_EXP: "blk.{bid}.ffn_gate_up_exps", MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b", MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n @@ -1587,6 +1591,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_UP, MODEL_TENSOR.LAYER_OUT_NORM, ], + MODEL_ARCH.EUROBERT: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_DOWN, + ], MODEL_ARCH.MPT: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -1805,6 +1822,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_GATE_UP_EXP, MODEL_TENSOR.SSM_A, MODEL_TENSOR.SSM_CONV1D, MODEL_TENSOR.SSM_DT, @@ -1894,6 +1912,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_GATE_UP_EXP, MODEL_TENSOR.SSM_A, MODEL_TENSOR.SSM_CONV1D, MODEL_TENSOR.SSM_DT, @@ -2595,6 +2614,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_GATE_EXP, MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_UP_EXP, MODEL_TENSOR.FFN_GATE_SHEXP, MODEL_TENSOR.FFN_DOWN_SHEXP, MODEL_TENSOR.FFN_UP_SHEXP, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index fc468d077..e57561090 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -567,6 +567,10 @@ class TensorNameMap: "model.layers.{bid}.mlp.chunk_experts.gate_proj", # grovemoe ), + MODEL_TENSOR.FFN_GATE_UP_EXP: ( + "model.layers.{bid}.mlp.experts.gate_up_proj", + ), + # Feed-forward down MODEL_TENSOR.FFN_DOWN: ( "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index 48693ae3e..5fb2755f1 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gguf" -version = "0.17.1" +version = "0.18.0" description = "Read and write ML models in GGUF for GGML" authors = ["GGML "] packages = [ diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 39ebb9db0..47e8d5278 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -26,6 +26,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_NEO_BERT, "neo-bert" }, { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" }, { LLM_ARCH_JINA_BERT_V3, "jina-bert-v3" }, + { LLM_ARCH_EUROBERT, "eurobert" }, { LLM_ARCH_BLOOM, "bloom" }, { LLM_ARCH_STABLELM, "stablelm" }, { LLM_ARCH_QWEN, "qwen" }, @@ -348,6 +349,7 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_GATE_UP_EXPS, "blk.%d.ffn_gate_up_exps" }, { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, @@ -819,6 +821,20 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_CLS, LLM_TENSOR_CLS_OUT, }; + case LLM_ARCH_EUROBERT: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_DOWN, + }; case LLM_ARCH_MODERN_BERT: return { LLM_TENSOR_TOKEN_EMBD, @@ -989,6 +1005,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_FFN_GATE_EXPS, LLM_TENSOR_FFN_DOWN_EXPS, LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_UP_EXPS, LLM_TENSOR_FFN_GATE_INP_SHEXP, LLM_TENSOR_FFN_GATE_SHEXP, LLM_TENSOR_FFN_DOWN_SHEXP, @@ -1046,6 +1063,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_FFN_GATE_EXPS, LLM_TENSOR_FFN_DOWN_EXPS, LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_UP_EXPS, LLM_TENSOR_FFN_GATE_INP_SHEXP, LLM_TENSOR_FFN_GATE_SHEXP, LLM_TENSOR_FFN_DOWN_SHEXP, @@ -1586,6 +1604,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_FFN_GATE_EXPS, LLM_TENSOR_FFN_DOWN_EXPS, LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_UP_EXPS, LLM_TENSOR_FFN_GATE_INP_SHEXP, LLM_TENSOR_FFN_GATE_SHEXP, LLM_TENSOR_FFN_DOWN_SHEXP, @@ -2670,6 +2689,7 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, + {LLM_TENSOR_FFN_GATE_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_DOWN_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_GATE_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_UP_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 11daa1413..6d1b1df31 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -30,6 +30,7 @@ enum llm_arch { LLM_ARCH_NEO_BERT, LLM_ARCH_JINA_BERT_V2, LLM_ARCH_JINA_BERT_V3, + LLM_ARCH_EUROBERT, LLM_ARCH_BLOOM, LLM_ARCH_STABLELM, LLM_ARCH_QWEN, @@ -372,6 +373,7 @@ enum llm_tensor { LLM_TENSOR_FFN_DOWN_EXPS, // merged experts LLM_TENSOR_FFN_GATE_EXPS, LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_UP_EXPS, LLM_TENSOR_FFN_DOWN_SHEXP, LLM_TENSOR_FFN_GATE_SHEXP, LLM_TENSOR_FFN_UP_SHEXP, diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index dc58c0826..23a86ea29 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1165,7 +1165,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn( float w_scale, llama_expert_gating_func_type gating_op, int il, - ggml_tensor * probs_in) const { + ggml_tensor * probs_in, + ggml_tensor * gate_up_exps) const { return build_moe_ffn( cur, gate_inp, /* gate_inp_b */ nullptr, @@ -1181,7 +1182,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn( w_scale, gating_op, il, - probs_in + probs_in, + gate_up_exps ); } @@ -1204,7 +1206,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn( float w_scale, llama_expert_gating_func_type gating_op, int il, - ggml_tensor * probs_in) const { + ggml_tensor * probs_in, + ggml_tensor * gate_up_exps, + ggml_tensor * gate_up_exps_b) const { const int64_t n_embd = cur->ne[0]; const int64_t n_tokens = cur->ne[1]; const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN @@ -1343,26 +1347,48 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cb(cur, "ffn_moe_weighted", il); } - ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] - cb(up, "ffn_moe_up", il); - - if (up_exps_b) { - up = ggml_add_id(ctx0, up, up_exps_b, selected_experts); - cb(up, "ffn_moe_up_biased", il); - } - + ggml_tensor * up = nullptr; ggml_tensor * experts = nullptr; - if (gate_exps) { - cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + + if (gate_up_exps) { + // merged gate_up path: one mul_mat_id, then split into gate and up views + ggml_tensor * gate_up = build_lora_mm_id(gate_up_exps, cur, selected_experts); // [n_ff*2, n_expert_used, n_tokens] + cb(gate_up, "ffn_moe_gate_up", il); + + if (gate_up_exps_b) { + gate_up = ggml_add_id(ctx0, gate_up, gate_up_exps_b, selected_experts); + cb(gate_up, "ffn_moe_gate_up_biased", il); + } + + const int64_t n_ff = gate_up->ne[0] / 2; + cur = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], 0); cb(cur, "ffn_moe_gate", il); + up = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], n_ff * gate_up->nb[0]); + cb(up, "ffn_moe_up", il); } else { - cur = up; + // separate gate and up path + up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(up, "ffn_moe_up", il); + + if (up_exps_b) { + up = ggml_add_id(ctx0, up, up_exps_b, selected_experts); + cb(up, "ffn_moe_up_biased", il); + } + + if (gate_exps) { + cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(cur, "ffn_moe_gate", il); + } else { + cur = up; + } + + if (gate_exps_b) { + cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts); + cb(cur, "ffn_moe_gate_biased", il); + } } - if (gate_exps_b) { - cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts); - cb(cur, "ffn_moe_gate_biased", il); - } + const bool has_gate = gate_exps || gate_up_exps; switch (type_op) { case LLM_FFN_SILU: @@ -1385,7 +1411,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn( break; } } + } + if (has_gate) { cur = ggml_swiglu_split(ctx0, cur, up); cb(cur, "ffn_moe_swiglu", il); } else { @@ -1393,7 +1421,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cb(cur, "ffn_moe_silu", il); } break; case LLM_FFN_GELU: - if (gate_exps) { + if (has_gate) { cur = ggml_geglu_split(ctx0, cur, up); cb(cur, "ffn_moe_geglu", il); } else { @@ -1409,7 +1437,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cb(cur, "ffn_moe_swiglu_oai", il); } break; case LLM_FFN_RELU: - if (gate_exps) { + if (has_gate) { cur = ggml_reglu_split(ctx0, cur, up); cb(cur, "ffn_moe_reglu", il); } else { @@ -1417,7 +1445,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cb(cur, "ffn_moe_relu", il); } break; case LLM_FFN_RELU_SQR: - if (gate_exps) { + if (has_gate) { // TODO: add support for gated squared relu GGML_ABORT("fatal error: gated squared relu not implemented"); } else { diff --git a/src/llama-graph.h b/src/llama-graph.h index 22d11a838..e8f006977 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -814,7 +814,8 @@ struct llm_graph_context { float w_scale, llama_expert_gating_func_type gating_op, int il, - ggml_tensor * probs_in = nullptr) const; + ggml_tensor * probs_in = nullptr, + ggml_tensor * gate_up_exps = nullptr) const; ggml_tensor * build_moe_ffn( ggml_tensor * cur, @@ -835,7 +836,9 @@ struct llm_graph_context { float w_scale, llama_expert_gating_func_type gating_op, int il, - ggml_tensor * probs_in = nullptr) const; + ggml_tensor * probs_in = nullptr, + ggml_tensor * gate_up_exps = nullptr, + ggml_tensor * gate_up_exps_b = nullptr) const; // // inputs diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 04a072c12..69b4cc72f 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -978,6 +978,9 @@ bool llama_kv_cache::get_can_shift() const { if (model.arch == LLM_ARCH_STEP35) { return false; } + if (hparams.n_pos_per_embd() > 1) { + return false; + } return true; } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 5587c03b9..b78423a60 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -53,6 +53,7 @@ #include "models/dream.cpp" #include "models/ernie4-5-moe.cpp" #include "models/ernie4-5.cpp" +#include "models/eurobert.cpp" #include "models/exaone-moe.cpp" #include "models/exaone.cpp" #include "models/exaone4.cpp" @@ -1092,6 +1093,16 @@ void llama_model::load_hparams(llama_model_loader & ml) { type = LLM_TYPE_250M; } } break; + case LLM_ARCH_EUROBERT: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); + + if (hparams.n_layer == 12) { + type = LLM_TYPE_SMALL; // 0.2B + } + } break; case LLM_ARCH_BLOOM: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -3090,6 +3101,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // TODO: move to a separate function const auto tn = LLM_TN(arch); + + // helper: try merged gate_up_exps first, fall back to separate gate and up + auto create_tensor_gate_up_exps = [&](llama_layer & layer, int bid, int64_t n_embd_, int64_t n_ff_, int64_t n_expert_, int flags) { + layer.ffn_gate_up_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS, "weight", bid), {n_embd_, n_ff_ * 2, n_expert_}, TENSOR_NOT_REQUIRED); + if (layer.ffn_gate_up_exps == nullptr) { + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", bid), {n_embd_, n_ff_, n_expert_}, flags); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", bid), {n_embd_, n_ff_, n_expert_}, flags); + } + }; switch (arch) { case LLM_ARCH_LLAMA: case LLM_ARCH_REFACT: @@ -3730,6 +3750,29 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); } } break; + case LLM_ARCH_EUROBERT: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); + } + } break; case LLM_ARCH_JINA_BERT_V2: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings @@ -5348,9 +5391,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } // MoE branch - layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0); layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0); - layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0); + create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0); // Shared expert branch layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0); @@ -7552,9 +7594,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0); - layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0); layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0); - layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0); + create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0); // Shared experts layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0); @@ -7618,9 +7659,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0); - layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0); layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0); - layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0); + create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0); // Shared experts const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff; @@ -8342,6 +8382,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT_MOE: case LLM_ARCH_NEO_BERT: + case LLM_ARCH_EUROBERT: case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_MODERN_BERT: case LLM_ARCH_GEMMA_EMBEDDING: @@ -8539,6 +8580,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_EUROBERT: + { + llm = std::make_unique(*this, params); + } break; case LLM_ARCH_BLOOM: { llm = std::make_unique(*this, params); @@ -9165,6 +9210,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_MODERN_BERT: case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT_MOE: + case LLM_ARCH_EUROBERT: case LLM_ARCH_STABLELM: case LLM_ARCH_BITNET: case LLM_ARCH_QWEN: diff --git a/src/llama-model.h b/src/llama-model.h index 96e407a0b..d7c3e7d1c 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -280,14 +280,16 @@ struct llama_layer { struct ggml_tensor * ffn_up_enc = nullptr; // ff MoE - struct ggml_tensor * ffn_gate_inp = nullptr; - struct ggml_tensor * ffn_gate_exps = nullptr; - struct ggml_tensor * ffn_down_exps = nullptr; - struct ggml_tensor * ffn_up_exps = nullptr; - struct ggml_tensor * ffn_gate_inp_b = nullptr; - struct ggml_tensor * ffn_gate_exps_b = nullptr; - struct ggml_tensor * ffn_down_exps_b = nullptr; - struct ggml_tensor * ffn_up_exps_b = nullptr; + struct ggml_tensor * ffn_gate_inp = nullptr; + struct ggml_tensor * ffn_gate_exps = nullptr; + struct ggml_tensor * ffn_down_exps = nullptr; + struct ggml_tensor * ffn_up_exps = nullptr; + struct ggml_tensor * ffn_gate_up_exps = nullptr; + struct ggml_tensor * ffn_gate_inp_b = nullptr; + struct ggml_tensor * ffn_gate_exps_b = nullptr; + struct ggml_tensor * ffn_down_exps_b = nullptr; + struct ggml_tensor * ffn_up_exps_b = nullptr; + struct ggml_tensor * ffn_gate_up_exps_b = nullptr; // ff shared expert (shexp) struct ggml_tensor * ffn_gate_inp_shexp = nullptr; diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 8de0424b0..9cf9a1d17 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -2126,7 +2126,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "falcon-h1" || tokenizer_pre == "pixtral" || tokenizer_pre == "midm-2.0" || - tokenizer_pre == "lfm2") { + tokenizer_pre == "lfm2" || + tokenizer_pre == "jina-v5-nano") { pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3; ignore_merges = true; add_bos = true; diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp index b2c1f1606..b608396e5 100644 --- a/src/models/deepseek2.cpp +++ b/src/models/deepseek2.cpp @@ -218,7 +218,9 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr LLM_FFN_SILU, hparams.expert_weights_norm, hparams.expert_weights_scale, hparams.expert_weights_scale, (llama_expert_gating_func_type) hparams.expert_gating_func, - il); + il, + nullptr, + model.layers[il].ffn_gate_up_exps); cb(moe_out, "ffn_moe_out", il); // FFN shared expert diff --git a/src/models/eurobert.cpp b/src/models/eurobert.cpp new file mode 100644 index 000000000..86e3176ed --- /dev/null +++ b/src/models/eurobert.cpp @@ -0,0 +1,97 @@ +#include "models.h" + +llm_build_eurobert::llm_build_eurobert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + ggml_tensor * cur; + ggml_tensor * inpL; + ggml_tensor * inp_pos = build_inp_pos(); + + inpL = build_inp_embd(model.tok_embd); + cb(inpL, "inp_embd", -1); + + auto * inp_attn = build_attn_inp_no_cache(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * cur = inpL; + + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + + { + ggml_tensor * Qcur; + ggml_tensor * Kcur; + ggml_tensor * Vcur; + + Qcur = build_lora_mm(model.layers[il].wq, cur); + Kcur = build_lora_mm(model.layers[il].wk, cur); + Vcur = build_lora_mm(model.layers[il].wv, cur); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, + model.layers[il].wo, nullptr, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + cb(cur, "kqv_out", il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + cur = ggml_add(ctx0, cur, inpL); + + ggml_tensor * ffn_inp = cur; + cb(ffn_inp, "ffn_inp", il); + + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + + inpL = cur; + } + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_embd", -1); + res->t_embd = cur; + + ggml_build_forward_expand(gf, cur); +} diff --git a/src/models/models.h b/src/models/models.h index 10f8b5892..0712d03d8 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -424,6 +424,10 @@ struct llm_build_neo_bert : public llm_graph_context { llm_build_neo_bert(const llama_model & model, const llm_graph_params & params); }; +struct llm_build_eurobert : public llm_graph_context { + llm_build_eurobert(const llama_model & model, const llm_graph_params & params); +}; + template struct llm_build_olmo2 : public llm_graph_context { llm_build_olmo2(const llama_model & model, const llm_graph_params & params); diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index 77f18b5ae..22d708f20 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -380,7 +380,8 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_ffn(ggml_tensor * cur, const int model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, nullptr, n_expert, n_expert_used, LLM_FFN_SILU, - true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); + true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il, + nullptr, model.layers[il].ffn_gate_up_exps); cb(moe_out, "ffn_moe_out", il); // Add shared experts if present - following Qwen3Next reference implementation diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp index ce6d4fe70..b3267b5ca 100644 --- a/src/models/qwen3next.cpp +++ b/src/models/qwen3next.cpp @@ -480,7 +480,8 @@ ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const int model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, nullptr, n_expert, n_expert_used, LLM_FFN_SILU, - true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); + true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il, + nullptr, model.layers[il].ffn_gate_up_exps); cb(moe_out, "ffn_moe_out", il); // Add shared experts if present - following Qwen3Next reference implementation diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp index d4c6b4c7f..b515f578a 100644 --- a/tools/mtmd/mtmd-helper.cpp +++ b/tools/mtmd/mtmd-helper.cpp @@ -248,7 +248,7 @@ int32_t mtmd_helper_decode_image_chunk( int32_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk); int32_t i_batch = 0; - int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch; + int32_t n_img_batches = (n_tokens + n_batch - 1) / n_batch; decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd); if (mtmd_decode_use_mrope(ctx)) { diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 73af81243..aafed4950 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -580,6 +580,8 @@ private: float slot_prompt_similarity = 0.0f; std::string model_name; // name of the loaded model, to be used by API + std::set model_aliases; // additional names for the model + std::set model_tags; // informational tags bool sleeping = false; @@ -813,10 +815,9 @@ private: SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n"); if (!params_base.model_alias.empty()) { - // user explicitly specified model name - model_name = params_base.model_alias; + // backward compat: use first alias as model name + model_name = *params_base.model_alias.begin(); } else if (!params_base.model.name.empty()) { - // use model name in registry format (for models in cache) model_name = params_base.model.name; } else { // fallback: derive model name from file name @@ -824,6 +825,9 @@ private: model_name = model_path.filename().string(); } + model_aliases = params_base.model_alias; + model_tags = params_base.model_tags; + if (!is_resume) { return init(); } @@ -2363,7 +2367,7 @@ private: //printf("[DEBUG] `do_reset` was set to `true` after failing to restore a checkpoint"); } else { pos_next = std::min(pos_next, std::max(it->pos_min + 1, it->pos_max)); - n_past = slot.prompt.tokens.size_up_to_pos(pos_next); + n_past = std::min(slot.prompt.tokens.size_up_to_pos(pos_next), (size_t) it->n_tokens); SLT_WRN(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n", it->pos_min, it->pos_max, it->n_tokens, (float) checkpoint_size / 1024 / 1024); } } @@ -2892,6 +2896,8 @@ server_context_meta server_context::get_meta() const { return server_context_meta { /* build_info */ build_info, /* model_name */ impl->model_name, + /* model_aliases */ impl->model_aliases, + /* model_tags */ impl->model_tags, /* model_path */ impl->params_base.model.path, /* has_mtmd */ impl->mctx != nullptr, /* has_inp_image */ impl->chat_params.allow_image, @@ -3688,6 +3694,8 @@ void server_routes::init_routes() { {"data", { { {"id", meta->model_name}, + {"aliases", meta->model_aliases}, + {"tags", meta->model_tags}, {"object", "model"}, {"created", std::time(0)}, {"owned_by", "llamacpp"}, diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 03c29f513..631d573fc 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -6,12 +6,15 @@ #include #include +#include struct server_context_impl; // private implementation struct server_context_meta { std::string build_info; std::string model_name; + std::set model_aliases; + std::set model_tags; std::string model_path; bool has_mtmd; bool has_inp_image; diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index efb22da5c..bc601237b 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -184,6 +184,51 @@ void server_models::add_model(server_model_meta && meta) { if (mapping.find(meta.name) != mapping.end()) { throw std::runtime_error(string_format("model '%s' appears multiple times", meta.name.c_str())); } + + // check model name does not conflict with existing aliases + for (const auto & [key, inst] : mapping) { + if (inst.meta.aliases.count(meta.name)) { + throw std::runtime_error(string_format("model name '%s' conflicts with alias of model '%s'", + meta.name.c_str(), key.c_str())); + } + } + + // parse aliases from preset's --alias option (comma-separated) + std::string alias_str; + if (meta.preset.get_option("LLAMA_ARG_ALIAS", alias_str) && !alias_str.empty()) { + for (auto & alias : string_split(alias_str, ',')) { + alias = string_strip(alias); + if (!alias.empty()) { + meta.aliases.insert(alias); + } + } + } + + // parse tags from preset's --tags option (comma-separated) + std::string tags_str; + if (meta.preset.get_option("LLAMA_ARG_TAGS", tags_str) && !tags_str.empty()) { + for (auto & tag : string_split(tags_str, ',')) { + tag = string_strip(tag); + if (!tag.empty()) { + meta.tags.insert(tag); + } + } + } + + // validate aliases do not conflict with existing names or aliases + for (const auto & alias : meta.aliases) { + if (mapping.find(alias) != mapping.end()) { + throw std::runtime_error(string_format("alias '%s' for model '%s' conflicts with existing model name", + alias.c_str(), meta.name.c_str())); + } + for (const auto & [key, inst] : mapping) { + if (inst.meta.aliases.count(alias)) { + throw std::runtime_error(string_format("alias '%s' for model '%s' conflicts with alias of model '%s'", + alias.c_str(), meta.name.c_str(), key.c_str())); + } + } + } + meta.update_args(ctx_preset, bin_path); // render args std::string name = meta.name; mapping[name] = instance_t{ @@ -249,6 +294,8 @@ void server_models::load_models() { server_model_meta meta{ /* preset */ preset.second, /* name */ preset.first, + /* aliases */ {}, + /* tags */ {}, /* port */ 0, /* status */ SERVER_MODEL_STATUS_UNLOADED, /* last_used */ 0, @@ -265,10 +312,28 @@ void server_models::load_models() { for (const auto & [name, preset] : custom_presets) { custom_names.insert(name); } + auto join_set = [](const std::set & s) { + std::string result; + for (const auto & v : s) { + if (!result.empty()) { + result += ", "; + } + result += v; + } + return result; + }; + SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size()); for (const auto & [name, inst] : mapping) { bool has_custom = custom_names.find(name) != custom_names.end(); - SRV_INF(" %c %s\n", has_custom ? '*' : ' ', name.c_str()); + std::string info; + if (!inst.meta.aliases.empty()) { + info += " (aliases: " + join_set(inst.meta.aliases) + ")"; + } + if (!inst.meta.tags.empty()) { + info += " [tags: " + join_set(inst.meta.tags) + "]"; + } + SRV_INF(" %c %s%s\n", has_custom ? '*' : ' ', name.c_str(), info.c_str()); } } @@ -291,7 +356,9 @@ void server_models::load_models() { for (const auto & [name, inst] : mapping) { std::string val; if (inst.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val)) { - models_to_load.push_back(name); + if (common_arg_utils::is_truthy(val)) { + models_to_load.push_back(name); + } } } if ((int)models_to_load.size() > base_params.models_max) { @@ -318,7 +385,15 @@ void server_models::update_meta(const std::string & name, const server_model_met bool server_models::has_model(const std::string & name) { std::lock_guard lk(mutex); - return mapping.find(name) != mapping.end(); + if (mapping.find(name) != mapping.end()) { + return true; + } + for (const auto & [key, inst] : mapping) { + if (inst.meta.aliases.count(name)) { + return true; + } + } + return false; } std::optional server_models::get_meta(const std::string & name) { @@ -327,6 +402,11 @@ std::optional server_models::get_meta(const std::string & nam if (it != mapping.end()) { return it->second.meta; } + for (const auto & [key, inst] : mapping) { + if (inst.meta.aliases.count(name)) { + return inst.meta; + } + } return std::nullopt; } @@ -764,7 +844,7 @@ static void res_err(std::unique_ptr & res, const json & error_d res->data = safe_json_to_str({{ "error", error_data }}); } -static bool router_validate_model(const std::string & name, server_models & models, bool models_autoload, std::unique_ptr & res) { +static bool router_validate_model(std::string & name, server_models & models, bool models_autoload, std::unique_ptr & res) { if (name.empty()) { res_err(res, format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST)); return false; @@ -774,6 +854,8 @@ static bool router_validate_model(const std::string & name, server_models & mode res_err(res, format_error_response(string_format("model '%s' not found", name.c_str()), ERROR_TYPE_INVALID_REQUEST)); return false; } + // resolve alias to canonical model name + name = meta->name; if (models_autoload) { models.ensure_model_loaded(name); } else { @@ -845,16 +927,16 @@ void server_models_routes::init_routes() { auto res = std::make_unique(); json body = json::parse(req.body); std::string name = json_value(body, "model", std::string()); - auto model = models.get_meta(name); - if (!model.has_value()) { + auto meta = models.get_meta(name); + if (!meta.has_value()) { res_err(res, format_error_response("model is not found", ERROR_TYPE_NOT_FOUND)); return res; } - if (model->status == SERVER_MODEL_STATUS_LOADED) { + if (meta->status == SERVER_MODEL_STATUS_LOADED) { res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST)); return res; } - models.load(name); + models.load(meta->name); res_ok(res, {{"success", true}}); return res; }; @@ -875,6 +957,7 @@ void server_models_routes::init_routes() { preset_copy.unset_option("LLAMA_ARG_HOST"); preset_copy.unset_option("LLAMA_ARG_PORT"); preset_copy.unset_option("LLAMA_ARG_ALIAS"); + preset_copy.unset_option("LLAMA_ARG_TAGS"); status["preset"] = preset_copy.to_ini(); } if (meta.is_failed()) { @@ -883,6 +966,8 @@ void server_models_routes::init_routes() { } models_json.push_back(json { {"id", meta.name}, + {"aliases", meta.aliases}, + {"tags", meta.tags}, {"object", "model"}, // for OAI-compat {"owned_by", "llamacpp"}, // for OAI-compat {"created", t}, // for OAI-compat @@ -910,7 +995,7 @@ void server_models_routes::init_routes() { res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST)); return res; } - models.unload(name); + models.unload(model->name); res_ok(res, {{"success", true}}); return res; }; diff --git a/tools/server/server-models.h b/tools/server/server-models.h index a397abda4..78abc8d72 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -52,6 +52,8 @@ static std::string server_model_status_to_string(server_model_status status) { struct server_model_meta { common_preset preset; std::string name; + std::set aliases; // additional names that resolve to this model + std::set tags; // informational tags, not used for routing int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; int64_t last_used = 0; // for LRU unloading diff --git a/tools/server/server.cpp b/tools/server/server.cpp index d3d431602..542b98453 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -92,7 +92,7 @@ int main(int argc, char ** argv) { // for consistency between server router mode and single-model mode, we set the same model name as alias if (params.model_alias.empty() && !params.model.name.empty()) { - params.model_alias = params.model.name; + params.model_alias.insert(params.model.name); } common_init(); diff --git a/tools/server/tests/unit/test_basic.py b/tools/server/tests/unit/test_basic.py index 3405be3e2..d1b89cf1a 100644 --- a/tools/server/tests/unit/test_basic.py +++ b/tools/server/tests/unit/test_basic.py @@ -94,3 +94,20 @@ def test_no_webui(): server.start() res = requests.get(url) assert res.status_code == 404 + + +def test_server_model_aliases_and_tags(): + global server + server.model_alias = "tinyllama-2,fim,code" + server.model_tags = "chat,fim,small" + server.start() + res = server.make_request("GET", "/models") + assert res.status_code == 200 + assert len(res.body["data"]) == 1 + model = res.body["data"][0] + # aliases field must contain all aliases + assert set(model["aliases"]) == {"tinyllama-2", "fim", "code"} + # tags field must contain all tags + assert set(model["tags"]) == {"chat", "fim", "small"} + # id is derived from first alias (alphabetical order from std::set) + assert model["id"] == "code" diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py index f76bb1a91..5002999d9 100644 --- a/tools/server/tests/utils.py +++ b/tools/server/tests/utils.py @@ -56,6 +56,7 @@ class ServerProcess: # custom options model_alias: str | None = None + model_tags: str | None = None model_url: str | None = None model_file: str | None = None model_draft: str | None = None @@ -180,6 +181,8 @@ class ServerProcess: server_args.extend(["--pooling", self.pooling]) if self.model_alias: server_args.extend(["--alias", self.model_alias]) + if self.model_tags: + server_args.extend(["--tags", self.model_tags]) if self.n_ctx: server_args.extend(["--ctx-size", self.n_ctx]) if self.n_slots: