diff --git a/common/chat-parser.cpp b/common/chat-parser.cpp
index d740dac06..23e23ca8c 100644
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
@@ -1395,6 +1395,14 @@ static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
builder.consume_reasoning_with_xml_tool_calls(form, "", "");
}
+static void common_chat_parse_solar_open(common_chat_msg_parser & builder) {
+ builder.try_parse_reasoning("<|think|>", "<|end|><|begin|>assistant<|content|>");
+
+ // TODO: Tool calling
+
+ builder.add_content(builder.consume_rest());
+}
+
static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
builder.try_parse_reasoning("", "");
builder.add_content(builder.consume_rest());
@@ -1479,6 +1487,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
case COMMON_CHAT_FORMAT_XIAOMI_MIMO:
common_chat_parse_xiaomi_mimo(builder);
break;
+ case COMMON_CHAT_FORMAT_SOLAR_OPEN:
+ common_chat_parse_solar_open(builder);
+ break;
default:
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
}
diff --git a/common/chat.cpp b/common/chat.cpp
index ccd35ea90..e104cc482 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -675,6 +675,7 @@ const char * common_chat_format_name(common_chat_format format) {
case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: return "Qwen3 Coder";
case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
+ case COMMON_CHAT_FORMAT_SOLAR_OPEN: return "Solar Open";
case COMMON_CHAT_FORMAT_PEG_SIMPLE: return "peg-simple";
case COMMON_CHAT_FORMAT_PEG_NATIVE: return "peg-native";
case COMMON_CHAT_FORMAT_PEG_CONSTRUCTED: return "peg-constructed";
@@ -2523,6 +2524,27 @@ static common_chat_params common_chat_params_init_granite(const common_chat_temp
return data;
}
+static common_chat_params common_chat_params_init_solar_open(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ common_chat_params data;
+
+ // TODO: Reasoning effort
+ json additional_context = {};
+
+ data.prompt = apply(tmpl, inputs, std::nullopt, std::nullopt, additional_context);
+ data.format = COMMON_CHAT_FORMAT_SOLAR_OPEN;
+
+ data.preserved_tokens = {
+ "<|think|>",
+ "<|content|>",
+ "<|begin|>",
+ "<|end|>",
+ };
+
+ // TODO: Tool calling
+
+ return data;
+}
+
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
data.prompt = apply(tmpl, inputs);
@@ -2786,6 +2808,13 @@ static common_chat_params common_chat_templates_apply_jinja(
return common_chat_params_init_magistral(tmpl, params);
}
+ // Solar Open
+ if (src.find("<|tool_response:begin|>") != std::string::npos &&
+ src.find("<|tool_response:name|>") != std::string::npos &&
+ src.find("<|tool_response:result|>") != std::string::npos) {
+ return common_chat_params_init_solar_open(tmpl, params);
+ }
+
// Plain handler (no tools)
if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
return common_chat_params_init_without_tools(tmpl, params);
diff --git a/common/chat.h b/common/chat.h
index 6085510a4..8bd4a325f 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -124,6 +124,7 @@ enum common_chat_format {
COMMON_CHAT_FORMAT_QWEN3_CODER_XML,
COMMON_CHAT_FORMAT_APRIEL_1_5,
COMMON_CHAT_FORMAT_XIAOMI_MIMO,
+ COMMON_CHAT_FORMAT_SOLAR_OPEN,
// These are intended to be parsed by the PEG parser
COMMON_CHAT_FORMAT_PEG_SIMPLE,
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index edc0ed539..7ad20c086 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1062,6 +1062,9 @@ class TextModel(ModelBase):
if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273":
# ref: https://huggingface.co/alvarobartt/grok-2-tokenizer
res = "grok-2"
+ if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df":
+ # ref: https://huggingface.co/aari1995/German_Semantic_V3
+ res = "jina-v2-de"
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
res = "llama-bpe"
@@ -1230,6 +1233,12 @@ class TextModel(ModelBase):
if chkhsh == "4a2e2abae11ca2b86d570fc5b44be4d5eb5e72cc8f22dd136a94b37da83ab665":
# ref: https://huggingface.co/KORMo-Team/KORMo-tokenizer
res = "kormo"
+ if chkhsh == "9d70134b369a70e5735009b6de918f7581b5211f7c074d1f89f753aea8248af1":
+ # ref: https://huggingface.co/tencent/Youtu-LLM-2B
+ res = "youtu"
+ if chkhsh == "16389f0a1f51ee53e562ffd51c371dc508639ab0e4261502071836e50e223e91":
+ # ref: https://huggingface.co/upstage/Solar-Open-100B
+ res = "solar-open"
if res is None:
logger.warning("\n")
@@ -2486,6 +2495,7 @@ class StableLMModel(TextModel):
"VLlama3ForCausalLM",
"LlavaForConditionalGeneration",
"VoxtralForConditionalGeneration",
+ "IQuestCoderForCausalLM",
"LlamaModel")
class LlamaModel(TextModel):
model_arch = gguf.MODEL_ARCH.LLAMA
@@ -5284,13 +5294,14 @@ class BertModel(TextModel):
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
# convert to phantom space vocab
- def phantom(tok):
- if tok.startswith("[") and tok.endswith("]"):
+ def phantom(tok, toktype):
+ if toktype == gguf.TokenType.CONTROL:
return tok
if tok.startswith("##"):
return tok[2:]
return "\u2581" + tok
- tokens = list(map(phantom, tokens))
+ assert len(tokens) == len(toktypes)
+ tokens = list(map(phantom, tokens, toktypes))
# add vocab to gguf
self.gguf_writer.add_tokenizer_model("bert")
@@ -7181,6 +7192,7 @@ class DeepseekModel(TextModel):
"DeepseekV2ForCausalLM",
"DeepseekV3ForCausalLM",
"KimiVLForConditionalGeneration",
+ "YoutuForCausalLM",
)
class DeepseekV2Model(TextModel):
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
@@ -7247,7 +7259,15 @@ class DeepseekV2Model(TextModel):
super().set_gguf_parameters()
hparams = self.hparams
- self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
+ # first_k_dense_replace: number of leading layers using dense FFN instead of MoE
+ # For non-MoE models (like Youtu), set to n_layer to use dense FFN for all layers
+ # For MoE models (like DeepSeek-V2), this is the number of leading non-MoE layers
+ has_moe = hparams.get("n_routed_experts") is not None
+ first_k_dense_replace = hparams.get("first_k_dense_replace")
+ if first_k_dense_replace is None:
+ # Default: if no MoE, all layers are dense; if MoE, none are dense
+ first_k_dense_replace = hparams["num_hidden_layers"] if not has_moe else 0
+ self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
@@ -7259,11 +7279,24 @@ class DeepseekV2Model(TextModel):
self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
- self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
- self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
- self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
- self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
- self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
+ # MoE parameters (required by C++ code for DEEPSEEK2 arch)
+ # For non-MoE models like Youtu, use intermediate_size as expert_feed_forward_length
+ moe_intermediate_size = self.find_hparam(["moe_intermediate_size", "intermediate_size"], optional=False)
+ self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
+
+ if (n_routed_experts := hparams.get("n_routed_experts")) is not None:
+ self.gguf_writer.add_expert_count(n_routed_experts)
+
+ # expert_shared_count is required by C++ code, default to 0 for non-MoE models
+ n_shared_experts = hparams.get("n_shared_experts", 0)
+ self.gguf_writer.add_expert_shared_count(n_shared_experts)
+
+ # When not set, C++ code will use scale_w = false to skip the no-op scaling
+ if (routed_scaling_factor := hparams.get("routed_scaling_factor")) is not None:
+ self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
+
+ if (norm_topk_prob := hparams.get("norm_topk_prob")) is not None and norm_topk_prob:
+ self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
@@ -7279,10 +7312,17 @@ class DeepseekV2Model(TextModel):
# skip vision tensors and remove "language_model." for Kimi-VL
if "vision_tower" in name or "multi_modal_projector" in name:
return []
-
+ if name.startswith("siglip2.") or name.startswith("merger."):
+ return []
if name.startswith("language_model."):
name = name.replace("language_model.", "")
+ # skip lm_head.weight if tie_word_embeddings is True
+ if self.hparams.get("tie_word_embeddings", False):
+ if name == "lm_head.weight" or name == "model.lm_head.weight":
+ logger.info("Skipping tied output layer 'lm_head.weight' (will use token_embd.weight)")
+ return []
+
# rename e_score_correction_bias tensors
if name.endswith("e_score_correction_bias"):
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
@@ -10617,6 +10657,79 @@ class JanusProVisionModel(MmprojModel):
return []
+@ModelBase.register("YOUTUVLForConditionalGeneration", "YOUTUVLForCausalLM")
+class YOUTUVLVisionModel(MmprojModel):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ assert self.hparams_vision is not None
+ self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560)
+
+ def set_gguf_parameters(self):
+ super().set_gguf_parameters()
+
+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.YOUTUVL)
+ self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
+
+ # Handle activation function
+ hidden_act = str(self.hparams.get("hidden_act", "gelu_pytorch_tanh")).lower()
+ if hidden_act in ("gelu", "gelu_pytorch_tanh", "gelu_fast", "gelu_new", "gelu_accurate"):
+ self.gguf_writer.add_vision_use_gelu(True)
+ elif hidden_act == "silu":
+ self.gguf_writer.add_vision_use_silu(True)
+ else:
+ raise ValueError(f"Unsupported activation function for YOUTUVL: {hidden_act}")
+
+ self.gguf_writer.add_vision_spatial_merge_size(self.hparams.get("spatial_merge_size", 2))
+
+ window_size = self.hparams.get("window_size")
+ if window_size is not None:
+ self.gguf_writer.add_vision_window_size(window_size)
+ # fullatt_block_indexes contains explicit layer indices that use full attention
+ # e.g., [2, 5, 8, 11] means layers 2, 5, 8, 11 use full attention
+ # All other layers use window attention
+ fullatt_block_indexes = self.hparams.get("fullatt_block_indexes")
+ assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for youtuvl"
+ # Store the explicit layer indices for YoutuVL (irregular pattern approach)
+ self.gguf_writer.add_vision_wa_layer_indexes(layers=fullatt_block_indexes)
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ del bid # unused
+
+ # Skip language model tensors
+ skip_prefixes = ('lm_head.', 'model.layers.', 'model.embed_tokens.', 'model.norm.')
+ if name.startswith(skip_prefixes):
+ return []
+
+ # Try to map the tensor using TensorNameMap (handles vision encoder and projector)
+ try:
+ new_name = self.map_tensor_name(name)
+ return [(new_name, data_torch)]
+ except ValueError:
+ # If mapping fails, log warning and skip
+ logger.warning(f"Cannot map tensor: {name}")
+ return []
+
+
+@ModelBase.register("SolarOpenForCausalLM")
+class SolarOpenModel(Glm4MoeModel):
+ model_arch = gguf.MODEL_ARCH.GLM4_MOE
+
+ def set_vocab(self):
+ from transformers import AutoTokenizer
+ tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
+ tokens, toktypes, tokpre = self.get_vocab_base()
+ self.gguf_writer.add_tokenizer_model("gpt2")
+ self.gguf_writer.add_tokenizer_pre(tokpre)
+ self.gguf_writer.add_token_list(tokens)
+ self.gguf_writer.add_token_types(toktypes)
+ special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
+ special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|endoftext|>"])
+ special_vocab._set_special_token("unk", tokenizer.get_added_vocab()[""])
+ special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|startoftext|>"])
+ special_vocab.add_to_gguf(self.gguf_writer)
+
+
###### CONVERSION LOGIC ######
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index 437837830..74c67e6a9 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -145,6 +145,8 @@ models = [
{"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
{"name": "minimax-m2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", },
{"name": "kormo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/KORMo-Team/KORMo-tokenizer", },
+ {"name": "youtu", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Youtu-LLM-2B", },
+ {"name": "solar-open", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/upstage/Solar-Open-100B", },
]
# some models are known to be broken upstream, so we will skip them as exceptions
@@ -165,6 +167,8 @@ pre_computed_hashes = [
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
{"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
+ # jina-v2-de variants
+ {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},
]
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 4ed5f3577..a9d177864 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -358,7 +358,7 @@ extern "C" {
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
// Compare the output of two backends
- GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node);
+ GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor const * const * test_nodes, size_t num_test_nodes);
// Tensor initialization
GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index c0e78e6c4..dd2825d5d 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -2060,7 +2060,7 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
ggml_free(copy.ctx_unallocated);
}
-bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node) {
+bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor const * const * test_nodes, size_t num_test_nodes) {
struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
if (copy.buffer == NULL) {
return false;
@@ -2071,22 +2071,22 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
assert(g1->n_nodes == g2->n_nodes);
- if (test_node != nullptr) {
- // Compute the whole graph and only test the output for a specific tensor
+ if (num_test_nodes != 0) {
+ GGML_ASSERT(test_nodes);
+ // Compute the whole graph and only test the output for specific tensors
ggml_backend_graph_compute(backend1, g1);
ggml_backend_graph_compute(backend2, g2);
- int test_node_idx = -1;
+ bool verified = false;
for (int i = 0; i < g1->n_nodes; i++) {
- struct ggml_tensor * t1 = g1->nodes[i];
- if (t1 == test_node) {
- test_node_idx = i;
- break;
+ for (size_t j = 0; j < num_test_nodes; ++j) {
+ if (g1->nodes[i] == test_nodes[j]) {
+ callback(i, g1->nodes[i], g2->nodes[i], user_data);
+ verified = true;
+ }
}
}
- GGML_ASSERT(test_node_idx != -1);
-
- callback(test_node_idx, g1->nodes[test_node_idx], g2->nodes[test_node_idx], user_data);
+ GGML_ASSERT(verified);
} else {
for (int i = 0; i < g1->n_nodes; i++) {
struct ggml_tensor * t1 = g1->nodes[i];
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
index c4ceb4fc5..ee84303ef 100644
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -12,11 +12,11 @@ const int CUDA_CPY_BLOCK_NM = 8; // block size of 3rd dimension if available
const int CUDA_CPY_BLOCK_ROWS = 8; // block dimension for marching through rows
template
-static __global__ void cpy_scalar(const char * cx, char * cdst, const int ne,
- const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
- const int nb12, const int nb13) {
- const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
+static __global__ void cpy_scalar(const char * cx, char * cdst, const int64_t ne,
+ const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+ const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11,
+ const int64_t nb12, const int64_t nb13) {
+ const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
if (i >= ne) {
return;
@@ -40,10 +40,10 @@ static __global__ void cpy_scalar(const char * cx, char * cdst, const int ne,
}
template
-static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const int ne,
- const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
- const int nb12, const int nb13) {
+static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const int64_t ne,
+ const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+ const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11,
+ const int64_t nb12, const int64_t nb13) {
const T* src = reinterpret_cast(cx);
T* dst = reinterpret_cast(cdst);
@@ -117,60 +117,60 @@ static __device__ void cpy_blck_q_f32(const char * cxi, char * cdsti) {
}
template
-static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
- const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
- const int nb12, const int nb13) {
- const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
+static __global__ void cpy_f32_q(const char * cx, char * cdst, const int64_t ne,
+ const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+ const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11,
+ const int64_t nb12, const int64_t nb13) {
+ const int64_t i = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*qk;
if (i >= ne) {
return;
}
- const int i03 = i/(ne00 * ne01 * ne02);
- const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
- const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
- const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
- const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
+ const int64_t i03 = i/(ne00 * ne01 * ne02);
+ const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+ const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
+ const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
+ const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
- const int i13 = i/(ne10 * ne11 * ne12);
- const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
- const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
- const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
- const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
+ const int64_t i13 = i/(ne10 * ne11 * ne12);
+ const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
+ const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
+ const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
+ const int64_t dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
cpy_blck(cx + x_offset, cdst + dst_offset);
}
template
-static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
- const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
- const int nb12, const int nb13) {
- const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
+static __global__ void cpy_q_f32(const char * cx, char * cdst, const int64_t ne,
+ const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+ const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11,
+ const int64_t nb12, const int64_t nb13) {
+ const int64_t i = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*qk;
if (i >= ne) {
return;
}
- const int i03 = i/(ne00 * ne01 * ne02);
- const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
- const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
- const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
- const int x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
+ const int64_t i03 = i/(ne00 * ne01 * ne02);
+ const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+ const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
+ const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
+ const int64_t x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
- const int i13 = i/(ne10 * ne11 * ne12);
- const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
- const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
- const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
- const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
+ const int64_t i13 = i/(ne10 * ne11 * ne12);
+ const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
+ const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
+ const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
+ const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
cpy_blck(cx + x_offset, cdst + dst_offset);
}
template
static __global__ void cpy_scalar_contiguous(const char * cx, char * cdst, const int64_t ne) {
- const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
+ const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
if (i >= ne) {
return;
@@ -188,19 +188,20 @@ static void ggml_cpy_scalar_contiguous_cuda(
cudaStream_t stream) {
const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+ GGML_ASSERT(num_blocks < UINT_MAX);
cpy_scalar_contiguous<<>>
(cx, cdst, ne);
}
template
static void ggml_cpy_scalar_cuda(
- const char * cx, char * cdst, const int ne,
- const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ const char * cx, char * cdst, const int64_t ne,
+ const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+ const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
if (transposed) {
GGML_ASSERT(ne == ne00*ne01*ne02); // ne[3] is 1 assumed
- int ne00n, ne01n, ne02n;
+ int64_t ne00n, ne01n, ne02n;
if (nb00 <= nb02) { // most likely safe to handle nb00 = nb02 case here
ne00n = ne00;
ne01n = ne01;
@@ -211,143 +212,159 @@ static void ggml_cpy_scalar_cuda(
ne02n = 1;
}
- dim3 dimGrid( (ne01n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D,
- (ne00n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D,
- (ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM);
+ int64_t grid_x = (ne01n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D;
+ int64_t grid_y = (ne00n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D;
+ int64_t grid_z = (ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM;
+ GGML_ASSERT(grid_x < UINT_MAX);
+ GGML_ASSERT(grid_y < USHRT_MAX);
+ GGML_ASSERT(grid_z < USHRT_MAX);
+ dim3 dimGrid(grid_x, grid_y, grid_z);
dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1);
cpy_scalar_transpose<<>>
(cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} else {
- const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+ const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+ GGML_ASSERT(num_blocks < UINT_MAX);
cpy_scalar><<>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}
}
static void ggml_cpy_f32_q8_0_cuda(
- const char * cx, char * cdst, const int ne,
- const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ const char * cx, char * cdst, const int64_t ne,
+ const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+ const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
GGML_ASSERT(ne % QK8_0 == 0);
- const int num_blocks = ne / QK8_0;
+ const int64_t num_blocks = ne / QK8_0;
+ GGML_ASSERT(num_blocks < UINT_MAX);
cpy_f32_q<<>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}
static void ggml_cpy_q8_0_f32_cuda(
- const char * cx, char * cdst, const int ne,
- const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ const char * cx, char * cdst, const int64_t ne,
+ const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+ const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
- const int num_blocks = ne;
+ const int64_t num_blocks = ne;
+ GGML_ASSERT(num_blocks < UINT_MAX);
cpy_q_f32<<>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}
static void ggml_cpy_f32_q4_0_cuda(
- const char * cx, char * cdst, const int ne,
- const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ const char * cx, char * cdst, const int64_t ne,
+ const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+ const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
GGML_ASSERT(ne % QK4_0 == 0);
- const int num_blocks = ne / QK4_0;
+ const int64_t num_blocks = ne / QK4_0;
+ GGML_ASSERT(num_blocks < UINT_MAX);
cpy_f32_q<<>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}
static void ggml_cpy_q4_0_f32_cuda(
- const char * cx, char * cdst, const int ne,
- const int ne00, const int ne01, const int ne02,
- const int nb00, const int nb01, const int nb02,
- const int nb03, const int ne10, const int ne11, const int ne12,
- const int nb10, const int nb11, const int nb12, const int nb13,
+ const char * cx, char * cdst, const int64_t ne,
+ const int64_t ne00, const int64_t ne01, const int64_t ne02,
+ const int64_t nb00, const int64_t nb01, const int64_t nb02,
+ const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12,
+ const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
cudaStream_t stream) {
- const int num_blocks = ne;
+ const int64_t num_blocks = ne;
+ GGML_ASSERT(num_blocks < UINT_MAX);
cpy_q_f32, QK4_0><<>>(
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}
static void ggml_cpy_f32_q4_1_cuda(
- const char * cx, char * cdst, const int ne,
- const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ const char * cx, char * cdst, const int64_t ne,
+ const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+ const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
GGML_ASSERT(ne % QK4_1 == 0);
- const int num_blocks = ne / QK4_1;
+ const int64_t num_blocks = ne / QK4_1;
+ GGML_ASSERT(num_blocks < UINT_MAX);
cpy_f32_q<<>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}
static void ggml_cpy_q4_1_f32_cuda(
- const char * cx, char * cdst, const int ne,
- const int ne00, const int ne01, const int ne02,
- const int nb00, const int nb01, const int nb02,
- const int nb03, const int ne10, const int ne11, const int ne12,
- const int nb10, const int nb11, const int nb12, const int nb13,
+ const char * cx, char * cdst, const int64_t ne,
+ const int64_t ne00, const int64_t ne01, const int64_t ne02,
+ const int64_t nb00, const int64_t nb01, const int64_t nb02,
+ const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12,
+ const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
cudaStream_t stream) {
- const int num_blocks = ne;
+ const int64_t num_blocks = ne;
+ GGML_ASSERT(num_blocks < UINT_MAX);
cpy_q_f32, QK4_1><<>>(
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}
static void ggml_cpy_f32_q5_0_cuda(
- const char * cx, char * cdst, const int ne,
- const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ const char * cx, char * cdst, const int64_t ne,
+ const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+ const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
GGML_ASSERT(ne % QK5_0 == 0);
- const int num_blocks = ne / QK5_0;
+ const int64_t num_blocks = ne / QK5_0;
+ GGML_ASSERT(num_blocks < UINT_MAX);
cpy_f32_q<<>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}
static void ggml_cpy_q5_0_f32_cuda(
- const char * cx, char * cdst, const int ne,
- const int ne00, const int ne01, const int ne02,
- const int nb00, const int nb01, const int nb02,
- const int nb03, const int ne10, const int ne11, const int ne12,
- const int nb10, const int nb11, const int nb12, const int nb13,
+ const char * cx, char * cdst, const int64_t ne,
+ const int64_t ne00, const int64_t ne01, const int64_t ne02,
+ const int64_t nb00, const int64_t nb01, const int64_t nb02,
+ const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12,
+ const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
cudaStream_t stream) {
- const int num_blocks = ne;
+ const int64_t num_blocks = ne;
+ GGML_ASSERT(num_blocks < UINT_MAX);
cpy_q_f32, QK5_0><<>>(
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}
static void ggml_cpy_f32_q5_1_cuda(
- const char * cx, char * cdst, const int ne,
- const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ const char * cx, char * cdst, const int64_t ne,
+ const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+ const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
GGML_ASSERT(ne % QK5_1 == 0);
- const int num_blocks = ne / QK5_1;
+ const int64_t num_blocks = ne / QK5_1;
+ GGML_ASSERT(num_blocks < UINT_MAX);
cpy_f32_q<<>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}
static void ggml_cpy_q5_1_f32_cuda(
- const char * cx, char * cdst, const int ne,
- const int ne00, const int ne01, const int ne02,
- const int nb00, const int nb01, const int nb02,
- const int nb03, const int ne10, const int ne11, const int ne12,
- const int nb10, const int nb11, const int nb12, const int nb13,
+ const char * cx, char * cdst, const int64_t ne,
+ const int64_t ne00, const int64_t ne01, const int64_t ne02,
+ const int64_t nb00, const int64_t nb01, const int64_t nb02,
+ const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12,
+ const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
cudaStream_t stream) {
- const int num_blocks = ne;
+ const int64_t num_blocks = ne;
+ GGML_ASSERT(num_blocks < UINT_MAX);
cpy_q_f32, QK5_1><<>>(
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}
static void ggml_cpy_f32_iq4_nl_cuda(
- const char * cx, char * cdst, const int ne,
- const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+ const char * cx, char * cdst, const int64_t ne,
+ const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
+ const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
GGML_ASSERT(ne % QK4_NL == 0);
- const int num_blocks = ne / QK4_NL;
+ const int64_t num_blocks = ne / QK4_NL;
+ GGML_ASSERT(num_blocks < UINT_MAX);
cpy_f32_q<<>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}
@@ -356,9 +373,6 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
const int64_t ne = ggml_nelements(src0);
GGML_ASSERT(ne == ggml_nelements(src1));
- GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
- GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
-
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index b78dd36b9..edb8a12f8 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -203,17 +203,6 @@ static ggml_cuda_device_info ggml_cuda_init() {
GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
int64_t total_vram = 0;
-//#ifdef GGML_CUDA_FORCE_MMQ
-// GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
-//#else
-// GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
-//#endif // GGML_CUDA_FORCE_MMQ
-//#ifdef GGML_CUDA_FORCE_CUBLAS
-// GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
-//#else
-// GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
-//#endif // GGML_CUDA_FORCE_CUBLAS
-
GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
std::vector> turing_devices_without_mma;
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 038c90930..5e86e4e1d 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -450,8 +450,15 @@ static constexpr std::initializer_list topk_moe_early_softmax_norm{ GGM
GGML_OP_VIEW, GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
GGML_OP_SUM_ROWS, GGML_OP_CLAMP, GGML_OP_DIV,
GGML_OP_RESHAPE };
+
+static constexpr std::initializer_list topk_moe_sigmoid_norm_bias{ GGML_OP_UNARY, GGML_OP_RESHAPE, GGML_OP_ADD,
+ GGML_OP_ARGSORT, GGML_OP_VIEW, GGML_OP_GET_ROWS,
+ GGML_OP_RESHAPE, GGML_OP_SUM_ROWS, GGML_OP_CLAMP,
+ GGML_OP_DIV, GGML_OP_RESHAPE };
+
static constexpr std::initializer_list topk_moe_early_softmax { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT,
GGML_OP_VIEW, GGML_OP_GET_ROWS };
+
static constexpr std::initializer_list topk_moe_late_softmax { GGML_OP_ARGSORT, GGML_OP_VIEW,
GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
GGML_OP_SOFT_MAX, GGML_OP_RESHAPE };
@@ -480,6 +487,32 @@ static constexpr std::initializer_list> topk_moe_early_softma
{ 9, 0, 8 }, // reshape->src[0] == div
};
+//node #436 ( UNARY): ffn_moe_probs-10 ( 256K) [Vulka ] use=2: ffn_moe_logits-10 ( 256K) [Vulka ]
+//node #437 ( RESHAPE): ffn_moe_probs-10 (re ( 256K) [Vulka ] use=1: ffn_moe_probs-10 ( 256K) [Vulka ]
+//node #438 ( ADD): ffn_moe_probs_biased ( 256K) [Vulka ] use=1: ffn_moe_probs-10 ( 256K) [Vulka ] blk.10.exp_probs_b.b ( 0K) [Vulka ]
+//node #439 ( ARGSORT): ffn_moe_argsort-10 ( 256K) [Vulka ] use=1: ffn_moe_probs_biased ( 256K) [Vulka ]
+//node #440 ( VIEW): ffn_moe_topk-10 ( 255K) [Vulka ] use=3: ffn_moe_argsort-10 ( 256K) [Vulka ]
+//node #441 ( GET_ROWS): ffn_moe_weights-10 ( 12K) [Vulka ] use=1: ffn_moe_probs-10 (re ( 256K) [Vulka ] ffn_moe_topk-10 ( 255K) [Vulka ]
+//node #442 ( RESHAPE): ffn_moe_weights-10 ( ( 12K) [Vulka ] use=2: ffn_moe_weights-10 ( 12K) [Vulka ]
+//node #443 ( SUM_ROWS): ffn_moe_weights_sum- ( 2K) [Vulka ] use=1: ffn_moe_weights-10 ( ( 12K) [Vulka ]
+//node #444 ( CLAMP): ffn_moe_weights_sum_ ( 2K) [Vulka ] use=1: ffn_moe_weights_sum- ( 2K) [Vulka ]
+//node #445 ( DIV): ffn_moe_weights_norm ( 12K) [Vulka ] use=1: ffn_moe_weights-10 ( ( 12K) [Vulka ] ffn_moe_weights_sum_ ( 2K) [Vulka ]
+//node #446 ( RESHAPE): ffn_moe_weights_norm ( 12K) [Vulka ] use=1: ffn_moe_weights_norm ( 12K) [Vulka ]
+static constexpr std::initializer_list> topk_moe_sigmoid_norm_bias_edges {
+ { 1, 0, 0 }, // reshape->src[0] == sigmoid
+ { 2, 0, 0 }, // add->src[0] == sigmoid
+ { 3, 0, 2 }, // argsort->src[0] == add
+ { 4, 0, 3 }, // view->src[0] == argsort
+ { 5, 0, 1 }, // get_rows->src[0] == reshape
+ { 5, 1, 4 }, // get_rows->src[1] == view
+ { 6, 0, 5 }, // reshape->src[0] == get_rows
+ { 7, 0, 6 }, // sum_rows->src[0] == reshape
+ { 8, 0, 7 }, // clamp->src[0] == sum_rows
+ { 9, 0, 6 }, // div->src[0] == reshape
+ { 9, 1, 8 }, // div->src[1] == clamp
+ {10, 0, 9 }, // reshape->src[0] == div
+};
+
// same as early_softmax_norm but ending after the get_rows
static constexpr std::initializer_list> topk_moe_early_softmax_edges {
{ 1, 0, 0 }, // reshape->src[0] == softmax
@@ -507,16 +540,10 @@ enum topk_moe_mode {
TOPK_MOE_EARLY_SOFTMAX,
TOPK_MOE_EARLY_SOFTMAX_NORM,
TOPK_MOE_LATE_SOFTMAX,
+ TOPK_MOE_SIGMOID_NORM_BIAS,
TOPK_MOE_COUNT,
};
-static topk_moe_mode ggml_vk_num_additional_ops_to_topk_moe_mode(uint32_t num) {
- topk_moe_mode mode = num == topk_moe_early_softmax_norm.size() - 1 ? TOPK_MOE_EARLY_SOFTMAX_NORM :
- num == topk_moe_early_softmax.size() - 1 ? TOPK_MOE_EARLY_SOFTMAX :
- TOPK_MOE_LATE_SOFTMAX;
- return mode;
-}
-
static constexpr std::initializer_list> rope_view_set_rows_edges {
{ 1, 0, 0 }, // view->src[0] == rope
{ 2, 0, 1 }, // set_rows->src[0] == view
@@ -782,7 +809,7 @@ struct vk_device_struct {
vk_pipeline pipeline_count_experts;
// [2] is for whether to take n_experts from spec constant (0) or push constant (1)
- vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT][2];
+ vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][2];
std::vector all_pipelines;
@@ -1197,6 +1224,11 @@ struct vk_op_topk_moe_push_constants {
uint32_t n_expert_used;
float clamp_min;
float clamp_max;
+ uint32_t gating_func;
+ uint32_t has_bias;
+ uint32_t with_norm;
+ float output_scale;
+ float output_bias;
};
struct vk_op_add_id_push_constants {
@@ -1787,6 +1819,8 @@ struct ggml_backend_vk_context {
// Bit 'i' means nodes[start_of_fusion + i] writes to memory.
// If there's no fusion, bit 0 is still set.
int fused_ops_write_mask {};
+ topk_moe_mode fused_topk_moe_mode {};
+ bool fused_topk_moe_scale {};
// for GGML_VK_PERF_LOGGER
std::unique_ptr perf_logger;
@@ -4307,9 +4341,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
for (uint32_t use_push = 0; use_push < 2; ++use_push) {
for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) {
- ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX][use_push], "topk_moe_f32_early_softmax_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<subgroup_size);
- ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM][use_push], "topk_moe_f32_early_softmax_norm"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<subgroup_size);
- ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX][use_push], "topk_moe_f32_late_softmax"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<subgroup_size);
+ ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][use_push], "topk_moe_f32_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 4, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<subgroup_size);
}
}
@@ -8714,10 +8746,9 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
if (ctx->num_additional_fused_ops) {
uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
GGML_ASSERT(idx < num_topk_moe_pipelines);
- topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
// use n_experts from push constant if it's not equal to the power of two spec constant
bool use_push = dst->ne[0] != (1u << idx);
- return ctx->device->pipeline_topk_moe[idx][mode][use_push];
+ return ctx->device->pipeline_topk_moe[idx][use_push];
}
if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
@@ -10376,14 +10407,16 @@ static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& sub
}
static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) {
- topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
+ topk_moe_mode mode = ctx->fused_topk_moe_mode;
ggml_tensor * logits = cgraph->nodes[node_idx + 0]->src[0];
- ggml_tensor * weights = (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) ? cgraph->nodes[node_idx + 9] :
- (mode == TOPK_MOE_EARLY_SOFTMAX) ? cgraph->nodes[node_idx + 4] :
- cgraph->nodes[node_idx + 5];
- ggml_tensor * ids = (mode == TOPK_MOE_LATE_SOFTMAX) ? cgraph->nodes[node_idx + 1] : cgraph->nodes[node_idx + 3];
+ ggml_tensor * bias = (mode == TOPK_MOE_SIGMOID_NORM_BIAS) ? cgraph->nodes[node_idx + 2]->src[1] : logits;
+ ggml_tensor * weights = cgraph->nodes[node_idx + ctx->num_additional_fused_ops];
+ ggml_tensor * ids = (mode == TOPK_MOE_SIGMOID_NORM_BIAS) ? cgraph->nodes[node_idx + 4] :
+ (mode == TOPK_MOE_LATE_SOFTMAX) ? cgraph->nodes[node_idx + 1] :
+ cgraph->nodes[node_idx + 3];
GGML_ASSERT(logits->type == GGML_TYPE_F32);
+ GGML_ASSERT(bias->type == GGML_TYPE_F32);
GGML_ASSERT(weights->type == GGML_TYPE_F32);
GGML_ASSERT(ids->type == GGML_TYPE_I32);
@@ -10398,6 +10431,7 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx,
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
vk_subbuffer logits_buf = ggml_vk_tensor_subbuffer(ctx, logits);
+ vk_subbuffer bias_buf = ggml_vk_tensor_subbuffer(ctx, bias);
vk_subbuffer weights_buf = ggml_vk_tensor_subbuffer(ctx, weights);
vk_subbuffer ids_buf = ggml_vk_tensor_subbuffer(ctx, ids);
@@ -10405,18 +10439,45 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx,
pc.n_rows = n_rows;
pc.n_experts_push = n_experts;
pc.n_expert_used = n_expert_used;
+ pc.clamp_min = -std::numeric_limits::infinity();
+ pc.clamp_max = std::numeric_limits::infinity();
if (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) {
ggml_tensor * clamp = cgraph->nodes[node_idx + 7];
+ GGML_ASSERT(clamp->op == GGML_OP_CLAMP);
pc.clamp_min = ggml_get_op_params_f32(clamp, 0);
pc.clamp_max = ggml_get_op_params_f32(clamp, 1);
}
+ if (mode == TOPK_MOE_SIGMOID_NORM_BIAS) {
+ ggml_tensor * clamp = cgraph->nodes[node_idx + 8];
+ GGML_ASSERT(clamp->op == GGML_OP_CLAMP);
+ pc.clamp_min = ggml_get_op_params_f32(clamp, 0);
+ pc.clamp_max = ggml_get_op_params_f32(clamp, 1);
+ }
+
+#define GATING_FUNC_SOFTMAX 0
+#define GATING_FUNC_SIGMOID 1
+#define GATING_FUNC_SOFTMAX_WEIGHT 2
+
+ pc.gating_func = mode == TOPK_MOE_SIGMOID_NORM_BIAS ? GATING_FUNC_SIGMOID :
+ mode == TOPK_MOE_LATE_SOFTMAX ? GATING_FUNC_SOFTMAX_WEIGHT :
+ GATING_FUNC_SOFTMAX;
+ pc.has_bias = mode == TOPK_MOE_SIGMOID_NORM_BIAS;
+ pc.with_norm = mode == TOPK_MOE_EARLY_SOFTMAX_NORM || mode == TOPK_MOE_SIGMOID_NORM_BIAS;
+ if (ctx->fused_topk_moe_scale) {
+ GGML_ASSERT(weights->op == GGML_OP_SCALE);
+ pc.output_scale = ggml_get_op_params_f32(weights, 0);
+ pc.output_bias = ggml_get_op_params_f32(weights, 1);
+ } else {
+ pc.output_scale = 1.0f;
+ pc.output_bias = 0.0f;
+ }
GGML_ASSERT(n_expert_used <= n_experts);
const uint32_t rows_per_block = 4;
std::array elements = { CEIL_DIV(n_rows, rows_per_block), 1, 1 };
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {logits_buf, weights_buf, ids_buf}, pc, elements);
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {logits_buf, bias_buf, weights_buf, ids_buf}, pc, elements);
}
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_cgraph * cgraph, int node_idx, bool backprop) {
@@ -12158,6 +12219,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
break;
case GGML_OP_UNARY:
+ if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) {
+ ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx);
+ break;
+ }
+
switch (ggml_get_unary_op(node)) {
case GGML_UNARY_OP_EXP:
case GGML_UNARY_OP_SILU:
@@ -12205,7 +12271,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
break;
case GGML_OP_SOFT_MAX:
- if (ctx->num_additional_fused_ops) {
+ if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) {
ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx);
} else {
ggml_vk_soft_max(ctx, compute_ctx, src0, src1, src2, node);
@@ -12225,7 +12291,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
break;
case GGML_OP_ARGSORT:
- if (ctx->num_additional_fused_ops) {
+ if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) {
ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx);
} else {
ggml_vk_argsort(ctx, compute_ctx, src0, node);
@@ -13078,6 +13144,24 @@ static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struc
get_rows = cgraph->nodes[node_idx + 4];
argsort = cgraph->nodes[node_idx + 2];
break;
+ case TOPK_MOE_SIGMOID_NORM_BIAS:
+ softmax = cgraph->nodes[node_idx + 0]; // really sigmoid
+ weights = cgraph->nodes[node_idx + 10];
+ get_rows = cgraph->nodes[node_idx + 5];
+ argsort = cgraph->nodes[node_idx + 3];
+ if (ggml_get_unary_op(softmax) != GGML_UNARY_OP_SIGMOID) {
+ return false;
+ }
+ // bias is expected to be 1D
+ if (ggml_nrows(cgraph->nodes[node_idx + 2]->src[1]) != 1 ||
+ !ggml_is_contiguous(cgraph->nodes[node_idx + 2]->src[1])) {
+ return false;
+ }
+ // sigmoid fusion seems to generate infinities on moltenvk
+ if (ctx->device->driver_id == vk::DriverId::eMoltenvk) {
+ return false;
+ }
+ break;
case TOPK_MOE_EARLY_SOFTMAX:
softmax = cgraph->nodes[node_idx + 0];
weights = cgraph->nodes[node_idx + 4];
@@ -13101,26 +13185,28 @@ static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struc
probs = probs->src[0];
ggml_tensor * selection_probs = argsort->src[0];
- if (probs != selection_probs) {
+ if (probs != selection_probs && mode != TOPK_MOE_SIGMOID_NORM_BIAS) {
return false;
}
- const float * op_params = (const float *)softmax->op_params;
-
- float scale = op_params[0];
- float max_bias = op_params[1];
-
if (!ggml_is_contiguous(softmax->src[0]) || !ggml_is_contiguous(weights)) {
return false;
}
- if (scale != 1.0f || max_bias != 0.0f) {
- return false;
- }
+ if (softmax->op == GGML_OP_SOFT_MAX) {
+ const float * op_params = (const float *)softmax->op_params;
- // don't fuse when masks or sinks are present
- if (softmax->src[1] || softmax->src[2]) {
- return false;
+ float scale = op_params[0];
+ float max_bias = op_params[1];
+
+ if (scale != 1.0f || max_bias != 0.0f) {
+ return false;
+ }
+
+ // don't fuse when masks or sinks are present
+ if (softmax->src[1] || softmax->src[2]) {
+ return false;
+ }
}
const int n_expert = softmax->ne[0];
@@ -13393,6 +13479,8 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
total_mul_mat_bytes += bytes;
}
+ ctx->fused_topk_moe_mode = TOPK_MOE_COUNT;
+ ctx->fused_topk_moe_scale = false;
const char *fusion_string {};
if (!ctx->device->disable_fusion) {
uint32_t num_adds = ggml_vk_fuse_multi_add(ctx, cgraph, i);
@@ -13438,13 +13526,23 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
ctx->num_additional_fused_ops = topk_moe_early_softmax_norm.size() - 1;
// view of argsort writes to memory
ctx->fused_ops_write_mask |= 1 << 3;
+ ctx->fused_topk_moe_mode = TOPK_MOE_EARLY_SOFTMAX_NORM;
fusion_string = "TOPK_MOE_EARLY_SOFTMAX_NORM";
+ } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_sigmoid_norm_bias, { i + 4, i + 10 }) &&
+ ggml_check_edges(cgraph, i, topk_moe_sigmoid_norm_bias_edges) &&
+ ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_SIGMOID_NORM_BIAS)) {
+ ctx->num_additional_fused_ops = topk_moe_sigmoid_norm_bias.size() - 1;
+ // view of argsort writes to memory
+ ctx->fused_ops_write_mask |= 1 << 4;
+ ctx->fused_topk_moe_mode = TOPK_MOE_SIGMOID_NORM_BIAS;
+ fusion_string = "TOPK_MOE_SIGMOID_NORM_BIAS";
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) &&
ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) &&
ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) {
ctx->num_additional_fused_ops = topk_moe_early_softmax.size() - 1;
// view of argsort writes to memory
ctx->fused_ops_write_mask |= 1 << 3;
+ ctx->fused_topk_moe_mode = TOPK_MOE_EARLY_SOFTMAX;
fusion_string = "TOPK_MOE_EARLY_SOFTMAX";
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) &&
ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) &&
@@ -13452,8 +13550,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
ctx->num_additional_fused_ops = topk_moe_late_softmax.size() - 1;
// view of argsort writes to memory
ctx->fused_ops_write_mask |= 1 << 1;
+ ctx->fused_topk_moe_mode = TOPK_MOE_LATE_SOFTMAX;
fusion_string = "TOPK_MOE_LATE_SOFTMAX";
}
+ if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) {
+ // Look for an additional scale op to fuse - occurs in deepseek2 and nemotron3 nano.
+ if (ggml_can_fuse_subgraph(cgraph, i + ctx->num_additional_fused_ops - 1, { GGML_OP_DIV, GGML_OP_RESHAPE, GGML_OP_SCALE }, { i + ctx->num_additional_fused_ops + 1 }) ||
+ ggml_can_fuse_subgraph(cgraph, i + ctx->num_additional_fused_ops, { GGML_OP_GET_ROWS, GGML_OP_SCALE }, { i + ctx->num_additional_fused_ops + 1 })) {
+ ctx->fused_topk_moe_scale = true;
+ ctx->num_additional_fused_ops++;
+ }
+ }
}
ctx->fused_ops_write_mask |= 1 << ctx->num_additional_fused_ops;
@@ -13632,6 +13739,9 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
if (keep_pattern(topk_moe_early_softmax_norm)) {
continue;
}
+ if (keep_pattern(topk_moe_sigmoid_norm_bias)) {
+ continue;
+ }
if (keep_pattern(topk_moe_early_softmax)) {
continue;
}
@@ -13658,6 +13768,7 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
}
// Don't pull forward nodes from fusion patterns
if (match_pattern(topk_moe_early_softmax_norm, j) ||
+ match_pattern(topk_moe_sigmoid_norm_bias, j) ||
match_pattern(topk_moe_early_softmax, j) ||
match_pattern(topk_moe_late_softmax, j)) {
continue;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp b/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
index b83a2b9d2..4bf6d2bcb 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
@@ -7,6 +7,10 @@
#include "types.glsl"
+#define GATING_FUNC_SOFTMAX 0
+#define GATING_FUNC_SIGMOID 1
+#define GATING_FUNC_SOFTMAX_WEIGHT 2
+
layout (push_constant) uniform parameter
{
uint n_rows;
@@ -14,15 +18,18 @@ layout (push_constant) uniform parameter
uint n_expert_used;
float clamp_min;
float clamp_max;
+ uint gating_func;
+ uint has_bias;
+ uint with_norm;
+ float output_scale;
+ float output_bias;
};
layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
layout(constant_id = 0) const uint WARP_SIZE = 32;
layout(constant_id = 1) const uint n_experts_spec = 512;
-layout(constant_id = 2) const bool with_norm = true;
-layout(constant_id = 3) const bool late_softmax = false;
-layout(constant_id = 4) const bool nexperts_use_push = false;
+layout(constant_id = 2) const bool nexperts_use_push = false;
uint n_experts = nexperts_use_push ? n_experts_push : n_experts_spec;
@@ -31,8 +38,9 @@ uint n_experts = nexperts_use_push ? n_experts_push : n_experts_spec;
const uint experts_per_thread = CEIL_DIV(n_experts_spec, WARP_SIZE);
layout (binding = 0, std430) readonly buffer Logits {float logits[];};
-layout (binding = 1, std430) writeonly buffer Weights {float weights[];};
-layout (binding = 2, std430) writeonly buffer Ids {uint ids[];};
+layout (binding = 1, std430) readonly buffer BiasProbs {float bias[];};
+layout (binding = 2, std430) writeonly buffer Weights {float weights[];};
+layout (binding = 3, std430) writeonly buffer Ids {uint ids[];};
const float INFINITY = 1.0 / 0.0;
@@ -87,20 +95,40 @@ void main() {
}
const uint logits_offset = n_experts * row;
+ const uint bias_offset = 0; // 1D
const uint weights_offset = n_expert_used * row;
const uint ids_offset = n_experts * row;
const uint lane = gl_SubgroupInvocationID;
- float wt[experts_per_thread];
+ float probs[experts_per_thread];
[[unroll]]
for (uint i = 0; i < n_experts; i += WARP_SIZE) {
const uint expert = i + lane;
- wt[i / WARP_SIZE] = (n_experts % WARP_SIZE == 0 || expert < n_experts) ? logits[logits_offset + expert] : -INFINITY;
+ probs[i / WARP_SIZE] = (n_experts % WARP_SIZE == 0 || expert < n_experts) ? logits[logits_offset + expert] : -INFINITY;
}
- if (!late_softmax) {
- softmax_warp_inplace(wt, n_experts, lane, nexperts_use_push);
+ if (gating_func == GATING_FUNC_SOFTMAX) {
+ softmax_warp_inplace(probs, n_experts, lane, nexperts_use_push);
+ } else if (gating_func == GATING_FUNC_SIGMOID) {
+ [[unroll]]
+ for (int i = 0; i < experts_per_thread; i++) {
+ probs[i] = 1.f / (1.f + exp(-probs[i]));
+ }
+ }
+
+ float selection_probs[experts_per_thread];
+ if (has_bias != 0) {
+ [[unroll]]
+ for (uint i = 0; i < n_experts; i += WARP_SIZE) {
+ const uint expert = i + lane;
+ selection_probs[i / WARP_SIZE] = (n_experts % WARP_SIZE == 0 || expert < n_experts) ? probs[i / WARP_SIZE] + bias[bias_offset + expert] : -INFINITY;
+ }
+ } else {
+ [[unroll]]
+ for (int i = 0; i < experts_per_thread; i++) {
+ selection_probs[i] = probs[i];
+ }
}
// at this point, each thread holds a portion of softmax,
@@ -117,14 +145,16 @@ void main() {
}
for (int k = 0; k < n_expert_used; k++) {
- float max_val = wt[0];
+ float max_val = probs[0];
+ float max_val_s = selection_probs[0];
uint max_expert = lane;
[[unroll]]
for (int i = 1; i < experts_per_thread; i++) {
const uint expert = lane + i * WARP_SIZE;
- if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && wt[i] > max_val) {
- max_val = wt[i];
+ if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && selection_probs[i] > max_val_s) {
+ max_val = probs[i];
+ max_val_s = selection_probs[i];
max_expert = expert;
}
}
@@ -132,9 +162,11 @@ void main() {
[[unroll]]
for (uint mask = WARP_SIZE / 2; mask > 0; mask /= 2) {
const float val = subgroupShuffleXor(max_val, mask);
+ const float val_s = subgroupShuffleXor(max_val_s, mask);
const uint expert = subgroupShuffleXor(max_expert, mask);
- if (val > max_val || (val == max_val && expert < max_expert)) {
+ if (val_s > max_val_s || (val_s == max_val_s && expert < max_expert)) {
max_val = val;
+ max_val_s = val_s;
max_expert = expert;
}
}
@@ -144,16 +176,14 @@ void main() {
}
if ((max_expert & (WARP_SIZE - 1)) == lane) {
- wt[max_expert / WARP_SIZE] = -INFINITY;
+ selection_probs[max_expert / WARP_SIZE] = -INFINITY;
ids[ids_offset + k] = max_expert;
- if (with_norm) {
- wt_sum += max_val;
- }
+ wt_sum += max_val;
}
}
- if (with_norm) {
+ if (with_norm != 0) {
wt_sum = subgroupAdd(wt_sum);
wt_sum = clamp(wt_sum, clamp_min, clamp_max);
const float inv_sum = 1.0f / wt_sum;
@@ -164,7 +194,7 @@ void main() {
}
}
- if (late_softmax) {
+ if (gating_func == GATING_FUNC_SOFTMAX_WEIGHT) {
softmax_warp_inplace(output_weights, n_expert_used, lane, true);
}
@@ -172,7 +202,7 @@ void main() {
for (uint i = 0; i < experts_per_thread; ++i) {
uint idx = i * WARP_SIZE + lane;
if (idx < n_expert_used) {
- weights[weights_offset + idx] = output_weights[i];
+ weights[weights_offset + idx] = output_scale * output_weights[i] + output_bias;
}
}
}
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index c2a0f41c1..0ac512ff3 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -294,7 +294,9 @@ class Keys:
USE_GELU = "clip.use_gelu"
USE_SILU = "clip.use_silu"
N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl
+ WA_LAYER_INDEXES = "clip.vision.wa_layer_indexes" # used by youtuvl
IS_DEEPSTACK_LAYERS = "clip.vision.is_deepstack_layers"
+ WINDOW_SIZE = "clip.vision.window_size"
class Attention:
HEAD_COUNT = "clip.vision.attention.head_count"
@@ -3494,6 +3496,7 @@ class VisionProjectorType:
LFM2A = "lfm2a" # audio
MUSIC_FLAMINGO = "musicflamingo" # audio
GLM4V = "glm4v"
+ YOUTUVL = "youtuvl"
# Items here are (block size, type size)
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 6a4a504f8..612a978e4 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -1129,11 +1129,40 @@ class GGUFWriter:
self.add_uint32(Keys.ClipVision.Projector.SCALE_FACTOR, value)
def add_vision_n_wa_pattern(self, value: int) -> None:
+ """Add window attention pattern interval for vision models.
+
+ This defines the pattern interval for window attention vs full attention layers.
+ For example, if n_wa_pattern=4, then layers 3, 7, 11, ... use full attention,
+ while other layers use window attention.
+
+ Used by models like Qwen2.5-VL where full attention layers follow a regular pattern.
+ """
self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value)
+ def add_vision_wa_layer_indexes(self, layers: Sequence[int]) -> None:
+ """Add explicit layer indexes that use full attention in vision models.
+
+ This specifies the exact layer indices (0-based) that should use full attention
+ instead of window attention. All other layers will use window attention.
+
+ Args:
+ layers: List of layer indices that use full attention (e.g., [3, 7, 11, 15])
+
+ Used by models like YoutuVL where full attention layers are explicitly specified
+ rather than following a regular pattern.
+
+ Difference from add_vision_n_wa_pattern:
+ - n_wa_pattern: Defines a regular interval pattern (every Nth layer uses full attention)
+ - wa_layer_indexes: Explicitly lists which layers use full attention (irregular pattern)
+ """
+ self.add_array(Keys.ClipVision.WA_LAYER_INDEXES, layers)
+
def add_vision_is_deepstack_layers(self, layers: Sequence[bool]) -> None:
self.add_array(Keys.ClipVision.IS_DEEPSTACK_LAYERS, layers)
+ def add_vision_window_size(self, value: int) -> None:
+ self.add_uint32(Keys.ClipVision.WINDOW_SIZE, value)
+
# audio models
def add_audio_projection_dim(self, value: int) -> None:
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 115df6c7c..64dd4ddca 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -1221,6 +1221,7 @@ class TensorNameMap:
MODEL_TENSOR.V_MMPROJ: (
"multi_modal_projector.linear_{bid}",
"visual.merger.mlp.{bid}", # qwen2vl
+ "merger.mlp.{bid}",
),
MODEL_TENSOR.V_MMPROJ_FC: (
@@ -1258,6 +1259,7 @@ class TensorNameMap:
"visual.patch_embed.proj", # qwen2vl
"vision_tower.patch_embed.proj", # kimi-vl
"model.vision.patch_embedding.proj", # cogvlm
+ "siglip2.vision_model.embeddings.patch_embedding",
),
MODEL_TENSOR.V_ENC_EMBD_NORM: (
@@ -1291,6 +1293,7 @@ class TensorNameMap:
"vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
"visual.blocks.{bid}.attn.q", # qwen2vl, generated
"vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated
+ "siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl
),
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
@@ -1308,6 +1311,7 @@ class TensorNameMap:
"vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
"visual.blocks.{bid}.attn.k", # qwen2vl, generated
"vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated
+ "siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj",
),
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
@@ -1325,6 +1329,7 @@ class TensorNameMap:
"vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
"visual.blocks.{bid}.attn.v", # qwen2vl, generated
"vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated
+ "siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj",
),
MODEL_TENSOR.V_ENC_INPUT_NORM: (
@@ -1339,6 +1344,7 @@ class TensorNameMap:
"visual.blocks.{bid}.norm1", # qwen2vl
"vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1)
"model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm
+ "siglip2.vision_model.encoder.layers.{bid}.layer_norm1",
),
MODEL_TENSOR.V_ENC_ATTN_O: (
@@ -1354,6 +1360,7 @@ class TensorNameMap:
"visual.blocks.{bid}.attn.proj", # qwen2vl
"vision_tower.encoder.blocks.{bid}.wo", # kimi-vl
"model.vision.transformer.layers.{bid}.attention.dense", # cogvlm
+ "siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
),
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
@@ -1368,6 +1375,7 @@ class TensorNameMap:
"visual.blocks.{bid}.norm2", # qwen2vl
"vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1)
"model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm
+ "siglip2.vision_model.encoder.layers.{bid}.layer_norm2",
),
MODEL_TENSOR.V_ENC_FFN_UP: (
@@ -1383,6 +1391,7 @@ class TensorNameMap:
"visual.blocks.{bid}.mlp.linear_fc1", # qwen3vl
"vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1)
"model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm
+ "siglip2.vision_model.encoder.layers.{bid}.mlp.fc1",
),
MODEL_TENSOR.V_ENC_FFN_GATE: (
@@ -1404,6 +1413,7 @@ class TensorNameMap:
"visual.blocks.{bid}.mlp.linear_fc2", # qwen3vl
"vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1)
"model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm
+ "siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
),
MODEL_TENSOR.V_LAYER_SCALE_1: (
@@ -1430,6 +1440,7 @@ class TensorNameMap:
"visual.merger.ln_q", # qwen2vl
"vision_tower.encoder.final_layernorm", # kimi-vl
"visual.post_layernorm", # glm4v
+ "siglip2.vision_model.post_layernorm",
),
MODEL_TENSOR.V_MM_POST_NORM: (
@@ -1446,6 +1457,7 @@ class TensorNameMap:
"multi_modal_projector.pre_norm",
"pre_mm_projector_norm",
"model.vision.linear_proj.norm1", # cogvlm
+ "merger.ln_q",
),
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp
index fc6a6223c..b54ebbd15 100644
--- a/src/llama-chat.cpp
+++ b/src/llama-chat.cpp
@@ -74,6 +74,7 @@ static const std::map LLM_CHAT_TEMPLATES = {
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
{ "pangu-embedded", LLM_CHAT_TEMPLATE_PANGU_EMBED },
+ { "solar-open", LLM_CHAT_TEMPLATE_SOLAR_OPEN },
};
llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -216,6 +217,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
return LLM_CHAT_TEMPLATE_GROK_2;
} else if (tmpl_contains(LU8("[unused9]系统:[unused10]"))) {
return LLM_CHAT_TEMPLATE_PANGU_EMBED;
+ } else if (tmpl_contains("<|begin|>") && tmpl_contains("<|end|>") && tmpl_contains("<|content|>")) {
+ return LLM_CHAT_TEMPLATE_SOLAR_OPEN;
}
return LLM_CHAT_TEMPLATE_UNKNOWN;
}
@@ -845,6 +848,14 @@ int32_t llm_chat_apply_template(
if (add_ass) {
ss << "[unused9]助手:";
}
+ } else if (tmpl == LLM_CHAT_TEMPLATE_SOLAR_OPEN) {
+ for (auto message : chat) {
+ std::string role(message->role);
+ ss << "<|begin|>" << role << "<|content|>" << message->content << "<|end|>";
+ }
+ if (add_ass) {
+ ss << "<|begin|>assistant";
+ }
} else {
// template not supported
return -1;
diff --git a/src/llama-chat.h b/src/llama-chat.h
index 684efb4d6..e1f795249 100644
--- a/src/llama-chat.h
+++ b/src/llama-chat.h
@@ -54,6 +54,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_SEED_OSS,
LLM_CHAT_TEMPLATE_GROK_2,
LLM_CHAT_TEMPLATE_PANGU_EMBED,
+ LLM_CHAT_TEMPLATE_SOLAR_OPEN,
LLM_CHAT_TEMPLATE_UNKNOWN,
};
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index eed102b25..1d434f0ce 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -234,6 +234,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
case LLM_TYPE_80B_A3B: return "80B.A3B";
case LLM_TYPE_100B_A6B: return "100B.A6B";
+ case LLM_TYPE_102B_A12B: return "102B.A12B";
case LLM_TYPE_106B_A12B: return "106B.A12B";
case LLM_TYPE_230B_A10B: return "230B.A10B";
case LLM_TYPE_235B_A22B: return "235B.A22B";
@@ -1790,7 +1791,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
@@ -1886,6 +1887,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
switch (hparams.n_layer) {
case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
+ case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
default: type = LLM_TYPE_UNKNOWN;
}
@@ -3481,7 +3483,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
+
+ const auto tn_ffn_up_weight = tn(LLM_TENSOR_FFN_UP, "weight", i);
+ ggml_tensor * t_ffn_up = ml.get_tensor_meta(tn_ffn_up_weight.str().c_str());
+ const int64_t n_ffn_up = t_ffn_up ? t_ffn_up->ne[1] : n_ff;
+
+ GGML_ASSERT(n_ffn_up == n_ff || n_ffn_up == n_ff * 2);
+ layer.ffn_up = create_tensor(tn_ffn_up_weight, {n_embd, n_ffn_up}, 0);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ffn_up}, TENSOR_NOT_REQUIRED);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
@@ -4937,7 +4946,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
// output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+ // try to load output.weight, if not found, use token_embd (tied embeddings)
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ if (!output) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
@@ -5000,7 +5013,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
// output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+ // try to load output.weight, if not found, use token_embd (tied embeddings)
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ if (!output) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
@@ -5367,9 +5384,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, TENSOR_NOT_REQUIRED | flags);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, TENSOR_NOT_REQUIRED | flags);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, TENSOR_NOT_REQUIRED | flags);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
@@ -7605,7 +7622,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
} break;
case LLM_ARCH_MODERN_BERT:
{
- llm = std::make_unique>(*this, params);
+ llm = std::make_unique(*this, params);
} break;
case LLM_ARCH_NEO_BERT:
{
diff --git a/src/llama-model.h b/src/llama-model.h
index f4f44a92b..79200a0d9 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -119,6 +119,7 @@ enum llm_type {
LLM_TYPE_31B_A3_5B,
LLM_TYPE_80B_A3B, // Qwen3 Next
LLM_TYPE_100B_A6B,
+ LLM_TYPE_102B_A12B, // Solar-Open
LLM_TYPE_106B_A12B, // GLM-4.5-Air
LLM_TYPE_230B_A10B, // Minimax M2
LLM_TYPE_235B_A22B,
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index f0e39fc2c..5592a192a 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -539,6 +539,12 @@ struct llm_tokenizer_bpe : llm_tokenizer {
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
};
break;
+ case LLAMA_VOCAB_PRE_TYPE_YOUTU:
+ regex_exprs = {
+ "[가-힣ㄱ-ㆎ]+|[!…“”‘’—:;,、-〿︰-﹏]+|[ㄅ-ㄯ]+|[一-龥-ゟ゠-ヿ]+",
+ "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+ };
+ break;
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
regex_exprs = {
"[\r\n]",
@@ -580,6 +586,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
+ case LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN:
regex_exprs = {
// original regex from tokenizer.json
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
@@ -2096,6 +2103,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "deepseek-v3") {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
clean_spaces = false;
+ } else if (
+ tokenizer_pre == "youtu") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_YOUTU;
+ clean_spaces = false;
+ ignore_merges = true;
} else if (
tokenizer_pre == "falcon") {
pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
@@ -2251,6 +2263,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "minimax-m2") {
pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
clean_spaces = false;
+ } else if (
+ tokenizer_pre == "solar-open") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
+ clean_spaces = false;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
@@ -2597,6 +2613,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<|end|>"
|| t.first == "<|return|>" // o200k_harmony
|| t.first == "<|call|>" // o200k_harmony
+ || t.first == "<|flush|>" // solar-open
+ || t.first == "<|calls|>" // solar-open
|| t.first == ""
|| t.first == "<|endoftext|>"
|| t.first == "<|eom_id|>"
@@ -2643,13 +2661,14 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
}
- // TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG
- // we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens,
+ // TODO: workaround for o200k_harmony and solar-open tokenizer: the "<|end|>" token should not be EOG
+ // we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens ("<|calls|>" and "<|flush|>" for solar-open),
// we remove the "<|end|>" token from the EOG list
{
bool has_return = false;
bool has_call = false;
bool has_end = false;
+ bool has_flush = false;
llama_token end_id = LLAMA_TOKEN_NULL;
@@ -2659,18 +2678,20 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
if (id_to_token[tid].text == "<|return|>") {
has_return = true;
- } else if (id_to_token[tid].text == "<|call|>") {
+ } else if (id_to_token[tid].text == "<|call|>" || id_to_token[tid].text == "<|calls|>") {
has_call = true;
+ } else if (id_to_token[tid].text == "<|flush|>") {
+ has_flush = true;
} else if (id_to_token[tid].text == "<|end|>") {
has_end = true;
end_id = tid;
}
}
- if (has_return && has_call && has_end) {
+ if ((has_return && has_call && has_end) || (has_call && has_flush && has_end)) {
special_eog_ids.erase(end_id);
id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
- LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
+ LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
}
}
}
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
index 87f0c9f1d..57e064217 100644
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -52,6 +52,8 @@ enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41,
LLAMA_VOCAB_PRE_TYPE_AFMOE = 42,
+ LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
+ LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
};
struct LLM_KV;
diff --git a/src/models/bert.cpp b/src/models/bert.cpp
index 3274fa3b9..bca0e254f 100644
--- a/src/models/bert.cpp
+++ b/src/models/bert.cpp
@@ -142,11 +142,13 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
LLM_FFN_GELU, LLM_FFN_SEQ, il);
cb(cur, "ffn_out", il);
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
+ const bool up_contains_gate = !model.layers[il].ffn_gate && model.layers[il].ffn_up->ne[1] != hparams.n_ff();
+ auto type_op = up_contains_gate ? LLM_FFN_GEGLU : LLM_FFN_GELU;
cur = build_ffn(cur,
- model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
- model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
+ type_op, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else {
cur = build_ffn(cur,
diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp
index 49382874b..ca63a62ad 100644
--- a/src/models/deepseek2.cpp
+++ b/src/models/deepseek2.cpp
@@ -215,7 +215,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
- true, hparams.expert_weights_scale,
+ hparams.expert_weights_scale, hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);
diff --git a/src/models/models.h b/src/models/models.h
index e2cd4e484..e78a788d4 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -332,7 +332,6 @@ struct llm_build_mistral3 : public llm_graph_context {
llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
};
-template
struct llm_build_modern_bert : public llm_graph_context {
llm_build_modern_bert(const llama_model & model, const llm_graph_params & params);
};
diff --git a/src/models/modern-bert.cpp b/src/models/modern-bert.cpp
index c7809bded..6df418ecd 100644
--- a/src/models/modern-bert.cpp
+++ b/src/models/modern-bert.cpp
@@ -1,7 +1,6 @@
#include "models.h"
-template
-llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -24,13 +23,7 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, co
auto * inp_attn = build_attn_inp_no_cache();
for (int il = 0; il < n_layer; ++il) {
- float freq_base_l = 0.0f;
-
- if constexpr (iswa) {
- freq_base_l = model.get_rope_freq_base(cparams, il);
- } else {
- freq_base_l = freq_base;
- }
+ float freq_base_l = model.get_rope_freq_base(cparams, il);
cur = inpL;
@@ -120,7 +113,3 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, co
res->t_embd = cur;
ggml_build_forward_expand(gf, cur);
}
-
-// Explicit template instantiations
-template struct llm_build_modern_bert;
-template struct llm_build_modern_bert;
diff --git a/src/unicode.cpp b/src/unicode.cpp
index b3e3ffc04..c04086ebc 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -972,6 +972,11 @@ std::vector unicode_regex_split(const std::string & text, const std
{ "\\p{P}", unicode_cpt_flags::PUNCTUATION },
{ "\\p{M}", unicode_cpt_flags::ACCENT_MARK },
{ "\\p{S}", unicode_cpt_flags::SYMBOL },
+ { "\\p{Lu}", unicode_cpt_flags::LETTER }, // Uppercase letter
+ { "\\p{Ll}", unicode_cpt_flags::LETTER }, // Lowercase letter
+ { "\\p{Lt}", unicode_cpt_flags::LETTER }, // Titlecase letter
+ { "\\p{Lm}", unicode_cpt_flags::LETTER }, // Modifier letter
+ { "\\p{Lo}", unicode_cpt_flags::LETTER }, // Other letter
};
static const std::map k_ucat_cpt = {
@@ -1082,22 +1087,26 @@ std::vector unicode_regex_split(const std::string & text, const std
continue;
}
- if (regex_expr[i + 0] == '\\' && i + 4 < regex_expr.size() &&
+ // Match \p{...} Unicode properties of varying lengths
+ if (regex_expr[i + 0] == '\\' && i + 3 < regex_expr.size() &&
regex_expr[i + 1] == 'p' &&
- regex_expr[i + 2] == '{' &&
- regex_expr[i + 4] == '}') {
- const std::string pat = regex_expr.substr(i, 5);
- if (k_ucat_enum.find(pat) != k_ucat_enum.end()) {
- if (!inside) {
- regex_expr_collapsed += '[';
+ regex_expr[i + 2] == '{') {
+ // Find the closing brace
+ size_t closing_brace = regex_expr.find('}', i + 3);
+ if (closing_brace != std::string::npos && closing_brace <= i + 10) { // reasonable limit
+ const std::string pat = regex_expr.substr(i, closing_brace - i + 1);
+ if (k_ucat_enum.find(pat) != k_ucat_enum.end()) {
+ if (!inside) {
+ regex_expr_collapsed += '[';
+ }
+ regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
+ regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
+ if (!inside) {
+ regex_expr_collapsed += ']';
+ }
+ i = closing_brace;
+ continue;
}
- regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
- regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
- if (!inside) {
- regex_expr_collapsed += ']';
- }
- i += 4;
- continue;
}
}
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index cd63acac6..d74964e18 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -45,13 +45,14 @@
#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size"
#define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers"
-#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
-#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
-#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
-#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
-#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
-#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
-#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
+#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
+#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
+#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
+#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
+#define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes"
+#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
+#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
+#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
// audio-specific
#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities
@@ -188,6 +189,7 @@ enum projector_type {
PROJECTOR_TYPE_JANUS_PRO,
PROJECTOR_TYPE_LFM2A,
PROJECTOR_TYPE_GLM4V,
+ PROJECTOR_TYPE_YOUTUVL,
PROJECTOR_TYPE_UNKNOWN,
};
@@ -218,6 +220,7 @@ static std::map PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
{ PROJECTOR_TYPE_LFM2A, "lfm2a"},
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
+ { PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
};
static projector_type clip_projector_type_from_string(const std::string & str) {
diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h
index 1e5aa87b9..702e10151 100644
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -61,6 +61,7 @@ struct clip_hparams {
std::unordered_set vision_feature_layer;
int32_t attn_window_size = 0;
int32_t n_wa_pattern = 0;
+ std::unordered_set wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)
// audio
int32_t n_mel_bins = 0; // whisper preprocessor
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 4cd9d7e96..6d5118732 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -62,6 +62,7 @@
#include "models/qwen3vl.cpp"
#include "models/siglip.cpp"
#include "models/whisper-enc.cpp"
+#include "models/youtuvl.cpp"
struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
@@ -890,6 +891,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{
builder = std::make_unique(ctx, img);
} break;
+ case PROJECTOR_TYPE_YOUTUVL:
+ {
+ builder = std::make_unique(ctx, img);
+ } break;
default:
GGML_ABORT("missing cgraph builder");
}
@@ -1232,6 +1237,20 @@ struct clip_model_loader {
// LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
// }
} break;
+ case PROJECTOR_TYPE_YOUTUVL:
+ {
+ hparams.n_merge = 2;
+ get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
+ get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
+ std::vector wa_layer_indexes_vec;
+ get_arr_int(KEY_WIN_ATTN_LAYER_INDEXES, wa_layer_indexes_vec, true);
+ for (auto & layer : wa_layer_indexes_vec) {
+ hparams.wa_layer_indexes.insert(layer);
+ }
+ // support max_height * max_width = 8000 * 8000. 8000/16/2 = 250 image tokens
+ hparams.set_limit_image_tokens(1, 62500);
+ hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup
+ } break;
case PROJECTOR_TYPE_GLM4V:
{
hparams.rope_theta = 10000.0f;
@@ -1300,7 +1319,14 @@ struct clip_model_loader {
LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector);
LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version);
LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge);
- LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
+ LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
+ if (!hparams.wa_layer_indexes.empty()) {
+ LOG_INF("%s: wa_layer_indexes: ", __func__);
+ for (auto & layer : hparams.wa_layer_indexes) {
+ LOG_INF("%d ", layer);
+ }
+ LOG_INF("\n");
+ }
if (hparams.image_min_pixels > 0) {
LOG_INF("%s: image_min_pixels: %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : "");
}
@@ -1568,6 +1594,14 @@ struct clip_model_loader {
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
} break;
+ case PROJECTOR_TYPE_YOUTUVL:
+ {
+ model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); // merger.ln_q (RMS norm)
+ model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); // merger.mlp.0
+ model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
+ model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); // merger.mlp.2
+ model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
+ } break;
case PROJECTOR_TYPE_GLM4V:
{
model.projection = get_tensor(TN_MM_PROJECTOR);
@@ -2895,6 +2929,57 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
// res_imgs->data[0] = *res;
res_imgs->entries.push_back(std::move(img_f32));
} break;
+ case PROJECTOR_TYPE_YOUTUVL:
+ {
+ const int patch_size = params.patch_size; // typically 16
+ const int merge_size = params.n_merge; // typically 2
+ const int align_size = patch_size * merge_size; // 32
+
+ const int max_num_patches = params.image_max_pixels > 0 ?
+ params.image_max_pixels / (patch_size * patch_size) : 256;
+
+ // Linear search for optimal scale to fit within max_num_patches
+ float scale = 1.0f;
+ int target_height = original_size.height;
+ int target_width = original_size.width;
+
+ auto get_scaled_image_size = [align_size](float scale, int size) -> int {
+ float scaled_size = size * scale;
+ // Round up to nearest multiple of align_size
+ int aligned = static_cast(std::ceil(scaled_size / align_size)) * align_size;
+ // Ensure at least one patch
+ return std::max(align_size, aligned);
+ };
+
+ // Linear search with 0.02 step size
+ while (scale > 0.0f) {
+ target_height = get_scaled_image_size(scale, original_size.height);
+ target_width = get_scaled_image_size(scale, original_size.width);
+
+ int num_patches_h = target_height / patch_size;
+ int num_patches_w = target_width / patch_size;
+ int num_patches = num_patches_h * num_patches_w;
+
+ if (num_patches > max_num_patches) {
+ scale -= 0.02f;
+ } else {
+ break;
+ }
+ }
+
+ clip_image_size new_size = {target_width, target_height};
+
+ // Resize the image
+ clip_image_u8 resized;
+ img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
+
+ // Normalize to float32
+ clip_image_f32_ptr img_f32(clip_image_f32_init());
+ normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
+
+ // Add to results
+ res_imgs->entries.push_back(std::move(img_f32));
+ } break;
case PROJECTOR_TYPE_IDEFICS3:
{
@@ -3127,6 +3212,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V:
+ case PROJECTOR_TYPE_YOUTUVL:
return (img->nx / params.patch_size) / 2;
default:
break;
@@ -3142,6 +3228,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V:
+ case PROJECTOR_TYPE_YOUTUVL:
return (img->ny / params.patch_size) / 2;
default:
break;
@@ -3202,6 +3289,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V:
+ case PROJECTOR_TYPE_YOUTUVL:
{
// dynamic size (2 conv, so double patch size)
int x_patch = img->nx / (params.patch_size * 2);
@@ -3329,7 +3417,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
const int pos_w = image_size_width / patch_size;
const int pos_h = image_size_height / patch_size;
- const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
auto get_inp_tensor = [&gf](const char * name) {
ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
@@ -3478,9 +3565,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
set_input_i32("positions", positions);
} break;
case PROJECTOR_TYPE_QWEN25VL:
+ case PROJECTOR_TYPE_YOUTUVL:
{
// pw * ph = number of tokens output by ViT after apply patch merger
// ipw * ipw = number of vision token been processed inside ViT
+ const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty();
const int merge_ratio = 2;
const int pw = image_size_width / patch_size / merge_ratio;
const int ph = image_size_height / patch_size / merge_ratio;
@@ -3491,7 +3580,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
std::vector inv_idx(ph * pw);
if (use_window_attn) {
- const int attn_window_size = 112;
+ const int attn_window_size = hparams.attn_window_size > 0 ? hparams.attn_window_size : 112;
const int grid_window = attn_window_size / patch_size / merge_ratio;
int dst = 0;
// [num_vision_tokens, num_vision_tokens] attention mask tensor
@@ -3926,6 +4015,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_JANUS_PRO:
+ case PROJECTOR_TYPE_YOUTUVL:
return ctx->model.mm_1_b->ne[0];
case PROJECTOR_TYPE_QWEN3VL:
// main path + deepstack paths
diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h
index e08c33f35..74e94f60e 100644
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@@ -27,6 +27,11 @@ struct clip_graph_qwen3vl : clip_graph {
ggml_cgraph * build() override;
};
+struct clip_graph_youtuvl : clip_graph {
+ clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
struct clip_graph_minicpmv : clip_graph {
clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
diff --git a/tools/mtmd/models/youtuvl.cpp b/tools/mtmd/models/youtuvl.cpp
new file mode 100644
index 000000000..ffbf2be55
--- /dev/null
+++ b/tools/mtmd/models/youtuvl.cpp
@@ -0,0 +1,179 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_youtuvl::build() {
+ GGML_ASSERT(model.class_embedding == nullptr);
+ const int batch_size = 1;
+ const bool use_window_attn = !hparams.wa_layer_indexes.empty();
+ const int n_pos = n_patches;
+ const int num_position_ids = n_pos * 4;
+ const int m = 2;
+ const int Wp = n_patches_x;
+ const int Hp = n_patches_y;
+ const int Hm = Hp / m;
+ const int Wm = Wp / m;
+ norm_type norm_t = NORM_TYPE_NORMAL;
+
+ int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
+
+ ggml_tensor * inp = build_inp_raw();
+
+ // change conv3d to linear
+ // reshape and permute to get patches, permute from (patch_size, m, Wm, patch_size, m, Hm, C) to (C, patch_size, patch_size, m, m, Wm, Hm)
+ {
+ inp = ggml_reshape_4d(
+ ctx0, inp,
+ Wm * m * patch_size, m * patch_size, Hm, 3);
+ inp = ggml_permute(ctx0, inp, 1, 2, 3, 0);
+ inp = ggml_cont_4d(
+ ctx0, inp,
+ m * patch_size * 3, Wm, m * patch_size, Hm);
+
+ inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
+ inp = ggml_cont_4d(
+ ctx0, inp,
+ m * patch_size * 3, patch_size, m, Hm * Wm);
+
+ inp = ggml_permute(ctx0, inp, 1, 0, 2, 3);
+ inp = ggml_cont_4d(
+ ctx0, inp,
+ patch_size, 3, patch_size, Hm * Wm * m * m);
+
+ inp = ggml_permute(ctx0, inp, 2, 0, 1, 3);
+ inp = ggml_cont_3d(
+ ctx0, inp,
+ 3*patch_size* patch_size, Hm * Wm * m * m, 1);
+ }
+ inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
+
+ if (model.patch_bias) {
+ inp = ggml_add(ctx0, inp, model.patch_bias);
+ }
+
+ inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
+
+ ggml_tensor * inpL = inp;
+ ggml_tensor * window_mask = nullptr;
+ ggml_tensor * window_idx = nullptr;
+ ggml_tensor * inv_window_idx = nullptr;
+
+ ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
+ ggml_set_name(positions, "positions");
+ ggml_set_input(positions);
+
+ // pre-layernorm
+ if (model.pre_ln_w) {
+ inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
+ }
+ if (use_window_attn) {
+ inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
+ ggml_set_name(inv_window_idx, "inv_window_idx");
+ ggml_set_input(inv_window_idx);
+ // mask for window attention
+ window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
+ ggml_set_name(window_mask, "window_mask");
+ ggml_set_input(window_mask);
+
+ // if flash attn is used, we need to pad the mask and cast to f16
+ if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
+ window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
+ }
+
+ // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
+ GGML_ASSERT(batch_size == 1);
+ inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
+ inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
+ inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
+ }
+
+ // loop over layers
+ for (int il = 0; il < n_layer; il++) {
+ const auto & layer = model.layers[il];
+ const bool full_attn = use_window_attn ? hparams.wa_layer_indexes.count(il) > 0 : true;
+
+ ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
+
+ // layernorm1
+ cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
+ // self-attention
+ {
+ ggml_tensor * Qcur = ggml_add(ctx0,
+ ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
+ ggml_tensor * Kcur = ggml_add(ctx0,
+ ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
+ ggml_tensor * Vcur = ggml_add(ctx0,
+ ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
+
+ Qcur = ggml_rope_multi(
+ ctx0, Qcur, positions, nullptr,
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+ Kcur = ggml_rope_multi(
+ ctx0, Kcur, positions, nullptr,
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+
+ ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
+
+ cur = build_attn(layer.o_w, layer.o_b,
+ Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
+ }
+ // re-add the layer input, e.g., residual
+ cur = ggml_add(ctx0, cur, inpL);
+
+ inpL = cur; // inpL = residual, cur = hidden_states
+
+ // layernorm2
+ cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
+
+ // ffn
+ cur = build_ffn(cur,
+ layer.ff_up_w, layer.ff_up_b,
+ nullptr, nullptr,
+ layer.ff_down_w, layer.ff_down_b,
+ hparams.ffn_op, il);
+
+ // residual 2
+ cur = ggml_add(ctx0, inpL, cur);
+
+ inpL = cur;
+ }
+
+ ggml_tensor * embeddings = inpL;
+ if (use_window_attn) {
+ const int spatial_merge_unit = 4;
+ window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / spatial_merge_unit);
+ ggml_set_name(window_idx, "window_idx");
+ ggml_set_input(window_idx);
+ GGML_ASSERT(batch_size == 1);
+ embeddings = ggml_reshape_2d(ctx0, embeddings, n_embd * spatial_merge_unit, n_patches / spatial_merge_unit);
+ embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
+ embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd, n_patches, batch_size);
+ cb(embeddings, "window_order_restored", -1);
+ }
+
+ // post-layernorm (part of Siglip2VisionTransformer, applied after encoder)
+ if (model.post_ln_w) {
+ embeddings = build_norm(embeddings, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
+ }
+
+ // Now apply merger (VLPatchMerger):
+ // 1. Apply RMS norm (ln_q in VLPatchMerger)
+ embeddings = build_norm(embeddings, model.mm_input_norm_w, nullptr, NORM_TYPE_RMS, 1e-6, -1);
+ cb(embeddings, "merger_normed", -1);
+
+ // 2. First reshape for spatial merge (merge 2x2 patches)
+ embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
+ cb(embeddings, "merger_reshaped", -1);
+
+ embeddings = build_ffn(embeddings,
+ model.mm_0_w, model.mm_0_b,
+ nullptr, nullptr,
+ model.mm_1_w, model.mm_1_b,
+ FFN_GELU,
+ -1);
+ ggml_build_forward_expand(gf, embeddings);
+
+ return gf;
+}
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index b0b5ab42a..fca55b76f 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -283,7 +283,7 @@ struct mtmd_context {
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
img_end = "[IMG_END]";
- } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL) {
+ } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) {
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
img_beg = "<|vision_start|>";
img_end = "<|vision_end|>";
diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz
index d1c10eed9..b3983b2b1 100644
Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ
diff --git a/tools/server/webui/src/lib/utils/clipboard.ts b/tools/server/webui/src/lib/utils/clipboard.ts
index 91e8ea75a..940e64c8f 100644
--- a/tools/server/webui/src/lib/utils/clipboard.ts
+++ b/tools/server/webui/src/lib/utils/clipboard.ts
@@ -65,10 +65,7 @@ export async function copyCodeToClipboard(
successMessage = 'Code copied to clipboard',
errorMessage = 'Failed to copy code'
): Promise {
- const doc = new DOMParser().parseFromString(rawCode, 'text/html');
- const decodedCode = doc.body.textContent ?? rawCode;
-
- return copyToClipboard(decodedCode, successMessage, errorMessage);
+ return copyToClipboard(rawCode, successMessage, errorMessage);
}
/**