Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	ggml/src/ggml-opencl/CMakeLists.txt
#	ggml/src/ggml-opencl/ggml-opencl.cpp
#	ggml/src/ggml-opencl/kernels/cvt.cl
#	requirements/requirements-tool_bench.txt
This commit is contained in:
Concedo 2026-04-12 16:22:26 +08:00
commit 5361b45fba
15 changed files with 177 additions and 42 deletions

View file

@ -11279,6 +11279,48 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel):
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
@ModelBase.register("MERaLiON2ForConditionalGeneration")
class MERaLiONWhisperEncoderModel(WhisperEncoderModel):
has_vision_encoder = False
has_audio_encoder = True
def get_audio_config(self) -> dict[str, Any] | None:
return self.global_config.get("speech_config")
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MERALION)
self.gguf_writer.add_audio_stack_factor(self.global_config.get("speech_mlp_scale_factor", 15))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("text_decoder."):
return
if name.startswith("speech_encoder."):
name = name.replace("speech_encoder.", "audio_tower.")
yield from super().modify_tensors(data_torch, name, bid)
return
suffix = "." + name.rsplit(".", 1)[-1]
if name.startswith("ln_speech."):
yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MM_NORM_PRE, suffix=suffix), data_torch)
return
if name.startswith("speech_audio_adapter."):
if ".mlp_adapter.0." in name:
yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 0, suffix=suffix), data_torch)
elif ".gate_proj." in name:
yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 1, suffix=suffix), data_torch)
elif ".pool_proj." in name:
yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 2, suffix=suffix), data_torch)
elif ".out_proj." in name:
yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 3, suffix=suffix), data_torch)
return
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("VoxtralForConditionalGeneration")
class VoxtralWhisperEncoderModel(WhisperEncoderModel):
has_vision_encoder = False # no vision encoder

View file

@ -664,6 +664,7 @@ void ggml_compute_forward_add(
{
ggml_compute_forward_add_non_quantized(params, dst);
} break;
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
@ -1113,6 +1114,7 @@ void ggml_compute_forward_add1(
GGML_ABORT("fatal error");
}
} break;
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
@ -1242,6 +1244,7 @@ void ggml_compute_forward_acc(
} break;
case GGML_TYPE_F16:
case GGML_TYPE_BF16:
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
@ -4331,6 +4334,7 @@ void ggml_compute_forward_out_prod(
const ggml_tensor * src0 = dst->src[0];
switch (src0->type) {
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
@ -4606,6 +4610,7 @@ void ggml_compute_forward_set(
} break;
case GGML_TYPE_F16:
case GGML_TYPE_BF16:
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:

View file

@ -1192,7 +1192,9 @@ struct ggml_cuda_graph {
bool warmup_complete = false;
struct node_properties {
ggml_tensor node;
void * node_src_data_ptrs[GGML_MAX_SRC];
void * node_src_data_ptrs[GGML_MAX_SRC];
int64_t node_src_ne[GGML_MAX_SRC][GGML_MAX_DIMS];
size_t node_src_nb[GGML_MAX_SRC][GGML_MAX_DIMS];
};
std::vector<node_properties> node_props;

View file

@ -75,13 +75,17 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_con
return;
}
if (use_gqa_opt && gqa_ratio % 2 == 0) {
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 2>(ctx, dst);
return;
}
if constexpr (DKQ <= 256) {
if (use_gqa_opt && gqa_ratio % 2 == 0) {
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 2>(ctx, dst);
return;
}
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 1>(ctx, dst);
return;
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 1>(ctx, dst);
return;
} else {
GGML_ABORT("fatal error");
}
}
if (use_gqa_opt && gqa_ratio > 4) {
@ -94,12 +98,16 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_con
return;
}
if (use_gqa_opt && gqa_ratio > 1) {
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 2>(ctx, dst);
return;
}
if constexpr (DKQ <= 256) {
if (use_gqa_opt && gqa_ratio > 1) {
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 2>(ctx, dst);
return;
}
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 1>(ctx, dst);
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 1>(ctx, dst);
} else {
GGML_ABORT("fatal error");
}
}
static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

View file

@ -3086,16 +3086,18 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
ggml_cuda_graph::node_properties prop = {};
memcpy(&prop.node, cgraph->nodes[i], sizeof(ggml_tensor));
// if the backend scheduler is making copies of CPU tensors, the src pointers can be the same but with different data, see:
// https://github.com/ggml-org/llama.cpp/pull/21472#discussion_r3052235188
for (int j = 0; j < GGML_MAX_SRC; ++j) {
prop.node_src_data_ptrs[j] = cgraph->nodes[i]->src[j] ? cgraph->nodes[i]->src[j]->data : nullptr;
if (cgraph->nodes[i]->src[j]) {
prop.node_src_data_ptrs[j] = cgraph->nodes[i]->src[j]->data;
memcpy(prop.node_src_ne[j], cgraph->nodes[i]->src[j]->ne, sizeof(prop.node_src_ne[j]));
memcpy(prop.node_src_nb[j], cgraph->nodes[i]->src[j]->nb, sizeof(prop.node_src_nb[j]));
}
}
if (!res && memcmp(&graph->node_props[i], &prop, sizeof(prop)) != 0) {
if (res || memcmp(&graph->node_props[i], &prop, sizeof(prop)) != 0) {
graph->node_props[i] = prop;
res = true;
}
graph->node_props[i] = prop;
}
return res;

View file

@ -4115,6 +4115,7 @@ class VisionProjectorType:
GLMA = "glma" # audio
QWEN25O = "qwen2.5o" # omni
VOXTRAL = "voxtral"
MERALION = "meralion" # audio: Whisper + gated MLP adaptor
LFM2 = "lfm2"
KIMIVL = "kimivl"
PADDLEOCR = "paddleocr"

View file

@ -2041,7 +2041,7 @@ class TensorNameMap:
# this prefix is added in the conversion code in modify_tensors()
MODEL_TENSOR.A_MMPROJ: (
"audio.multi_modal_projector.linear_{bid}", # ultravox
"audio.multi_modal_projector.linear_{bid}", # ultravox, meralion
"audio_adapter.model.{bid}" # lfm2
),

View file

@ -2127,6 +2127,7 @@ void kcpp_init_audio_proj(clip_ctx * ctx_a)
case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_GLMA:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
case PROJECTOR_TYPE_MERALION:
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
break;
case PROJECTOR_TYPE_LFM2A:

View file

@ -317,24 +317,37 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
const int64_t n_v_heads = hparams.ssm_dt_rank;
const int64_t key_dim = head_k_dim * n_k_heads;
const int64_t value_dim = head_v_dim * n_v_heads;
const int64_t head_ratio = n_v_heads / n_k_heads;
if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_ssm_conv1d)) {
GGML_ASSERT(tensor->ne[axis] == 2*key_dim + value_dim);
return std::vector<int64_t>(2 + head_ratio, key_dim);
}
if (std::regex_match(tensor_name, pattern_attn_gate_weight) || std::regex_match(tensor_name, pattern_ssm_out_weight)) {
return std::vector<int64_t>(head_ratio, key_dim);
}
if (std::regex_match(tensor_name, pattern_ssm_dt) || std::regex_match(tensor_name, pattern_ssm_a) ||
std::regex_match(tensor_name, pattern_ssm_alpha) || std::regex_match(tensor_name, pattern_ssm_beta)) {
return std::vector<int64_t>(head_ratio, n_k_heads);
}
if (std::regex_match(tensor_name, pattern_r_cache)) {
return std::vector<int64_t>(2 + head_ratio, key_dim * (hparams.ssm_d_conv - 1));
}
if (std::regex_match(tensor_name, pattern_s_cache)) {
return std::vector<int64_t>(head_ratio, n_k_heads * head_v_dim * head_v_dim);
// both Qwen 3 Next and Qwen 3.5 support n_v_heads > n_k_heads but the broadcasting pattern is different:
// - Qwen 3 Next: [k0_v0, k0_v1, k1_v2, k1_v3] (this is the default split pattern)
// - Qwen 3.5: [k0_v0, k1_v1, k0_v2, k1_v3] (needs segmenting of V on the scale of K to get the correct pattern)
if (ud->model->arch == LLM_ARCH_QWEN3NEXT) {
if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_ssm_conv1d)) {
GGML_ASSERT(tensor->ne[axis] == 2*key_dim + value_dim);
return {key_dim, key_dim, value_dim};
}
} else {
const int64_t head_ratio = n_v_heads / n_k_heads;
if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_ssm_conv1d)) {
GGML_ASSERT(tensor->ne[axis] == 2*key_dim + value_dim);
return std::vector<int64_t>(2 + head_ratio, key_dim);
}
if (std::regex_match(tensor_name, pattern_attn_gate_weight) || std::regex_match(tensor_name, pattern_ssm_out_weight)) {
return std::vector<int64_t>(head_ratio, key_dim);
}
if (std::regex_match(tensor_name, pattern_ssm_dt) || std::regex_match(tensor_name, pattern_ssm_a) ||
std::regex_match(tensor_name, pattern_ssm_alpha) || std::regex_match(tensor_name, pattern_ssm_beta)) {
return std::vector<int64_t>(head_ratio, n_k_heads);
}
if (std::regex_match(tensor_name, pattern_r_cache)) {
return std::vector<int64_t>(2 + head_ratio, key_dim * (hparams.ssm_d_conv - 1));
}
if (std::regex_match(tensor_name, pattern_s_cache)) {
return std::vector<int64_t>(head_ratio, n_k_heads * head_v_dim * head_v_dim);
}
}
// the FFN is the same for Qwen 3 Next and Qwen 3.5:
if (std::regex_match(tensor_name, pattern_ffn_gate_up_weight)) {
const int64_t n_ff_exp = hparams.n_ff_exp;
GGML_ASSERT(tensor->ne[axis] == 2*n_ff_exp);
@ -364,13 +377,16 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
const int64_t head_dim = hparams.ssm_d_state;
const int64_t granularity_qkv = std::lcm(blck_size, head_dim);
if (std::regex_match(tensor_name, pattern_qkv_weight) || std::regex_match(tensor_name, pattern_attn_gate_weight) ||
std::regex_match(tensor_name, pattern_ssm_conv1d) || std::regex_match(tensor_name, pattern_ssm_out_weight)) {
std::regex_match(tensor_name, pattern_ssm_conv1d) || std::regex_match(tensor_name, pattern_ssm_out_weight)) {
return std::vector<int64_t>(segments.size(), granularity_qkv);
}
if (std::regex_match(tensor_name, pattern_ssm_dt) || std::regex_match(tensor_name, pattern_ssm_a) ||
std::regex_match(tensor_name, pattern_ssm_alpha) || std::regex_match(tensor_name, pattern_ssm_beta)) {
if (std::regex_match(tensor_name, pattern_ssm_dt) || std::regex_match(tensor_name, pattern_ssm_a) ||
std::regex_match(tensor_name, pattern_ssm_alpha) || std::regex_match(tensor_name, pattern_ssm_beta)) {
return std::vector<int64_t>(segments.size(), granularity_qkv / head_dim);
}
if (std::regex_match(tensor_name, pattern_ssm_beta_alpha)) {
return std::vector<int64_t>(segments.size(), 2 * (granularity_qkv / head_dim));
}
if (std::regex_match(tensor_name, pattern_r_cache)) {
return std::vector<int64_t>(segments.size(), granularity_qkv * (hparams.ssm_d_conv - 1));
}
@ -415,7 +431,7 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str
// FFN
if (std::regex_match(tensor_name, pattern_ffn_up_gate_weight) || std::regex_match(tensor_name, pattern_ffn_up_gate_bias) ||
std::regex_match(tensor_name, pattern_ffn_gate_up_weight) || std::regex_match(tensor_name, pattern_ffn_down_weight)) {
std::regex_match(tensor_name, pattern_ffn_gate_up_weight) || std::regex_match(tensor_name, pattern_ffn_down_weight)) {
GGML_ASSERT(segments.size() <= 2);
return std::vector<int64_t>(segments.size(), blck_size);
}

View file

@ -355,7 +355,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
cb(last_conv_states, "last_conv_states", il);
ggml_tensor * state_update_target =
ggml_view_1d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels * n_seqs,
ggml_view_2d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels, n_seqs, conv_states_all->nb[1],
kv_head * (conv_kernel_size - 1) * conv_channels * ggml_element_size(conv_states_all));
cb(state_update_target, "state_update_target", il);
@ -446,7 +446,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
// Update the recurrent states
ggml_build_forward_expand(gf,
ggml_cpy(ctx0, new_state,
ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
ggml_view_2d(ctx0, ssm_states_all, hparams.n_embd_s(), n_seqs, ssm_states_all->nb[1],
kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
// z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]

View file

@ -259,6 +259,7 @@ enum projector_type {
PROJECTOR_TYPE_GLMA,
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
PROJECTOR_TYPE_VOXTRAL,
PROJECTOR_TYPE_MERALION,
PROJECTOR_TYPE_MUSIC_FLAMINGO,
PROJECTOR_TYPE_LFM2,
PROJECTOR_TYPE_KIMIVL,
@ -302,6 +303,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_GLMA, "glma"},
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
{ PROJECTOR_TYPE_MERALION, "meralion"},
{ PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"},
{ PROJECTOR_TYPE_LFM2, "lfm2"},
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},

View file

@ -467,7 +467,8 @@ struct clip_model {
bool audio_has_stack_frames() const {
return proj_type == PROJECTOR_TYPE_ULTRAVOX
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
|| proj_type == PROJECTOR_TYPE_VOXTRAL
|| proj_type == PROJECTOR_TYPE_MERALION;
}
};

View file

@ -945,6 +945,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_GLMA:
case PROJECTOR_TYPE_MERALION:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
{
builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
@ -1482,10 +1483,12 @@ struct clip_model_loader {
case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_GLMA:
case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_MERALION:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
{
bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
model.proj_type == PROJECTOR_TYPE_MERALION ||
model.proj_type == PROJECTOR_TYPE_GLMA;
get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack);
hparams.ffn_op = FFN_GELU_ERF;
@ -2105,6 +2108,30 @@ struct clip_model_loader {
model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
model.mm_norm_mid_w = get_tensor(string_format(TN_MM_NORM_MID, "weight"));
} break;
case PROJECTOR_TYPE_MERALION:
{
// Whisper encoder conv layers
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
// MERaLiON adaptor: 4 linear layers + ln_pre
// linear_0 = frame compression (19200->6400) + SiLU
// linear_1 = gate_proj (6400->6400) for GLU
// linear_2 = pool_proj (6400->6400) for GLU
// linear_3 = out_proj (6400->3584)
model.mm_0_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "weight"));
model.mm_0_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "bias"));
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
model.mm_3_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "weight"));
model.mm_3_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "bias"));
// ln_speech (LayerNorm before adaptor)
model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias"));
} break;
case PROJECTOR_TYPE_QWEN2A:
{
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
@ -3297,6 +3324,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_MERALION:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
{
n_patches = img->nx;
@ -3786,6 +3814,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_MERALION:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
case PROJECTOR_TYPE_JANUS_PRO:
case PROJECTOR_TYPE_PHI4:
@ -4148,6 +4177,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
return ctx->model.mm_2_w->ne[1];
case PROJECTOR_TYPE_MERALION:
return ctx->model.mm_3_w->ne[1]; // out_proj output dim
case PROJECTOR_TYPE_INTERNVL:
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
return ctx->model.mm_3_w->ne[1];
@ -4225,6 +4256,7 @@ bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_GLMA:
case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_MERALION:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
return true;
default:

View file

@ -95,6 +95,28 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
FFN_GELU_ERF,
-1);
} else if (proj_type == PROJECTOR_TYPE_MERALION) {
// stack (above) -> ln -> linear0+silu -> GLU -> out
cur = ggml_norm(ctx0, cur, hparams.eps);
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
cur = ggml_add(ctx0, cur, model.mm_norm_pre_b);
cur = ggml_mul_mat(ctx0, model.mm_0_w, cur);
cur = ggml_add(ctx0, cur, model.mm_0_b);
cur = ggml_silu(ctx0, cur);
ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_1_w, cur);
gate = ggml_add(ctx0, gate, model.mm_1_b);
gate = ggml_silu(ctx0, gate);
ggml_tensor * pool = ggml_mul_mat(ctx0, model.mm_2_w, cur);
pool = ggml_add(ctx0, pool, model.mm_2_b);
cur = ggml_mul(ctx0, gate, pool);
cur = ggml_mul_mat(ctx0, model.mm_3_w, cur);
cur = ggml_add(ctx0, cur, model.mm_3_b);
} else if (proj_type == PROJECTOR_TYPE_GLMA) {
cur = ggml_norm(ctx0, cur, hparams.eps);
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);

View file

@ -476,6 +476,7 @@ struct mtmd_context {
} break;
case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_GLMA:
case PROJECTOR_TYPE_MERALION:
{
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
} break;