mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-19 08:00:25 +00:00
* mtmd: add chunks and fix preproc for qwen3a * add attn_mask * limit mtmd_chunk size (avoid blow up memory) * correct audio tokens * re-order the set_input case * remove attn_mask
88 lines
3.5 KiB
C++
88 lines
3.5 KiB
C++
#include "models.h"
|
|
|
|
ggml_cgraph * clip_graph_qwen3a::build() {
|
|
// Ref implementation: https://github.com/QwenLM/Qwen3-ASR/blob/main/qwen_asr/core/transformers_backend/modeling_qwen3_asr.py
|
|
|
|
// inp_raw: [n_frames, n_mel, 1] (nx=n_frames, ny=n_mel)
|
|
ggml_tensor * inp = build_inp_raw(1);
|
|
|
|
const int64_t n_frames = inp->ne[0]; // total frames, padded to multiple of chunk_size
|
|
const int64_t n_mel = inp->ne[1]; // 128
|
|
const int64_t chunk_size = 100; // n_window * 2 (n_window=50 from model config)
|
|
const int64_t n_chunks = n_frames / chunk_size;
|
|
|
|
GGML_ASSERT(n_frames % chunk_size == 0); // preprocessor should already pad the input
|
|
GGML_ASSERT(inp->type == GGML_TYPE_F32);
|
|
|
|
// View mel spectrogram as batched 100-frame chunks: [chunk_size, n_mel, 1, n_chunks]
|
|
inp = ggml_view_4d(ctx0, inp,
|
|
chunk_size, n_mel, 1, n_chunks,
|
|
n_frames * (int64_t)sizeof(float), // nb[1]: stride over mel bins
|
|
chunk_size * (int64_t)sizeof(float), // nb[2]: stride for C=1 (unused)
|
|
chunk_size * (int64_t)sizeof(float), // nb[3]: stride over chunks
|
|
0);
|
|
inp = ggml_cont(ctx0, inp);
|
|
cb(inp, "inp_chunks", -1);
|
|
|
|
// 3 x conv2d + gelu
|
|
{
|
|
// conv output [OW, OH, C_out, n_chunks]
|
|
auto conv_block = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) {
|
|
x = ggml_conv_2d(ctx0, w, x, 2, 2, 1, 1, 1, 1);
|
|
if (b) {
|
|
x = ggml_add(ctx0, x, ggml_reshape_4d(ctx0, b, 1, 1, x->ne[2], 1));
|
|
}
|
|
return ggml_gelu_erf(ctx0, x);
|
|
};
|
|
|
|
inp = conv_block(inp, model.conv2d_1_w, model.conv2d_1_b);
|
|
inp = conv_block(inp, model.conv2d_2_w, model.conv2d_2_b);
|
|
inp = conv_block(inp, model.conv2d_3_w, model.conv2d_3_b);
|
|
// inp: [OW=13, OH=16, OC=480, n_chunks]
|
|
cb(inp, "after_conv_blocks", -1);
|
|
}
|
|
|
|
// permute [OW=25, OH=16, OC=480, n_chunks] -> [OH=16, OC=480, OW=25, n_chunks]
|
|
// reshape to [OH*OC=7680, OW*n_chunks]
|
|
// feature index h+16*c = c*16+f (matches python code)
|
|
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 2, 0, 1, 3));
|
|
inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2] * inp->ne[3]);
|
|
|
|
// Project to d_model: [d_model, 25*n_chunks]
|
|
inp = ggml_mul_mat(ctx0, model.conv_out_w, inp);
|
|
if (model.conv_out_b) {
|
|
inp = ggml_add(ctx0, inp, model.conv_out_b);
|
|
}
|
|
cb(inp, "after_conv_out", -1);
|
|
|
|
const int64_t n_pos = inp->ne[1]; // 25 * n_chunks
|
|
|
|
// Per-chunk positional embeddings: repeat pos[0:13] for each chunk
|
|
// (position indices reset 0..12 per chunk, not sequential across chunks)
|
|
{
|
|
const int64_t tokens_per_chunk = n_pos / n_chunks; // 13
|
|
ggml_tensor * pos_tmp = ggml_view_2d(ctx0, model.position_embeddings,
|
|
model.position_embeddings->ne[0], tokens_per_chunk,
|
|
model.position_embeddings->nb[1], 0);
|
|
ggml_tensor * tgt = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32,
|
|
model.position_embeddings->ne[0], n_pos);
|
|
inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, pos_tmp, tgt));
|
|
}
|
|
|
|
ggml_tensor * cur = build_vit(inp, n_pos,
|
|
NORM_TYPE_NORMAL, hparams.ffn_op,
|
|
nullptr, // pos embd already added above
|
|
nullptr);
|
|
cb(cur, "after_transformer", -1);
|
|
|
|
// MLP projector
|
|
cur = build_ffn(cur,
|
|
model.mm_1_w, model.mm_1_b,
|
|
nullptr, nullptr,
|
|
model.mm_2_w, model.mm_2_b,
|
|
FFN_GELU_ERF, -1);
|
|
cb(cur, "projected", -1);
|
|
|
|
ggml_build_forward_expand(gf, cur);
|
|
return gf;
|
|
}
|