mtmd: qwen3 audio support (qwen3-omni and qwen3-asr) (#19441)

* add qwen3a

* wip

* vision ok

* no more deepstack for audio

* convert ASR model ok

* qwen3 asr working

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* nits

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* fix bad merge

* fix multi inheritance

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
Xuan-Son Nguyen 2026-04-12 23:57:25 +02:00 committed by GitHub
parent 1e9d771e2c
commit 21a4933042
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 301 additions and 21 deletions

View file

@ -455,6 +455,7 @@ struct mtmd_context {
// set preprocessor
switch (proj) {
case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_QWEN3A:
case PROJECTOR_TYPE_QWEN25O:
{
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
@ -1027,6 +1028,10 @@ bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
}
bool mtmd_decode_use_mrope(mtmd_context * ctx) {
if (ctx->ctx_v == nullptr && ctx->proj_type_a() == PROJECTOR_TYPE_QWEN3A) {
// qwen3-asr
return true;
}
switch (ctx->proj_type_v()) {
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL: