From 345de3cd870ef363398f3c3cffb4f34b398770ff Mon Sep 17 00:00:00 2001 From: Shreya Jain Date: Wed, 25 Mar 2026 09:36:27 -0700 Subject: [PATCH 01/11] Use docker in build-android.yml (#20928) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * use docker instead of SDK separately * fix whitespaces * Update .github/workflows/build-android.yml Co-authored-by: Sigbjørn Skjæret --------- Co-authored-by: Max Krasnyansky Co-authored-by: Sigbjørn Skjæret --- .github/workflows/build-android.yml | 83 ++++++++--------------------- 1 file changed, 22 insertions(+), 61 deletions(-) diff --git a/.github/workflows/build-android.yml b/.github/workflows/build-android.yml index cd9d99ffa..8dc264681 100644 --- a/.github/workflows/build-android.yml +++ b/.github/workflows/build-android.yml @@ -40,13 +40,9 @@ jobs: steps: - name: Clone uses: actions/checkout@v6 - - # Disabled due to size (400MB) and always 0 cache hits - # - name: ccache - # uses: ggml-org/ccache-action@v1.2.16 - # with: - # key: android-build - # evict-old-files: 1d + with: + fetch-depth: 0 + lfs: false - name: Set up JDK uses: actions/setup-java@v5 @@ -66,10 +62,11 @@ jobs: android-ndk: runs-on: ubuntu-latest - - env: - OPENCL_VERSION: 2025.07.22 - + container: + image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3' + defaults: + run: + shell: bash strategy: matrix: include: @@ -82,59 +79,23 @@ jobs: - name: Clone id: checkout uses: actions/checkout@v6 + with: + fetch-depth: 0 + lfs: false - - name: Install OpenCL Headers and Libs - id: install_opencl - if: ${{ matrix.build == 'arm64-snapdragon' }} - run: | - mkdir opencl - curl -L -o opencl/clhpp.tar.gz https://github.com/KhronosGroup/OpenCL-CLHPP/archive/refs/tags/v${OPENCL_VERSION}.tar.gz - curl -L -o opencl/headers.tar.gz https://github.com/KhronosGroup/OpenCL-Headers/archive/refs/tags/v${OPENCL_VERSION}.tar.gz - curl -L -o opencl/icd-loader.tar.gz https://github.com/KhronosGroup/OpenCL-ICD-Loader/archive/refs/tags/v${OPENCL_VERSION}.tar.gz - tar -xaf opencl/headers.tar.gz -C opencl - tar -xaf opencl/clhpp.tar.gz -C opencl - tar -xaf opencl/icd-loader.tar.gz -C opencl - sudo cp -r opencl/OpenCL-Headers-${OPENCL_VERSION}/CL ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include - sudo cp -r opencl/OpenCL-CLHPP-${OPENCL_VERSION}/include/CL/* ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include/CL - cd opencl/OpenCL-ICD-Loader-${OPENCL_VERSION} - cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -DOPENCL_ICD_LOADER_HEADERS_DIR=${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=31 -DANDROID_STL=c++_shared - cmake --build build - sudo cp build/libOpenCL.so ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android - rm -rf opencl - - - name: Install Hexagon SDK - id: install_hexsdk - if: ${{ matrix.build == 'arm64-snapdragon' }} - env: - HEXSDK_VER: 6.4.0.2 - HEXTLS_VER: 19.0.04 - run: | - curl -L -o hex-sdk.tar.gz https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v$HEXSDK_VER/hexagon-sdk-v$HEXSDK_VER-amd64-lnx.tar.xz - mkdir hex-sdk - tar -xaf hex-sdk.tar.gz -C hex-sdk - ls -l hex-sdk - sudo mv hex-sdk /opt/hexagon - echo "HEXAGON_SDK_ROOT=/opt/hexagon/$HEXSDK_VER" >> "$GITHUB_ENV" - echo "HEXAGON_TOOLS_ROOT=/opt/hexagon/$HEXSDK_VER/tools/HEXAGON_Tools/$HEXTLS_VER" >> "$GITHUB_ENV" - echo "DEFAULT_HLOS_ARCH=64" >> "$GITHUB_ENV" - echo "DEFAULT_TOOLS_VARIANT=toolv19" >> "$GITHUB_ENV" - echo "DEFAULT_NO_QURT_INC=0" >> "$GITHUB_ENV" - echo "DEFAULT_DSP_ARCH=v73" >> "$GITHUB_ENV" - - - name: Update CMake presets - id: update_presets - if: ${{ matrix.build == 'arm64-snapdragon' }} - run: | - cp docs/backend/snapdragon/CMakeUserPresets.json . - - - name: Build - id: ndk_build + - name: Build Llama.CPP for Hexagon Android + id: build_llama_cpp_hexagon_android run: | + if [[ "${{ matrix.build }}" == "arm64-snapdragon" ]]; then + cp docs/backend/snapdragon/CMakeUserPresets.json . + fi cmake ${{ matrix.defines }} -B build cmake --build build cmake --install build --prefix pkg-adb/llama.cpp - - name: Test - id: cmake_test - run: | - echo "FIXME: test on devices" + - name: Upload Llama.CPP Hexagon Android Build Artifact + if: ${{ always() && steps.build_llama_cpp_hexagon_android.outcome == 'success' }} + uses: actions/upload-artifact@v6 + with: + name: llama-cpp-android-${{ matrix.build }} + path: pkg-adb/llama.cpp From 1922f87c2f753d8f6630050a7ce8292994c27356 Mon Sep 17 00:00:00 2001 From: Aparna M P Date: Wed, 25 Mar 2026 22:13:12 +0530 Subject: [PATCH 02/11] snapdragon: add missing features to WoS scripts to achieve parity with ADB scripts (#20884) * Add missing features to WoS scripts to achieve parity with ADB scripts * Fix line-ending in run-mtmd.ps1 Signed-off-by: Max Krasnyansky --------- Signed-off-by: Max Krasnyansky Co-authored-by: Max Krasnyansky --- scripts/snapdragon/windows/run-bench.ps1 | 12 +++ scripts/snapdragon/windows/run-cli.ps1 | 6 +- scripts/snapdragon/windows/run-completion.ps1 | 6 +- scripts/snapdragon/windows/run-mtmd.ps1 | 74 +++++++++++++++++++ scripts/snapdragon/windows/run-tool.ps1 | 4 + 5 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 scripts/snapdragon/windows/run-mtmd.ps1 diff --git a/scripts/snapdragon/windows/run-bench.ps1 b/scripts/snapdragon/windows/run-bench.ps1 index 21fd063eb..1957028d1 100644 --- a/scripts/snapdragon/windows/run-bench.ps1 +++ b/scripts/snapdragon/windows/run-bench.ps1 @@ -20,6 +20,14 @@ if ($null -ne $env:V) { $env:GGML_HEXAGON_VERBOSE=$env:V } +if ($null -ne $env:E) { + $env:GGML_HEXAGON_EXPERIMENTAL=$env:E +} + +if ($null -ne $env:PROF) { + $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1 +} + if ($null -ne $env:OPMASK) { $env:GGML_HEXAGON_OPMASK=$env:OPMASK } @@ -32,6 +40,10 @@ if ($null -ne $env:NDEV) { $env:GGML_HEXAGON_NDEV=$env:NDEV } +if ($null -ne $env:HB) { + $env:GGML_HEXAGON_HOSTBUF=$env:HB +} + $env:ADSP_LIBRARY_PATH="$basedir\lib" & "$basedir\bin\llama-bench.exe" ` diff --git a/scripts/snapdragon/windows/run-cli.ps1 b/scripts/snapdragon/windows/run-cli.ps1 index 5891c894a..668c525f5 100644 --- a/scripts/snapdragon/windows/run-cli.ps1 +++ b/scripts/snapdragon/windows/run-cli.ps1 @@ -44,10 +44,14 @@ if ($null -ne $env:NDEV) { $env:GGML_HEXAGON_NDEV=$env:NDEV } +if ($null -ne $env:HB) { + $env:GGML_HEXAGON_HOSTBUF=$env:HB +} + $env:ADSP_LIBRARY_PATH="$basedir\lib" & "$basedir\bin\llama-cli.exe" ` --no-mmap -m $basedir\..\..\gguf\$model ` --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 ` - --ctx-size 8192 --ubatch-size 128 -fa on ` + --ctx-size 8192 --ubatch-size 256 -fa on ` -ngl 99 --device $device $cli_opts diff --git a/scripts/snapdragon/windows/run-completion.ps1 b/scripts/snapdragon/windows/run-completion.ps1 index 8a48d2d74..1221330f2 100644 --- a/scripts/snapdragon/windows/run-completion.ps1 +++ b/scripts/snapdragon/windows/run-completion.ps1 @@ -44,10 +44,14 @@ if ($null -ne $env:NDEV) { $env:GGML_HEXAGON_NDEV=$env:NDEV } +if ($null -ne $env:HB) { + $env:GGML_HEXAGON_HOSTBUF=$env:HB +} + $env:ADSP_LIBRARY_PATH="$basedir\lib" & "$basedir\bin\llama-completion.exe" ` --no-mmap -m $basedir\..\..\gguf\$model ` --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 ` - --ctx-size 8192 --batch-size 128 -fa on ` + --ctx-size 8192 --batch-size 256 -fa on ` -ngl 99 -no-cnv --device $device $cli_opts diff --git a/scripts/snapdragon/windows/run-mtmd.ps1 b/scripts/snapdragon/windows/run-mtmd.ps1 new file mode 100644 index 000000000..f47d942f5 --- /dev/null +++ b/scripts/snapdragon/windows/run-mtmd.ps1 @@ -0,0 +1,74 @@ +#!/usr/bin/env pwsh + +# Basedir on device +$basedir=".\pkg-snapdragon" + +$cli_opts=$args + +$model="gemma-3-4b-it-Q4_0.gguf" +if ($null -ne $env:M) { + $model=$env:M +} + +$mmproj="mmproj-F16.gguf" +if ($null -ne $env:MMPROJ) { + $mmproj=$env:MMPROJ +} + +$image="" +if ($null -ne $env:IMG) { + $image=$env:IMG +} + +$device="HTP0" +if ($null -ne $env:D) { + $device=$env:D +} + +if ($null -ne $env:V) { + $env:GGML_HEXAGON_VERBOSE=$env:V +} + +# Default experimental to 1 +$env:GGML_HEXAGON_EXPERIMENTAL=1 +if ($null -ne $env:E) { + $env:GGML_HEXAGON_EXPERIMENTAL=$env:E +} + +if ($null -ne $env:SCHED) { + $env:GGML_SCHED_DEBUG=$env:SCHED; $cli_opts="$cli_opts -v" +} + +if ($null -ne $env:PROF) { + $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1 +} + +if ($null -ne $env:OPMASK) { + $env:GGML_HEXAGON_OPMASK=$env:OPMASK +} + +if ($null -ne $env:NHVX) { + $env:GGML_HEXAGON_NHVX=$env:NHVX +} + +if ($null -ne $env:NDEV) { + $env:GGML_HEXAGON_NDEV=$env:NDEV +} + +if ($null -ne $env:HB) { + $env:GGML_HEXAGON_HOSTBUF=$env:HB +} + +if ($null -ne $env:MTMD_DEVICE) { + $env:MTMD_BACKEND_DEVICE=$env:MTMD_DEVICE +} + +$env:ADSP_LIBRARY_PATH="$basedir\lib" + +& "$basedir\bin\llama-mtmd-cli.exe" ` + --no-mmap -m $basedir\..\..\gguf\$model ` + --mmproj $basedir\..\..\gguf\$mmproj ` + --image $basedir\..\..\gguf\$image ` + --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 ` + --ctx-size 8192 --ubatch-size 256 -fa on ` + -ngl 99 --device $device -v $cli_opts diff --git a/scripts/snapdragon/windows/run-tool.ps1 b/scripts/snapdragon/windows/run-tool.ps1 index 70094af9b..78ccd5b21 100644 --- a/scripts/snapdragon/windows/run-tool.ps1 +++ b/scripts/snapdragon/windows/run-tool.ps1 @@ -50,6 +50,10 @@ if ($null -ne $env:NDEV) { $env:GGML_HEXAGON_NDEV=$env:NDEV } +if ($null -ne $env:HB) { + $env:GGML_HEXAGON_HOSTBUF=$env:HB +} + $env:ADSP_LIBRARY_PATH="$basedir\lib" & "$basedir\bin\$tool" ` From 44c51e526b1f51dd9906cb49858eac9722829a0c Mon Sep 17 00:00:00 2001 From: Dowon Date: Thu, 26 Mar 2026 02:12:38 +0900 Subject: [PATCH 03/11] model : allow causal_attn and pooling_type on all architectures (#20973) * models : allow causal_attn and pooling_type on all architectures * fix: move location --- src/llama-model.cpp | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 1abebf3d7..7a5544abf 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -370,6 +370,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train); ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd); ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl, false); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false); + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); @@ -748,8 +750,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { case LLM_ARCH_BERT: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); switch (hparams.n_layer) { case 3: @@ -781,8 +781,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { } ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); switch (hparams.n_layer) { case 12: @@ -797,8 +795,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { case LLM_ARCH_JINA_BERT_V2: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); hparams.f_max_alibi_bias = 8.0f; switch (hparams.n_layer) { @@ -810,8 +806,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { case LLM_ARCH_JINA_BERT_V3: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); switch (hparams.n_layer) { case 24: @@ -823,8 +817,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { case LLM_ARCH_NOMIC_BERT_MOE: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0); if (hparams.n_layer == 12 && hparams.n_embd == 768) { @@ -838,8 +830,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { case LLM_ARCH_NEO_BERT: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); if (hparams.n_layer == 28) { type = LLM_TYPE_250M; @@ -848,8 +838,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { case LLM_ARCH_EUROBERT: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); if (hparams.n_layer == 12) { type = LLM_TYPE_SMALL; // 0.2B @@ -913,7 +901,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { // fall through case LLM_ARCH_QWEN2: { - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break; @@ -995,7 +982,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { } break; case LLM_ARCH_QWEN3: { - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break; @@ -1287,7 +1273,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); //applied only if model converted with --sentence-transformers-dense-modules ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false); @@ -2084,7 +2069,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps); ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups); - ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false); } break; case LLM_ARCH_BAILINGMOE: { From 80322ebdaf8ab41fe3e66a17d7a4ec9360c54eac Mon Sep 17 00:00:00 2001 From: Saba Fallah <10401143+sfallah@users.noreply.github.com> Date: Wed, 25 Mar 2026 18:33:42 +0100 Subject: [PATCH 04/11] model: codefuse-ai/F2LLM-v2 support --- convert_hf_to_gguf.py | 3 +++ convert_hf_to_gguf_update.py | 1 + src/llama-vocab.cpp | 3 ++- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 06365bb49..cb255763d 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1503,6 +1503,9 @@ class TextModel(ModelBase): if chkhsh == "e4d54df1ebc1f2b91acd986c5b51aa50837d5faf7c7398e73c1f9e9ee5d19869": # ref: https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601 res = "kanana2" + if chkhsh == "862f827721df956049dff5ca81a57f29e575280bc622e290d3bf4e35eca29015": + # ref: https://huggingface.co/codefuse-ai/F2LLM-v2-4B + res = "f2llmv2" if res is None: logger.warning("\n") diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index b31ddcca7..1e8b29fb2 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -154,6 +154,7 @@ models = [ {"name": "qwen35", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3.5-9B-Instruct", }, {"name": "joyai-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", }, {"name": "kanana2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", }, + {"name": "f2llmv2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", }, ] # some models are known to be broken upstream, so we will skip them as exceptions diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 13934339d..e5e414172 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1952,7 +1952,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } else if ( tokenizer_pre == "qwen2" || tokenizer_pre == "deepseek-r1-qwen" || - tokenizer_pre == "kormo") { + tokenizer_pre == "kormo" || + tokenizer_pre == "f2llmv2") { pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2; clean_spaces = false; } else if ( From ec54ac13a8e3977b3a5cfe75ee841226049ee1aa Mon Sep 17 00:00:00 2001 From: uvos Date: Wed, 25 Mar 2026 19:00:37 +0100 Subject: [PATCH 05/11] ci : fix parsing of vgpr counts in hip-quality-check (#20987) * scripts: hip: gcn-cdna-vgpr-check: fix parsing of vgpr counts when an amdclang Remark block is interlieved with another from a different process * Return warning ignore * obay pep8 inline double space before inline commets * add # noqa: NP100 for other prints too * Add script changes to cause autotrigger --- .github/workflows/hip-quality-check.yml | 6 ++- scripts/hip/gcn-cdna-vgpr-check.py | 67 ++++++++++++++++--------- 2 files changed, 46 insertions(+), 27 deletions(-) diff --git a/.github/workflows/hip-quality-check.yml b/.github/workflows/hip-quality-check.yml index 04ae96d64..474c0ad41 100644 --- a/.github/workflows/hip-quality-check.yml +++ b/.github/workflows/hip-quality-check.yml @@ -8,7 +8,8 @@ on: paths: [ '.github/workflows/hip-quality-check.yml', '**/*.cu', - '**/*.cuh' + '**/*.cuh', + 'scripts/hip/gcn-cdna-vgpr-check.py' ] pull_request: @@ -16,7 +17,8 @@ on: paths: [ '.github/workflows/hip-quality-check.yml', '**/*.cu', - '**/*.cuh' + '**/*.cuh', + 'scripts/hip/gcn-cdna-vgpr-check.py' ] concurrency: diff --git a/scripts/hip/gcn-cdna-vgpr-check.py b/scripts/hip/gcn-cdna-vgpr-check.py index 934728d4a..38db47d3d 100644 --- a/scripts/hip/gcn-cdna-vgpr-check.py +++ b/scripts/hip/gcn-cdna-vgpr-check.py @@ -2,37 +2,51 @@ import sys from collections import defaultdict +import re def parse_log_file(filepath): - """Parse log file and extract function VGPR usage.""" - import re - functions = defaultdict(lambda: {'vgprs': 0, 'spill': 0, 'location': ''}) + func_stack = [] try: with open(filepath, 'r') as f: - content = f.read() - # Find all function entries with VGPR usage including location - pattern = r'([^:]+:\d+):.*?Function Name: (\S+).*?VGPRs: (\d+).*?VGPRs Spill: (\d+)' - matches = re.findall(pattern, content, re.DOTALL) + for line in f: + # Match function name lines + func_match = re.search(r'remark: ([^:]+):(\d+):\d+: Function Name: (\S+)', line) + if func_match: + location = func_match.group(1) + ':' + func_match.group(2) + func_name = func_match.group(3) + # Extract just the filename and line number + parts = location.split('/') + short_location = parts[-1] if len(parts) > 0 else location + functions[func_name]['location'] = short_location + # Push function onto stack with its location + func_stack.append({'name': func_name, 'location': location}) + continue - for location, func_name, vgprs, spill in matches: - functions[func_name]['vgprs'] = int(vgprs) - functions[func_name]['spill'] = int(spill) - # Extract just the filename and line number - parts = location.split('/') - if len(parts) > 0: - short_location = parts[-1] # Get last part (filename) - # Check if there's a line number after filename - if ':' in short_location: - functions[func_name]['location'] = short_location - else: - functions[func_name]['location'] = location - else: - functions[func_name]['location'] = location + # Match VGPR usage lines (only if we have functions in stack) + vgpr_match = re.search(r'remark: ([^:]+):(\d+):\d+:\s+VGPRs: (\d+)', line) + if vgpr_match: + location = vgpr_match.group(1) + ':' + vgpr_match.group(2) + # Find the most recent function with matching location + for i in range(len(func_stack) - 1, -1, -1): + if func_stack[i]['location'] == location: + functions[func_stack[i]['name']]['vgprs'] = int(vgpr_match.group(3)) + break + continue + + spill_match = re.search(r'remark: ([^:]+):(\d+):\d+:\s+VGPRs Spill: (\d+)', line) + if spill_match: + location = spill_match.group(1) + ':' + spill_match.group(2) + # Find the most recent function with matching location + for i in range(len(func_stack) - 1, -1, -1): + if func_stack[i]['location'] == location: + functions[func_stack[i]['name']]['spill'] = int(spill_match.group(3)) + break + continue except FileNotFoundError: - print(f"Error: File {filepath} not found", file=sys.stderr) # noqa: NP100 + print(f"Error: File {filepath} not found", file=sys.stderr) # noqa: NP100 sys.exit(1) return functions @@ -40,7 +54,7 @@ def parse_log_file(filepath): def main(): if len(sys.argv) < 2: - print("Usage: ./vgpr_check.py ", file=sys.stderr) # noqa: NP100 + print("Usage: ./vgpr_check.py ", file=sys.stderr) # noqa: NP100 sys.exit(1) log_file = sys.argv[1] @@ -123,6 +137,9 @@ def main(): '_ZL18flash_attn_ext_f16ILi128ELi128ELi32ELi2ELb1ELb0EEvPKcS1_S1_S1_S1_PKiPfP15HIP_vector_typeIfLj2EEffffjfiS5_IjLj3EEiiiiiiiiiiiliiliiiiil', '_ZL18flash_attn_ext_f16ILi128ELi128ELi4ELi8ELb1ELb0EEvPKcS1_S1_S1_S1_PKiPfP15HIP_vector_typeIfLj2EEffffjfiS5_IjLj3EEiiiiiiiiiiiliiliiiiil', '_ZL18flash_attn_ext_f16ILi96ELi96ELi4ELi8ELb0ELb0EEvPKcS1_S1_S1_S1_PKiPfP15HIP_vector_typeIfLj2EEffffjfiS5_IjLj3EEiiiiiiiiiiiliiliiiiil', + '_ZL18flash_attn_ext_vecILi128ELi2EL9ggml_type2ELS0_2ELb0EEvPKcS2_S2_S2_S2_PKiPfP15HIP_vector_typeIfLj2EEffffjfiS6_IjLj3EEiiiiiiiiiiiliiliiiiil', + '_ZL9mul_mat_qIL9ggml_type10ELi16ELb1EEvPKcPKiS4_S4_PfS5_iiiiiiiiiiiiiiiii', + '_ZL9mul_mat_qIL9ggml_type12ELi128ELb1EEvPKcPKiS4_S4_PfS5_iiiiiiiiiiiiiiiii' } functions = parse_log_file(log_file) @@ -134,7 +151,7 @@ def main(): total_vgprs = int(data['vgprs']) + int(data['spill']) if total_vgprs > 256 and func_name in ignored and func_name not in printed_ignored: location = data.get('location', log_file) - print(f"{location}: {func_name} - Total VGPRs: {total_vgprs} ({data['vgprs']} + {data['spill']}) [IGNORED]") # noqa: NP100 + print(f"{location}: {func_name} - Total VGPRs: {total_vgprs} ({data['vgprs']} + {data['spill']}) [IGNORED]") # noqa: NP100 printed_ignored.add(func_name) # Then print new functions with issues in red @@ -146,7 +163,7 @@ def main(): # Print in red if not ignored color_code = "\033[91m" if func_name not in ignored else "" reset_code = "\033[0m" if func_name not in ignored else "" - print(f"{color_code}{location}: {func_name} - Total VGPRs: {total_vgprs} ({data['vgprs']} + {data['spill']}) {status}{reset_code}") # noqa: NP100 + print(f"{color_code}{location}: {func_name} - Total VGPRs: {total_vgprs} ({data['vgprs']} + {data['spill']}) {status}{reset_code}") # noqa: NP100 if func_name not in ignored: found_issues = True From f2c72b8f1f39ee189b8f4c161532080dbe8608f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Wed, 25 Mar 2026 19:18:06 +0100 Subject: [PATCH 06/11] common : fix gguf selection in common_list_cached_models (#20996) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- common/download.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/common/download.cpp b/common/download.cpp index b0abbba8f..fa2e6fb28 100644 --- a/common/download.cpp +++ b/common/download.cpp @@ -454,7 +454,9 @@ static gguf_split_info get_gguf_split_info(const std::string & path) { std::smatch m; std::string prefix = path; - string_remove_suffix(prefix, ".gguf"); + if (!string_remove_suffix(prefix, ".gguf")) { + return {}; + } int index = 1; int count = 1; From 056b50c319724ef4788da46c605673b94dc2374e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Wed, 25 Mar 2026 19:41:01 +0100 Subject: [PATCH 07/11] common : fix verbosity setup (#20989) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The verbosity threshold was set at the end of common_params_parse_ex(), after doing many things (like downloading files..) Signed-off-by: Adrien Gallouët --- common/arg.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 71f63ba63..0096e7a68 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -423,6 +423,9 @@ static bool parse_bool_value(const std::string & value) { static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) { common_params & params = ctx_arg.params; + // setup log directly from params.verbosity: see tools/cli/cli.cpp + common_log_set_verbosity_thold(params.verbosity); + std::unordered_map> arg_to_options; for (auto & opt : ctx_arg.options) { for (const auto & arg : opt.args) { @@ -631,8 +634,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context )); } - common_log_set_verbosity_thold(params.verbosity); - return true; } @@ -3244,6 +3245,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "Set verbosity level to infinity (i.e. log all messages, useful for debugging)", [](common_params & params) { params.verbosity = INT_MAX; + common_log_set_verbosity_thold(INT_MAX); } )); add_opt(common_arg( @@ -3264,6 +3266,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "(default: %d)\n", params.verbosity), [](common_params & params, int value) { params.verbosity = value; + common_log_set_verbosity_thold(value); } ).set_env("LLAMA_LOG_VERBOSITY")); add_opt(common_arg( From a970515bdb0b1d09519106847660b0d0c84d2472 Mon Sep 17 00:00:00 2001 From: Saba Fallah <10401143+sfallah@users.noreply.github.com> Date: Wed, 25 Mar 2026 19:57:40 +0100 Subject: [PATCH 08/11] mtmd: Add DeepSeekOCR Support (#17400) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * mtmd: llama.cpp DeepSeekOCR support init commit * loading sam tensors * mtmd: fix vision model processing * deepseek-ocr clip-vit model impl * mtmd: add DeepSeek-OCR LM support with standard attention * mtmd: successfully runs DeepSeek-OCR LM in llama-cli * mtmd: Fix RoPE type for DeepSeek-OCR LM. * loading LM testing Vision model loading * sam warmup working * sam erroneous return corrected * clip-vit: corrected cls_embd concat * clip-vit: model convert qkv_proj split * corrected combining of image encoders' results * fix: update callback for ffn_moe_weighted and add callback for attn_out in deepseek2 model * concat image_newline and image_seperator tokens * visual_model warmup (technically) works * window partitioning using standard ggml ops * sam implementation without using CPU only ops * clip: fixed warnings * Merge branch 'sf/deepseek-ocr' of github.com:sfallah/llama.cpp into sf/deepseek-ocr * mtmd: fix get_rel_pos * mtmd: fixed the wrong scaler for get_rel_pos * image encoding technically works but the output can't be checked singe image decoding fails * mtmd: minor changed * mtmd: add native resolution support * - image encoding debugged - issues fixed mainly related wrong config like n_patches etc. - configs need to be corrected in the converter * mtmd: correct token order * - dynamic resizing - changes are concerning PR https://github.com/sfallah/llama.cpp/pull/4 * mtmd: quick fix token order * mtmd: fix danling pointer * mtmd: SAM numerically works * mtmd: debug CLIP-L (vit_pre_ln) * mtmd: debug CLIP-L & first working DeepSeek-OCR model * mtmd : add --dsocr-mode CLI argument for DeepSeek-OCR resolution control & all native resolution modes work * mtmd: simplify SAM patch embedding * mtmd: adapt Pillow image resizing function * mtmd: simplify DeepSeek-OCR dynamic resolution preprocessing * mtmd: remove --dsocr-mode argument * mtmd: refactor code & remove unused helper functions * mtmd: fix tensor names for image newlines and view separator * clean up * reverting automatically removed spaces * reverting automatically removed spaces * mtmd: fixed bad ocr check in Deepseek2 (LM) * mtmd: support combined QKV projection in buid_vit * using common build_attn in sam * corrected code-branch when flash-attn disabled enabling usage of --flash-attn option * mtmd: minor fix * minor formatting and style * fixed flake8 lint issues * minor editorconfig-check fixes * minor editorconfig-check fixes * mtmd: simplify get_rel_pos * mtmd: make sam hparams configurable * mtmd: add detailed comments for resize_bicubic_pillow * mtmd: fixed wrong input setting * mtmd: convert model in FP16 * mtmd: minor fix * mtmd: remove tweak to llama-mtmd-cli & deepseek-ocr template * fix: test-1.jpg ORC issue with small (640) resolution setting min-resolution base (1024) max large (1280) for dynamic-resolution * minor: editconfig-check fix * merge with changes from https://github.com/ggml-org/llama.cpp/pull/17909 added new opt to tests.sh to disable flash-attn * minor: editconfig-check fix * testing deepseek-ocr quick and dirty test script comparing results of Qwen2.5-VL vs DeepSeek-OCR * quick and (potential) dirty merge with https://github.com/ggml-org/llama.cpp/pull/17909 * refactoring, one single builder function and static helpers * added deepseek-ocr test to tests.sh * minor formatting fixes * check with fixed expected resutls * minor formatting * editorconfig-check fix * merge with changes from https://github.com/ggml-org/llama.cpp/pull/18042 * minor - added GLM-4.6V to big tests - added missing deps for python test * convert: minor fix * mtmd: format code * convert: quick fix * convert: quick fix * minor python formatting * fixed merge build issue * merge resolved - fixed issues in convert - tested several deepseek models * minor fix * minor * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret * - removed clip_is_deepseekocr - removed redundant RESIZE_ALGO_BICUBIC_PILLOW resize-algo - simplified image-preprocessing - removed/simplified debug functions * - cleaning commented out code * fixing instabilities issues reintroducing resize_bicubic_pillow * - use f16 model for deepseek-ocr test - ignore llama-arch test for deepseek-ocr * rename fc_w --> mm_fc_w * add links to OCR discussion * cleaner loading code * add missing .weight to some tensors * add default jinja template (to be used by server) * move test model to ggml-org * rolling back upscale change * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret --------- Co-authored-by: bluebread Co-authored-by: Sigbjørn Skjæret Co-authored-by: Xuan Son Nguyen Co-authored-by: Xuan-Son Nguyen --- convert_hf_to_gguf.py | 113 +++++- docs/multimodal.md | 7 + ggml/src/ggml.c | 2 + gguf-py/gguf/constants.py | 93 +++++ gguf-py/gguf/gguf_writer.py | 9 + gguf-py/gguf/tensor_mapping.py | 75 +++- src/llama-arch.cpp | 4 + src/llama-arch.h | 1 + src/llama-chat.cpp | 6 + src/llama-chat.h | 1 + src/llama-graph.cpp | 2 +- src/llama-kv-cache.cpp | 1 - src/llama-model.cpp | 78 ++++- src/llama-quant.cpp | 5 +- src/llama-vocab.cpp | 1 + src/models/deepseek2.cpp | 36 +- tests/test-llama-archs.cpp | 3 + tools/mtmd/CMakeLists.txt | 1 + tools/mtmd/clip-impl.h | 24 +- tools/mtmd/clip-model.h | 43 ++- tools/mtmd/clip.cpp | 438 +++++++++++++++++++++++- tools/mtmd/models/deepseekocr.cpp | 324 ++++++++++++++++++ tools/mtmd/models/glm4v.cpp | 2 +- tools/mtmd/models/models.h | 5 + tools/mtmd/models/siglip.cpp | 2 +- tools/mtmd/tests.sh | 2 + tools/mtmd/tests/test-1-extracted.md | 85 +++++ tools/mtmd/tests/test-1-extracted.txt | 42 +++ tools/mtmd/tests/test-deepseek-ocr.py | 186 ++++++++++ tools/mtmd/tests/tests-requirements.txt | 5 + 30 files changed, 1569 insertions(+), 27 deletions(-) create mode 100644 tools/mtmd/models/deepseekocr.cpp create mode 100644 tools/mtmd/tests/test-1-extracted.md create mode 100644 tools/mtmd/tests/test-1-extracted.txt create mode 100644 tools/mtmd/tests/test-deepseek-ocr.py create mode 100644 tools/mtmd/tests/tests-requirements.txt diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index cb255763d..f377738f8 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -947,6 +947,9 @@ class ModelBase: if "thinker_config" in config: # rename for Qwen2.5-Omni config["text_config"] = config["thinker_config"]["text_config"] + if "language_config" in config: + # rename for DeepSeekOCR + config["text_config"] = config["language_config"] if "lfm" in config: # rename for LFM2-Audio config["text_config"] = config["lfm"] @@ -2074,7 +2077,7 @@ class MmprojModel(ModelBase): preprocessor_config: dict[str, Any] global_config: dict[str, Any] - n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers", "vt_num_hidden_layers"] + n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "layers", "encoder_layers", "vt_num_hidden_layers"] has_vision_encoder: bool = True # by default has_audio_encoder: bool = False @@ -6938,6 +6941,68 @@ class ConformerAudioModel(MmprojModel): yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("DeepseekOCRForCausalLM") +class DeepseekOCRVisionModel(MmprojModel): + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DEEPSEEKOCR) + # default values below are taken from HF tranformers code + self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6)) + self.gguf_writer.add_vision_use_gelu(True) + # calculate proj_scale_factor (used by tinygemma3 test model) + image_seq_length = self.preprocessor_config.get("image_seq_length", 256) + n_per_side = int(image_seq_length ** 0.5) + image_size = self.hparams["image_size"] + patch_size = self.hparams["patch_size"] + proj_scale_factor = (image_size // patch_size) // n_per_side + if proj_scale_factor > 0 and proj_scale_factor != 4: + # we only need to write this if it's not the default value + # in this case, we are converting a test model + self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor) + # @bluebread: there's no window_size in config but just add it here anyway + self.gguf_writer.add_vision_window_size(self.hparams.get("window_size", 14)) + + # SAM configuration + sam_hparams = hparams['sam'] + self.gguf_writer.add_vision_sam_layers_count(sam_hparams['layers']) + self.gguf_writer.add_vision_sam_embedding_length(sam_hparams['width']) + self.gguf_writer.add_vision_sam_head_count(sam_hparams['heads']) + + def get_vision_config(self) -> dict[str, Any]: + vision_config: dict[str, Any] | None = self.global_config.get("vision_config") + + if not vision_config: + raise ValueError("DeepseekOCR model requires 'vision_config' in the model configuration, but it was not found") + + vision_config['sam'] = vision_config['width']['sam_vit_b'] + vision_config.update(vision_config['width']['clip-l-14-224']) + vision_config['hidden_size'] = vision_config['width'] + vision_config['num_heads'] = vision_config['heads'] + vision_config['intermediate_size'] = vision_config['heads'] * 4 + + return vision_config + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".embeddings." in name or 'pos_embed' in name: + return gguf.GGMLQuantizationType.F32 + if ".rel_pos_h" in name or '.rel_pos_w' in name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Only process vision-related tensors, skip language model tensors + # Vision components: sam_model, vision_model, projector, image_newline, view_seperator + # Language model components to skip: lm_head, embed_tokens, layers, norm + if name.startswith(("lm_head.", "model.embed_tokens.", "model.layers.", "model.norm.")): + return + + if name.endswith("pos_embed") or name.endswith("rel_pos_h") or name.endswith("rel_pos_w"): + name += ".weight" + + yield from super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("Gemma3nForConditionalGeneration") class Gemma3nVisionAudioModel(ConformerAudioModel): has_audio_encoder = True @@ -8283,6 +8348,19 @@ class DeepseekV2Model(TextModel): merge_expert = True + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + hparams: dict = ModelBase.load_hparams(self.dir_model, is_mistral_format=False) + self.origin_hf_arch = hparams.get('architectures', [None])[0] + + # special handling for Deepseek OCR + if self.origin_hf_arch == "DeepseekOCRForCausalLM": + self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR + self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch] + self.gguf_writer.add_architecture() + # default jinja template + self.gguf_writer.add_chat_template("{% for m in messages %}{{m['content']}}{% endfor %}") + def set_vocab(self): try: self._set_vocab_gpt2() @@ -8338,9 +8416,15 @@ class DeepseekV2Model(TextModel): raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!") def set_gguf_parameters(self): + is_ocr = (self.model_arch == gguf.MODEL_ARCH.DEEPSEEK2OCR) - # note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group) - self.hparams["num_key_value_heads"] = 1 + if is_ocr: + self.hparams['rope_theta'] = self.hparams.get('rope_theta', 10000.0) + else: + # note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group) + self.hparams["num_key_value_heads"] = 1 + + self.hparams['rms_norm_eps'] = self.hparams.get('rms_norm_eps', 1e-6) super().set_gguf_parameters() hparams = self.hparams @@ -8354,16 +8438,18 @@ class DeepseekV2Model(TextModel): # Default: if no MoE, all layers are dense; if MoE, none are dense first_k_dense_replace = hparams["num_hidden_layers"] if not has_moe else 0 self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace) + kv_lora_rank = hparams.get("kv_lora_rank", 512) self.gguf_writer.add_vocab_size(hparams["vocab_size"]) if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) - self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) # note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA - self.gguf_writer.add_key_length(hparams["kv_lora_rank"] + hparams["qk_rope_head_dim"]) - self.gguf_writer.add_value_length(hparams["kv_lora_rank"]) - self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) - self.gguf_writer.add_value_length_mla(hparams["v_head_dim"]) + if not is_ocr: + self.gguf_writer.add_kv_lora_rank(kv_lora_rank) + self.gguf_writer.add_key_length(kv_lora_rank + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_value_length(kv_lora_rank) + self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_value_length_mla(hparams["v_head_dim"]) # MoE parameters (required by C++ code for DEEPSEEK2 arch) # For non-MoE models like Youtu, use intermediate_size as expert_feed_forward_length @@ -8395,8 +8481,15 @@ class DeepseekV2Model(TextModel): _experts: list[dict[str, Tensor]] | None = None def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # skip vision tensors and remove "language_model." for Kimi-VL and Kimi-K2.5 - if "vision_tower" in name or "multi_modal_projector" in name or "mm_projector" in name: + # skip vision tensors and remove "language_model." for Kimi-VL and Kimi-K2.5, and DeepSeek-OCR + if ("vision_tower" in name + or "multi_modal_projector" in name + or "mm_projector" in name + or "vision_model" in name + or "image_newline" in name + or "model.projector" in name + or "sam_model" in name + or "view_seperator" in name): return if name.startswith("siglip2.") or name.startswith("merger."): return diff --git a/docs/multimodal.md b/docs/multimodal.md index e2e12d07d..f2fc1510c 100644 --- a/docs/multimodal.md +++ b/docs/multimodal.md @@ -31,6 +31,13 @@ llama-server -m gemma-3-4b-it-Q4_K_M.gguf --mmproj mmproj-gemma-3-4b-it-Q4_K_M.g llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload ``` +> [!IMPORTANT] +> +> OCR models are trained with specific prompt and input structure, please refer to these discussions for more info: +> - PaddleOCR-VL: https://github.com/ggml-org/llama.cpp/pull/18825 +> - GLM-OCR: https://github.com/ggml-org/llama.cpp/pull/19677 +> - Deepseek-OCR: https://github.com/ggml-org/llama.cpp/pull/17400 + ## Pre-quantized models These are ready-to-use models, most of them come with `Q4_K_M` quantization by default. They can be found at the Hugging Face page of the ggml-org: https://huggingface.co/collections/ggml-org/multimodal-ggufs-68244e01ff1f39e5bebeeedc diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 4c0764a0a..e9b6720c0 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4962,6 +4962,7 @@ static struct ggml_tensor * ggml_interpolate_impl( GGML_ASSERT((mode & 0xFF) < GGML_SCALE_MODE_COUNT); // TODO: implement antialias for modes other than bilinear GGML_ASSERT(!(mode & GGML_SCALE_FLAG_ANTIALIAS) || (mode & 0xFF) == GGML_SCALE_MODE_BILINEAR); + GGML_ASSERT(a->type == GGML_TYPE_F32); struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3); @@ -5307,6 +5308,7 @@ struct ggml_tensor * ggml_flash_attn_ext( GGML_ASSERT(q->ne[3] == v->ne[3]); if (mask) { + GGML_ASSERT(mask->type == GGML_TYPE_F16); GGML_ASSERT(ggml_is_contiguous(mask)); //GGML_ASSERT(ggml_can_repeat_rows(mask, qk)); diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 9383644ab..b35c976e8 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -326,6 +326,11 @@ class Keys: class Projector: SCALE_FACTOR = "clip.vision.projector.scale_factor" + class SAM: + BLOCK_COUNT = "clip.vision.sam.block_count" + EMBEDDING_LENGTH = "clip.vision.sam.embedding_length" + HEAD_COUNT = "clip.vision.sam.head_count" + class ClipAudio: PROJECTOR_TYPE = "clip.audio.projector_type" # for mixed modality models NUM_MEL_BINS = "clip.audio.num_mel_bins" @@ -434,6 +439,7 @@ class MODEL_ARCH(IntEnum): ARCTIC = auto() DEEPSEEK = auto() DEEPSEEK2 = auto() + DEEPSEEK2OCR = auto() CHATGLM = auto() GLM4 = auto() GLM4_MOE = auto() @@ -755,6 +761,22 @@ class MODEL_TENSOR(IntEnum): V_MM_GATE = auto() # cogvlm V_TOK_BOI = auto() # cogvlm V_TOK_EOI = auto() # cogvlm + V_SAM_POS_EMBD = auto() # Deepseek-OCR + V_SAM_PATCH_EMBD = auto() # Deepseek-OCR + V_SAM_PRE_NORM = auto() # Deepseek-OCR + V_SAM_POST_NORM = auto() # Deepseek-OCR + V_SAM_ATTN_POS_H = auto() # Deepseek-OCR + V_SAM_ATTN_POS_W = auto() # Deepseek-OCR + V_SAM_ATTN_QKV = auto() # Deepseek-OCR + V_SAM_ATTN_OUT = auto() # Deepseek-OCR + V_SAM_MLP_LIN_1 = auto() # Deepseek-OCR + V_SAM_MLP_LIN_2 = auto() # Deepseek-OCR + V_SAM_NECK = auto() # Deepseek-OCR + V_SAM_NET_2 = auto() # Deepseek-OCR + V_SAM_NET_3 = auto() # Deepseek-OCR + V_ENC_EMBD_IMGNL = auto() # Deepseek-OCR + V_ENC_EMBD_VSEP = auto() # Deepseek-OCR + # audio (mtmd) A_ENC_EMBD_POS = auto() A_ENC_EMBD_NORM = auto() @@ -880,6 +902,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.ARCTIC: "arctic", MODEL_ARCH.DEEPSEEK: "deepseek", MODEL_ARCH.DEEPSEEK2: "deepseek2", + MODEL_ARCH.DEEPSEEK2OCR: "deepseek2-ocr", MODEL_ARCH.CHATGLM: "chatglm", MODEL_ARCH.GLM4: "glm4", MODEL_ARCH.GLM4_MOE: "glm4moe", @@ -1199,6 +1222,22 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.V_MM_GATE: "mm.gate", MODEL_TENSOR.V_TOK_BOI: "v.boi", MODEL_TENSOR.V_TOK_EOI: "v.eoi", + # DeepSeek-OCR SAM + MODEL_TENSOR.V_SAM_POS_EMBD: "v.sam.pos_embd", + MODEL_TENSOR.V_SAM_PATCH_EMBD: "v.sam.patch_embd", + MODEL_TENSOR.V_SAM_PRE_NORM: "v.sam.blk.{bid}.pre_ln", + MODEL_TENSOR.V_SAM_POST_NORM: "v.sam.blk.{bid}.post_ln", + MODEL_TENSOR.V_SAM_ATTN_POS_H: "v.sam.blk.{bid}.attn.pos_h", + MODEL_TENSOR.V_SAM_ATTN_POS_W: "v.sam.blk.{bid}.attn.pos_w", + MODEL_TENSOR.V_SAM_ATTN_QKV: "v.sam.blk.{bid}.attn.qkv", + MODEL_TENSOR.V_SAM_ATTN_OUT: "v.sam.blk.{bid}.attn.out", + MODEL_TENSOR.V_SAM_MLP_LIN_1: "v.sam.blk.{bid}.mlp.lin1", + MODEL_TENSOR.V_SAM_MLP_LIN_2: "v.sam.blk.{bid}.mlp.lin2", + MODEL_TENSOR.V_SAM_NECK: "v.sam.neck.{bid}", + MODEL_TENSOR.V_SAM_NET_2: "v.sam.net_2", + MODEL_TENSOR.V_SAM_NET_3: "v.sam.net_3", + MODEL_TENSOR.V_ENC_EMBD_IMGNL: "v.image_newline", # Deepseek-OCR + MODEL_TENSOR.V_ENC_EMBD_VSEP: "v.view_seperator", # Deepseek-OCR # audio (mtmd) # note: all audio tensor names must use prefix "a." or "mm.a." MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd", @@ -1265,6 +1304,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.V_ENC_EMBD_PATCH, MODEL_TENSOR.V_ENC_EMBD_NORM, MODEL_TENSOR.V_ENC_EMBD_POS, + MODEL_TENSOR.V_ENC_EMBD_IMGNL, + MODEL_TENSOR.V_ENC_EMBD_VSEP, MODEL_TENSOR.V_ENC_INPUT_NORM, MODEL_TENSOR.V_ENC_ATTN_QKV, MODEL_TENSOR.V_ENC_ATTN_Q, @@ -1317,6 +1358,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.V_MM_GATE, MODEL_TENSOR.V_TOK_BOI, MODEL_TENSOR.V_TOK_EOI, + MODEL_TENSOR.V_SAM_POS_EMBD, + MODEL_TENSOR.V_SAM_PATCH_EMBD, + MODEL_TENSOR.V_SAM_PRE_NORM, + MODEL_TENSOR.V_SAM_POST_NORM, + MODEL_TENSOR.V_SAM_ATTN_POS_H, + MODEL_TENSOR.V_SAM_ATTN_POS_W, + MODEL_TENSOR.V_SAM_ATTN_QKV, + MODEL_TENSOR.V_SAM_ATTN_OUT, + MODEL_TENSOR.V_SAM_MLP_LIN_1, + MODEL_TENSOR.V_SAM_MLP_LIN_2, + MODEL_TENSOR.V_SAM_NECK, + MODEL_TENSOR.V_SAM_NET_2, + MODEL_TENSOR.V_SAM_NET_3, # audio MODEL_TENSOR.A_ENC_EMBD_POS, MODEL_TENSOR.A_ENC_EMBD_NORM, @@ -2612,7 +2666,41 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.ATTN_Q_B, MODEL_TENSOR.ATTN_KV_A_MQA, MODEL_TENSOR.ATTN_KV_B, + MODEL_TENSOR.ATTN_K, MODEL_TENSOR.ATTN_K_B, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_V_B, + MODEL_TENSOR.ATTN_Q_A_NORM, + MODEL_TENSOR.ATTN_KV_A_NORM, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + MODEL_TENSOR.FFN_EXP_PROBS_B, + ], + MODEL_ARCH.DEEPSEEK2OCR: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_A, + MODEL_TENSOR.ATTN_Q_B, + MODEL_TENSOR.ATTN_KV_A_MQA, + MODEL_TENSOR.ATTN_KV_B, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_B, + MODEL_TENSOR.ATTN_V, MODEL_TENSOR.ATTN_V_B, MODEL_TENSOR.ATTN_Q_A_NORM, MODEL_TENSOR.ATTN_KV_A_NORM, @@ -3741,6 +3829,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_ROT_EMBD, ], + MODEL_ARCH.DEEPSEEK2OCR: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_ROT_EMBD, + ], MODEL_ARCH.CHATGLM: [ MODEL_TENSOR.ROPE_FREQS, ], @@ -3938,6 +4030,7 @@ class VisionProjectorType: LIGHTONOCR = "lightonocr" COGVLM = "cogvlm" JANUS_PRO = "janus_pro" + DEEPSEEKOCR = "deepseekocr" LFM2A = "lfm2a" # audio MUSIC_FLAMINGO = "musicflamingo" # audio GLM4V = "glm4v" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 010dfeea1..37b987993 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1218,6 +1218,15 @@ class GGUFWriter: def add_vision_window_size(self, value: int) -> None: self.add_uint32(Keys.ClipVision.WINDOW_SIZE, value) + def add_vision_sam_layers_count(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.SAM.BLOCK_COUNT, value) + + def add_vision_sam_embedding_length(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.SAM.EMBEDDING_LENGTH, value) + + def add_vision_sam_head_count(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.SAM.HEAD_COUNT, value) + # audio models def add_clip_audio_projector_type(self, value: str) -> None: diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 18131e540..281c1a830 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1344,6 +1344,7 @@ class TensorNameMap: MODEL_TENSOR.V_MMPROJ_FC: ( "model.connector.modality_projection.proj", # SmolVLM "model.vision.linear_proj.linear_proj", # cogvlm + "model.projector.layers", # Deepseek-OCR "visual.merger.proj", # glm4v ), @@ -1364,6 +1365,7 @@ class TensorNameMap: "vision_model.class_embedding", # llama 4 "model.vision.patch_embedding.cls_embedding", # cogvlm "vision_model.radio_model.model.patch_generator.cls_token.token", # Nemotron Nano v2 VL + "model.vision_model.embeddings.class_embedding", # Deepseek-OCR ), MODEL_TENSOR.V_ENC_EMBD_PATCH: ( @@ -1377,6 +1379,7 @@ class TensorNameMap: "visual.patch_embed.proj", # qwen2vl "vision_tower.patch_embed.proj", # kimi-vl "model.vision.patch_embedding.proj", # cogvlm + "model.vision_model.embeddings.patch_embedding", # Deepseek-OCR CLIP "siglip2.vision_model.embeddings.patch_embedding", "vision_model.radio_model.model.patch_generator.embedder", # Nemotron Nano v2 VL ), @@ -1398,10 +1401,19 @@ class TensorNameMap: "vision_model.radio_model.model.patch_generator.pos_embed", # Nemotron Nano v2 VL ), + MODEL_TENSOR.V_ENC_EMBD_IMGNL: ( + "model.image_newline", # Deepseek-OCR + ), + + MODEL_TENSOR.V_ENC_EMBD_VSEP: ( + "model.view_seperator", # Deepseek-OCR + ), + MODEL_TENSOR.V_ENC_ATTN_QKV: ( "visual.blocks.{bid}.attn.qkv", # qwen3vl "model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm - "vision_tower.encoder.blocks.{bid}.wqkv", # Kimi-K2.5 + "model.vision_model.transformer.layers.{bid}.self_attn.qkv_proj", # Deepseek-OCR CLIP + "vision_tower.encoder.blocks.{bid}.wqkv" # Kimi-K2.5 "vision_model.radio_model.model.blocks.{bid}.attn.qkv", # Nemotron Nano v2 VL ), @@ -1416,6 +1428,7 @@ class TensorNameMap: "visual.blocks.{bid}.attn.q", # qwen2vl, generated "vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated "siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl + "model.vision_model.transformer.layers.{bid}.self_attn.q_proj", # Deepseek-OCR CLIP, generated ), MODEL_TENSOR.V_ENC_ATTN_Q_NORM: ( @@ -1434,6 +1447,7 @@ class TensorNameMap: "vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral "visual.blocks.{bid}.attn.k", # qwen2vl, generated "vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated + "model.vision_model.transformer.layers.{bid}.self_attn.k_proj", # Deepseek-OCR CLIP, generated "siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj", ), @@ -1454,6 +1468,7 @@ class TensorNameMap: "visual.blocks.{bid}.attn.v", # qwen2vl, generated "vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated "siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj", + "model.vision_model.transformer.layers.{bid}.self_attn.v_proj", # Deepseek-OCR CLIP, generated ), MODEL_TENSOR.V_ENC_INPUT_NORM: ( @@ -1468,6 +1483,7 @@ class TensorNameMap: "visual.blocks.{bid}.norm1", # qwen2vl "vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1) "model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm + "model.vision_model.transformer.layers.{bid}.layer_norm1", # Deepseek-OCR CLIP "siglip2.vision_model.encoder.layers.{bid}.layer_norm1", "vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL ), @@ -1485,6 +1501,7 @@ class TensorNameMap: "visual.blocks.{bid}.attn.proj", # qwen2vl "vision_tower.encoder.blocks.{bid}.wo", # kimi-vl "model.vision.transformer.layers.{bid}.attention.dense", # cogvlm + "model.vision_model.transformer.layers.{bid}.self_attn.out_proj", # Deepseek-OCR CLIP "siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl "vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL ), @@ -1501,6 +1518,7 @@ class TensorNameMap: "visual.blocks.{bid}.norm2", # qwen2vl "vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1) "model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm + "model.vision_model.transformer.layers.{bid}.layer_norm2", # Deepseek-OCR CLIP "siglip2.vision_model.encoder.layers.{bid}.layer_norm2", "vision_model.radio_model.model.blocks.{bid}.norm2", # Nemotron Nano v2 VL ), @@ -1517,6 +1535,7 @@ class TensorNameMap: "visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl "visual.blocks.{bid}.mlp.linear_fc1", # qwen3vl "vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1) + "model.vision_model.transformer.layers.{bid}.mlp.fc1", # Deepseek-OCR CLIP "model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm "siglip2.vision_model.encoder.layers.{bid}.mlp.fc1", "vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL @@ -1541,6 +1560,7 @@ class TensorNameMap: "visual.blocks.{bid}.mlp.linear_fc2", # qwen3vl "vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1) "model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm + "model.vision_model.transformer.layers.{bid}.mlp.fc2", # Deepseek-OCR CLIP "siglip2.vision_model.encoder.layers.{bid}.mlp.fc2", "vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL ), @@ -1560,6 +1580,7 @@ class TensorNameMap: "vision_tower.ln_pre", # pixtral-hf "vision_encoder.ln_pre", # pixtral "vision_model.layernorm_pre", # llama4 + "model.vision_model.pre_layrnorm", # Deepseek-OCR CLIP ), MODEL_TENSOR.V_POST_NORM: ( @@ -1662,6 +1683,58 @@ class TensorNameMap: "model.visual.deepstack_merger_list.{bid}.linear_fc2", # deepstack in qwen3vl ), + MODEL_TENSOR.V_SAM_POS_EMBD: ( + "model.sam_model.pos_embed", + ), + + MODEL_TENSOR.V_SAM_PATCH_EMBD: ( + "model.sam_model.patch_embed.proj", + ), + + MODEL_TENSOR.V_SAM_PRE_NORM: ( + "model.sam_model.blocks.{bid}.norm1", # deepstack in qwen3vl + ), + + MODEL_TENSOR.V_SAM_POST_NORM: ( + "model.sam_model.blocks.{bid}.norm2", # deepstack in qwen3vl + ), + + MODEL_TENSOR.V_SAM_ATTN_POS_H: ( + "model.sam_model.blocks.{bid}.attn.rel_pos_h", + ), + + MODEL_TENSOR.V_SAM_ATTN_POS_W: ( + "model.sam_model.blocks.{bid}.attn.rel_pos_w", + ), + + MODEL_TENSOR.V_SAM_ATTN_QKV: ( + "model.sam_model.blocks.{bid}.attn.qkv", + ), + + MODEL_TENSOR.V_SAM_ATTN_OUT: ( + "model.sam_model.blocks.{bid}.attn.proj", + ), + + MODEL_TENSOR.V_SAM_MLP_LIN_1: ( + "model.sam_model.blocks.{bid}.mlp.lin1", + ), + + MODEL_TENSOR.V_SAM_MLP_LIN_2: ( + "model.sam_model.blocks.{bid}.mlp.lin2", + ), + + MODEL_TENSOR.V_SAM_NECK: ( + "model.sam_model.neck.{bid}", + ), + + MODEL_TENSOR.V_SAM_NET_2: ( + "model.sam_model.net_2", + ), + + MODEL_TENSOR.V_SAM_NET_3: ( + "model.sam_model.net_3", + ), + MODEL_TENSOR.V_MM_POST_FC_NORM: ( "model.vision.linear_proj.norm1", # cogvlm ), diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index b05178859..a90518763 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -73,6 +73,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_ARCTIC, "arctic" }, { LLM_ARCH_DEEPSEEK, "deepseek" }, { LLM_ARCH_DEEPSEEK2, "deepseek2" }, + { LLM_ARCH_DEEPSEEK2OCR, "deepseek2-ocr" }, { LLM_ARCH_CHATGLM, "chatglm" }, { LLM_ARCH_GLM4, "glm4" }, { LLM_ARCH_GLM4_MOE, "glm4moe" }, @@ -1571,6 +1572,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_FFN_UP_SHEXP, }; case LLM_ARCH_DEEPSEEK2: + case LLM_ARCH_DEEPSEEK2OCR: case LLM_ARCH_MISTRAL4: return { LLM_TENSOR_TOKEN_EMBD, @@ -1579,6 +1581,8 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_ATTN_NORM, LLM_TENSOR_ATTN_Q_A_NORM, LLM_TENSOR_ATTN_KV_A_NORM, + LLM_TENSOR_ATTN_K, // deepseek-ocr + LLM_TENSOR_ATTN_V, // deepseek-ocr LLM_TENSOR_ATTN_Q, LLM_TENSOR_ATTN_Q_A, LLM_TENSOR_ATTN_Q_B, diff --git a/src/llama-arch.h b/src/llama-arch.h index 9b9eec2f5..4c5b6a1ad 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -77,6 +77,7 @@ enum llm_arch { LLM_ARCH_ARCTIC, LLM_ARCH_DEEPSEEK, LLM_ARCH_DEEPSEEK2, + LLM_ARCH_DEEPSEEK2OCR, LLM_ARCH_CHATGLM, LLM_ARCH_GLM4, LLM_ARCH_GLM4_MOE, diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp index c415a998f..78cbc38db 100644 --- a/src/llama-chat.cpp +++ b/src/llama-chat.cpp @@ -49,6 +49,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK }, { "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 }, { "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 }, + { "deepseek-ocr", LLM_CHAT_TEMPLATE_DEEPSEEK_OCR }, { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R }, { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 }, { "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 }, @@ -548,6 +549,11 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << LU8("<|Assistant|>"); } + } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_OCR) { + for (auto message : chat) { + // no template + ss << message->content; + } } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) { // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb // EXAONE-3.0-7.8B-Instruct diff --git a/src/llama-chat.h b/src/llama-chat.h index 9ed1db128..ef7dfeceb 100644 --- a/src/llama-chat.h +++ b/src/llama-chat.h @@ -28,6 +28,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_DEEPSEEK, LLM_CHAT_TEMPLATE_DEEPSEEK_2, LLM_CHAT_TEMPLATE_DEEPSEEK_3, + LLM_CHAT_TEMPLATE_DEEPSEEK_OCR, LLM_CHAT_TEMPLATE_COMMAND_R, LLM_CHAT_TEMPLATE_LLAMA_3, LLM_CHAT_TEMPLATE_CHATGLM_3, diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 9a215bb77..11759ae1e 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1516,7 +1516,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( if (!weight_before_ffn) { experts = ggml_mul(ctx0, experts, weights); - cb(cur, "ffn_moe_weighted", il); + cb(experts, "ffn_moe_weighted", il); } ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr }; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 01166fac9..5f57ba9e1 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -1561,7 +1561,6 @@ ggml_tensor * llama_kv_cache::build_rope_shift( // ref: https://github.com/ggml-org/llama.cpp/pull/13870 ? LLAMA_ROPE_TYPE_NEOX : hparams.rope_type; - ggml_tensor * tmp; if (ggml_is_quantized(cur->type)) { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 7a5544abf..ba3eed595 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1621,6 +1621,26 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_DEEPSEEK2OCR: + { + // similar to deepseek2, but without MLA + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false); + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); + ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); + + if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) { + hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX; + } + + switch (hparams.n_layer) { + case 12: type = LLM_TYPE_3B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_PLM: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -4951,6 +4971,60 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0); create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0); + // Shared expert branch + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0); + } + } + } break; + case LLM_ARCH_DEEPSEEK2OCR: + { + // similar to deepseek2, but without MLA + const int64_t n_ff_exp = hparams.n_ff_exp; + const int64_t n_expert_shared = hparams.n_expert_shared; + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + // try to load output.weight, if not found, use token_embd (tied embeddings) + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + if (!output) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + + // norm + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + if (i < (int) hparams.n_layer_dense_lead) { + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + } else { + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); + layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED); + + if (n_expert == 0) { + throw std::runtime_error("n_expert must be > 0"); + } + if (n_expert_used == 0) { + throw std::runtime_error("n_expert_used must be > 0"); + } + + // MoE branch + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0); + create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0); + // Shared expert branch layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0); layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0); @@ -7842,7 +7916,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); } - if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) { + if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) { LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q); LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv); @@ -8419,6 +8493,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { llm = std::make_unique(*this, params); } break; case LLM_ARCH_DEEPSEEK2: + case LLM_ARCH_DEEPSEEK2OCR: case LLM_ARCH_GLM_DSA: case LLM_ARCH_MISTRAL4: { @@ -8819,6 +8894,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_ARCTIC: case LLM_ARCH_DEEPSEEK: case LLM_ARCH_DEEPSEEK2: + case LLM_ARCH_DEEPSEEK2OCR: case LLM_ARCH_PLM: case LLM_ARCH_CHATGLM: case LLM_ARCH_GRANITE: diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c414656e0..a1f87b3b3 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -344,7 +344,10 @@ static bool tensor_allows_quantization(const llama_model_quantize_params * param quantize &= name.find("attn_rel_b.weight") == std::string::npos; // do not quantize specific multimodal tensors - quantize &= name.find(".position_embd.") == std::string::npos; + quantize &= name.find(".position_embd") == std::string::npos; + quantize &= name.find("sam.patch_embd") == std::string::npos; + quantize &= name.find("sam.pos_embd") == std::string::npos; + quantize &= name.find(".rel_pos") == std::string::npos; return quantize; } diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index e5e414172..f51b4badc 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -2490,6 +2490,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { || t.first == "[EOS]" // Kimi-K2 || t.first == "<|end_of_text|>" || t.first == "" // smoldocling + || t.first == "<|end▁of▁sentence|>" // deepseek-ocr ) { special_eog_ids.insert(t.second); if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp index d437fe29e..ef9c8420e 100644 --- a/src/models/deepseek2.cpp +++ b/src/models/deepseek2.cpp @@ -2,6 +2,9 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B + bool is_ocr = model.arch == LLM_ARCH_DEEPSEEK2OCR; + const bool is_mla = hparams.is_mla(); // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA @@ -54,7 +57,38 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr cb(cur, "attn_norm", il); // self_attention - { + if (is_ocr) { + const int n_embed_head = hparams.n_embd / hparams.n_head(); + const int ocr_rope_type = GGML_ROPE_TYPE_NEOX; + GGML_ASSERT(n_embed_head == n_embd_head_k && n_embed_head == n_embd_head_v); + + ggml_tensor * Qcur = NULL; + ggml_tensor * Kcur = NULL; + ggml_tensor * Vcur = NULL; + + Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Qcur, "q", il); + cb(Kcur, "k", il); + cb(Vcur, "v", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embed_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embed_head, n_head, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embed_head, n_head, n_tokens); + + GGML_ASSERT(fabs(freq_base - 10000.0) < 1e-4); + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_embed_head, ocr_rope_type, 0, freq_base, 1, 0, 1, 0, 0); + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_embed_head, ocr_rope_type, 0, freq_base, 1, 0, 1, 0, 0); + cb(Qcur, "q_pe", il); + cb(Kcur, "k_pe", il); + + cur = build_attn(inp_attn_kv, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + } + else { ggml_tensor * q = NULL; const bool is_lite = model.layers[il].wq; diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp index 0b0876426..df21ced74 100644 --- a/tests/test-llama-archs.cpp +++ b/tests/test-llama-archs.cpp @@ -467,6 +467,9 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg if (arch == LLM_ARCH_PLM) { continue; // TODO tensor shapes } + if (arch == LLM_ARCH_DEEPSEEK2OCR) { + continue; // TODO tensor shapes + } // FIXME some models are segfaulting with WebGPU: #ifdef GGML_USE_WEBGPU diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index a39de8c92..71ad149ad 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -30,6 +30,7 @@ add_library(mtmd models/qwen3vl.cpp models/siglip.cpp models/whisper-enc.cpp + models/deepseekocr.cpp models/mobilenetv5.cpp models/youtuvl.cpp ) diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index bf55cec7e..4bf34e65b 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -57,7 +57,9 @@ #define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size" #define KEY_MINICPMV_VERSION "clip.minicpmv_version" #define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num" - +#define KEY_SAM_N_HEAD "clip.vision.sam.head_count" +#define KEY_SAM_N_BLOCK "clip.vision.sam.block_count" +#define KEY_SAM_N_EMBD "clip.vision.sam.embedding_length" // audio-specific #define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities #define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins" @@ -99,12 +101,13 @@ #define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" -#define TN_IMAGE_NEWLINE "model.image_newline" +#define TN_IMAGE_NEWLINE "v.image_newline" +#define TN_IMAGE_SEPERATOR "v.view_seperator" #define TN_MM_INP_NORM "mm.input_norm.weight" #define TN_MM_INP_NORM_B "mm.input_norm.bias" #define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3 #define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3 -#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3 +#define TN_MM_PROJECTOR "mm.model.fc.%s" // idefics3, deepseekocr #define TN_MM_PATCH_MERGER "mm.patch_merger.%s" // mistral small 3.1, glm4v #define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral #define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model) @@ -143,6 +146,19 @@ #define TN_TOK_BOI "v.boi" #define TN_TOK_EOI "v.eoi" +// deepseek-ocr +#define TN_SAM_POS_EMBD "v.sam.pos_embd.%s" +#define TN_SAM_PATCH_EMBD "v.sam.patch_embd.%s" +#define TN_SAM_PRE_NORM "v.sam.blk.%d.pre_ln.%s" +#define TN_SAM_POST_NORM "v.sam.blk.%d.post_ln.%s" +#define TN_SAM_ATTN_POS_H "v.sam.blk.%d.attn.pos_h.%s" +#define TN_SAM_ATTN_POS_W "v.sam.blk.%d.attn.pos_w.%s" +#define TN_SAM_ATTN_QKV "v.sam.blk.%d.attn.qkv.%s" +#define TN_SAM_ATTN_OUT "v.sam.blk.%d.attn.out.%s" +#define TN_SAM_FFN_UP "v.sam.blk.%d.mlp.lin1.%s" +#define TN_SAM_FFN_DOWN "v.sam.blk.%d.mlp.lin2.%s" +#define TN_SAM_NECK "v.sam.neck.%d.%s" +#define TN_SAM_NET "v.sam.net_%d.%s" // (conformer) lfm2 #define TN_PRE_ENCODE_OUT "a.pre_encode.out.%s" #define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s" @@ -236,6 +252,7 @@ enum projector_type { PROJECTOR_TYPE_LIGHTONOCR, PROJECTOR_TYPE_COGVLM, PROJECTOR_TYPE_JANUS_PRO, + PROJECTOR_TYPE_DEEPSEEKOCR, PROJECTOR_TYPE_LFM2A, PROJECTOR_TYPE_GLM4V, PROJECTOR_TYPE_YOUTUVL, @@ -273,6 +290,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"}, { PROJECTOR_TYPE_COGVLM, "cogvlm"}, { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"}, + { PROJECTOR_TYPE_DEEPSEEKOCR,"deepseekocr"}, { PROJECTOR_TYPE_LFM2A, "lfm2a"}, { PROJECTOR_TYPE_GLM4V, "glm4v"}, { PROJECTOR_TYPE_YOUTUVL, "youtuvl"}, diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index 265a17130..e9c454fe6 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -67,6 +67,11 @@ struct clip_hparams { int32_t n_wa_pattern = 0; std::unordered_set wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL) + // deepseek-ocr (sam) + int32_t sam_n_layer = 0; + int32_t sam_n_head = 0; + int32_t sam_n_embd = 0; + // audio int32_t n_mel_bins = 0; // whisper preprocessor int32_t proj_stack_factor = 0; // ultravox @@ -102,6 +107,21 @@ struct clip_hparams { warmup_image_size = n_tok_per_side * patch_size * cur_merge; // TODO: support warmup size for custom token numbers } + // sam vit deepseek-ocr + std::vector global_attn_indices() const { + return { 2, 5, 8, 11 }; + } + bool is_global_attn(int32_t layer) const { + const auto indices = global_attn_indices(); + + for (const auto & idx : indices) { + if (layer == idx) { + return true; + } + } + + return false; + } }; struct clip_layer { @@ -148,6 +168,9 @@ struct clip_layer { ggml_tensor * deepstack_fc2_w = nullptr; ggml_tensor * deepstack_fc2_b = nullptr; + // sam rel_pos + ggml_tensor * rel_pos_w = nullptr; + ggml_tensor * rel_pos_h = nullptr; // lfm2 ggml_tensor * ff_norm_w = nullptr; ggml_tensor * ff_norm_b = nullptr; @@ -240,7 +263,6 @@ struct clip_model { ggml_tensor * post_ln_w; ggml_tensor * post_ln_b; - ggml_tensor * projection; // TODO: rename it to fc (fully connected layer) ggml_tensor * mm_fc_w; ggml_tensor * mm_fc_b; ggml_tensor * mm_ffn_up_w = nullptr; @@ -261,6 +283,8 @@ struct clip_model { ggml_tensor * mm_2_b = nullptr; ggml_tensor * image_newline = nullptr; + ggml_tensor * view_seperator = nullptr; + // Yi type models with mlp+normalization projection ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4 @@ -372,6 +396,23 @@ struct clip_model { ggml_tensor * mm_boi = nullptr; ggml_tensor * mm_eoi = nullptr; + // deepseek ocr sam + ggml_tensor * patch_embed_proj_w = nullptr; + ggml_tensor * patch_embed_proj_b = nullptr; + ggml_tensor * pos_embed = nullptr; + + ggml_tensor * neck_0_w; + ggml_tensor * neck_1_w; + ggml_tensor * neck_1_b; + ggml_tensor * neck_2_w; + ggml_tensor * neck_3_w; + ggml_tensor * neck_3_b; + ggml_tensor * net_2; + ggml_tensor * net_3; + + int32_t n_sam_layers = 12; // used by deepseek-ocr sam encoder + + std::vector sam_layers; // lfm2 audio std::array pre_encode_conv_X_w = {nullptr}; std::array pre_encode_conv_X_b = {nullptr}; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index a47f1f495..b7237d661 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -870,6 +870,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_DEEPSEEKOCR: + { + builder = std::make_unique(ctx, img); + } break; case PROJECTOR_TYPE_LFM2A: { builder = std::make_unique(ctx, img); @@ -1302,6 +1306,17 @@ struct clip_model_loader { hparams.set_warmup_n_tokens(28*28); // avoid OOM on warmup } break; + case PROJECTOR_TYPE_DEEPSEEKOCR: + { + hparams.patch_size = 16; + hparams.image_size = 1024; + hparams.warmup_image_size = 1024; + + get_u32(KEY_SAM_N_BLOCK, hparams.sam_n_layer, true); + get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true); + get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true); + get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true); + } break; case PROJECTOR_TYPE_LFM2A: { // audio preprocessing params @@ -1626,7 +1641,7 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_GLM4V: { - model.projection = get_tensor(TN_MM_PROJECTOR); + model.mm_fc_w = get_tensor(string_format(TN_MM_PROJECTOR, "weight")); model.mm_ffn_up_w = get_tensor(string_format(TN_MM_UP, "weight")); model.mm_ffn_up_b = get_tensor(string_format(TN_MM_UP, "bias"), false); model.mm_ffn_gate_w = get_tensor(string_format(TN_MM_GATE, "weight")); @@ -1738,7 +1753,7 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_IDEFICS3: { - model.projection = get_tensor(TN_MM_PROJECTOR); + model.mm_fc_w = get_tensor(string_format(TN_MM_PROJECTOR, "weight")); } break; case PROJECTOR_TYPE_LFM2: { @@ -1853,13 +1868,13 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_LLAMA4: { - model.mm_model_proj = get_tensor(TN_MM_PROJECTOR); + model.mm_model_proj = get_tensor(string_format(TN_MM_PROJECTOR, "weight")); model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight")); model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight")); } break; case PROJECTOR_TYPE_COGVLM: { - model.mm_model_proj = get_tensor(TN_MM_PROJECTOR); + model.mm_model_proj = get_tensor(string_format(TN_MM_PROJECTOR, "weight")); model.mm_post_fc_norm_w = get_tensor(string_format(TN_MM_POST_FC_NORM, "weight")); model.mm_post_fc_norm_b = get_tensor(string_format(TN_MM_POST_FC_NORM, "bias")); model.mm_h_to_4h_w = get_tensor(string_format(TN_MM_H_TO_4H, "weight")); @@ -1882,6 +1897,42 @@ struct clip_model_loader { model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); } break; + case PROJECTOR_TYPE_DEEPSEEKOCR: + { + model.pos_embed = get_tensor(string_format(TN_SAM_POS_EMBD, "weight")); + model.patch_embed_proj_w = get_tensor(string_format(TN_SAM_PATCH_EMBD, "weight")); + model.patch_embed_proj_b = get_tensor(string_format(TN_SAM_PATCH_EMBD, "bias")); + model.sam_layers.resize(model.n_sam_layers); + for (int il = 0; il < model.n_sam_layers; ++il) { + auto & layer = model.sam_layers[il]; + layer.qkv_w = get_tensor(string_format(TN_SAM_ATTN_QKV, il, "weight")); + layer.qkv_b = get_tensor(string_format(TN_SAM_ATTN_QKV, il, "bias")); + layer.o_w = get_tensor(string_format(TN_SAM_ATTN_OUT, il, "weight")); + layer.o_b = get_tensor(string_format(TN_SAM_ATTN_OUT, il, "bias")); + layer.ln_1_w = get_tensor(string_format(TN_SAM_PRE_NORM, il, "weight")); + layer.ln_1_b = get_tensor(string_format(TN_SAM_PRE_NORM, il, "bias")); + layer.ln_2_w = get_tensor(string_format(TN_SAM_POST_NORM, il, "weight")); + layer.ln_2_b = get_tensor(string_format(TN_SAM_POST_NORM, il, "bias")); + layer.rel_pos_h = get_tensor(string_format(TN_SAM_ATTN_POS_H, il, "weight")); + layer.rel_pos_w = get_tensor(string_format(TN_SAM_ATTN_POS_W, il, "weight")); + layer.ff_up_w = get_tensor(string_format(TN_SAM_FFN_UP, il, "weight")); + layer.ff_up_b = get_tensor(string_format(TN_SAM_FFN_UP, il, "bias")); + layer.ff_down_w = get_tensor(string_format(TN_SAM_FFN_DOWN, il, "weight")); + layer.ff_down_b = get_tensor(string_format(TN_SAM_FFN_DOWN, il, "bias")); + } + model.neck_0_w = get_tensor(string_format(TN_SAM_NECK, 0, "weight")); + model.neck_1_b = get_tensor(string_format(TN_SAM_NECK, 1, "bias")); + model.neck_1_w = get_tensor(string_format(TN_SAM_NECK, 1, "weight")); + model.neck_2_w = get_tensor(string_format(TN_SAM_NECK, 2, "weight")); + model.neck_3_b = get_tensor(string_format(TN_SAM_NECK, 3, "bias")); + model.neck_3_w = get_tensor(string_format(TN_SAM_NECK, 3, "weight")); + model.net_2 = get_tensor(string_format(TN_SAM_NET, 2, "weight")); + model.net_3 = get_tensor(string_format(TN_SAM_NET, 3, "weight")); + model.image_newline = get_tensor(TN_IMAGE_NEWLINE); + model.view_seperator = get_tensor(TN_IMAGE_SEPERATOR); + model.mm_fc_w = get_tensor(string_format(TN_MM_PROJECTOR, "weight")); + model.mm_fc_b = get_tensor(string_format(TN_MM_PROJECTOR, "bias")); + } break; case PROJECTOR_TYPE_LFM2A: { for (int i : {0, 2, 3, 5, 6}) { @@ -2353,6 +2404,7 @@ struct img_tool { enum resize_algo { RESIZE_ALGO_BILINEAR, RESIZE_ALGO_BICUBIC, + RESIZE_ALGO_BICUBIC_PILLOW, // RESIZE_ALGO_LANCZOS, // TODO }; @@ -2382,6 +2434,9 @@ struct img_tool { case RESIZE_ALGO_BICUBIC: resize_bicubic(src, dst, target_resolution.width, target_resolution.height); break; + case RESIZE_ALGO_BICUBIC_PILLOW: + resize_bicubic_pillow(src, dst, target_resolution.width, target_resolution.height); + break; default: throw std::runtime_error("Unsupported resize algorithm"); } @@ -2401,6 +2456,9 @@ struct img_tool { case RESIZE_ALGO_BICUBIC: resize_bicubic(src, resized_image, new_width, new_height); break; + case RESIZE_ALGO_BICUBIC_PILLOW: + resize_bicubic_pillow(src, resized_image, new_width, new_height); + break; default: throw std::runtime_error("Unsupported resize algorithm"); } @@ -2611,6 +2669,255 @@ private: return true; } + // Bicubic resize function using Pillow's ImagingResample algorithm + // Adapted from https://github.com/python-pillow/Pillow/blob/main/src/libImaging/Resample.c + // + // Key Difference with resize_bicubic: + // 1. Uses separable filtering: horizontal pass followed by vertical pass + // 2. Pre-computes normalized filter coefficients for each output pixel + // 3. Applies convolution using fixed-point integer arithmetic for performance + static bool resize_bicubic_pillow(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { + // Fixed-point precision: 22 bits = 32 (int32_t) - 8 (uint8_t pixels) - 2 (headroom for accumulation) + // This allows encoding fractional weights as integers: weight * 2^22 + const int PRECISION_BITS = 32 - 8 - 2; + + // Bicubic filter function with a = -0.5 (Note that GGML/PyTorch takes a = -0.75) + // Returns filter weight for distance x from pixel center + // Support: [-2, 2], meaning the filter influences pixels within 2 units of distance + auto bicubic_filter = [](double x) -> double { + constexpr double a = -0.5; + if (x < 0.0) { + x = -x; + } + if (x < 1.0) { + return ((a + 2.0) * x - (a + 3.0)) * x * x + 1; + } + if (x < 2.0) { + return (((x - 5) * x + 8) * x - 4) * a; + } + return 0.0; // Zero outside [-2, 2] + }; + + // Filter support radius: bicubic extends 2 pixels in each direction + constexpr double filter_support = 2.0; + + // Clipping function for 8-bit values + auto clip8 = [](int val) -> uint8_t { + if (val < 0) return 0; + if (val > 255) return 255; + return static_cast(val); + }; + + // Precompute filter coefficients for ONE dimension (horizontal or vertical) + // + // Parameters: + // inSize - Number of pixels in input dimension (e.g., src_width or src_height) + // outSize - Number of pixels in output dimension (e.g., target_width or target_height) + // bounds - [OUTPUT] Array of size outSize*2 storing input pixel ranges: + // bounds[xx*2+0] = first input pixel index for output pixel xx (xmin) + // bounds[xx*2+1] = number of input pixels for output pixel xx (xcnt) + // weights - [OUTPUT] Array of size outSize*ksize storing fixed-point filter weights: + // kk[xx*ksize + x] = weight for input pixel x contributing to output pixel xx + // + // Returns: kernel size (ksize) - number of input pixels that contribute to each output pixel + auto precompute_weights = [&](int inSize, int outSize, + std::vector & bounds, std::vector & weights) -> int { + double support, scale, filterscale; + double center, ww, ss; + int xx, x, ksize, xmin, xmax, xcnt; + + // Calculate scaling factor: ratio of input range to output size + filterscale = scale = (double)inSize / outSize; + // For upsampling (scale < 1), keep filterscale = 1 to maintain filter sharpness + // For downsampling (scale > 1), widen filter to prevent aliasing + if (filterscale < 1.0) { + filterscale = 1.0; + } + + // Determine filter support radius and kernel size + support = filter_support * filterscale; // Widen filter when downsampling + ksize = static_cast(std::ceil(support)) * 2 + 1; // Total pixels in kernel + + std::vector pre_weights(outSize * ksize); // Temporary weights + bounds.resize(outSize * 2); + + // For each output pixel, compute its filter coefficients + for (xx = 0; xx < outSize; xx++) { + // Calculate the center position in input space (pixel-center convention: +0.5) + center = (xx + 0.5) * scale; + ww = 0.0; // Sum of weights for normalization + ss = 1.0 / filterscale; // Scale factor for filter function + + // Determine the range of input pixels that contribute to this output pixel + xmin = static_cast(center - support + 0.5); + if (xmin < 0) { + xmin = 0; + } + + xmax = static_cast(center + support + 0.5); + if (xmax > inSize) { + xmax = inSize; + } + + xcnt = xmax - xmin; + + // Compute filter weights for each contributing input pixel + for (x = 0; x < xcnt; x++) { + // Distance from input pixel center to output pixel center in input space + double w = bicubic_filter((x + xmin - center + 0.5) * ss); + pre_weights[xx * ksize + x] = w; + ww += w; // Accumulate for normalization + } + + // Normalize weights to sum to 1.0 (preserves brightness) + for (x = 0; x < xcnt; x++) { + if (ww != 0.0) { + pre_weights[xx * ksize + x] /= ww; + } + } + + // Zero-pad remaining kernel positions + for (; x < ksize; x++) { + pre_weights[xx * ksize + x] = 0; + } + + // Store input pixel range for this output pixel + bounds[xx * 2 + 0] = xmin; + bounds[xx * 2 + 1] = xcnt; + } + + // Convert floating-point coefficients to fixed-point integers + // Formula: int32 = round(float * 2^PRECISION_BITS) + weights.resize(outSize * ksize); + for (int i = 0; i < outSize * ksize; i++) { + if (pre_weights[i] < 0) { + weights[i] = static_cast(-0.5 + pre_weights[i] * (1 << PRECISION_BITS)); + } else { + weights[i] = static_cast(0.5 + pre_weights[i] * (1 << PRECISION_BITS)); + } + } + + return ksize; + }; + + // Horizontal resampling pass + // Resizes width from imIn.nx to imOut.nx, preserving height + auto resample_horizontal = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut, + int ksize, const std::vector & bounds, const std::vector & weights) { + imOut.ny = imIn.ny; + imOut.buf.resize(3 * imOut.nx * imOut.ny); + + // Process each row independently + for (int yy = 0; yy < imOut.ny; yy++) { + // For each output pixel in this row + for (int xx = 0; xx < imOut.nx; xx++) { + // Get the range of input pixels and filter coefficients + int xmin = bounds[xx * 2 + 0]; // First input pixel index + int xcnt = bounds[xx * 2 + 1]; // Number of input pixels + + // Initialize accumulators for RGB channels with rounding bias (0.5 in fixed-point) + int32_t ss0 = 1 << (PRECISION_BITS - 1); + int32_t ss1 = 1 << (PRECISION_BITS - 1); + int32_t ss2 = 1 << (PRECISION_BITS - 1); + + // Convolve: sum weighted input pixels + for (int x = 0; x < xcnt; x++) { + int src_idx = ((yy * imIn.nx) + (x + xmin)) * 3; + ss0 += static_cast(imIn.buf[src_idx + 0]) * weights[xx * ksize + x]; // R channel + ss1 += static_cast(imIn.buf[src_idx + 1]) * weights[xx * ksize + x]; // G channel + ss2 += static_cast(imIn.buf[src_idx + 2]) * weights[xx * ksize + x]; // B channel + } + + // Convert back from fixed-point (divide by 2^PRECISION_BITS) and clamp to [0,255] + int dst_idx = (yy * imOut.nx + xx) * 3; + imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS); + imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS); + imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS); + } + } + }; + + // Vertical resampling pass + // Resizes height from imIn.ny to imOut.ny, preserving width + auto resample_vertical = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut, + int ksize, const std::vector & bounds, const std::vector & weight) { + imOut.nx = imIn.nx; + imOut.buf.resize(3 * imOut.nx * imOut.ny); + + // For each output row + for (int yy = 0; yy < imOut.ny; yy++) { + // Get the range of input rows and filter coefficients + int ymin = bounds[yy * 2 + 0]; // First input row index + int ycnt = bounds[yy * 2 + 1]; // Number of input rows + + // Process each column in this output row + for (int xx = 0; xx < imOut.nx; xx++) { + // Initialize accumulators for RGB channels with rounding bias + int32_t ss0 = 1 << (PRECISION_BITS - 1); + int32_t ss1 = 1 << (PRECISION_BITS - 1); + int32_t ss2 = 1 << (PRECISION_BITS - 1); + + // Convolve: sum weighted input pixels vertically + for (int y = 0; y < ycnt; y++) { + int src_idx = ((y + ymin) * imIn.nx + xx) * 3; + ss0 += static_cast(imIn.buf[src_idx + 0]) * weight[yy * ksize + y]; // R channel + ss1 += static_cast(imIn.buf[src_idx + 1]) * weight[yy * ksize + y]; // G channel + ss2 += static_cast(imIn.buf[src_idx + 2]) * weight[yy * ksize + y]; // B channel + } + + // Convert back from fixed-point and clamp to [0,255] + int dst_idx = (yy * imOut.nx + xx) * 3; + imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS); + imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS); + imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS); + } + } + }; + + // Main resampling logic using separable two-pass approach + const int src_width = img.nx; + const int src_height = img.ny; + + dst.nx = target_width; + dst.ny = target_height; + + bool need_horizontal = (target_width != src_width); + bool need_vertical = (target_height != src_height); + + // Precompute filter coefficients for both dimensions + std::vector bounds_horiz, bounds_vert; + std::vector weights_horiz, weights_vert; + int ksize_horiz = 0, ksize_vert = 0; + + if (need_horizontal) { + ksize_horiz = precompute_weights(src_width, target_width, bounds_horiz, weights_horiz); + } + + if (need_vertical) { + ksize_vert = precompute_weights(src_height, target_height, bounds_vert, weights_vert); + } + + // Perform two-pass resampling + if (need_horizontal && need_vertical) { + // Both horizontal and vertical + clip_image_u8 temp; + temp.nx = target_width; + resample_horizontal(img, temp, ksize_horiz, bounds_horiz, weights_horiz); + resample_vertical(temp, dst, ksize_vert, bounds_vert, weights_vert); + } else if (need_horizontal) { + // Only horizontal + resample_horizontal(img, dst, ksize_horiz, bounds_horiz, weights_horiz); + } else if (need_vertical) { + // Only vertical + resample_vertical(img, dst, ksize_vert, bounds_vert, weights_vert); + } else { + // No resizing needed - direct copy + dst.buf = img.buf; + } + + return true; + } + static inline int clip(int x, int lower, int upper) { return std::max(lower, std::min(x, upper)); } @@ -3377,6 +3684,89 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str } } } break; + case PROJECTOR_TYPE_DEEPSEEKOCR: + { + const std::vector native_resolutions = { + /*512 tiny , 640 small, */ 1024 /* base */, 1280 /* large */ + }; + // original image size + const int orig_w = original_size.width; + const int orig_h = original_size.height; + const int orig_area = orig_h * orig_w; + std::array color; + + for (int i = 0; i < 3; i++) { + color[i] = static_cast(params.image_mean[i] * 255.0f); + } + + size_t mode_i = 0; + int min_diff = orig_area; + + for (size_t i = 0; i < native_resolutions.size(); i++) { + int r = native_resolutions[i]; + if (std::abs(orig_area - r * r) < min_diff) { + mode_i = i; + min_diff = std::abs(orig_area - r * r); + } + } + + /* Native Resolution (Base/Large) */ + const int image_size = native_resolutions[mode_i]; + + // Resize maintaining an aspect ratio, then pad to square + float scale = std::min( + static_cast(image_size) / orig_w, + static_cast(image_size) / orig_h + ); + int new_w = static_cast(orig_w * scale); + int new_h = static_cast(orig_h * scale); + + clip_image_u8_ptr scaled_img(clip_image_u8_init()); + img_tool::resize(*img, *scaled_img, clip_image_size{new_w, new_h}, + img_tool::RESIZE_ALGO_BICUBIC_PILLOW, true, color); + + // Use mean color for padding + unsigned char pad_r = static_cast(params.image_mean[0] * 255.0f); + unsigned char pad_g = static_cast(params.image_mean[1] * 255.0f); + unsigned char pad_b = static_cast(params.image_mean[2] * 255.0f); + + // Pad to image_size × image_size (center padding) + clip_image_u8_ptr padded_img(clip_image_u8_init()); + padded_img->nx = image_size; + padded_img->ny = image_size; + padded_img->buf.resize(image_size * image_size * 3); // black padding + + // Fill with mean color + for (int i = 0; i < image_size * image_size; ++i) + { + padded_img->buf[i * 3 + 0] = pad_r; + padded_img->buf[i * 3 + 1] = pad_g; + padded_img->buf[i * 3 + 2] = pad_b; + } + + // Calculate padding offsets (center the image) + int pad_x = (image_size - new_w) / 2; + int pad_y = (image_size - new_h) / 2; + + // Copy scaled image into padded canvas + for (int y = 0; y < new_h; ++y){ + for (int x = 0; x < new_w; ++x){ + int src_idx = (y * new_w + x) * 3; + int dst_idx = ((y + pad_y) * image_size + (x + pad_x)) * 3; + padded_img->buf[dst_idx + 0] = scaled_img->buf[src_idx + 0]; + padded_img->buf[dst_idx + 1] = scaled_img->buf[src_idx + 1]; + padded_img->buf[dst_idx + 2] = scaled_img->buf[src_idx + 2]; + } + } + + // Normalize and output + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*padded_img, *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + + res_imgs->grid_x = 1; + res_imgs->grid_y = 1; + } break; default: LOG_ERR("%s: unsupported projector type %d\n", __func__, ctx->proj_type()); @@ -3608,6 +3998,18 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im { n_patches += 2; // for BOI and EOI token embeddings } break; + case PROJECTOR_TYPE_DEEPSEEKOCR: + { + // SAM encoder applies two stride-2 convolutions (net_2 and net_3) + // which reduces spatial dimensions by 4x in each direction (16x total) + // E.g., 64x64 -> 16x16 patches + n_patches /= 16; + + // build_global_local_features adds image newlines and view separator + // Formula: h*(w+1) + 1 where h = w = sqrt(n_patches) + int h = static_cast(std::sqrt(static_cast(n_patches))); + n_patches = h * (h + 1) + 1; + } break; case PROJECTOR_TYPE_LFM2A: { n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2; @@ -3965,6 +4367,30 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } set_input_i32("patches", patches); } break; + case PROJECTOR_TYPE_DEEPSEEKOCR: + { + GGML_ASSERT(pos_w == pos_h); + + const int window = hparams.attn_window_size; + const int pos = pos_w; + std::vector rel_pos_indices_local(window * window); + std::vector rel_pos_indices_global(pos * pos); + + for (int q = 0; q < window; q++) { + for (int k = 0; k < window; k++) { + rel_pos_indices_local[q * window + k] = q - k + window - 1; + } + } + + for (int q = 0; q < pos; q++) { + for (int k = 0; k < pos; k++) { + rel_pos_indices_global[q * pos + k] = q - k + pos - 1; + } + } + + set_input_i32("rel_pos_indices_local", rel_pos_indices_local); + set_input_i32("rel_pos_indices_global", rel_pos_indices_global); + } break; case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_GEMMA3NV: case PROJECTOR_TYPE_IDEFICS3: @@ -4129,7 +4555,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_GEMMA3NV: return ctx->model.mm_input_proj_w->ne[0]; case PROJECTOR_TYPE_IDEFICS3: - return ctx->model.projection->ne[1]; + return ctx->model.mm_fc_w->ne[1]; case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_MUSIC_FLAMINGO: @@ -4150,6 +4576,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->model.mm_2_w->ne[1]; case PROJECTOR_TYPE_COGVLM: return ctx->model.mm_4h_to_h_w->ne[1]; + case PROJECTOR_TYPE_DEEPSEEKOCR: + return ctx->model.mm_fc_w->ne[1]; case PROJECTOR_TYPE_LFM2A: return ctx->model.position_embeddings->ne[0]; case PROJECTOR_TYPE_GLM4V: diff --git a/tools/mtmd/models/deepseekocr.cpp b/tools/mtmd/models/deepseekocr.cpp new file mode 100644 index 000000000..b1f6ead5b --- /dev/null +++ b/tools/mtmd/models/deepseekocr.cpp @@ -0,0 +1,324 @@ +#include "models.h" + +// Implementation based on approach suggested by Acly +// See: https://github.com/ggml-org/llama.cpp/pull/17383#issuecomment-3554227091 +static ggml_tensor * window_partition(ggml_context * ctx0, ggml_tensor * x, const int window) { + auto [c, w, h, b] = x->ne; + // same as + // x = ggml_win_part(m, x, window); + // x = ggml_reshape_3d(m, x, c, window * window, x->ne[3]); + + const int64_t px = (window - w % window) % window; + const int64_t py = (window - h % window) % window; + const int64_t npw = (w + px) / window; + const int64_t nph = (h + py) / window; + + ggml_tensor * cur = x; + if (px > 0 || py > 0) { + cur = ggml_pad(ctx0, cur, 0, static_cast(px), static_cast(py), 0); + } + cur = ggml_reshape_4d(ctx0, cur, c * window, npw, window, nph * b); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3)); + cur = ggml_reshape_4d(ctx0, cur, c, window, window, npw * nph * b); + return cur; +} + +// Implementation based on approach suggested by Acly +// See: https://github.com/ggml-org/llama.cpp/pull/17383#issuecomment-3554227091 +static ggml_tensor * window_unpartition(ggml_context * ctx0, + ggml_tensor * x, + const int w, + const int h, + const int window) { + const int64_t c = x->ne[0]; + // same as + // x = ggml_reshape_4d(m, x, c, window, window, x->ne[2]); + // x = ggml_win_unpart(m, x, w, h, window); + + const int64_t px = (window - w % window) % window; + const int64_t py = (window - h % window) % window; + const int64_t npw = (w + px) / window; + const int64_t nph = (h + py) / window; + + const int64_t b = x->ne[3] / (npw * nph); + ggml_tensor * cur = x; + cur = ggml_reshape_4d(ctx0, cur, c * window, window, npw, nph * b); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3)); + cur = ggml_reshape_4d(ctx0, cur, c, w + px, h + py, b); + cur = ggml_view_4d(ctx0, cur, cur->ne[0], w, h, cur->ne[3], cur->nb[1], cur->nb[2], cur->nb[3], 0); + cur = ggml_cont(ctx0, cur); + return cur; +} + +static ggml_tensor * get_rel_pos(ggml_context * ctx0, + ggml_tensor * rel_pos, // [L, C] + ggml_tensor * indices, // [q_size, k_size] + const int q_size, + const int k_size) { + const int64_t C = rel_pos->ne[0]; // channels + const int64_t L = rel_pos->ne[1]; // length + + GGML_ASSERT(indices != nullptr); + GGML_ASSERT(indices->type == GGML_TYPE_I32); + GGML_ASSERT(indices->ne[0] == k_size); + GGML_ASSERT(indices->ne[1] == q_size); + + const auto max_rel_dist = 2 * std::max(q_size, k_size) - 1; + ggml_tensor * cur = rel_pos; + + if (max_rel_dist != L) { + // Linear interpolation + const int64_t ne0 = cur->ne[0]; + const int64_t ne1 = cur->ne[1]; + const int64_t ne2 = cur->ne[2]; + const int64_t ne3 = cur->ne[3]; + + cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3)), ne1, 1, ne0 * ne2 * ne3); + cur = ggml_reshape_4d( + ctx0, ggml_interpolate(ctx0, cur, max_rel_dist, 1, ne0 * ne2 * ne3, 1, GGML_SCALE_MODE_BILINEAR), + max_rel_dist, ne0, ne2, ne3); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3)); + } + + // Flatten indices to 1D for ggml_get_rows + const int qk = q_size * k_size; + + cur = ggml_reshape_3d(ctx0, ggml_get_rows(ctx0, cur, ggml_reshape_1d(ctx0, indices, qk)), C, k_size, q_size); + + return cur; // [C, k_size, q_size] +} + +ggml_cgraph * clip_graph_deepseekocr::build() { + // patch embedding + ggml_tensor * inp_raw = build_inp_raw(); + + ggml_tensor * sam_out; + // Building SAM + { + const int n_embd = hparams.sam_n_embd; + const int n_layer = hparams.sam_n_layer; + const int n_heads = hparams.sam_n_head; + const int d_heads = n_embd / n_heads; + const int window = hparams.attn_window_size; + + ggml_tensor * inpL; + + inpL = ggml_conv_2d_sk_p0(ctx0, model.patch_embed_proj_w, inp_raw); + inpL = ggml_add(ctx0, inpL, ggml_reshape_3d(ctx0, model.patch_embed_proj_b, 1, 1, n_embd)); + inpL = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 1, 2, 0, 3)); + + ggml_tensor * rel_pos_indices_local; + ggml_tensor * rel_pos_indices_global; + + rel_pos_indices_local = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, window, window); + rel_pos_indices_global = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, inpL->ne[1], inpL->ne[2]); + ggml_set_name(rel_pos_indices_local, "rel_pos_indices_local"); + ggml_set_name(rel_pos_indices_global, "rel_pos_indices_global"); + ggml_set_input(rel_pos_indices_local); + ggml_set_input(rel_pos_indices_global); + + ggml_tensor * cur; + const auto tgt_size = inpL->ne[1]; + const auto str_size = model.pos_embed->ne[1]; + + if (str_size != tgt_size) { + ggml_tensor * old_pos_embed = nullptr; + old_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, model.pos_embed, 2, 0, 1, 3)); + ggml_tensor * new_pos_embed = + ggml_interpolate(ctx0, old_pos_embed, tgt_size, tgt_size, n_embd, 1, GGML_SCALE_MODE_BICUBIC); + new_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embed, 1, 2, 0, 3)); + cur = ggml_add(ctx0, inpL, new_pos_embed); + } else { + cur = ggml_add(ctx0, inpL, model.pos_embed); + } + + // loop over layers + for (int il = 0; il < n_layer; il++) { + auto & layer = model.sam_layers[il]; + ggml_tensor * shortcut = cur; + + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); + + const int64_t w0 = cur->ne[1]; + const int64_t h0 = cur->ne[2]; + + ggml_tensor * indices; + + if (hparams.is_global_attn(il)) { + indices = rel_pos_indices_global; + } else { + // local attention layer - apply window partition + cur = window_partition(ctx0, cur, window); + indices = rel_pos_indices_local; + } + + const int64_t W = cur->ne[1]; + const int64_t H = cur->ne[2]; + // self-attention + { + const int B = cur->ne[3]; + + cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); + cur = ggml_add(ctx0, cur, layer.qkv_b); + cur = ggml_cont(ctx0, cur); // Ensure tensor is contiguous before reshape + cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B); + + ggml_tensor * Q; + ggml_tensor * K; + ggml_tensor * V; + + Q = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 0 * cur->nb[1]); + Q = ggml_reshape_4d(ctx0, ggml_cont(ctx0, Q), d_heads, n_heads, W * H, B); + + K = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 1 * cur->nb[1]); + K = ggml_reshape_4d(ctx0, ggml_cont(ctx0, K), d_heads, n_heads, W * H, B); + + V = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 2 * cur->nb[1]); + V = ggml_reshape_4d(ctx0, ggml_cont(ctx0, V), d_heads, n_heads, W * H, B); + + ggml_tensor * mask; + ggml_tensor * rw; + ggml_tensor * rh; + ggml_tensor * qr; + + rw = get_rel_pos(ctx0, layer.rel_pos_w, indices, W, W); // [W, W, C] + rh = get_rel_pos(ctx0, layer.rel_pos_h, indices, H, H); // [H, H, C] + qr = ggml_permute(ctx0, Q, 0, 2, 1, 3); + qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads); + + rw = ggml_mul_mat(ctx0, rw, + ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3))); // [B*n_heads, W, H, W] + rw = ggml_cont(ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3)); // [B*n_heads, H, W, W] + rw = ggml_reshape_4d(ctx0, rw, W, 1, W * H, n_heads * B); + rw = ggml_repeat_4d(ctx0, rw, W, H, W * H, n_heads * B); + rh = ggml_mul_mat(ctx0, rh, qr); // [B*n_heads, H, W, H] + rh = ggml_reshape_4d(ctx0, rh, 1, H, W * H, n_heads * B); + mask = ggml_add(ctx0, rw, rh); // [B*n_heads, H*W, H, W] + mask = ggml_reshape_4d(ctx0, mask, W * H, W * H, n_heads, B); + mask = ggml_cast(ctx0, mask, GGML_TYPE_F16); + + const float scale = 1.0f / sqrtf(static_cast(d_heads)); + + cur = build_attn(layer.o_w, layer.o_b, Q, K, V, mask, scale, + il); // [B, H*W, n_embd] + cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), n_embd, W, H, B); + } + + if (hparams.is_global_attn(il) == false) { + // local attention layer - reverse window partition + cur = window_unpartition(ctx0, cur, w0, h0, window); + } + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, shortcut); + + ggml_tensor * inpFF = cur; + + // layernorm2 + cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); + + // ffn + cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + // residual 2 + cur = ggml_add(ctx0, cur, inpFF); + cb(cur, "sam_layer_out", il); + } + + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); + + cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); + cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); + + cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); + cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); + + cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1); + cur = ggml_conv_2d(ctx0, model.net_3, cur, 2, 2, 1, 1, 1, 1); + cb(cur, "sam_output", -1); + + ggml_build_forward_expand(gf, cur); + sam_out = cur; + } + + ggml_tensor * clip_out; + // Building DS-OCR CLIP + { + ggml_tensor * inp; + + inp = ggml_cpy(ctx0, sam_out, ggml_dup_tensor(ctx0, sam_out)); + inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2]); + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); + + ggml_tensor * new_pos_embd = + ggml_cpy(ctx0, model.position_embeddings, ggml_dup_tensor(ctx0, model.position_embeddings)); + + int n_pos = new_pos_embd->ne[1]; // +1 for [CLS] + const auto tgt_size = static_cast(std::sqrt(inp->ne[1])); + const auto src_size = static_cast(std::sqrt(n_pos - 1)); + + if (tgt_size != src_size) { + ggml_tensor * old_pos_embd; + ggml_tensor * cls_tok; + + old_pos_embd = ggml_view_2d(ctx0, new_pos_embd, new_pos_embd->ne[0], src_size * src_size, + ggml_row_size(new_pos_embd->type, new_pos_embd->ne[0]), 0); + cls_tok = ggml_view_2d(ctx0, new_pos_embd, new_pos_embd->ne[0], 1, + ggml_row_size(new_pos_embd->type, new_pos_embd->ne[0]), src_size * src_size); + new_pos_embd = ggml_interpolate(ctx0, old_pos_embd, tgt_size, tgt_size, new_pos_embd->ne[0], 1, + GGML_SCALE_MODE_BICUBIC); + new_pos_embd = ggml_reshape_3d(ctx0, new_pos_embd, n_embd, tgt_size * tgt_size, 1); + new_pos_embd = ggml_concat(ctx0, new_pos_embd, cls_tok, 1); + n_pos = tgt_size * tgt_size + 1; + } + + // add CLS token + inp = ggml_concat(ctx0, model.class_embedding, inp, 1); + + // for selecting learned pos embd, used by ViT + ggml_tensor * positions = ggml_cast(ctx0, ggml_arange(ctx0, 0, n_pos, 1), GGML_TYPE_I32); + ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, new_pos_embd, positions); + + ggml_tensor * cur = build_vit(inp, n_pos, NORM_TYPE_NORMAL, FFN_GELU_QUICK, learned_pos_embd, nullptr); + + ggml_build_forward_expand(gf, cur); + clip_out = cur; + } + + const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1]; + + sam_out = ggml_cont(ctx0, ggml_permute(ctx0, sam_out, 1, 2, 0, 3)); + sam_out = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0], clip_n_patches); + clip_out = ggml_view_2d(ctx0, clip_out, n_embd, clip_n_patches, clip_out->nb[1], clip_out->nb[1]); + + ggml_tensor * cur; + cur = ggml_concat(ctx0, clip_out, sam_out, 0); + cur = ggml_reshape_2d(ctx0, cur, 2 * n_embd, clip_n_patches); + cur = ggml_cont(ctx0, cur); + cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur); + cur = ggml_add(ctx0, cur, model.mm_fc_b); + + const auto h = static_cast(std::sqrt(static_cast(cur->ne[1]))); + const auto w = h; + const auto n_dim = cur->ne[0]; + + ggml_tensor * imgnl; + ggml_tensor * vs; + + imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1); + vs = ggml_reshape_2d(ctx0, model.view_seperator, n_dim, 1); // (n_dim, 1) + cur = ggml_reshape_3d(ctx0, cur, n_dim, w, h); + cur = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h); + cur = ggml_concat(ctx0, cur, vs, 1); // (n_dim, h*(w+1) + 1) + + cb(cur, "dsocr_output", -1); + + ggml_build_forward_expand(gf, cur); + return gf; +} diff --git a/tools/mtmd/models/glm4v.cpp b/tools/mtmd/models/glm4v.cpp index 9dbb162c5..623d2e384 100644 --- a/tools/mtmd/models/glm4v.cpp +++ b/tools/mtmd/models/glm4v.cpp @@ -97,7 +97,7 @@ ggml_cgraph * clip_graph_glm4v::build() { // FC projector { - cur = build_mm(model.projection, cur); + cur = build_mm(model.mm_fc_w, cur); // default LayerNorm (post_projection_norm) cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); cur = ggml_gelu_erf(ctx0, cur); diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index aff222c71..5705d7f21 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -77,6 +77,11 @@ struct clip_graph_whisper_enc : clip_graph { ggml_cgraph * build() override; }; +struct clip_graph_deepseekocr : clip_graph { + clip_graph_deepseekocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + struct clip_graph_conformer : clip_graph { clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; diff --git a/tools/mtmd/models/siglip.cpp b/tools/mtmd/models/siglip.cpp index 9dafa35ea..7ef98eed0 100644 --- a/tools/mtmd/models/siglip.cpp +++ b/tools/mtmd/models/siglip.cpp @@ -43,7 +43,7 @@ ggml_cgraph * clip_graph_siglip::build() { // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578 const int scale_factor = model.hparams.n_merge; cur = build_patch_merge_permute(cur, scale_factor); - cur = build_mm(model.projection, cur); + cur = build_mm(model.mm_fc_w, cur); } else if (proj_type == PROJECTOR_TYPE_LFM2) { // pixel unshuffle block diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh index d2b7e684a..e081bde87 100755 --- a/tools/mtmd/tests.sh +++ b/tools/mtmd/tests.sh @@ -88,6 +88,7 @@ add_test_vision "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M" add_test_vision "ggml-org/LFM2-VL-450M-GGUF:Q8_0" add_test_vision "ggml-org/granite-docling-258M-GGUF:Q8_0" add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0" +add_test_vision "ggml-org/DeepSeek-OCR-GGUF:Q8_0" -p "Free OCR." --chat-template deepseek-ocr add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0" add_test_audio "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M" @@ -108,6 +109,7 @@ if [ "$RUN_BIG_TESTS" = true ]; then add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M" # add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra # add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M" # not always working + add_test_vision "ggml-org/GLM-4.6V-Flash-GGUF:Q4_K_M" -p "extract all texts from this image" add_test_audio "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M" add_test_audio "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M" diff --git a/tools/mtmd/tests/test-1-extracted.md b/tools/mtmd/tests/test-1-extracted.md new file mode 100644 index 000000000..a92dcd959 --- /dev/null +++ b/tools/mtmd/tests/test-1-extracted.md @@ -0,0 +1,85 @@ +<|ref|>title<|/ref|><|det|>[[61, 255, 907, 533]]<|/det|> +# MEN WALK ON MOON +ASTRONAUTS LAND ON PLAIN; +COLLECT ROCKS, PLANT FLAG + +<|ref|>text<|/ref|><|det|>[[56, 559, 268, 629]]<|/det|> +Voice From Moon: +Eagle Has Landed' + +<|ref|>text<|/ref|><|det|>[[74, 645, 262, 675]]<|/det|> +EAGLE (the lunar surface, Houston, Truesquily) +Base here, The Eagle has landed. + +<|ref|>text<|/ref|><|det|>[[74, 675, 262, 720]]<|/det|> +BOOTHROOM: Lounge, Truesquily, we enjoy you on the ground. You've got a bunch of guys about to toss bikes. We're breaking again. Thanks a lot. + +<|ref|>text<|/ref|><|det|>[[74, 720, 262, 750]]<|/det|> +TRAVELLING MADE: Time you. BOOTHROOM: You're looking good here. + +<|ref|>text<|/ref|><|det|>[[74, 750, 262, 780]]<|/det|> +TRAVELLING MADE: A very smooth touchdown. BEDROOM: Eagle, you are very far. I'll. (The first sign in the lunar appearance) (Over.) + +<|ref|>text<|/ref|><|det|>[[74, 780, 262, 810]]<|/det|> +TRAVELLING MADE: Eagle, stay for I'll. BOOTHROOM: Bumper and we are you waiting the cue. + +<|ref|>text<|/ref|><|det|>[[74, 810, 262, 830]]<|/det|> +TRAVELLING MADE: Eagle, and service mobility. + +<|ref|>text<|/ref|><|det|>[[74, 830, 262, 850]]<|/det|> +How do you read me? + +<|ref|>text<|/ref|><|det|>[[74, 850, 262, 880]]<|/det|> +TRAVELLING COLUMBIA, he has landed Truesquily. Base, Eagle is at Truesquily. I read you first by. Over. + +<|ref|>text<|/ref|><|det|>[[74, 880, 262, 900]]<|/det|> +COLUMBIA: Yes, I heard the whole thing. + +<|ref|>text<|/ref|><|det|>[[74, 900, 262, 920]]<|/det|> +BOOTHROOM: Well, it's a good show. + +<|ref|>text<|/ref|><|det|>[[74, 920, 262, 940]]<|/det|> +COLUMBIA: Fantastic. + +<|ref|>text<|/ref|><|det|>[[74, 940, 262, 960]]<|/det|> +TRAVELLING MADE: I'll read that. + +<|ref|>text<|/ref|><|det|>[[74, 960, 262, 980]]<|/det|> +APOLLO CONTROL: The most major sky to sky will be for the 23 event, that is at 21 minutes 26 sec- + +<|ref|>text<|/ref|><|det|>[[74, 980, 262, 990]]<|/det|> +tion of lunar descent. + +<|ref|>image<|/ref|><|det|>[[270, 545, 697, 990]]<|/det|> + + +<|ref|>text<|/ref|><|det|>[[715, 559, 911, 629]]<|/det|> +A Powdery Surface +Is Closely Explored + +<|ref|>text<|/ref|><|det|>[[733, 645, 851, 665]]<|/det|> +BY JOHN NOBLE WILFORD + +<|ref|>text<|/ref|><|det|>[[715, 669, 911, 700]]<|/det|> +HOUSTON, Monday, July 21—New hires landed and walked on the moon. + +<|ref|>text<|/ref|><|det|>[[715, 700, 911, 750]]<|/det|> +Two Americans, astronauts of Apollo 11, steered their Eagle-shaped lunar module safely and smoothly to the lunar landing yesterday at 4:17:40 P.M., Eastern day-light time. + +<|ref|>text<|/ref|><|det|>[[715, 750, 911, 780]]<|/det|> +Neil A. Armstrong, the 38-year-old civilian commander, radioed to earth and the landing team here. + +<|ref|>text<|/ref|><|det|>[[715, 780, 911, 830]]<|/det|> +"Boom, Truesquily! Base here. The Eagle has landed," the first man to reach the moon—Neil Armstrong and his engineer, Capt. Charles E. Alder, of the Jet Propulsion Laboratory, the space agency's rocket and space program manager. + +<|ref|>text<|/ref|><|det|>[[715, 830, 911, 880]]<|/det|> +About six and a half hours later, Mr. Armstrong opened the landing craft's hatch, stepped slowly down the ladder and descended as he pointed his first landing footguard on the lunar crater. + +<|ref|>text<|/ref|><|det|>[[715, 880, 911, 920]]<|/det|> +"That's one small step for man, one giant leap for mankind." + +<|ref|>text<|/ref|><|det|>[[715, 920, 911, 960]]<|/det|> +His first step on the moon came on 10:56:29 P.M., as a television camera recorded the craft's transmitted his every word to an aerial and excited audiences of hundreds of millions of people on earth. + +<|ref|>text<|/ref|><|det|>[[749, 960, 861, 974]]<|/det|> +Testable Slope Test Soil diff --git a/tools/mtmd/tests/test-1-extracted.txt b/tools/mtmd/tests/test-1-extracted.txt new file mode 100644 index 000000000..4fe273e31 --- /dev/null +++ b/tools/mtmd/tests/test-1-extracted.txt @@ -0,0 +1,42 @@ +MEN WALK ON MOON +ASTRONAUTS LAND ON PLAIN; +COLLECT ROCKS, PLANT FLAG + +Voice From Moon: +'Eagle Has Landed' + +A Powder Surface +Is Closely Explored + +By JOHN NOBLE WILFORD +NOVEMBER, Monday, July 21—New York Herald and +wished on the moon. + +Two American astronauts of Apollo 11, steered their +frigate Eagle toward the moon's surface and smoothly to +the lunar landing yesterday at 4:17:40 P.M., Eastern day- +light time. + +Neil A. Armstrong, the 38-year-old civilian commander, +landed on the soft sand of the moon's surface here. + +"Beautiful, Triumph!" he said. "The Eagle has landed." + +The first man to reach the moon—Neil Armstrong and +his co-pilot, Charles E. "Pete" Conrad, 26, of the Pentagon, +brought their ship to rest on a level, rock-strewn plain near +the moon's surface. The two men and two of the three +astronauts on board, Armstrong, Conrad and Edwin E. +Aldrin, 38, of Houston, stepped slowly down the ladder +and descended as he pointed his first full-flaming footpad +at the lunar crater. + +"That's one small step for man, one giant leap for +mankind." + +His first step on the moon came at 10:56:20 P.M., as +a television camera rolled the earth's thousandth line every +second to an aerial and studied audiences of hundreds of +millions of people on earth. + +Textile Slope Test Soil diff --git a/tools/mtmd/tests/test-deepseek-ocr.py b/tools/mtmd/tests/test-deepseek-ocr.py new file mode 100644 index 000000000..674a35001 --- /dev/null +++ b/tools/mtmd/tests/test-deepseek-ocr.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 +""" +Test script to compare llama.cpp mtmd-cli output with HuggingFace reference implementation +for DeepSeek-OCR model using embedding similarity. +""" + +import argparse +import subprocess +import sys +from pathlib import Path + +from sentence_transformers import SentenceTransformer +from sentence_transformers import util + + +def run_mtmd_deepseek_ocr( + model_path: str, + mmproj_path: str, + image_path: str, + bin_path: str, + prompt: str = "Free OCR." +) -> str: + """ + Run inference using llama.cpp mtmd-cli. + """ + cmd = [ + bin_path, + "-m", model_path, + "--mmproj", mmproj_path, + "--image", image_path, + # "-p", "<|grounding|>Convert the document to markdown.", + "-p", prompt, + "--chat-template", "deepseek-ocr", + "--temp", "0", + "-n", "1024", + # "--verbose" + ] + + print(f"Running llama.cpp command: {' '.join(cmd)}") + + result = subprocess.run( + cmd, + capture_output=True, + text=False, + timeout=300 + ) + + if result.returncode != 0: + stderr = result.stderr.decode('utf-8', errors='replace') + print(f"llama.cpp stderr: {stderr}") + raise RuntimeError(f"llama-mtmd-cli failed with code {result.returncode}") + + output = result.stdout.decode('utf-8', errors='replace').strip() + print(f"llama.cpp output length: {len(output)} chars") + return output + + +def compute_embedding_similarity(text1: str, text2: str, model_name: str) -> float: + """ + Compute cosine similarity between two texts using embedding model. + """ + print(f"Loading embedding model: {model_name}") + + # Use sentence-transformers for easier embedding extraction + embed_model = SentenceTransformer(model_name) + + print("Computing embeddings...") + embeddings = embed_model.encode([text1, text2], convert_to_numpy=True) + + similarity = util.similarity.cos_sim([embeddings[0]], [embeddings[1]])[0][0] + return float(similarity) + + +def read_expected_output(file_path: str) -> str: + """ + Read expected OCR output from file. + """ + cur_path = Path(__file__).parent + expected_path = str(cur_path / file_path) + with open(expected_path, "r", encoding="utf-8") as f: + return f.read().strip() + + +def main(): + ap = argparse.ArgumentParser(description="Compare llama.cpp and HuggingFace DeepSeek-OCR outputs") + ap.add_argument("--llama-model", default="gguf_models/deepseek-ai/deepseek-ocr-f16.gguf", + help="Path to llama.cpp GGUF model") + ap.add_argument("--mmproj", default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-f16.gguf", + help="Path to mmproj GGUF file") + ap.add_argument("--image", default="test-1.jpeg", + help="Path to test image") + ap.add_argument("--llama-bin", default="build/bin/llama-mtmd-cli", + help="Path to llama-mtmd-cli binary") + ap.add_argument("--embedding-model", default="Qwen/Qwen3-Embedding-0.6B", + help="Embedding model for similarity computation") + ap.add_argument("--threshold", type=float, default=0.7, + help="Minimum similarity threshold for pass") + args = ap.parse_args() + + # Validate paths + # script directory + image + mtmd_dir = Path(__file__).parent.parent + args.image = str(mtmd_dir / args.image) + # project directory + llama model + args.llama_model = str(mtmd_dir.parent.parent / args.llama_model) + # project directory + mmproj + args.mmproj = str(mtmd_dir.parent.parent / args.mmproj) + args.llama_bin = str(mtmd_dir.parent.parent / args.llama_bin) + if not Path(args.image).exists(): + print(f"Error: Image not found: {args.image}") + sys.exit(1) + if not Path(args.llama_model).exists(): + print(f"Error: Model not found: {args.llama_model}") + sys.exit(1) + if not Path(args.mmproj).exists(): + print(f"Error: mmproj not found: {args.mmproj}") + sys.exit(1) + + print("=" * 60) + print("DeepSeek-OCR: llama.cpp vs HuggingFace Comparison") + print("=" * 60) + + # Default paths based on your command + + # Run llama.cpp inference + print("\n[2/3] Running llama.cpp implementation...") + llama_free_ocr = run_mtmd_deepseek_ocr( + args.llama_model, + args.mmproj, + args.image, + args.llama_bin + ) + + llama_md_ocr = run_mtmd_deepseek_ocr( + args.llama_model, + args.mmproj, + args.image, + args.llama_bin, + prompt="<|grounding|>Convert the document to markdown." + ) + + expected_free_ocr = read_expected_output("test-1-extracted.txt") + expected_md_ocr = read_expected_output("test-1-extracted.md") + + # Compute similarity + print("\n[3/3] Computing embedding similarity...") + free_ocr_similarity = compute_embedding_similarity( + expected_free_ocr, + llama_free_ocr, + args.embedding_model + ) + + md_ocr_similarity = compute_embedding_similarity( + expected_md_ocr, + llama_md_ocr, + args.embedding_model + ) + + # Results + print("\n" + "=" * 60) + print("RESULTS") + print("=" * 60) + print(f"\nReference Model output:\n{'-' * 40}") + print(expected_free_ocr) + print(f"\nDeepSeek-OCR output:\n{'-' * 40}") + print(llama_free_ocr) + print(f"\n{'=' * 60}") + print(f"Cosine Similarity: {free_ocr_similarity:.4f}") + print(f"Threshold: {args.threshold}") + print(f"Result: {'PASS' if free_ocr_similarity >= args.threshold else 'FAIL'}") + print("=" * 60) + + # Markdown OCR results + print(f"\nReference Model Markdown output:\n{'-' * 40}") + print(expected_md_ocr) + print(f"\nDeepSeek-OCR Markdown output:\n{'-' * 40}") + print(llama_md_ocr) + print(f"\n{'=' * 60}") + print(f"Cosine Similarity (Markdown): {md_ocr_similarity:.4f}") + print(f"Threshold: {args.threshold}") + print(f"Result: {'PASS' if md_ocr_similarity >= args.threshold else 'FAIL'}") + print("=" * 60) + + +if __name__ == "__main__": + main() diff --git a/tools/mtmd/tests/tests-requirements.txt b/tools/mtmd/tests/tests-requirements.txt new file mode 100644 index 000000000..3134d098d --- /dev/null +++ b/tools/mtmd/tests/tests-requirements.txt @@ -0,0 +1,5 @@ +sentence-transformers +transformers +tokenizers +torch +torchvision From c0159f9c1f874da15e94f371d136f5920b4b5335 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Wed, 25 Mar 2026 22:28:04 +0100 Subject: [PATCH 09/11] common : do not delete old files from the old cache when updating (#21000) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- common/hf-cache.cpp | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/common/hf-cache.cpp b/common/hf-cache.cpp index ce66f6467..6645aceb2 100644 --- a/common/hf-cache.cpp +++ b/common/hf-cache.cpp @@ -504,7 +504,7 @@ static std::string make_old_cache_filename(const std::string & owner, return result; } -static bool migrate_single_file(const fs::path & old_cache, +static void migrate_single_file(const fs::path & old_cache, const std::string & owner, const std::string & repo, const nl::json & node, @@ -513,7 +513,7 @@ static bool migrate_single_file(const fs::path & old_cache, if (!node.contains("rfilename") || !node.contains("lfs") || !node["lfs"].contains("sha256")) { - return false; + return; } std::string path = node["rfilename"]; @@ -536,27 +536,19 @@ static bool migrate_single_file(const fs::path & old_cache, LOG_WRN("%s: %s is orphan, deleting...\n", __func__, etag_path.string().c_str()); fs::remove(etag_path); } - return false; + return; } - bool delete_old_path = false; - if (!file_info) { - LOG_WRN("%s: %s not found in current repo, deleting...\n", __func__, old_filename.c_str()); - delete_old_path = true; + LOG_WRN("%s: %s not found in current repo, ignoring...\n", __func__, old_filename.c_str()); + return; } else if (!sha256.empty() && !file_info->oid.empty() && sha256 != file_info->oid) { - LOG_WRN("%s: %s is not up to date (sha256 mismatch), deleting...\n", __func__, old_filename.c_str()); - delete_old_path = true; + LOG_WRN("%s: %s is not up to date (sha256 mismatch), ignoring...\n", __func__, old_filename.c_str()); + return; } std::error_code ec; - if (delete_old_path) { - fs::remove(old_path, ec); - fs::remove(etag_path, ec); - return true; - } - fs::path new_path(file_info->local_path); fs::create_directories(new_path.parent_path(), ec); @@ -566,7 +558,7 @@ static bool migrate_single_file(const fs::path & old_cache, fs::copy_file(old_path, new_path, ec); if (ec) { LOG_WRN("%s: failed to move/copy %s: %s\n", __func__, old_path.string().c_str(), ec.message().c_str()); - return false; + return; } } fs::remove(old_path, ec); @@ -575,8 +567,6 @@ static bool migrate_single_file(const fs::path & old_cache, std::string filename = finalize_file(*file_info); LOG_INF("%s: migrated %s -> %s\n", __func__, old_filename.c_str(), filename.c_str()); - - return true; } void migrate_old_cache_to_hf_cache(const std::string & token, bool offline) { From 0a524f240456d7727570043f97757ea2c249003b Mon Sep 17 00:00:00 2001 From: Yihao Wang <42559837+AgainstEntropy@users.noreply.github.com> Date: Wed, 25 Mar 2026 19:19:14 -0700 Subject: [PATCH 10/11] CUDA & CPU: support F32 kernel type for `CONV_TRANSPOSE_2D` (#17094) * Refactor CUDA 2D transpose implementation to support multiple kernel types and improve parameter handling - Introduced a `conv2d_transpose_params` struct for better parameter management. - Updated `conv2d_transpose_kernel` to be templated for different kernel types (float and half). - Modified `ggml_cuda_conv_2d_transpose_p0` to handle both F16 and F32 kernel types. - Enhanced test cases to validate functionality for both kernel types. * Refactor test cases for 2D convolution transpose to support dynamic kernel types - Updated `test_conv_transpose_2d` structure to improve parameter handling by reordering constructor arguments. - Enhanced test case generation to iterate over kernel types, allowing for flexible testing of different configurations. - Removed hardcoded kernel type instances in favor of a loop for better maintainability and scalability. * Refactor ggml_compute_forward_conv_transpose_2d to support both F16 and F32 tensor types. * Refactor conv2d transpose kernel to use a template for kernel type, enhancing flexibility for different data types. Update test cases to include both F16 and F32 tensor types for comprehensive coverage. * Update ggml/src/ggml-cuda/conv2d-transpose.cu Co-authored-by: Aman Gupta * Update ggml/src/ggml-cpu/ggml-cpu.c Co-authored-by: Aman Gupta * Refactor conv2d transpose implementation by removing the conv2d_transpose_params struct and dispatching with direct kernel launch. * Enhance cpu conv2d transpose implementation by introducing a templated kernel type for improved flexibility with F16 and F32 data types. --------- Co-authored-by: Aman Gupta --- ggml/src/ggml-cpu/ggml-cpu.c | 8 ++- ggml/src/ggml-cpu/ops.cpp | 69 ++++++++++++++++++------- ggml/src/ggml-cuda/conv2d-transpose.cu | 64 ++++++++++++++++------- ggml/src/ggml-cuda/conv2d-transpose.cuh | 1 + tests/test-backend-ops.cpp | 33 +++++++----- 5 files changed, 122 insertions(+), 53 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 8b323bd9b..df17cc553 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -2871,8 +2871,12 @@ struct ggml_cplan ggml_graph_plan( const int64_t ne11 = node->src[1]->ne[1]; // H const int64_t ne12 = node->src[1]->ne[2]; // Channels In - cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03; - cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12; + GGML_ASSERT(node->src[0]->type == GGML_TYPE_F16 || node->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(node->src[1]->type == GGML_TYPE_F32); + + cur += ggml_type_size(node->src[0]->type) * ne00 * ne01 * ne02 * ne03; + cur += ggml_type_size(node->src[0]->type) * ne10 * ne11 * ne12; + } break; case GGML_OP_TOP_K: { diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 3f85e531d..d950972c8 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -6923,16 +6923,15 @@ void ggml_compute_forward_conv_3d( ggml_compute_forward_conv_3d_impl(params, src0, src1, dst, src0->type); } -// ggml_compute_forward_conv_transpose_2d - -void ggml_compute_forward_conv_transpose_2d( - const ggml_compute_params * params, - ggml_tensor * dst) { +template +static void ggml_compute_forward_conv_transpose_2d_impl( + const ggml_compute_params * params, + ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -6943,7 +6942,7 @@ void ggml_compute_forward_conv_transpose_2d( const int nk = ne00*ne01*ne02*ne03; - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == ggml_type_size(src0->type)); GGML_ASSERT(nb10 == sizeof(float)); if (ith == 0) { @@ -6951,12 +6950,12 @@ void ggml_compute_forward_conv_transpose_2d( // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout) { - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + kernel_t * const wdata = (kernel_t *) params->wdata + 0; for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02); - ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03; + const kernel_t * const src = (kernel_t *)((char *) src0->data + i03*nb03 + i02*nb02); + kernel_t * dst_data = wdata + i02*ne01*ne00*ne03; for (int64_t i01 = 0; i01 < ne01; i01++) { for (int64_t i00 = 0; i00 < ne00; i00++) { dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00]; @@ -6968,13 +6967,17 @@ void ggml_compute_forward_conv_transpose_2d( // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh) { - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk; + kernel_t * const wdata = (kernel_t *) params->wdata + nk; for (int i12 = 0; i12 < ne12; i12++) { for (int i11 = 0; i11 < ne11; i11++) { const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11); - ggml_fp16_t * dst_data = wdata + i11*ne10*ne12; + kernel_t * dst_data = wdata + i11*ne10*ne12; for (int i10 = 0; i10 < ne10; i10++) { - dst_data[i10*ne12 + i12] = GGML_CPU_FP32_TO_FP16(src[i10]); + if constexpr (std::is_same_v) { + dst_data[i10*ne12 + i12] = GGML_CPU_FP32_TO_FP16(src[i10]); + } else { + dst_data[i10*ne12 + i12] = src[i10]; + } } } } @@ -6996,21 +6999,27 @@ void ggml_compute_forward_conv_transpose_2d( const int ip0 = dp*ith; const int ip1 = MIN(ip0 + dp, np); - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; - ggml_fp16_t * const wdata_src = wdata + nk; + kernel_t * const wdata = (kernel_t *) params->wdata + 0; + kernel_t * const wdata_src = wdata + nk; for (int i2 = ip0; i2 < ip1; i2++) { // Cout float * dst_data = (float *)((char *) dst->data + i2*nb2); - ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03; + kernel_t * wdata_kernel = wdata + i2*ne01*ne00*ne03; for (int i11 = 0; i11 < ne11; i11++) { for (int i10 = 0; i10 < ne10; i10++) { const int i1n = i11*ne10*ne12 + i10*ne12; for (int i01 = 0; i01 < ne01; i01++) { for (int i00 = 0; i00 < ne00; i00++) { float v = 0; - ggml_vec_dot_f16(ne03, &v, 0, - wdata_src + i1n, 0, - wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1); + if constexpr (std::is_same_v) { + ggml_vec_dot_f16(ne03, &v, 0, + wdata_src + i1n, 0, + wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1); + } else { + ggml_vec_dot_f32(ne03, &v, 0, + wdata_src + i1n, 0, + wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1); + } dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v; } } @@ -7019,6 +7028,28 @@ void ggml_compute_forward_conv_transpose_2d( } } +void ggml_compute_forward_conv_transpose_2d( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_conv_transpose_2d_impl(params, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_conv_transpose_2d_impl(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + // ggml_compute_forward_conv_2d_dw struct ggml_conv_2d_dw_params { diff --git a/ggml/src/ggml-cuda/conv2d-transpose.cu b/ggml/src/ggml-cuda/conv2d-transpose.cu index 03224e404..6cbd6f879 100644 --- a/ggml/src/ggml-cuda/conv2d-transpose.cu +++ b/ggml/src/ggml-cuda/conv2d-transpose.cu @@ -1,12 +1,20 @@ -#include - #include "conv2d-transpose.cuh" -#include "ggml.h" +#include "convert.cuh" -__global__ void conv2d_transpose_kernel(const float * __restrict__ input, const half * __restrict__ kernel, - float * __restrict__ output, const int in_w, const int in_h, const int out_w, - const int out_h, const int kernel_w, const int kernel_h, const int stride, - const int c_in, const int c_out, const int batches) { +template +static __global__ void conv2d_transpose_kernel(const float * __restrict__ input, + const kernel_t * __restrict__ kernel, + float * __restrict__ output, + const int in_w, + const int in_h, + const int out_w, + const int out_h, + const int kernel_w, + const int kernel_h, + const int stride, + const int c_in, + const int c_out, + const int batches) { const int global_idx = blockIdx.x * blockDim.x + threadIdx.x; const int total_elements = out_w * out_h * c_out * batches; @@ -26,24 +34,32 @@ __global__ void conv2d_transpose_kernel(const float * __restrict__ input, const for (int c_in_idx = 0; c_in_idx < c_in; c_in_idx++) { for (int kh = 0; kh < kernel_h; ++kh) { int in_y = out_y_idx - kh; - if (in_y < 0 || in_y % stride) continue; + if (in_y < 0 || in_y % stride) { + continue; + } in_y /= stride; - if (in_y >= in_h) continue; + if (in_y >= in_h) { + continue; + } for (int kw = 0; kw < kernel_w; ++kw) { int in_x = out_x_idx - kw; - if (in_x < 0 || in_x % stride) continue; + if (in_x < 0 || in_x % stride) { + continue; + } in_x /= stride; - if (in_x >= in_w) continue; + if (in_x >= in_w) { + continue; + } const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + (in_w) *in_y + in_x; const int kernel_idx = (kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx + (kernel_w) *kh + kw; - float input_val = input[input_idx]; - half kern_val = kernel[kernel_idx]; + float input_val = input[input_idx]; + kernel_t kern_val = kernel[kernel_idx]; - accumulator += input_val * (float) kern_val; + accumulator += input_val * ggml_cuda_cast(kern_val); } } } @@ -56,11 +72,12 @@ void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor const ggml_tensor * kernel = dst->src[0]; const ggml_tensor * input = dst->src[1]; - GGML_ASSERT(kernel->type == GGML_TYPE_F16 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + GGML_ASSERT(kernel->type == GGML_TYPE_F16 || kernel->type == GGML_TYPE_F32); + GGML_ASSERT(input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); const float * input_data = (const float *) input->data; float * output_data = (float *) dst->data; - const half * kernel_data = (const half *) kernel->data; + const void * kernel_data = kernel->data; const int input_w = input->ne[0]; const int input_h = input->ne[1]; @@ -82,10 +99,17 @@ void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor GGML_ASSERT(ggml_is_contiguous(kernel)); GGML_ASSERT(ggml_is_contiguous(dst)); - const int total = (output_w * output_h * channels_out * batches); + const int total = output_w * output_h * channels_out * batches; const int blocks = (total + CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE - 1) / CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE; - conv2d_transpose_kernel<<>>( - input_data, kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w, kernel_h, stride, - channels_in, channels_out, batches); + if (kernel->type == GGML_TYPE_F16) { + conv2d_transpose_kernel<<>>( + input_data, (const half *) kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w, + kernel_h, stride, channels_in, channels_out, batches); + + } else { + conv2d_transpose_kernel<<>>( + input_data, (const float *) kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w, + kernel_h, stride, channels_in, channels_out, batches); + } } diff --git a/ggml/src/ggml-cuda/conv2d-transpose.cuh b/ggml/src/ggml-cuda/conv2d-transpose.cuh index c9430b248..72889c5f0 100644 --- a/ggml/src/ggml-cuda/conv2d-transpose.cuh +++ b/ggml/src/ggml-cuda/conv2d-transpose.cuh @@ -1,4 +1,5 @@ #include "common.cuh" #define CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE 256 + void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 82333a3a6..ac284db2b 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -4823,28 +4823,33 @@ struct test_conv_transpose_1d : public test_case { // GGML_OP_CONV_TRANSPOSE_2D struct test_conv_transpose_2d : public test_case { + // Dimensions const std::array ne_input; const std::array ne_kernel; const int stride; + // Types + const ggml_type kernel_type; std::string vars() override { - return VARS_TO_STR3(ne_input, ne_kernel, stride); + return VARS_TO_STR4(kernel_type, ne_input, ne_kernel, stride); } double max_nmse_err() override { return 5e-4; // The default 1e-7 is too small for Vulkan. } - test_conv_transpose_2d(std::array ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1] - std::array ne_kernel = {3, 3, 3, 1}, // [kernel_width, kernel_height, input_channels, 1] - int stride = 1) - : ne_input(ne_input), ne_kernel(ne_kernel), stride(stride){} + test_conv_transpose_2d( + std::array ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1] + std::array ne_kernel = {3, 3, 3, 1}, // [kernel_width, kernel_height, input_channels, 1] + int stride = 1, + ggml_type kernel_type = GGML_TYPE_F16 + ) : ne_input(ne_input), ne_kernel(ne_kernel), stride(stride), kernel_type(kernel_type) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data()); ggml_set_name(input, "input"); - ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne_kernel.data()); + ggml_tensor * kernel = ggml_new_tensor(ctx, kernel_type, 4, ne_kernel.data()); ggml_set_name(kernel, "kernel"); ggml_tensor * out = ggml_conv_transpose_2d_p0(ctx, kernel, input, stride); @@ -7704,9 +7709,11 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1)); test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1)); - test_cases.emplace_back(new test_conv_transpose_2d({3, 2, 3, 1}, {2, 2, 1, 3}, 1)); - test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2)); - test_cases.emplace_back(new test_conv_transpose_2d({129, 63, 35, 1}, {3, 3, 48, 35}, 1)); + for (ggml_type kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) { + test_cases.emplace_back(new test_conv_transpose_2d({3, 2, 3, 1}, {2, 2, 1, 3}, 1, kernel_type)); + test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2, kernel_type)); + test_cases.emplace_back(new test_conv_transpose_2d({129, 63, 35, 1}, {3, 3, 48, 35}, 1, kernel_type)); + } test_cases.emplace_back(new test_count_equal(GGML_TYPE_F32, {4, 500, 1, 1})); test_cases.emplace_back(new test_count_equal(GGML_TYPE_F32, {4, 5000, 1, 1})); @@ -8892,9 +8899,11 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, false)); test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, true)); - test_cases.emplace_back(new test_conv_transpose_2d({256, 256, 256, 1}, {3, 3, 16, 256}, 1)); - test_cases.emplace_back(new test_conv_transpose_2d({16, 16, 16, 1}, {3, 3, 8, 16}, 1)); - test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2)); + for (ggml_type kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) { + test_cases.emplace_back(new test_conv_transpose_2d({256, 256, 256, 1}, {3, 3, 16, 256}, 1, kernel_type)); + test_cases.emplace_back(new test_conv_transpose_2d({16, 16, 16, 1}, {3, 3, 8, 16}, 1, kernel_type)); + test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2, kernel_type)); + } test_cases.emplace_back(new test_mean(GGML_TYPE_F32, {256, 256, 3, 1})); From 0fac87b157305eb82a70902327abffbbce25bd3e Mon Sep 17 00:00:00 2001 From: SamareshSingh <97642706+ssam18@users.noreply.github.com> Date: Thu, 26 Mar 2026 02:14:36 -0500 Subject: [PATCH 11/11] imatrix : fix crash when using --show-statistics with zero counts (#19532) * imatrix: fix crash when using --show-statistics with zero counts Fixes division by zero that caused floating point exceptions when processing imatrix files with zero count values. Added checks to skip zero counts and handle empty activation vectors. Fix for the bug #19190 * imatrix: lower log level for zero-count skip message to DBG --- tools/imatrix/imatrix.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index bbedb159c..fa21f6c9d 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -143,11 +143,20 @@ static void compute_statistics(std::vector & tstats, const st activations.reserve(e.values.size()); for (int i = 0; i < n_mat; ++i) { + if (e.counts[i] == 0) { + LOG_DBG("%s: skipping tensor %s due to zero count at index %d\n", __func__, name.c_str(), i); + continue; + } for (int j = 0; j < row_size; ++j) { activations.push_back(e.values[i*row_size + j] / e.counts[i]); } } + if (activations.empty()) { + LOG_ERR("%s: all counts are zero for tensor %s, skipping statistics computation\n", __func__, name.c_str()); + return; + } + const float act_total = std::accumulate(activations.begin(), activations.end(), 0.0f); const float act_max = *std::max_element(activations.begin(), activations.end()); const float act_min = *std::min_element(activations.begin(), activations.end()); @@ -1142,10 +1151,12 @@ static bool show_statistics(const common_params & params) { blk = -1; // not a block layer } + const float entropy_norm = (tstat.elements > 0) ? 100.0f * (tstat.entropy / std::log2(tstat.elements)) : 0.0f; + LOG_INF("%5s\t%-20s\t%10.2f\t%8.4f\t%11.4f\t%6.2f\t%6.2f\t%8.2f%%\t%6d\t%10.4f\t%6.2f%%\t%10.2f%%\t%8.4f\n", layer.c_str(), name.c_str(), tstat.total_sqract, tstat.min_sqract, tstat.max_sqract, tstat.mean_sqract, tstat.stddev, tstat.active * 100.0f, tstat.elements, tstat.entropy, - 100.0f * (tstat.entropy / std::log2(tstat.elements)), 100.0f * tstat.zd, tstat.cossim); + entropy_norm, 100.0f * tstat.zd, tstat.cossim); const float weighted_bias = tstat.elements * tstat.total_sqract; const float weighted_zd = tstat.elements * tstat.zd;