From 5d3a4a7da5e3dd42f5922aba2fe21b520e96e830 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 5 Apr 2026 09:14:02 -0500 Subject: [PATCH 1/7] server : fix logging of build + system info (#21460) This PR changes the logging that occurs at startup of llama-server. Currently, it is redundant (including CPU information twice) and it is missing the build + commit info. --- tools/server/server.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index a7afa7743..b9e320d9c 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -108,10 +108,8 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency()); - LOG_INF("\n"); + LOG_INF("build_info: %s\n", build_info.c_str()); LOG_INF("%s\n", common_params_get_system_info(params).c_str()); - LOG_INF("\n"); server_http_context ctx_http; if (!ctx_http.init(params)) { From 761797ffdf2ce3f118e82c663b1ad7d935fbd656 Mon Sep 17 00:00:00 2001 From: Ludovic Henry Date: Sun, 5 Apr 2026 20:29:48 +0200 Subject: [PATCH 2/7] ci : use default RISE RISC-V Runners (#21263) --- .github/workflows/build-riscv.yml | 38 +++++++++---------------- .github/workflows/build.yml | 47 ++++++++++++------------------- 2 files changed, 32 insertions(+), 53 deletions(-) diff --git a/.github/workflows/build-riscv.yml b/.github/workflows/build-riscv.yml index 36a3a1155..9733dbaa7 100644 --- a/.github/workflows/build-riscv.yml +++ b/.github/workflows/build-riscv.yml @@ -35,7 +35,7 @@ env: jobs: ubuntu-riscv64-native-sanitizer: - runs-on: RISCV64 + runs-on: ubuntu-24.04-riscv continue-on-error: true @@ -50,17 +50,18 @@ jobs: sudo apt-get update # Install necessary packages - sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache git-lfs + sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 cmake build-essential wget git-lfs # Set gcc-14 and g++-14 as the default compilers sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100 sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100 - sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc - sudo ln -sf /usr/bin/g++-14 /usr/bin/g++ - # Install Rust stable version - rustup install stable - rustup default stable + if ! which rustc; then + # Install Rust stable version + sudo apt-get install -y rustup + rustup install stable + rustup default stable + fi git lfs install @@ -73,23 +74,12 @@ jobs: id: checkout uses: actions/checkout@v6 - - name: Setup ccache - run: | - # Unique cache directory per matrix combination - export CCACHE_DIR="$HOME/.ccache/sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}" - mkdir -p "$CCACHE_DIR" - - # Configure ccache - ccache --set-config=max_size=5G - ccache --set-config=compression=true - ccache --set-config=compression_level=6 - ccache --set-config=cache_dir="$CCACHE_DIR" - ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime - ccache --set-config=hash_dir=false - - # Export for subsequent steps - echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV - echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV + # FIXME: Enable when ggml-org/ccache-action works on riscv64 + # - name: ccache + # uses: ggml-org/ccache-action@v1.2.21 + # with: + # key: ubuntu-riscv64-native-sanitizer-${{ matrix.sanytizer }}-${{ matrix.build_type }} + # save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - name: Build id: cmake_build diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 491fc0c42..f4ae36756 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -996,7 +996,7 @@ jobs: cmake --build build -j ${env:NUMBER_OF_PROCESSORS} ubuntu-cpu-riscv64-native: - runs-on: RISCV64 + runs-on: ubuntu-24.04-riscv steps: - name: Install dependencies @@ -1004,24 +1004,21 @@ jobs: sudo apt-get update # Install necessary packages - sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache git-lfs + sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 cmake build-essential libssl-dev wget git-lfs # Set gcc-14 and g++-14 as the default compilers sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100 sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100 - sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc - sudo ln -sf /usr/bin/g++-14 /usr/bin/g++ - # Install Rust stable version - rustup install stable - rustup default stable + if ! which rustc; then + # Install Rust stable version + sudo apt-get install -y rustup + rustup install stable + rustup default stable + fi git lfs install - - name: Clone - id: checkout - uses: actions/checkout@v6 - - name: Check environment run: | uname -a @@ -1031,25 +1028,17 @@ jobs: cmake --version rustc --version - - name: Setup ccache - run: | - # Set unique cache directory for this job - export CCACHE_DIR="$HOME/.ccache/cpu-cmake-rv64-native" - mkdir -p "$CCACHE_DIR" + - name: Clone + id: checkout + uses: actions/checkout@v6 - # Configure ccache for optimal performance - ccache --set-config=max_size=5G - ccache --set-config=compression=true - ccache --set-config=compression_level=6 - ccache --set-config=cache_dir="$CCACHE_DIR" - - # Enable more aggressive caching - ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime - ccache --set-config=hash_dir=false - - # Export for subsequent steps - echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV - echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV + # FIXME: Enable when ggml-org/ccache-action works on riscv64 + # - name: ccache + # uses: ggml-org/ccache-action@v1.2.21 + # with: + # key: ubuntu-cpu-riscv64-native + # evict-old-files: 1d + # save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - name: Build id: cmake_build From af76639f728c69c74c873cc45f038aaa17afd09e Mon Sep 17 00:00:00 2001 From: Richard Davison Date: Sun, 5 Apr 2026 23:32:14 +0200 Subject: [PATCH 3/7] model : add HunyuanOCR support (#21395) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * HunyuanOCR: add support for text and vision models - Add HunyuanOCR vision projector (perceiver-based) with Conv2d merge - Add separate HUNYUAN_OCR chat template (content-before-role format) - Handle HunyuanOCR's invalid pad_token_id=-1 in converter - Fix EOS/EOT token IDs from generation_config.json - Support xdrope RoPE scaling type - Add tensor mappings for perceiver projector (mm.before_rms, mm.after_rms, etc.) - Register HunYuanVLForConditionalGeneration for both text and mmproj conversion * fix proper mapping * Update gguf-py/gguf/tensor_mapping.py Co-authored-by: Xuan-Son Nguyen * Update tools/mtmd/clip.cpp Co-authored-by: Xuan-Son Nguyen * address comments * update * Fix typecheck * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret --------- Co-authored-by: Xuan-Son Nguyen Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 100 ++++++++++++++++++++++++++++--- gguf-py/gguf/constants.py | 10 ++++ gguf-py/gguf/tensor_mapping.py | 27 +++++++++ src/llama-chat.cpp | 19 ++++++ src/llama-chat.h | 1 + tools/mtmd/CMakeLists.txt | 1 + tools/mtmd/clip-impl.h | 7 +++ tools/mtmd/clip-model.h | 8 ++- tools/mtmd/clip.cpp | 39 ++++++++++++ tools/mtmd/models/hunyuanocr.cpp | 59 ++++++++++++++++++ tools/mtmd/models/models.h | 5 ++ tools/mtmd/mtmd.cpp | 7 +++ 12 files changed, 273 insertions(+), 10 deletions(-) create mode 100644 tools/mtmd/models/hunyuanocr.cpp diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d4929d6b6..7ba6f6a74 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -11521,13 +11521,50 @@ class LLaDAMoEModel(TextModel): raise ValueError(f"Unprocessed experts: {experts}") -@ModelBase.register("HunYuanDenseV1ForCausalLM") +@ModelBase.register("HunYuanDenseV1ForCausalLM", "HunYuanVLForConditionalGeneration") class HunYuanModel(TextModel): model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE + def _get_eod_token_id(self) -> int | None: + """Get the actual end-of-generation token from config (eod_token_id).""" + return self.hparams.get("eod_token_id") + + def _get_eot_token_id(self) -> int | None: + """Get the end-of-turn token from generation_config.json. + This is the first entry in eos_token_id when it's a list.""" + gen_cfg_path = self.dir_model / "generation_config.json" + if gen_cfg_path.is_file(): + with open(gen_cfg_path, encoding="utf-8") as f: + gen_cfg = json.load(f) + eos = gen_cfg.get("eos_token_id") + if isinstance(eos, list) and len(eos) >= 2: + return eos[0] + return None + + def _fix_special_tokens(self): + """Fix EOS/EOT tokens that are incorrect in upstream configs.""" + eod_id = self._get_eod_token_id() + if eod_id is not None: + self.gguf_writer.add_eos_token_id(eod_id) + eot_id = self._get_eot_token_id() + if eot_id is not None: + self.gguf_writer.add_eot_token_id(eot_id) + def set_vocab(self): if (self.dir_model / "tokenizer.json").is_file(): - self._set_vocab_gpt2() + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + # HunyuanOCR has pad_token_id=-1 in config.json; exclude pad from SpecialVocab + token_types = None + if (self.hparams.get("pad_token_id") or 0) < 0: + token_types = ('bos', 'eos', 'unk', 'sep', 'cls', 'mask') + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True, special_token_types=token_types) + special_vocab.add_to_gguf(self.gguf_writer) + self._fix_special_tokens() else: from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) @@ -11579,13 +11616,18 @@ class HunYuanModel(TextModel): # FIX for BOS token: Overwrite incorrect id read from config.json if self.hparams['hidden_size'] == 4096: self.gguf_writer.add_bos_token_id(127958) # only for 7b dense, fix <|bos|> token + self._fix_special_tokens() def set_gguf_parameters(self): + # HunyuanOCR has num_experts=1 which is not MoE, prevent parent from writing it + saved_num_experts = self.hparams.pop("num_experts", None) super().set_gguf_parameters() + if saved_num_experts is not None and saved_num_experts > 1: + self.hparams["num_experts"] = saved_num_experts hparams = self.hparams # Rope - if self.rope_parameters.get("rope_type") == "dynamic": + if self.rope_parameters.get("rope_type") in ("dynamic", "xdrope"): # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/ # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf) alpha = self.rope_parameters.get("alpha", 50) @@ -11595,13 +11637,14 @@ class HunYuanModel(TextModel): self.gguf_writer.add_rope_freq_base(scaled_base) self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) self.gguf_writer.add_rope_scaling_factor(1) - # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k - self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length - self.gguf_writer.add_context_length(256 * 1024) # 256k context length + if self.rope_parameters.get("rope_type") == "dynamic": + # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k + self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length + self.gguf_writer.add_context_length(256 * 1024) # 256k context length - # if any of our assumptions about the values are wrong, something has changed and this may need to be updated - assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \ - "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually" + # if any of our assumptions about the values are wrong, something has changed and this may need to be updated + assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \ + "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually" def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if name == "lm_head.weight": @@ -11609,9 +11652,48 @@ class HunYuanModel(TextModel): logger.info("Skipping tied output layer 'lm_head.weight'") return + # skip vision tensors for HunyuanVL models + if name.startswith("vit."): + return + yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("HunYuanVLForConditionalGeneration") +class HunyuanOCRVisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + # HunyuanOCR uses max_image_size instead of image_size + if "image_size" not in self.hparams_vision: + self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + assert self.hparams_vision is not None + hparams = self.hparams_vision + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR) + self.gguf_writer.add_vision_use_gelu(True) + self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-5)) + self.gguf_writer.add_vision_spatial_merge_size(hparams.get("spatial_merge_size", 2)) + self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"]) + self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if not name.startswith("vit."): + return # skip text tensors + # strip CLS token (row 0) from position embeddings so resize_position_embeddings works + if "position_embedding" in name: + data_torch = data_torch[1:] # [n_patches+1, n_embd] -> [n_patches, n_embd] + yield from super().modify_tensors(data_torch, name, bid) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + # force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal + if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"): + return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + @ModelBase.register("SmolLM3ForCausalLM") class SmolLM3Model(LlamaModel): model_arch = gguf.MODEL_ARCH.SMOLLM3 diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 3ebd9de5f..6b1a19a30 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -734,6 +734,7 @@ class MODEL_TENSOR(IntEnum): V_LAYER_OUT_SCALE = auto() V_PRE_NORM = auto() V_POST_NORM = auto() + V_MM_PRE_NORM = auto() # hunyuanocr V_MM_POST_NORM = auto() V_MM_INP_NORM = auto() V_MM_INP_PROJ = auto() # gemma3 @@ -769,6 +770,8 @@ class MODEL_TENSOR(IntEnum): V_MM_GATE = auto() # cogvlm V_TOK_BOI = auto() # cogvlm V_TOK_EOI = auto() # cogvlm + V_TOK_IMG_BEGIN = auto() # hunyuanocr + V_TOK_IMG_END = auto() # hunyuanocr V_STD_BIAS = auto() # gemma4 V_STD_SCALE = auto() # gemma4 V_SAM_POS_EMBD = auto() # Deepseek-OCR @@ -1246,6 +1249,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.V_MM_GATE: "mm.gate", MODEL_TENSOR.V_TOK_BOI: "v.boi", MODEL_TENSOR.V_TOK_EOI: "v.eoi", + MODEL_TENSOR.V_MM_PRE_NORM: "mm.pre_norm", + MODEL_TENSOR.V_TOK_IMG_BEGIN: "mm.image_begin", + MODEL_TENSOR.V_TOK_IMG_END: "mm.image_end", MODEL_TENSOR.V_STD_BIAS: "v.std_bias", # gemma4 MODEL_TENSOR.V_STD_SCALE: "v.std_scale", # gemma4 # DeepSeek-OCR SAM @@ -1393,6 +1399,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.V_MM_GATE, MODEL_TENSOR.V_TOK_BOI, MODEL_TENSOR.V_TOK_EOI, + MODEL_TENSOR.V_MM_PRE_NORM, + MODEL_TENSOR.V_TOK_IMG_BEGIN, + MODEL_TENSOR.V_TOK_IMG_END, MODEL_TENSOR.V_STD_BIAS, MODEL_TENSOR.V_STD_SCALE, MODEL_TENSOR.V_SAM_POS_EMBD, @@ -4113,6 +4122,7 @@ class VisionProjectorType: GLM4V = "glm4v" YOUTUVL = "youtuvl" NEMOTRON_V2_VL = "nemotron_v2_vl" + HUNYUANOCR = "hunyuanocr" # Items here are (block size, type size) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index a7c7ce464..1c324976c 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1359,6 +1359,7 @@ class TensorNameMap: "visual.merger.mlp.{bid}", # qwen2vl "mlp_AR.linear_{bid}", # PaddleOCR-VL "merger.mlp.{bid}", + "vit.perceive.proj.{bid}", # HunyuanOCR (proj.0 = conv1, proj.2 = conv2) ), MODEL_TENSOR.V_MMPROJ_FC: ( @@ -1366,6 +1367,7 @@ class TensorNameMap: "model.vision.linear_proj.linear_proj", # cogvlm "model.projector.layers", # Deepseek-OCR "visual.merger.proj", # glm4v + "vit.perceive.mlp", # HunyuanOCR ), MODEL_TENSOR.V_MMPROJ_MLP: ( @@ -1393,6 +1395,7 @@ class TensorNameMap: "model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1 "vpm.embeddings.patch_embedding", "model.vision_model.embeddings.patch_embedding", # SmolVLM + "vit.embeddings.patch_embedding", # HunyuanOCR "vision_tower.patch_conv", # pixtral-hf "vision_encoder.patch_conv", # pixtral "vision_model.patch_embedding.linear", # llama 4 @@ -1414,6 +1417,7 @@ class TensorNameMap: "model.vision_tower.embeddings.position_embeddings", # Intern-S1 "vpm.embeddings.position_embedding", "model.vision_model.embeddings.position_embedding", # SmolVLM + "vit.embeddings.position_embedding", # HunyuanOCR "vision_model.positional_embedding_vlm", # llama 4 "vision_tower.patch_embed.pos_emb", # kimi-vl "visual.pos_embed", # qwen3vl @@ -1425,10 +1429,12 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_EMBD_IMGNL: ( "model.image_newline", # Deepseek-OCR + "vit.perceive.image_newline", # HunyuanOCR ), MODEL_TENSOR.V_ENC_EMBD_VSEP: ( "model.view_seperator", # Deepseek-OCR + "vit.perceive.image_sep", # HunyuanOCR ), MODEL_TENSOR.V_ENC_ATTN_QKV: ( @@ -1444,6 +1450,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.q_proj", "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM + "vit.layers.{bid}.self_attn.q_proj", # HunyuanOCR "vision_model.model.layers.{bid}.self_attn.q_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral @@ -1466,6 +1473,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.k_proj", "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM + "vit.layers.{bid}.self_attn.k_proj", # HunyuanOCR "vision_model.model.layers.{bid}.self_attn.k_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral @@ -1488,6 +1496,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.v_proj", "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM + "vit.layers.{bid}.self_attn.v_proj", # HunyuanOCR "vision_model.model.layers.{bid}.self_attn.v_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral @@ -1504,6 +1513,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.layernorm_before", # Intern-S1 "vpm.encoder.layers.{bid}.layer_norm1", "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM + "vit.layers.{bid}.input_layernorm", # HunyuanOCR "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral "vision_model.model.layers.{bid}.input_layernorm", # llama4, gemma4 @@ -1521,6 +1531,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.out_proj", "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM + "vit.layers.{bid}.self_attn.o_proj", # HunyuanOCR "model.vision_model.encoder.layers.{bid}.self_attn.projection_layer", # Janus Pro "vision_model.model.layers.{bid}.self_attn.o_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf @@ -1540,6 +1551,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.layernorm_after", # Intern-S1 "vpm.encoder.layers.{bid}.layer_norm2", "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM + "vit.layers.{bid}.post_attention_layernorm", # HunyuanOCR "vision_model.model.layers.{bid}.post_attention_layernorm", # llama4 "vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral-hf "vision_encoder.transformer.layers.{bid}.ffn_norm", # pixtral @@ -1557,6 +1569,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1 "vpm.encoder.layers.{bid}.mlp.fc1", "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3 + "vit.layers.{bid}.mlp.dense_h_to_4h", # HunyuanOCR "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.feed_forward.w3", # pixtral "vision_model.model.layers.{bid}.mlp.fc1", # llama4 @@ -1583,6 +1596,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1 "vpm.encoder.layers.{bid}.mlp.fc2", "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3 + "vit.layers.{bid}.mlp.dense_4h_to_h", # HunyuanOCR "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.feed_forward.w2", # pixtral "vision_model.model.layers.{bid}.mlp.fc2", # llama4 @@ -1639,6 +1653,7 @@ class TensorNameMap: MODEL_TENSOR.V_MM_POST_NORM: ( "visual.merger.post_projection_norm", # glm4v + "vit.perceive.after_rms", # HunyuanOCR ), MODEL_TENSOR.V_MM_INP_PROJ: ( @@ -1806,6 +1821,18 @@ class TensorNameMap: "model.vision.eoi", # cogvlm ), + MODEL_TENSOR.V_MM_PRE_NORM: ( + "vit.perceive.before_rms", # HunyuanOCR + ), + + MODEL_TENSOR.V_TOK_IMG_BEGIN: ( + "vit.perceive.image_begin", # HunyuanOCR + ), + + MODEL_TENSOR.V_TOK_IMG_END: ( + "vit.perceive.image_end", # HunyuanOCR + ), + MODEL_TENSOR.V_STD_BIAS: ( "model.vision_tower.std_bias", # gemma4 ), diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp index 80a88fade..6554a89b2 100644 --- a/src/llama-chat.cpp +++ b/src/llama-chat.cpp @@ -73,6 +73,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE }, { "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE }, { "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE }, + { "hunyuan-ocr", LLM_CHAT_TEMPLATE_HUNYUAN_OCR }, { "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 }, { "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS }, { "grok-2", LLM_CHAT_TEMPLATE_GROK_2 }, @@ -216,6 +217,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { return LLM_CHAT_TEMPLATE_HUNYUAN_MOE; } else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) { return LLM_CHAT_TEMPLATE_OPENAI_MOE; + } else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_begin▁of▁sentence|>")) { + return LLM_CHAT_TEMPLATE_HUNYUAN_OCR; } else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) { return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE; } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) { @@ -822,6 +825,22 @@ int32_t llm_chat_apply_template( ss << "<|hy_User|>" << chat[i]->content << "<|hy_Assistant|>"; } } + } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_OCR) { + // tencent/HunyuanOCR + ss << "<|hy_begin▁of▁sentence|>"; + for (size_t i = 0; i < chat.size(); i++) { + std::string role(chat[i]->role); + if (i == 0 && role == "system") { + ss << chat[i]->content << "<|hy_place▁holder▁no▁3|>"; + continue; + } + + if (role == "user") { + ss << chat[i]->content << "<|hy_User|>"; + } else if (role == "assistant") { + ss << chat[i]->content << "<|hy_Assistant|>"; + } + } } else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) { // moonshotai/Kimi-K2-Instruct for (auto message : chat) { diff --git a/src/llama-chat.h b/src/llama-chat.h index 2542f3cc8..13f936a94 100644 --- a/src/llama-chat.h +++ b/src/llama-chat.h @@ -53,6 +53,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_HUNYUAN_MOE, LLM_CHAT_TEMPLATE_OPENAI_MOE, LLM_CHAT_TEMPLATE_HUNYUAN_DENSE, + LLM_CHAT_TEMPLATE_HUNYUAN_OCR, LLM_CHAT_TEMPLATE_KIMI_K2, LLM_CHAT_TEMPLATE_SEED_OSS, LLM_CHAT_TEMPLATE_GROK_2, diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 675464c6b..6ffdb674d 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -19,6 +19,7 @@ add_library(mtmd models/conformer.cpp models/gemma4v.cpp models/glm4v.cpp + models/hunyuanocr.cpp models/internvl.cpp models/kimivl.cpp models/kimik25.cpp diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 5fa487367..1f2f7cfaa 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -148,6 +148,11 @@ #define TN_TOK_BOI "v.boi" #define TN_TOK_EOI "v.eoi" +// hunyuanocr +#define TN_MM_PRE_NORM "mm.pre_norm.%s" +#define TN_TOK_IMG_BEGIN "mm.image_begin" +#define TN_TOK_IMG_END "mm.image_end" + // deepseek-ocr #define TN_SAM_POS_EMBD "v.sam.pos_embd.%s" #define TN_SAM_PATCH_EMBD "v.sam.patch_embd.%s" @@ -266,6 +271,7 @@ enum projector_type { PROJECTOR_TYPE_YOUTUVL, PROJECTOR_TYPE_KIMIK25, PROJECTOR_TYPE_NEMOTRON_V2_VL, + PROJECTOR_TYPE_HUNYUANOCR, PROJECTOR_TYPE_UNKNOWN, }; @@ -306,6 +312,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_YOUTUVL, "youtuvl"}, { PROJECTOR_TYPE_KIMIK25, "kimik25"}, { PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"}, + { PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index 70270d6e7..b85c4122e 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -358,7 +358,8 @@ struct clip_model { // MINICPMV projection ggml_tensor * mm_model_pos_embed_k = nullptr; ggml_tensor * mm_model_query = nullptr; - ggml_tensor * mm_model_proj = nullptr; + ggml_tensor * mm_model_proj = nullptr; + ggml_tensor * mm_model_proj_b = nullptr; ggml_tensor * mm_model_kv_proj = nullptr; ggml_tensor * mm_model_attn_q_w = nullptr; ggml_tensor * mm_model_attn_q_b = nullptr; @@ -419,6 +420,11 @@ struct clip_model { ggml_tensor * mm_boi = nullptr; ggml_tensor * mm_eoi = nullptr; + // hunyuanocr perceiver + ggml_tensor * mm_pre_norm_w = nullptr; + ggml_tensor * mm_img_begin = nullptr; + ggml_tensor * mm_img_end = nullptr; + // deepseek ocr sam ggml_tensor * patch_embed_proj_w = nullptr; ggml_tensor * patch_embed_proj_b = nullptr; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 12517123e..2faf595a9 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -902,6 +902,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_HUNYUANOCR: + { + builder = std::make_unique(ctx, img); + } break; case PROJECTOR_TYPE_MLP: case PROJECTOR_TYPE_MLP_NORM: case PROJECTOR_TYPE_LDP: @@ -1408,6 +1412,14 @@ struct clip_model_loader { get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true); get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true); } break; + case PROJECTOR_TYPE_HUNYUANOCR: + { + hparams.n_merge = 2; + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels); + get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels); + hparams.set_warmup_n_tokens(28*28); + } break; case PROJECTOR_TYPE_LFM2A: { // audio preprocessing params @@ -2035,6 +2047,22 @@ struct clip_model_loader { model.mm_boi = get_tensor(TN_TOK_BOI); model.mm_eoi = get_tensor(TN_TOK_EOI); } break; + case PROJECTOR_TYPE_HUNYUANOCR: + { + // proj.0 -> mm.0 (conv1), proj.2 -> mm.2 (conv2), mlp -> mm.model.fc (linear) + model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); + model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias")); + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); + model.mm_model_proj = get_tensor(string_format(TN_MM_PROJECTOR, "weight")); + model.mm_model_proj_b = get_tensor(string_format(TN_MM_PROJECTOR, "bias")); + model.mm_pre_norm_w = get_tensor(string_format(TN_MM_PRE_NORM, "weight")); + model.mm_post_norm_w = get_tensor(string_format(TN_MM_POST_NORM, "weight")); + model.mm_img_begin = get_tensor(TN_TOK_IMG_BEGIN); + model.mm_img_end = get_tensor(TN_TOK_IMG_END); + model.image_newline = get_tensor(TN_IMAGE_NEWLINE); + model.view_seperator = get_tensor(TN_IMAGE_SEPERATOR, false); + } break; case PROJECTOR_TYPE_JANUS_PRO: { model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); @@ -2584,6 +2612,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_PADDLEOCR: + case PROJECTOR_TYPE_HUNYUANOCR: case PROJECTOR_TYPE_YOUTUVL: return (img->nx / params.patch_size) / 2; default: @@ -2768,6 +2797,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im int h = static_cast(std::sqrt(static_cast(n_patches))); n_patches = h * (h + 1) + 1; } break; + case PROJECTOR_TYPE_HUNYUANOCR: + { + int merge = ctx->model.hparams.n_merge; + int ow = (img->nx / patch_size) / merge; + int oh = (img->ny / patch_size) / merge; + n_patches = (ow + 1) * oh + 2; + } break; case PROJECTOR_TYPE_LFM2A: { n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2; @@ -3175,6 +3211,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima case PROJECTOR_TYPE_JANUS_PRO: case PROJECTOR_TYPE_PHI4: case PROJECTOR_TYPE_COGVLM: + case PROJECTOR_TYPE_HUNYUANOCR: { // do nothing } break; @@ -3346,6 +3383,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_PADDLEOCR: case PROJECTOR_TYPE_KIMIK25: return ctx->model.mm_2_w->ne[1]; + case PROJECTOR_TYPE_HUNYUANOCR: + return ctx->model.mm_model_proj->ne[1]; case PROJECTOR_TYPE_COGVLM: return ctx->model.mm_4h_to_h_w->ne[1]; case PROJECTOR_TYPE_DEEPSEEKOCR: diff --git a/tools/mtmd/models/hunyuanocr.cpp b/tools/mtmd/models/hunyuanocr.cpp new file mode 100644 index 000000000..37d1e2b86 --- /dev/null +++ b/tools/mtmd/models/hunyuanocr.cpp @@ -0,0 +1,59 @@ +#include "models.h" + +ggml_cgraph * clip_graph_hunyuanocr::build() { + const int merge = hparams.n_merge; + const int pw = n_patches_x; + const int ph = n_patches_y; + + ggml_tensor * pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR); + + ggml_tensor * inp = build_inp(); + ggml_tensor * cur = build_vit(inp, n_patches, NORM_TYPE_NORMAL, hparams.ffn_op, pos_embd, nullptr); + + // perceiver projector + cur = build_norm(cur, model.mm_pre_norm_w, nullptr, NORM_TYPE_RMS, eps, -1); + + // [C, W*H] -> [W, H, C] for conv2d + cur = ggml_reshape_3d(ctx0, cur, n_embd, pw, ph); + cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); + cur = ggml_cont(ctx0, cur); + + // Conv2d(1152->2304, k=2, s=2) + GELU + Conv2d(2304->4608, k=1, s=1) + cur = ggml_conv_2d(ctx0, model.mm_0_w, cur, merge, merge, 0, 0, 1, 1); + if (model.mm_0_b) { + cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model.mm_0_b, 1, 1, model.mm_0_b->ne[0])); + } + cur = ggml_gelu(ctx0, cur); + cur = ggml_conv_2d(ctx0, model.mm_1_w, cur, 1, 1, 0, 0, 1, 1); + if (model.mm_1_b) { + cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model.mm_1_b, 1, 1, model.mm_1_b->ne[0])); + } + + const int ow = pw / merge; + const int oh = ph / merge; + const int idim = (int)cur->ne[2]; // OC = 4608 + + // append newline along W (dim 0) + ggml_tensor * nl = ggml_reshape_4d(ctx0, model.image_newline, 1, 1, idim, 1); + nl = ggml_repeat_4d(ctx0, nl, 1, oh, idim, 1); + cur = ggml_concat(ctx0, cur, nl, 0); + + // [OW+1, OH, OC] -> [OC, (OW+1)*OH] + cur = ggml_permute(ctx0, cur, 1, 2, 0, 3); + cur = ggml_cont_2d(ctx0, cur, idim, (ow + 1) * oh); + + // project to LLM hidden size + cur = build_mm(model.mm_model_proj, cur); + if (model.mm_model_proj_b) { + cur = ggml_add(ctx0, cur, model.mm_model_proj_b); + } + + // wrap with begin/end tokens + cur = ggml_concat(ctx0, ggml_reshape_2d(ctx0, model.mm_img_begin, model.mm_img_begin->ne[0], 1), cur, 1); + cur = ggml_concat(ctx0, cur, ggml_reshape_2d(ctx0, model.mm_img_end, model.mm_img_end->ne[0], 1), 1); + + cur = build_norm(cur, model.mm_post_norm_w, nullptr, NORM_TYPE_RMS, eps, -1); + + ggml_build_forward_expand(gf, cur); + return gf; +} diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index 992eda04b..6f9632b62 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -98,6 +98,11 @@ struct clip_graph_glm4v : clip_graph { ggml_cgraph * build() override; }; +struct clip_graph_hunyuanocr : clip_graph { + clip_graph_hunyuanocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + struct clip_graph_mobilenetv5 : clip_graph { clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 35b4396fd..4b6dd44f0 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -406,6 +406,13 @@ struct mtmd_context { img_end = "\n"; // prevent empty batch on llama-server image_preproc = std::make_unique(ctx_v); } break; + case PROJECTOR_TYPE_HUNYUANOCR: + { + // note: these use fullwidth | (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary + img_beg = "<|hy_place▁holder▁no▁100|>"; + img_end = "<|hy_place▁holder▁no▁101|>"; + image_preproc = std::make_unique(ctx_v); + } break; default: throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj)); } From 58190cc84d846d8575ba26e8486bc29d9fd8ad55 Mon Sep 17 00:00:00 2001 From: anchortense Date: Mon, 6 Apr 2026 09:40:38 +1000 Subject: [PATCH 4/7] llama : correct platform-independent loading of BOOL metadata (#21428) * model-loader : fix GGUF bool array conversion * model-loader : fix remaining GGUF bool pointer uses --- src/llama-impl.cpp | 2 +- src/llama-model-loader.cpp | 5 +++-- tools/mtmd/clip-impl.h | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/llama-impl.cpp b/src/llama-impl.cpp index 4c0188ee7..b3a94b946 100644 --- a/src/llama-impl.cpp +++ b/src/llama-impl.cpp @@ -128,7 +128,7 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]); case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]); case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]); - case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false"; + case GGUF_TYPE_BOOL: return ((const int8_t *)data)[i] != 0 ? "true" : "false"; default: return format("unknown type %d", type); } } diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 3d549cae5..4f821e0dc 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -374,8 +374,9 @@ namespace GGUFMeta { } } else { if (arr_info.gt == GGUF_TYPE_BOOL) { - std::transform((const bool *)arr_info.data, (const bool *)arr_info.data + arr_info.length, result.begin(), [](bool x) { - return static_cast(x); + const int8_t * values = (const int8_t *) arr_info.data; + std::transform(values, values + arr_info.length, result.begin(), [](int8_t x) { + return static_cast(x != 0); }); } else { std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin()); diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 1f2f7cfaa..81b92841c 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -522,7 +522,7 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]); case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]); case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]); - case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false"; + case GGUF_TYPE_BOOL: return ((const int8_t *)data)[i] != 0 ? "true" : "false"; default: return string_format("unknown type %d", type); } } From 25eec6f32713886390debe9b03785aa82f015ce3 Mon Sep 17 00:00:00 2001 From: Yarden Tal Date: Mon, 6 Apr 2026 04:30:25 +0300 Subject: [PATCH 5/7] hexagon: slight optimization for argosrt output init (#21463) --- ggml/src/ggml-hexagon/htp/argsort-ops.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/argsort-ops.c b/ggml/src/ggml-hexagon/htp/argsort-ops.c index 170220e8f..3ec26a4c1 100644 --- a/ggml/src/ggml-hexagon/htp/argsort-ops.c +++ b/ggml/src/ggml-hexagon/htp/argsort-ops.c @@ -164,6 +164,12 @@ static void quicksort_values_indices_desc(float * values, int32_t * indices, int if (i < right) quicksort_values_indices_desc(values, indices, i, right); } +// LUT for ramp initialization of argsort output (first 32 members) +int32_t argosrt_ramp_lut[32] __attribute__((aligned(VLEN))) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 +}; + static void htp_argsort_f32(unsigned int n, unsigned int i, void * data) { struct htp_argsort_context * actx = (struct htp_argsort_context *)data; struct htp_ops_context * octx = actx->octx; @@ -205,8 +211,12 @@ static void htp_argsort_f32(unsigned int n, unsigned int i, void * data) { // Padded to 128 bytes. size_t values_size = hex_round_up(ne00 * sizeof(float), 128); + size_t num_vec_ind_values = hmx_ceil_div(ne00, VLEN/(sizeof(int32_t))); float * values_buf = (float *) spad; int32_t * indices_buf = (int32_t *) (spad + values_size); + HVX_Vector * indices_buf_vec = (HVX_Vector *) (spad + values_size); + const HVX_Vector ind_init_vec = *(HVX_Vector *)argosrt_ramp_lut; + const HVX_Vector ind_diff_vec = Q6_V_vsplat_R(32); for (uint32_t r = start_row; r < end_row; r++) { uint32_t src_offset = r * nb01; @@ -218,9 +228,11 @@ static void htp_argsort_f32(unsigned int n, unsigned int i, void * data) { hex_l2fetch(src_ptr, ne00 * sizeof(float), ne00 * sizeof(float), 1); hvx_copy_f32_au((uint8_t*)values_buf, src_ptr, ne00); - // Initialize indices - for (uint32_t j = 0; j < ne00; j++) { - indices_buf[j] = j; + // Initialize indices - Start with values 0..31, add 32 for additional vec iterations + HVX_Vector curr_ind_vec = ind_init_vec; + for (uint32_t j_vec = 0; j_vec < num_vec_ind_values; j_vec++) { + indices_buf_vec[j_vec] = curr_ind_vec; + curr_ind_vec = Q6_Vw_vadd_VwVw(curr_ind_vec, ind_diff_vec); } // Sort values and mirror swaps to indices From f51fd36d79a0bbb89c2ddb1f10b10edaa18f3ca0 Mon Sep 17 00:00:00 2001 From: Neo Zhang Date: Mon, 6 Apr 2026 18:28:00 +0800 Subject: [PATCH 6/7] sycl : handle other FA case (#21377) --- ggml/src/ggml-sycl/fattn-tile.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ggml/src/ggml-sycl/fattn-tile.hpp b/ggml/src/ggml-sycl/fattn-tile.hpp index c4d24613a..b4d4e0ae9 100644 --- a/ggml/src/ggml-sycl/fattn-tile.hpp +++ b/ggml/src/ggml-sycl/fattn-tile.hpp @@ -1252,6 +1252,16 @@ static void launch_fattn_tile_switch_ncols1(ggml_backend_sycl_context & ctx, ggm return; } + { + constexpr int cols_per_block = ncols2*2; + const int nwarps = ggml_sycl_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size; + const int nbatch_fa = ggml_sycl_fattn_tile_get_nbatch_fa(DKQ, DV, cols_per_block, cc); + launch_fattn, warp_size> + (ctx, dst, nwarps, nbytes_shared, nbatch_fa, true, true, false); + return; + } + GGML_ABORT("fatal error"); } From 400ac8e194ba1aa09d07f302681b8cbc8787d5f7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 6 Apr 2026 13:52:07 +0300 Subject: [PATCH 7/7] convert : set "add bos" == True for Gemma 4 (#21500) * convert : set "add bos" == True for Gemma 4 * cont : handle old GGUFs --- convert_hf_to_gguf.py | 2 +- src/llama-vocab.cpp | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 7ba6f6a74..c1737bb2c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7472,7 +7472,7 @@ class Gemma4Model(Gemma3Model): special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) special_vocab.add_to_gguf(self.gguf_writer) self.gguf_writer.add_add_space_prefix(False) - self.gguf_writer.add_add_bos_token(False) # already added via the chat template + self.gguf_writer.add_add_bos_token(True) def set_gguf_parameters(self): super().set_gguf_parameters() diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index cb55b46b7..75dbaa91e 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -2325,6 +2325,14 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { if (ml.get_key(LLM_KV_TOKENIZER_ADD_SEP, temp, false)) { add_sep = temp; } + + // workaround for Gemma 4 + // ref: https://github.com/ggml-org/llama.cpp/pull/21500 + if (pre_type == LLAMA_VOCAB_PRE_TYPE_GEMMA4 && !add_bos) { + add_bos = true; + + LLAMA_LOG_WARN("%s: override '%s' to 'true' for Gemma4\n", __func__, kv(LLM_KV_TOKENIZER_ADD_BOS).c_str()); + } } // auto-detect special tokens by text