diff --git a/common/arg.cpp b/common/arg.cpp index 8b68da3f3..5bdc25e15 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -426,6 +426,9 @@ static bool parse_bool_value(const std::string & value) { static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) { common_params & params = ctx_arg.params; + // setup log directly from params.verbosity: see tools/cli/cli.cpp + common_log_set_verbosity_thold(params.verbosity); + std::unordered_map> arg_to_options; for (auto & opt : ctx_arg.options) { for (const auto & arg : opt.args) { @@ -634,8 +637,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context )); } - common_log_set_verbosity_thold(params.verbosity); - return true; } @@ -3247,6 +3248,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "Set verbosity level to infinity (i.e. log all messages, useful for debugging)", [](common_params & params) { params.verbosity = INT_MAX; + common_log_set_verbosity_thold(INT_MAX); } )); add_opt(common_arg( @@ -3267,6 +3269,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "(default: %d)\n", params.verbosity), [](common_params & params, int value) { params.verbosity = value; + common_log_set_verbosity_thold(value); } ).set_env("LLAMA_LOG_VERBOSITY")); add_opt(common_arg( diff --git a/common/download.cpp b/common/download.cpp index 61351b555..06e91fb3f 100644 --- a/common/download.cpp +++ b/common/download.cpp @@ -458,7 +458,9 @@ static gguf_split_info get_gguf_split_info(const std::string & path) { std::smatch m; std::string prefix = path; - string_remove_suffix(prefix, ".gguf"); + if (!string_remove_suffix(prefix, ".gguf")) { + return {}; + } int index = 1; int count = 1; diff --git a/common/hf-cache.cpp b/common/hf-cache.cpp index ce66f6467..6645aceb2 100644 --- a/common/hf-cache.cpp +++ b/common/hf-cache.cpp @@ -504,7 +504,7 @@ static std::string make_old_cache_filename(const std::string & owner, return result; } -static bool migrate_single_file(const fs::path & old_cache, +static void migrate_single_file(const fs::path & old_cache, const std::string & owner, const std::string & repo, const nl::json & node, @@ -513,7 +513,7 @@ static bool migrate_single_file(const fs::path & old_cache, if (!node.contains("rfilename") || !node.contains("lfs") || !node["lfs"].contains("sha256")) { - return false; + return; } std::string path = node["rfilename"]; @@ -536,27 +536,19 @@ static bool migrate_single_file(const fs::path & old_cache, LOG_WRN("%s: %s is orphan, deleting...\n", __func__, etag_path.string().c_str()); fs::remove(etag_path); } - return false; + return; } - bool delete_old_path = false; - if (!file_info) { - LOG_WRN("%s: %s not found in current repo, deleting...\n", __func__, old_filename.c_str()); - delete_old_path = true; + LOG_WRN("%s: %s not found in current repo, ignoring...\n", __func__, old_filename.c_str()); + return; } else if (!sha256.empty() && !file_info->oid.empty() && sha256 != file_info->oid) { - LOG_WRN("%s: %s is not up to date (sha256 mismatch), deleting...\n", __func__, old_filename.c_str()); - delete_old_path = true; + LOG_WRN("%s: %s is not up to date (sha256 mismatch), ignoring...\n", __func__, old_filename.c_str()); + return; } std::error_code ec; - if (delete_old_path) { - fs::remove(old_path, ec); - fs::remove(etag_path, ec); - return true; - } - fs::path new_path(file_info->local_path); fs::create_directories(new_path.parent_path(), ec); @@ -566,7 +558,7 @@ static bool migrate_single_file(const fs::path & old_cache, fs::copy_file(old_path, new_path, ec); if (ec) { LOG_WRN("%s: failed to move/copy %s: %s\n", __func__, old_path.string().c_str(), ec.message().c_str()); - return false; + return; } } fs::remove(old_path, ec); @@ -575,8 +567,6 @@ static bool migrate_single_file(const fs::path & old_cache, std::string filename = finalize_file(*file_info); LOG_INF("%s: migrated %s -> %s\n", __func__, old_filename.c_str(), filename.c_str()); - - return true; } void migrate_old_cache_to_hf_cache(const std::string & token, bool offline) { diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 06365bb49..f377738f8 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -947,6 +947,9 @@ class ModelBase: if "thinker_config" in config: # rename for Qwen2.5-Omni config["text_config"] = config["thinker_config"]["text_config"] + if "language_config" in config: + # rename for DeepSeekOCR + config["text_config"] = config["language_config"] if "lfm" in config: # rename for LFM2-Audio config["text_config"] = config["lfm"] @@ -1503,6 +1506,9 @@ class TextModel(ModelBase): if chkhsh == "e4d54df1ebc1f2b91acd986c5b51aa50837d5faf7c7398e73c1f9e9ee5d19869": # ref: https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601 res = "kanana2" + if chkhsh == "862f827721df956049dff5ca81a57f29e575280bc622e290d3bf4e35eca29015": + # ref: https://huggingface.co/codefuse-ai/F2LLM-v2-4B + res = "f2llmv2" if res is None: logger.warning("\n") @@ -2071,7 +2077,7 @@ class MmprojModel(ModelBase): preprocessor_config: dict[str, Any] global_config: dict[str, Any] - n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers", "vt_num_hidden_layers"] + n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "layers", "encoder_layers", "vt_num_hidden_layers"] has_vision_encoder: bool = True # by default has_audio_encoder: bool = False @@ -6935,6 +6941,68 @@ class ConformerAudioModel(MmprojModel): yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("DeepseekOCRForCausalLM") +class DeepseekOCRVisionModel(MmprojModel): + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DEEPSEEKOCR) + # default values below are taken from HF tranformers code + self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6)) + self.gguf_writer.add_vision_use_gelu(True) + # calculate proj_scale_factor (used by tinygemma3 test model) + image_seq_length = self.preprocessor_config.get("image_seq_length", 256) + n_per_side = int(image_seq_length ** 0.5) + image_size = self.hparams["image_size"] + patch_size = self.hparams["patch_size"] + proj_scale_factor = (image_size // patch_size) // n_per_side + if proj_scale_factor > 0 and proj_scale_factor != 4: + # we only need to write this if it's not the default value + # in this case, we are converting a test model + self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor) + # @bluebread: there's no window_size in config but just add it here anyway + self.gguf_writer.add_vision_window_size(self.hparams.get("window_size", 14)) + + # SAM configuration + sam_hparams = hparams['sam'] + self.gguf_writer.add_vision_sam_layers_count(sam_hparams['layers']) + self.gguf_writer.add_vision_sam_embedding_length(sam_hparams['width']) + self.gguf_writer.add_vision_sam_head_count(sam_hparams['heads']) + + def get_vision_config(self) -> dict[str, Any]: + vision_config: dict[str, Any] | None = self.global_config.get("vision_config") + + if not vision_config: + raise ValueError("DeepseekOCR model requires 'vision_config' in the model configuration, but it was not found") + + vision_config['sam'] = vision_config['width']['sam_vit_b'] + vision_config.update(vision_config['width']['clip-l-14-224']) + vision_config['hidden_size'] = vision_config['width'] + vision_config['num_heads'] = vision_config['heads'] + vision_config['intermediate_size'] = vision_config['heads'] * 4 + + return vision_config + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".embeddings." in name or 'pos_embed' in name: + return gguf.GGMLQuantizationType.F32 + if ".rel_pos_h" in name or '.rel_pos_w' in name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Only process vision-related tensors, skip language model tensors + # Vision components: sam_model, vision_model, projector, image_newline, view_seperator + # Language model components to skip: lm_head, embed_tokens, layers, norm + if name.startswith(("lm_head.", "model.embed_tokens.", "model.layers.", "model.norm.")): + return + + if name.endswith("pos_embed") or name.endswith("rel_pos_h") or name.endswith("rel_pos_w"): + name += ".weight" + + yield from super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("Gemma3nForConditionalGeneration") class Gemma3nVisionAudioModel(ConformerAudioModel): has_audio_encoder = True @@ -8280,6 +8348,19 @@ class DeepseekV2Model(TextModel): merge_expert = True + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + hparams: dict = ModelBase.load_hparams(self.dir_model, is_mistral_format=False) + self.origin_hf_arch = hparams.get('architectures', [None])[0] + + # special handling for Deepseek OCR + if self.origin_hf_arch == "DeepseekOCRForCausalLM": + self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR + self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch] + self.gguf_writer.add_architecture() + # default jinja template + self.gguf_writer.add_chat_template("{% for m in messages %}{{m['content']}}{% endfor %}") + def set_vocab(self): try: self._set_vocab_gpt2() @@ -8335,9 +8416,15 @@ class DeepseekV2Model(TextModel): raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!") def set_gguf_parameters(self): + is_ocr = (self.model_arch == gguf.MODEL_ARCH.DEEPSEEK2OCR) - # note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group) - self.hparams["num_key_value_heads"] = 1 + if is_ocr: + self.hparams['rope_theta'] = self.hparams.get('rope_theta', 10000.0) + else: + # note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group) + self.hparams["num_key_value_heads"] = 1 + + self.hparams['rms_norm_eps'] = self.hparams.get('rms_norm_eps', 1e-6) super().set_gguf_parameters() hparams = self.hparams @@ -8351,16 +8438,18 @@ class DeepseekV2Model(TextModel): # Default: if no MoE, all layers are dense; if MoE, none are dense first_k_dense_replace = hparams["num_hidden_layers"] if not has_moe else 0 self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace) + kv_lora_rank = hparams.get("kv_lora_rank", 512) self.gguf_writer.add_vocab_size(hparams["vocab_size"]) if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) - self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) # note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA - self.gguf_writer.add_key_length(hparams["kv_lora_rank"] + hparams["qk_rope_head_dim"]) - self.gguf_writer.add_value_length(hparams["kv_lora_rank"]) - self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) - self.gguf_writer.add_value_length_mla(hparams["v_head_dim"]) + if not is_ocr: + self.gguf_writer.add_kv_lora_rank(kv_lora_rank) + self.gguf_writer.add_key_length(kv_lora_rank + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_value_length(kv_lora_rank) + self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_value_length_mla(hparams["v_head_dim"]) # MoE parameters (required by C++ code for DEEPSEEK2 arch) # For non-MoE models like Youtu, use intermediate_size as expert_feed_forward_length @@ -8392,8 +8481,15 @@ class DeepseekV2Model(TextModel): _experts: list[dict[str, Tensor]] | None = None def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # skip vision tensors and remove "language_model." for Kimi-VL and Kimi-K2.5 - if "vision_tower" in name or "multi_modal_projector" in name or "mm_projector" in name: + # skip vision tensors and remove "language_model." for Kimi-VL and Kimi-K2.5, and DeepSeek-OCR + if ("vision_tower" in name + or "multi_modal_projector" in name + or "mm_projector" in name + or "vision_model" in name + or "image_newline" in name + or "model.projector" in name + or "sam_model" in name + or "view_seperator" in name): return if name.startswith("siglip2.") or name.startswith("merger."): return diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index b31ddcca7..1e8b29fb2 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -154,6 +154,7 @@ models = [ {"name": "qwen35", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3.5-9B-Instruct", }, {"name": "joyai-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", }, {"name": "kanana2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", }, + {"name": "f2llmv2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", }, ] # some models are known to be broken upstream, so we will skip them as exceptions diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 728b13753..86e081db4 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -3708,8 +3708,12 @@ struct ggml_cplan ggml_graph_plan( const int64_t ne11 = node->src[1]->ne[1]; // H const int64_t ne12 = node->src[1]->ne[2]; // Channels In - cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03; - cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12; + GGML_ASSERT(node->src[0]->type == GGML_TYPE_F16 || node->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(node->src[1]->type == GGML_TYPE_F32); + + cur += ggml_type_size(node->src[0]->type) * ne00 * ne01 * ne02 * ne03; + cur += ggml_type_size(node->src[0]->type) * ne10 * ne11 * ne12; + } break; case GGML_OP_TOP_K: { diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 3f85e531d..d950972c8 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -6923,16 +6923,15 @@ void ggml_compute_forward_conv_3d( ggml_compute_forward_conv_3d_impl(params, src0, src1, dst, src0->type); } -// ggml_compute_forward_conv_transpose_2d - -void ggml_compute_forward_conv_transpose_2d( - const ggml_compute_params * params, - ggml_tensor * dst) { +template +static void ggml_compute_forward_conv_transpose_2d_impl( + const ggml_compute_params * params, + ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; - GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -6943,7 +6942,7 @@ void ggml_compute_forward_conv_transpose_2d( const int nk = ne00*ne01*ne02*ne03; - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == ggml_type_size(src0->type)); GGML_ASSERT(nb10 == sizeof(float)); if (ith == 0) { @@ -6951,12 +6950,12 @@ void ggml_compute_forward_conv_transpose_2d( // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout) { - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + kernel_t * const wdata = (kernel_t *) params->wdata + 0; for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02); - ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03; + const kernel_t * const src = (kernel_t *)((char *) src0->data + i03*nb03 + i02*nb02); + kernel_t * dst_data = wdata + i02*ne01*ne00*ne03; for (int64_t i01 = 0; i01 < ne01; i01++) { for (int64_t i00 = 0; i00 < ne00; i00++) { dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00]; @@ -6968,13 +6967,17 @@ void ggml_compute_forward_conv_transpose_2d( // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh) { - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk; + kernel_t * const wdata = (kernel_t *) params->wdata + nk; for (int i12 = 0; i12 < ne12; i12++) { for (int i11 = 0; i11 < ne11; i11++) { const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11); - ggml_fp16_t * dst_data = wdata + i11*ne10*ne12; + kernel_t * dst_data = wdata + i11*ne10*ne12; for (int i10 = 0; i10 < ne10; i10++) { - dst_data[i10*ne12 + i12] = GGML_CPU_FP32_TO_FP16(src[i10]); + if constexpr (std::is_same_v) { + dst_data[i10*ne12 + i12] = GGML_CPU_FP32_TO_FP16(src[i10]); + } else { + dst_data[i10*ne12 + i12] = src[i10]; + } } } } @@ -6996,21 +6999,27 @@ void ggml_compute_forward_conv_transpose_2d( const int ip0 = dp*ith; const int ip1 = MIN(ip0 + dp, np); - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; - ggml_fp16_t * const wdata_src = wdata + nk; + kernel_t * const wdata = (kernel_t *) params->wdata + 0; + kernel_t * const wdata_src = wdata + nk; for (int i2 = ip0; i2 < ip1; i2++) { // Cout float * dst_data = (float *)((char *) dst->data + i2*nb2); - ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03; + kernel_t * wdata_kernel = wdata + i2*ne01*ne00*ne03; for (int i11 = 0; i11 < ne11; i11++) { for (int i10 = 0; i10 < ne10; i10++) { const int i1n = i11*ne10*ne12 + i10*ne12; for (int i01 = 0; i01 < ne01; i01++) { for (int i00 = 0; i00 < ne00; i00++) { float v = 0; - ggml_vec_dot_f16(ne03, &v, 0, - wdata_src + i1n, 0, - wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1); + if constexpr (std::is_same_v) { + ggml_vec_dot_f16(ne03, &v, 0, + wdata_src + i1n, 0, + wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1); + } else { + ggml_vec_dot_f32(ne03, &v, 0, + wdata_src + i1n, 0, + wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1); + } dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v; } } @@ -7019,6 +7028,28 @@ void ggml_compute_forward_conv_transpose_2d( } } +void ggml_compute_forward_conv_transpose_2d( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_conv_transpose_2d_impl(params, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_conv_transpose_2d_impl(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + // ggml_compute_forward_conv_2d_dw struct ggml_conv_2d_dw_params { diff --git a/ggml/src/ggml-cuda/conv2d-transpose.cu b/ggml/src/ggml-cuda/conv2d-transpose.cu index 03224e404..6cbd6f879 100644 --- a/ggml/src/ggml-cuda/conv2d-transpose.cu +++ b/ggml/src/ggml-cuda/conv2d-transpose.cu @@ -1,12 +1,20 @@ -#include - #include "conv2d-transpose.cuh" -#include "ggml.h" +#include "convert.cuh" -__global__ void conv2d_transpose_kernel(const float * __restrict__ input, const half * __restrict__ kernel, - float * __restrict__ output, const int in_w, const int in_h, const int out_w, - const int out_h, const int kernel_w, const int kernel_h, const int stride, - const int c_in, const int c_out, const int batches) { +template +static __global__ void conv2d_transpose_kernel(const float * __restrict__ input, + const kernel_t * __restrict__ kernel, + float * __restrict__ output, + const int in_w, + const int in_h, + const int out_w, + const int out_h, + const int kernel_w, + const int kernel_h, + const int stride, + const int c_in, + const int c_out, + const int batches) { const int global_idx = blockIdx.x * blockDim.x + threadIdx.x; const int total_elements = out_w * out_h * c_out * batches; @@ -26,24 +34,32 @@ __global__ void conv2d_transpose_kernel(const float * __restrict__ input, const for (int c_in_idx = 0; c_in_idx < c_in; c_in_idx++) { for (int kh = 0; kh < kernel_h; ++kh) { int in_y = out_y_idx - kh; - if (in_y < 0 || in_y % stride) continue; + if (in_y < 0 || in_y % stride) { + continue; + } in_y /= stride; - if (in_y >= in_h) continue; + if (in_y >= in_h) { + continue; + } for (int kw = 0; kw < kernel_w; ++kw) { int in_x = out_x_idx - kw; - if (in_x < 0 || in_x % stride) continue; + if (in_x < 0 || in_x % stride) { + continue; + } in_x /= stride; - if (in_x >= in_w) continue; + if (in_x >= in_w) { + continue; + } const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + (in_w) *in_y + in_x; const int kernel_idx = (kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx + (kernel_w) *kh + kw; - float input_val = input[input_idx]; - half kern_val = kernel[kernel_idx]; + float input_val = input[input_idx]; + kernel_t kern_val = kernel[kernel_idx]; - accumulator += input_val * (float) kern_val; + accumulator += input_val * ggml_cuda_cast(kern_val); } } } @@ -56,11 +72,12 @@ void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor const ggml_tensor * kernel = dst->src[0]; const ggml_tensor * input = dst->src[1]; - GGML_ASSERT(kernel->type == GGML_TYPE_F16 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + GGML_ASSERT(kernel->type == GGML_TYPE_F16 || kernel->type == GGML_TYPE_F32); + GGML_ASSERT(input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); const float * input_data = (const float *) input->data; float * output_data = (float *) dst->data; - const half * kernel_data = (const half *) kernel->data; + const void * kernel_data = kernel->data; const int input_w = input->ne[0]; const int input_h = input->ne[1]; @@ -82,10 +99,17 @@ void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor GGML_ASSERT(ggml_is_contiguous(kernel)); GGML_ASSERT(ggml_is_contiguous(dst)); - const int total = (output_w * output_h * channels_out * batches); + const int total = output_w * output_h * channels_out * batches; const int blocks = (total + CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE - 1) / CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE; - conv2d_transpose_kernel<<>>( - input_data, kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w, kernel_h, stride, - channels_in, channels_out, batches); + if (kernel->type == GGML_TYPE_F16) { + conv2d_transpose_kernel<<>>( + input_data, (const half *) kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w, + kernel_h, stride, channels_in, channels_out, batches); + + } else { + conv2d_transpose_kernel<<>>( + input_data, (const float *) kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w, + kernel_h, stride, channels_in, channels_out, batches); + } } diff --git a/ggml/src/ggml-cuda/conv2d-transpose.cuh b/ggml/src/ggml-cuda/conv2d-transpose.cuh index c9430b248..72889c5f0 100644 --- a/ggml/src/ggml-cuda/conv2d-transpose.cuh +++ b/ggml/src/ggml-cuda/conv2d-transpose.cuh @@ -1,4 +1,5 @@ #include "common.cuh" #define CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE 256 + void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 59f2838da..4b417a1b5 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4978,6 +4978,7 @@ static struct ggml_tensor * ggml_interpolate_impl( GGML_ASSERT((mode & 0xFF) < GGML_SCALE_MODE_COUNT); // TODO: implement antialias for modes other than bilinear GGML_ASSERT(!(mode & GGML_SCALE_FLAG_ANTIALIAS) || (mode & 0xFF) == GGML_SCALE_MODE_BILINEAR); + GGML_ASSERT(a->type == GGML_TYPE_F32); struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3); @@ -5323,6 +5324,7 @@ struct ggml_tensor * ggml_flash_attn_ext( GGML_ASSERT(q->ne[3] == v->ne[3]); if (mask) { + GGML_ASSERT(mask->type == GGML_TYPE_F16); GGML_ASSERT(ggml_is_contiguous(mask)); //GGML_ASSERT(ggml_can_repeat_rows(mask, qk)); diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 9383644ab..b35c976e8 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -326,6 +326,11 @@ class Keys: class Projector: SCALE_FACTOR = "clip.vision.projector.scale_factor" + class SAM: + BLOCK_COUNT = "clip.vision.sam.block_count" + EMBEDDING_LENGTH = "clip.vision.sam.embedding_length" + HEAD_COUNT = "clip.vision.sam.head_count" + class ClipAudio: PROJECTOR_TYPE = "clip.audio.projector_type" # for mixed modality models NUM_MEL_BINS = "clip.audio.num_mel_bins" @@ -434,6 +439,7 @@ class MODEL_ARCH(IntEnum): ARCTIC = auto() DEEPSEEK = auto() DEEPSEEK2 = auto() + DEEPSEEK2OCR = auto() CHATGLM = auto() GLM4 = auto() GLM4_MOE = auto() @@ -755,6 +761,22 @@ class MODEL_TENSOR(IntEnum): V_MM_GATE = auto() # cogvlm V_TOK_BOI = auto() # cogvlm V_TOK_EOI = auto() # cogvlm + V_SAM_POS_EMBD = auto() # Deepseek-OCR + V_SAM_PATCH_EMBD = auto() # Deepseek-OCR + V_SAM_PRE_NORM = auto() # Deepseek-OCR + V_SAM_POST_NORM = auto() # Deepseek-OCR + V_SAM_ATTN_POS_H = auto() # Deepseek-OCR + V_SAM_ATTN_POS_W = auto() # Deepseek-OCR + V_SAM_ATTN_QKV = auto() # Deepseek-OCR + V_SAM_ATTN_OUT = auto() # Deepseek-OCR + V_SAM_MLP_LIN_1 = auto() # Deepseek-OCR + V_SAM_MLP_LIN_2 = auto() # Deepseek-OCR + V_SAM_NECK = auto() # Deepseek-OCR + V_SAM_NET_2 = auto() # Deepseek-OCR + V_SAM_NET_3 = auto() # Deepseek-OCR + V_ENC_EMBD_IMGNL = auto() # Deepseek-OCR + V_ENC_EMBD_VSEP = auto() # Deepseek-OCR + # audio (mtmd) A_ENC_EMBD_POS = auto() A_ENC_EMBD_NORM = auto() @@ -880,6 +902,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.ARCTIC: "arctic", MODEL_ARCH.DEEPSEEK: "deepseek", MODEL_ARCH.DEEPSEEK2: "deepseek2", + MODEL_ARCH.DEEPSEEK2OCR: "deepseek2-ocr", MODEL_ARCH.CHATGLM: "chatglm", MODEL_ARCH.GLM4: "glm4", MODEL_ARCH.GLM4_MOE: "glm4moe", @@ -1199,6 +1222,22 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.V_MM_GATE: "mm.gate", MODEL_TENSOR.V_TOK_BOI: "v.boi", MODEL_TENSOR.V_TOK_EOI: "v.eoi", + # DeepSeek-OCR SAM + MODEL_TENSOR.V_SAM_POS_EMBD: "v.sam.pos_embd", + MODEL_TENSOR.V_SAM_PATCH_EMBD: "v.sam.patch_embd", + MODEL_TENSOR.V_SAM_PRE_NORM: "v.sam.blk.{bid}.pre_ln", + MODEL_TENSOR.V_SAM_POST_NORM: "v.sam.blk.{bid}.post_ln", + MODEL_TENSOR.V_SAM_ATTN_POS_H: "v.sam.blk.{bid}.attn.pos_h", + MODEL_TENSOR.V_SAM_ATTN_POS_W: "v.sam.blk.{bid}.attn.pos_w", + MODEL_TENSOR.V_SAM_ATTN_QKV: "v.sam.blk.{bid}.attn.qkv", + MODEL_TENSOR.V_SAM_ATTN_OUT: "v.sam.blk.{bid}.attn.out", + MODEL_TENSOR.V_SAM_MLP_LIN_1: "v.sam.blk.{bid}.mlp.lin1", + MODEL_TENSOR.V_SAM_MLP_LIN_2: "v.sam.blk.{bid}.mlp.lin2", + MODEL_TENSOR.V_SAM_NECK: "v.sam.neck.{bid}", + MODEL_TENSOR.V_SAM_NET_2: "v.sam.net_2", + MODEL_TENSOR.V_SAM_NET_3: "v.sam.net_3", + MODEL_TENSOR.V_ENC_EMBD_IMGNL: "v.image_newline", # Deepseek-OCR + MODEL_TENSOR.V_ENC_EMBD_VSEP: "v.view_seperator", # Deepseek-OCR # audio (mtmd) # note: all audio tensor names must use prefix "a." or "mm.a." MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd", @@ -1265,6 +1304,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.V_ENC_EMBD_PATCH, MODEL_TENSOR.V_ENC_EMBD_NORM, MODEL_TENSOR.V_ENC_EMBD_POS, + MODEL_TENSOR.V_ENC_EMBD_IMGNL, + MODEL_TENSOR.V_ENC_EMBD_VSEP, MODEL_TENSOR.V_ENC_INPUT_NORM, MODEL_TENSOR.V_ENC_ATTN_QKV, MODEL_TENSOR.V_ENC_ATTN_Q, @@ -1317,6 +1358,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.V_MM_GATE, MODEL_TENSOR.V_TOK_BOI, MODEL_TENSOR.V_TOK_EOI, + MODEL_TENSOR.V_SAM_POS_EMBD, + MODEL_TENSOR.V_SAM_PATCH_EMBD, + MODEL_TENSOR.V_SAM_PRE_NORM, + MODEL_TENSOR.V_SAM_POST_NORM, + MODEL_TENSOR.V_SAM_ATTN_POS_H, + MODEL_TENSOR.V_SAM_ATTN_POS_W, + MODEL_TENSOR.V_SAM_ATTN_QKV, + MODEL_TENSOR.V_SAM_ATTN_OUT, + MODEL_TENSOR.V_SAM_MLP_LIN_1, + MODEL_TENSOR.V_SAM_MLP_LIN_2, + MODEL_TENSOR.V_SAM_NECK, + MODEL_TENSOR.V_SAM_NET_2, + MODEL_TENSOR.V_SAM_NET_3, # audio MODEL_TENSOR.A_ENC_EMBD_POS, MODEL_TENSOR.A_ENC_EMBD_NORM, @@ -2612,7 +2666,41 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.ATTN_Q_B, MODEL_TENSOR.ATTN_KV_A_MQA, MODEL_TENSOR.ATTN_KV_B, + MODEL_TENSOR.ATTN_K, MODEL_TENSOR.ATTN_K_B, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_V_B, + MODEL_TENSOR.ATTN_Q_A_NORM, + MODEL_TENSOR.ATTN_KV_A_NORM, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + MODEL_TENSOR.FFN_EXP_PROBS_B, + ], + MODEL_ARCH.DEEPSEEK2OCR: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_A, + MODEL_TENSOR.ATTN_Q_B, + MODEL_TENSOR.ATTN_KV_A_MQA, + MODEL_TENSOR.ATTN_KV_B, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_B, + MODEL_TENSOR.ATTN_V, MODEL_TENSOR.ATTN_V_B, MODEL_TENSOR.ATTN_Q_A_NORM, MODEL_TENSOR.ATTN_KV_A_NORM, @@ -3741,6 +3829,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_ROT_EMBD, ], + MODEL_ARCH.DEEPSEEK2OCR: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_ROT_EMBD, + ], MODEL_ARCH.CHATGLM: [ MODEL_TENSOR.ROPE_FREQS, ], @@ -3938,6 +4030,7 @@ class VisionProjectorType: LIGHTONOCR = "lightonocr" COGVLM = "cogvlm" JANUS_PRO = "janus_pro" + DEEPSEEKOCR = "deepseekocr" LFM2A = "lfm2a" # audio MUSIC_FLAMINGO = "musicflamingo" # audio GLM4V = "glm4v" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 010dfeea1..37b987993 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1218,6 +1218,15 @@ class GGUFWriter: def add_vision_window_size(self, value: int) -> None: self.add_uint32(Keys.ClipVision.WINDOW_SIZE, value) + def add_vision_sam_layers_count(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.SAM.BLOCK_COUNT, value) + + def add_vision_sam_embedding_length(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.SAM.EMBEDDING_LENGTH, value) + + def add_vision_sam_head_count(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.SAM.HEAD_COUNT, value) + # audio models def add_clip_audio_projector_type(self, value: str) -> None: diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 18131e540..281c1a830 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1344,6 +1344,7 @@ class TensorNameMap: MODEL_TENSOR.V_MMPROJ_FC: ( "model.connector.modality_projection.proj", # SmolVLM "model.vision.linear_proj.linear_proj", # cogvlm + "model.projector.layers", # Deepseek-OCR "visual.merger.proj", # glm4v ), @@ -1364,6 +1365,7 @@ class TensorNameMap: "vision_model.class_embedding", # llama 4 "model.vision.patch_embedding.cls_embedding", # cogvlm "vision_model.radio_model.model.patch_generator.cls_token.token", # Nemotron Nano v2 VL + "model.vision_model.embeddings.class_embedding", # Deepseek-OCR ), MODEL_TENSOR.V_ENC_EMBD_PATCH: ( @@ -1377,6 +1379,7 @@ class TensorNameMap: "visual.patch_embed.proj", # qwen2vl "vision_tower.patch_embed.proj", # kimi-vl "model.vision.patch_embedding.proj", # cogvlm + "model.vision_model.embeddings.patch_embedding", # Deepseek-OCR CLIP "siglip2.vision_model.embeddings.patch_embedding", "vision_model.radio_model.model.patch_generator.embedder", # Nemotron Nano v2 VL ), @@ -1398,10 +1401,19 @@ class TensorNameMap: "vision_model.radio_model.model.patch_generator.pos_embed", # Nemotron Nano v2 VL ), + MODEL_TENSOR.V_ENC_EMBD_IMGNL: ( + "model.image_newline", # Deepseek-OCR + ), + + MODEL_TENSOR.V_ENC_EMBD_VSEP: ( + "model.view_seperator", # Deepseek-OCR + ), + MODEL_TENSOR.V_ENC_ATTN_QKV: ( "visual.blocks.{bid}.attn.qkv", # qwen3vl "model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm - "vision_tower.encoder.blocks.{bid}.wqkv", # Kimi-K2.5 + "model.vision_model.transformer.layers.{bid}.self_attn.qkv_proj", # Deepseek-OCR CLIP + "vision_tower.encoder.blocks.{bid}.wqkv" # Kimi-K2.5 "vision_model.radio_model.model.blocks.{bid}.attn.qkv", # Nemotron Nano v2 VL ), @@ -1416,6 +1428,7 @@ class TensorNameMap: "visual.blocks.{bid}.attn.q", # qwen2vl, generated "vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated "siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl + "model.vision_model.transformer.layers.{bid}.self_attn.q_proj", # Deepseek-OCR CLIP, generated ), MODEL_TENSOR.V_ENC_ATTN_Q_NORM: ( @@ -1434,6 +1447,7 @@ class TensorNameMap: "vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral "visual.blocks.{bid}.attn.k", # qwen2vl, generated "vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated + "model.vision_model.transformer.layers.{bid}.self_attn.k_proj", # Deepseek-OCR CLIP, generated "siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj", ), @@ -1454,6 +1468,7 @@ class TensorNameMap: "visual.blocks.{bid}.attn.v", # qwen2vl, generated "vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated "siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj", + "model.vision_model.transformer.layers.{bid}.self_attn.v_proj", # Deepseek-OCR CLIP, generated ), MODEL_TENSOR.V_ENC_INPUT_NORM: ( @@ -1468,6 +1483,7 @@ class TensorNameMap: "visual.blocks.{bid}.norm1", # qwen2vl "vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1) "model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm + "model.vision_model.transformer.layers.{bid}.layer_norm1", # Deepseek-OCR CLIP "siglip2.vision_model.encoder.layers.{bid}.layer_norm1", "vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL ), @@ -1485,6 +1501,7 @@ class TensorNameMap: "visual.blocks.{bid}.attn.proj", # qwen2vl "vision_tower.encoder.blocks.{bid}.wo", # kimi-vl "model.vision.transformer.layers.{bid}.attention.dense", # cogvlm + "model.vision_model.transformer.layers.{bid}.self_attn.out_proj", # Deepseek-OCR CLIP "siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl "vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL ), @@ -1501,6 +1518,7 @@ class TensorNameMap: "visual.blocks.{bid}.norm2", # qwen2vl "vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1) "model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm + "model.vision_model.transformer.layers.{bid}.layer_norm2", # Deepseek-OCR CLIP "siglip2.vision_model.encoder.layers.{bid}.layer_norm2", "vision_model.radio_model.model.blocks.{bid}.norm2", # Nemotron Nano v2 VL ), @@ -1517,6 +1535,7 @@ class TensorNameMap: "visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl "visual.blocks.{bid}.mlp.linear_fc1", # qwen3vl "vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1) + "model.vision_model.transformer.layers.{bid}.mlp.fc1", # Deepseek-OCR CLIP "model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm "siglip2.vision_model.encoder.layers.{bid}.mlp.fc1", "vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL @@ -1541,6 +1560,7 @@ class TensorNameMap: "visual.blocks.{bid}.mlp.linear_fc2", # qwen3vl "vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1) "model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm + "model.vision_model.transformer.layers.{bid}.mlp.fc2", # Deepseek-OCR CLIP "siglip2.vision_model.encoder.layers.{bid}.mlp.fc2", "vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL ), @@ -1560,6 +1580,7 @@ class TensorNameMap: "vision_tower.ln_pre", # pixtral-hf "vision_encoder.ln_pre", # pixtral "vision_model.layernorm_pre", # llama4 + "model.vision_model.pre_layrnorm", # Deepseek-OCR CLIP ), MODEL_TENSOR.V_POST_NORM: ( @@ -1662,6 +1683,58 @@ class TensorNameMap: "model.visual.deepstack_merger_list.{bid}.linear_fc2", # deepstack in qwen3vl ), + MODEL_TENSOR.V_SAM_POS_EMBD: ( + "model.sam_model.pos_embed", + ), + + MODEL_TENSOR.V_SAM_PATCH_EMBD: ( + "model.sam_model.patch_embed.proj", + ), + + MODEL_TENSOR.V_SAM_PRE_NORM: ( + "model.sam_model.blocks.{bid}.norm1", # deepstack in qwen3vl + ), + + MODEL_TENSOR.V_SAM_POST_NORM: ( + "model.sam_model.blocks.{bid}.norm2", # deepstack in qwen3vl + ), + + MODEL_TENSOR.V_SAM_ATTN_POS_H: ( + "model.sam_model.blocks.{bid}.attn.rel_pos_h", + ), + + MODEL_TENSOR.V_SAM_ATTN_POS_W: ( + "model.sam_model.blocks.{bid}.attn.rel_pos_w", + ), + + MODEL_TENSOR.V_SAM_ATTN_QKV: ( + "model.sam_model.blocks.{bid}.attn.qkv", + ), + + MODEL_TENSOR.V_SAM_ATTN_OUT: ( + "model.sam_model.blocks.{bid}.attn.proj", + ), + + MODEL_TENSOR.V_SAM_MLP_LIN_1: ( + "model.sam_model.blocks.{bid}.mlp.lin1", + ), + + MODEL_TENSOR.V_SAM_MLP_LIN_2: ( + "model.sam_model.blocks.{bid}.mlp.lin2", + ), + + MODEL_TENSOR.V_SAM_NECK: ( + "model.sam_model.neck.{bid}", + ), + + MODEL_TENSOR.V_SAM_NET_2: ( + "model.sam_model.net_2", + ), + + MODEL_TENSOR.V_SAM_NET_3: ( + "model.sam_model.net_3", + ), + MODEL_TENSOR.V_MM_POST_FC_NORM: ( "model.vision.linear_proj.norm1", # cogvlm ), diff --git a/scripts/snapdragon/windows/run-completion.ps1 b/scripts/snapdragon/windows/run-completion.ps1 index 8a48d2d74..1221330f2 100644 --- a/scripts/snapdragon/windows/run-completion.ps1 +++ b/scripts/snapdragon/windows/run-completion.ps1 @@ -44,10 +44,14 @@ if ($null -ne $env:NDEV) { $env:GGML_HEXAGON_NDEV=$env:NDEV } +if ($null -ne $env:HB) { + $env:GGML_HEXAGON_HOSTBUF=$env:HB +} + $env:ADSP_LIBRARY_PATH="$basedir\lib" & "$basedir\bin\llama-completion.exe" ` --no-mmap -m $basedir\..\..\gguf\$model ` --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 ` - --ctx-size 8192 --batch-size 128 -fa on ` + --ctx-size 8192 --batch-size 256 -fa on ` -ngl 99 -no-cnv --device $device $cli_opts diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index b05178859..a90518763 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -73,6 +73,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_ARCTIC, "arctic" }, { LLM_ARCH_DEEPSEEK, "deepseek" }, { LLM_ARCH_DEEPSEEK2, "deepseek2" }, + { LLM_ARCH_DEEPSEEK2OCR, "deepseek2-ocr" }, { LLM_ARCH_CHATGLM, "chatglm" }, { LLM_ARCH_GLM4, "glm4" }, { LLM_ARCH_GLM4_MOE, "glm4moe" }, @@ -1571,6 +1572,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_FFN_UP_SHEXP, }; case LLM_ARCH_DEEPSEEK2: + case LLM_ARCH_DEEPSEEK2OCR: case LLM_ARCH_MISTRAL4: return { LLM_TENSOR_TOKEN_EMBD, @@ -1579,6 +1581,8 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_ATTN_NORM, LLM_TENSOR_ATTN_Q_A_NORM, LLM_TENSOR_ATTN_KV_A_NORM, + LLM_TENSOR_ATTN_K, // deepseek-ocr + LLM_TENSOR_ATTN_V, // deepseek-ocr LLM_TENSOR_ATTN_Q, LLM_TENSOR_ATTN_Q_A, LLM_TENSOR_ATTN_Q_B, diff --git a/src/llama-arch.h b/src/llama-arch.h index 9b9eec2f5..4c5b6a1ad 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -77,6 +77,7 @@ enum llm_arch { LLM_ARCH_ARCTIC, LLM_ARCH_DEEPSEEK, LLM_ARCH_DEEPSEEK2, + LLM_ARCH_DEEPSEEK2OCR, LLM_ARCH_CHATGLM, LLM_ARCH_GLM4, LLM_ARCH_GLM4_MOE, diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp index c415a998f..78cbc38db 100644 --- a/src/llama-chat.cpp +++ b/src/llama-chat.cpp @@ -49,6 +49,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK }, { "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 }, { "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 }, + { "deepseek-ocr", LLM_CHAT_TEMPLATE_DEEPSEEK_OCR }, { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R }, { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 }, { "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 }, @@ -548,6 +549,11 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << LU8("<|Assistant|>"); } + } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_OCR) { + for (auto message : chat) { + // no template + ss << message->content; + } } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) { // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb // EXAONE-3.0-7.8B-Instruct diff --git a/src/llama-chat.h b/src/llama-chat.h index 9ed1db128..ef7dfeceb 100644 --- a/src/llama-chat.h +++ b/src/llama-chat.h @@ -28,6 +28,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_DEEPSEEK, LLM_CHAT_TEMPLATE_DEEPSEEK_2, LLM_CHAT_TEMPLATE_DEEPSEEK_3, + LLM_CHAT_TEMPLATE_DEEPSEEK_OCR, LLM_CHAT_TEMPLATE_COMMAND_R, LLM_CHAT_TEMPLATE_LLAMA_3, LLM_CHAT_TEMPLATE_CHATGLM_3, diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 9a215bb77..11759ae1e 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1516,7 +1516,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( if (!weight_before_ffn) { experts = ggml_mul(ctx0, experts, weights); - cb(cur, "ffn_moe_weighted", il); + cb(experts, "ffn_moe_weighted", il); } ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr }; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 42fe2f2ca..034d81988 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -1566,7 +1566,6 @@ ggml_tensor * llama_kv_cache::build_rope_shift( // ref: https://github.com/ggml-org/llama.cpp/pull/13870 ? LLAMA_ROPE_TYPE_NEOX : hparams.rope_type; - ggml_tensor * tmp; if (ggml_is_quantized(cur->type)) { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 3739a16da..ecac2811f 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -484,6 +484,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train); ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd); ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl, false); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false); + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); @@ -862,8 +864,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { case LLM_ARCH_BERT: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); switch (hparams.n_layer) { case 3: @@ -895,8 +895,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { } ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); switch (hparams.n_layer) { case 12: @@ -911,8 +909,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { case LLM_ARCH_JINA_BERT_V2: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); hparams.f_max_alibi_bias = 8.0f; switch (hparams.n_layer) { @@ -924,8 +920,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { case LLM_ARCH_JINA_BERT_V3: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); switch (hparams.n_layer) { case 24: @@ -937,8 +931,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { case LLM_ARCH_NOMIC_BERT_MOE: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0); if (hparams.n_layer == 12 && hparams.n_embd == 768) { @@ -952,8 +944,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { case LLM_ARCH_NEO_BERT: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); if (hparams.n_layer == 28) { type = LLM_TYPE_250M; @@ -962,8 +952,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { case LLM_ARCH_EUROBERT: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); if (hparams.n_layer == 12) { type = LLM_TYPE_SMALL; // 0.2B @@ -1027,7 +1015,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { // fall through case LLM_ARCH_QWEN2: { - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break; @@ -1109,7 +1096,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { } break; case LLM_ARCH_QWEN3: { - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break; @@ -1401,7 +1387,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); //applied only if model converted with --sentence-transformers-dense-modules ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false); @@ -1750,6 +1735,26 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_DEEPSEEK2OCR: + { + // similar to deepseek2, but without MLA + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false); + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); + ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); + + if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) { + hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX; + } + + switch (hparams.n_layer) { + case 12: type = LLM_TYPE_3B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_PLM: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -2198,7 +2203,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps); ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups); - ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false); } break; case LLM_ARCH_BAILINGMOE: { @@ -5125,6 +5129,60 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0); create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0); + // Shared expert branch + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0); + } + } + } break; + case LLM_ARCH_DEEPSEEK2OCR: + { + // similar to deepseek2, but without MLA + const int64_t n_ff_exp = hparams.n_ff_exp; + const int64_t n_expert_shared = hparams.n_expert_shared; + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + // try to load output.weight, if not found, use token_embd (tied embeddings) + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + if (!output) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + + // norm + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + if (i < (int) hparams.n_layer_dense_lead) { + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + } else { + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); + layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED); + + if (n_expert == 0) { + throw std::runtime_error("n_expert must be > 0"); + } + if (n_expert_used == 0) { + throw std::runtime_error("n_expert_used must be > 0"); + } + + // MoE branch + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0); + create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0); + // Shared expert branch layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0); layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0); @@ -8016,7 +8074,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); } - if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) { + if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) { LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q); LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv); @@ -8593,6 +8651,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { llm = std::make_unique(*this, params); } break; case LLM_ARCH_DEEPSEEK2: + case LLM_ARCH_DEEPSEEK2OCR: case LLM_ARCH_GLM_DSA: case LLM_ARCH_MISTRAL4: { @@ -8993,6 +9052,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_ARCTIC: case LLM_ARCH_DEEPSEEK: case LLM_ARCH_DEEPSEEK2: + case LLM_ARCH_DEEPSEEK2OCR: case LLM_ARCH_PLM: case LLM_ARCH_CHATGLM: case LLM_ARCH_GRANITE: diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 04e6c7a86..56c9e8197 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -344,7 +344,10 @@ static bool tensor_allows_quantization(const llama_model_quantize_params * param quantize &= name.find("attn_rel_b.weight") == std::string::npos; // do not quantize specific multimodal tensors - quantize &= name.find(".position_embd.") == std::string::npos; + quantize &= name.find(".position_embd") == std::string::npos; + quantize &= name.find("sam.patch_embd") == std::string::npos; + quantize &= name.find("sam.pos_embd") == std::string::npos; + quantize &= name.find(".rel_pos") == std::string::npos; return quantize; } diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index cd89f65fc..a809076ef 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -2188,7 +2188,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } else if ( tokenizer_pre == "qwen2" || tokenizer_pre == "deepseek-r1-qwen" || - tokenizer_pre == "kormo") { + tokenizer_pre == "kormo" || + tokenizer_pre == "f2llmv2") { pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2; clean_spaces = false; } else if ( @@ -2728,6 +2729,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { || t.first == "[EOS]" // Kimi-K2 || t.first == "<|end_of_text|>" || t.first == "" // smoldocling + || t.first == "<|end▁of▁sentence|>" // deepseek-ocr ) { special_eog_ids.insert(t.second); if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp index d437fe29e..ef9c8420e 100644 --- a/src/models/deepseek2.cpp +++ b/src/models/deepseek2.cpp @@ -2,6 +2,9 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B + bool is_ocr = model.arch == LLM_ARCH_DEEPSEEK2OCR; + const bool is_mla = hparams.is_mla(); // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA @@ -54,7 +57,38 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr cb(cur, "attn_norm", il); // self_attention - { + if (is_ocr) { + const int n_embed_head = hparams.n_embd / hparams.n_head(); + const int ocr_rope_type = GGML_ROPE_TYPE_NEOX; + GGML_ASSERT(n_embed_head == n_embd_head_k && n_embed_head == n_embd_head_v); + + ggml_tensor * Qcur = NULL; + ggml_tensor * Kcur = NULL; + ggml_tensor * Vcur = NULL; + + Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Qcur, "q", il); + cb(Kcur, "k", il); + cb(Vcur, "v", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embed_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embed_head, n_head, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embed_head, n_head, n_tokens); + + GGML_ASSERT(fabs(freq_base - 10000.0) < 1e-4); + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_embed_head, ocr_rope_type, 0, freq_base, 1, 0, 1, 0, 0); + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_embed_head, ocr_rope_type, 0, freq_base, 1, 0, 1, 0, 0); + cb(Qcur, "q_pe", il); + cb(Kcur, "k_pe", il); + + cur = build_attn(inp_attn_kv, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + } + else { ggml_tensor * q = NULL; const bool is_lite = model.layers[il].wq; diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 683666b1b..e3f4fdedb 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -57,7 +57,9 @@ #define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size" #define KEY_MINICPMV_VERSION "clip.minicpmv_version" #define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num" - +#define KEY_SAM_N_HEAD "clip.vision.sam.head_count" +#define KEY_SAM_N_BLOCK "clip.vision.sam.block_count" +#define KEY_SAM_N_EMBD "clip.vision.sam.embedding_length" // audio-specific #define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities #define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins" @@ -99,12 +101,13 @@ #define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" -#define TN_IMAGE_NEWLINE "model.image_newline" +#define TN_IMAGE_NEWLINE "v.image_newline" +#define TN_IMAGE_SEPERATOR "v.view_seperator" #define TN_MM_INP_NORM "mm.input_norm.weight" #define TN_MM_INP_NORM_B "mm.input_norm.bias" #define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3 #define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3 -#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3 +#define TN_MM_PROJECTOR "mm.model.fc.%s" // idefics3, deepseekocr #define TN_MM_PATCH_MERGER "mm.patch_merger.%s" // mistral small 3.1, glm4v #define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral #define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model) @@ -143,6 +146,19 @@ #define TN_TOK_BOI "v.boi" #define TN_TOK_EOI "v.eoi" +// deepseek-ocr +#define TN_SAM_POS_EMBD "v.sam.pos_embd.%s" +#define TN_SAM_PATCH_EMBD "v.sam.patch_embd.%s" +#define TN_SAM_PRE_NORM "v.sam.blk.%d.pre_ln.%s" +#define TN_SAM_POST_NORM "v.sam.blk.%d.post_ln.%s" +#define TN_SAM_ATTN_POS_H "v.sam.blk.%d.attn.pos_h.%s" +#define TN_SAM_ATTN_POS_W "v.sam.blk.%d.attn.pos_w.%s" +#define TN_SAM_ATTN_QKV "v.sam.blk.%d.attn.qkv.%s" +#define TN_SAM_ATTN_OUT "v.sam.blk.%d.attn.out.%s" +#define TN_SAM_FFN_UP "v.sam.blk.%d.mlp.lin1.%s" +#define TN_SAM_FFN_DOWN "v.sam.blk.%d.mlp.lin2.%s" +#define TN_SAM_NECK "v.sam.neck.%d.%s" +#define TN_SAM_NET "v.sam.net_%d.%s" // (conformer) lfm2 #define TN_PRE_ENCODE_OUT "a.pre_encode.out.%s" #define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s" @@ -236,6 +252,7 @@ enum projector_type { PROJECTOR_TYPE_LIGHTONOCR, PROJECTOR_TYPE_COGVLM, PROJECTOR_TYPE_JANUS_PRO, + PROJECTOR_TYPE_DEEPSEEKOCR, PROJECTOR_TYPE_LFM2A, PROJECTOR_TYPE_GLM4V, PROJECTOR_TYPE_YOUTUVL, @@ -273,6 +290,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"}, { PROJECTOR_TYPE_COGVLM, "cogvlm"}, { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"}, + { PROJECTOR_TYPE_DEEPSEEKOCR,"deepseekocr"}, { PROJECTOR_TYPE_LFM2A, "lfm2a"}, { PROJECTOR_TYPE_GLM4V, "glm4v"}, { PROJECTOR_TYPE_YOUTUVL, "youtuvl"}, diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index 265a17130..e9c454fe6 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -67,6 +67,11 @@ struct clip_hparams { int32_t n_wa_pattern = 0; std::unordered_set wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL) + // deepseek-ocr (sam) + int32_t sam_n_layer = 0; + int32_t sam_n_head = 0; + int32_t sam_n_embd = 0; + // audio int32_t n_mel_bins = 0; // whisper preprocessor int32_t proj_stack_factor = 0; // ultravox @@ -102,6 +107,21 @@ struct clip_hparams { warmup_image_size = n_tok_per_side * patch_size * cur_merge; // TODO: support warmup size for custom token numbers } + // sam vit deepseek-ocr + std::vector global_attn_indices() const { + return { 2, 5, 8, 11 }; + } + bool is_global_attn(int32_t layer) const { + const auto indices = global_attn_indices(); + + for (const auto & idx : indices) { + if (layer == idx) { + return true; + } + } + + return false; + } }; struct clip_layer { @@ -148,6 +168,9 @@ struct clip_layer { ggml_tensor * deepstack_fc2_w = nullptr; ggml_tensor * deepstack_fc2_b = nullptr; + // sam rel_pos + ggml_tensor * rel_pos_w = nullptr; + ggml_tensor * rel_pos_h = nullptr; // lfm2 ggml_tensor * ff_norm_w = nullptr; ggml_tensor * ff_norm_b = nullptr; @@ -240,7 +263,6 @@ struct clip_model { ggml_tensor * post_ln_w; ggml_tensor * post_ln_b; - ggml_tensor * projection; // TODO: rename it to fc (fully connected layer) ggml_tensor * mm_fc_w; ggml_tensor * mm_fc_b; ggml_tensor * mm_ffn_up_w = nullptr; @@ -261,6 +283,8 @@ struct clip_model { ggml_tensor * mm_2_b = nullptr; ggml_tensor * image_newline = nullptr; + ggml_tensor * view_seperator = nullptr; + // Yi type models with mlp+normalization projection ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4 @@ -372,6 +396,23 @@ struct clip_model { ggml_tensor * mm_boi = nullptr; ggml_tensor * mm_eoi = nullptr; + // deepseek ocr sam + ggml_tensor * patch_embed_proj_w = nullptr; + ggml_tensor * patch_embed_proj_b = nullptr; + ggml_tensor * pos_embed = nullptr; + + ggml_tensor * neck_0_w; + ggml_tensor * neck_1_w; + ggml_tensor * neck_1_b; + ggml_tensor * neck_2_w; + ggml_tensor * neck_3_w; + ggml_tensor * neck_3_b; + ggml_tensor * net_2; + ggml_tensor * net_3; + + int32_t n_sam_layers = 12; // used by deepseek-ocr sam encoder + + std::vector sam_layers; // lfm2 audio std::array pre_encode_conv_X_w = {nullptr}; std::array pre_encode_conv_X_b = {nullptr}; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 72b9827c4..a8726b018 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -66,6 +66,7 @@ #include "models/qwen3vl.cpp" #include "models/siglip.cpp" #include "models/whisper-enc.cpp" +#include "models/deepseekocr.cpp" #include "models/mobilenetv5.cpp" #include "models/youtuvl.cpp" @@ -920,6 +921,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_DEEPSEEKOCR: + { + builder = std::make_unique(ctx, img); + } break; case PROJECTOR_TYPE_LFM2A: { builder = std::make_unique(ctx, img); @@ -1381,6 +1386,17 @@ struct clip_model_loader { hparams.set_warmup_n_tokens(28*28); // avoid OOM on warmup } break; + case PROJECTOR_TYPE_DEEPSEEKOCR: + { + hparams.patch_size = 16; + hparams.image_size = 1024; + hparams.warmup_image_size = 1024; + + get_u32(KEY_SAM_N_BLOCK, hparams.sam_n_layer, true); + get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true); + get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true); + get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true); + } break; case PROJECTOR_TYPE_LFM2A: { // audio preprocessing params @@ -1705,7 +1721,7 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_GLM4V: { - model.projection = get_tensor(TN_MM_PROJECTOR); + model.mm_fc_w = get_tensor(string_format(TN_MM_PROJECTOR, "weight")); model.mm_ffn_up_w = get_tensor(string_format(TN_MM_UP, "weight")); model.mm_ffn_up_b = get_tensor(string_format(TN_MM_UP, "bias"), false); model.mm_ffn_gate_w = get_tensor(string_format(TN_MM_GATE, "weight")); @@ -1817,7 +1833,7 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_IDEFICS3: { - model.projection = get_tensor(TN_MM_PROJECTOR); + model.mm_fc_w = get_tensor(string_format(TN_MM_PROJECTOR, "weight")); } break; case PROJECTOR_TYPE_LFM2: { @@ -1932,13 +1948,13 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_LLAMA4: { - model.mm_model_proj = get_tensor(TN_MM_PROJECTOR); + model.mm_model_proj = get_tensor(string_format(TN_MM_PROJECTOR, "weight")); model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight")); model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight")); } break; case PROJECTOR_TYPE_COGVLM: { - model.mm_model_proj = get_tensor(TN_MM_PROJECTOR); + model.mm_model_proj = get_tensor(string_format(TN_MM_PROJECTOR, "weight")); model.mm_post_fc_norm_w = get_tensor(string_format(TN_MM_POST_FC_NORM, "weight")); model.mm_post_fc_norm_b = get_tensor(string_format(TN_MM_POST_FC_NORM, "bias")); model.mm_h_to_4h_w = get_tensor(string_format(TN_MM_H_TO_4H, "weight")); @@ -1961,6 +1977,42 @@ struct clip_model_loader { model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); } break; + case PROJECTOR_TYPE_DEEPSEEKOCR: + { + model.pos_embed = get_tensor(string_format(TN_SAM_POS_EMBD, "weight")); + model.patch_embed_proj_w = get_tensor(string_format(TN_SAM_PATCH_EMBD, "weight")); + model.patch_embed_proj_b = get_tensor(string_format(TN_SAM_PATCH_EMBD, "bias")); + model.sam_layers.resize(model.n_sam_layers); + for (int il = 0; il < model.n_sam_layers; ++il) { + auto & layer = model.sam_layers[il]; + layer.qkv_w = get_tensor(string_format(TN_SAM_ATTN_QKV, il, "weight")); + layer.qkv_b = get_tensor(string_format(TN_SAM_ATTN_QKV, il, "bias")); + layer.o_w = get_tensor(string_format(TN_SAM_ATTN_OUT, il, "weight")); + layer.o_b = get_tensor(string_format(TN_SAM_ATTN_OUT, il, "bias")); + layer.ln_1_w = get_tensor(string_format(TN_SAM_PRE_NORM, il, "weight")); + layer.ln_1_b = get_tensor(string_format(TN_SAM_PRE_NORM, il, "bias")); + layer.ln_2_w = get_tensor(string_format(TN_SAM_POST_NORM, il, "weight")); + layer.ln_2_b = get_tensor(string_format(TN_SAM_POST_NORM, il, "bias")); + layer.rel_pos_h = get_tensor(string_format(TN_SAM_ATTN_POS_H, il, "weight")); + layer.rel_pos_w = get_tensor(string_format(TN_SAM_ATTN_POS_W, il, "weight")); + layer.ff_up_w = get_tensor(string_format(TN_SAM_FFN_UP, il, "weight")); + layer.ff_up_b = get_tensor(string_format(TN_SAM_FFN_UP, il, "bias")); + layer.ff_down_w = get_tensor(string_format(TN_SAM_FFN_DOWN, il, "weight")); + layer.ff_down_b = get_tensor(string_format(TN_SAM_FFN_DOWN, il, "bias")); + } + model.neck_0_w = get_tensor(string_format(TN_SAM_NECK, 0, "weight")); + model.neck_1_b = get_tensor(string_format(TN_SAM_NECK, 1, "bias")); + model.neck_1_w = get_tensor(string_format(TN_SAM_NECK, 1, "weight")); + model.neck_2_w = get_tensor(string_format(TN_SAM_NECK, 2, "weight")); + model.neck_3_b = get_tensor(string_format(TN_SAM_NECK, 3, "bias")); + model.neck_3_w = get_tensor(string_format(TN_SAM_NECK, 3, "weight")); + model.net_2 = get_tensor(string_format(TN_SAM_NET, 2, "weight")); + model.net_3 = get_tensor(string_format(TN_SAM_NET, 3, "weight")); + model.image_newline = get_tensor(TN_IMAGE_NEWLINE); + model.view_seperator = get_tensor(TN_IMAGE_SEPERATOR); + model.mm_fc_w = get_tensor(string_format(TN_MM_PROJECTOR, "weight")); + model.mm_fc_b = get_tensor(string_format(TN_MM_PROJECTOR, "bias")); + } break; case PROJECTOR_TYPE_LFM2A: { for (int i : {0, 2, 3, 5, 6}) { @@ -2557,6 +2609,7 @@ struct img_tool { enum resize_algo { RESIZE_ALGO_BILINEAR, RESIZE_ALGO_BICUBIC, + RESIZE_ALGO_BICUBIC_PILLOW, // RESIZE_ALGO_LANCZOS, // TODO }; @@ -2586,6 +2639,9 @@ struct img_tool { case RESIZE_ALGO_BICUBIC: resize_bicubic(src, dst, target_resolution.width, target_resolution.height); break; + case RESIZE_ALGO_BICUBIC_PILLOW: + resize_bicubic_pillow(src, dst, target_resolution.width, target_resolution.height); + break; default: throw std::runtime_error("Unsupported resize algorithm"); } @@ -2605,6 +2661,9 @@ struct img_tool { case RESIZE_ALGO_BICUBIC: resize_bicubic(src, resized_image, new_width, new_height); break; + case RESIZE_ALGO_BICUBIC_PILLOW: + resize_bicubic_pillow(src, resized_image, new_width, new_height); + break; default: throw std::runtime_error("Unsupported resize algorithm"); } @@ -2815,6 +2874,255 @@ private: return true; } + // Bicubic resize function using Pillow's ImagingResample algorithm + // Adapted from https://github.com/python-pillow/Pillow/blob/main/src/libImaging/Resample.c + // + // Key Difference with resize_bicubic: + // 1. Uses separable filtering: horizontal pass followed by vertical pass + // 2. Pre-computes normalized filter coefficients for each output pixel + // 3. Applies convolution using fixed-point integer arithmetic for performance + static bool resize_bicubic_pillow(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { + // Fixed-point precision: 22 bits = 32 (int32_t) - 8 (uint8_t pixels) - 2 (headroom for accumulation) + // This allows encoding fractional weights as integers: weight * 2^22 + const int PRECISION_BITS = 32 - 8 - 2; + + // Bicubic filter function with a = -0.5 (Note that GGML/PyTorch takes a = -0.75) + // Returns filter weight for distance x from pixel center + // Support: [-2, 2], meaning the filter influences pixels within 2 units of distance + auto bicubic_filter = [](double x) -> double { + constexpr double a = -0.5; + if (x < 0.0) { + x = -x; + } + if (x < 1.0) { + return ((a + 2.0) * x - (a + 3.0)) * x * x + 1; + } + if (x < 2.0) { + return (((x - 5) * x + 8) * x - 4) * a; + } + return 0.0; // Zero outside [-2, 2] + }; + + // Filter support radius: bicubic extends 2 pixels in each direction + constexpr double filter_support = 2.0; + + // Clipping function for 8-bit values + auto clip8 = [](int val) -> uint8_t { + if (val < 0) return 0; + if (val > 255) return 255; + return static_cast(val); + }; + + // Precompute filter coefficients for ONE dimension (horizontal or vertical) + // + // Parameters: + // inSize - Number of pixels in input dimension (e.g., src_width or src_height) + // outSize - Number of pixels in output dimension (e.g., target_width or target_height) + // bounds - [OUTPUT] Array of size outSize*2 storing input pixel ranges: + // bounds[xx*2+0] = first input pixel index for output pixel xx (xmin) + // bounds[xx*2+1] = number of input pixels for output pixel xx (xcnt) + // weights - [OUTPUT] Array of size outSize*ksize storing fixed-point filter weights: + // kk[xx*ksize + x] = weight for input pixel x contributing to output pixel xx + // + // Returns: kernel size (ksize) - number of input pixels that contribute to each output pixel + auto precompute_weights = [&](int inSize, int outSize, + std::vector & bounds, std::vector & weights) -> int { + double support, scale, filterscale; + double center, ww, ss; + int xx, x, ksize, xmin, xmax, xcnt; + + // Calculate scaling factor: ratio of input range to output size + filterscale = scale = (double)inSize / outSize; + // For upsampling (scale < 1), keep filterscale = 1 to maintain filter sharpness + // For downsampling (scale > 1), widen filter to prevent aliasing + if (filterscale < 1.0) { + filterscale = 1.0; + } + + // Determine filter support radius and kernel size + support = filter_support * filterscale; // Widen filter when downsampling + ksize = static_cast(std::ceil(support)) * 2 + 1; // Total pixels in kernel + + std::vector pre_weights(outSize * ksize); // Temporary weights + bounds.resize(outSize * 2); + + // For each output pixel, compute its filter coefficients + for (xx = 0; xx < outSize; xx++) { + // Calculate the center position in input space (pixel-center convention: +0.5) + center = (xx + 0.5) * scale; + ww = 0.0; // Sum of weights for normalization + ss = 1.0 / filterscale; // Scale factor for filter function + + // Determine the range of input pixels that contribute to this output pixel + xmin = static_cast(center - support + 0.5); + if (xmin < 0) { + xmin = 0; + } + + xmax = static_cast(center + support + 0.5); + if (xmax > inSize) { + xmax = inSize; + } + + xcnt = xmax - xmin; + + // Compute filter weights for each contributing input pixel + for (x = 0; x < xcnt; x++) { + // Distance from input pixel center to output pixel center in input space + double w = bicubic_filter((x + xmin - center + 0.5) * ss); + pre_weights[xx * ksize + x] = w; + ww += w; // Accumulate for normalization + } + + // Normalize weights to sum to 1.0 (preserves brightness) + for (x = 0; x < xcnt; x++) { + if (ww != 0.0) { + pre_weights[xx * ksize + x] /= ww; + } + } + + // Zero-pad remaining kernel positions + for (; x < ksize; x++) { + pre_weights[xx * ksize + x] = 0; + } + + // Store input pixel range for this output pixel + bounds[xx * 2 + 0] = xmin; + bounds[xx * 2 + 1] = xcnt; + } + + // Convert floating-point coefficients to fixed-point integers + // Formula: int32 = round(float * 2^PRECISION_BITS) + weights.resize(outSize * ksize); + for (int i = 0; i < outSize * ksize; i++) { + if (pre_weights[i] < 0) { + weights[i] = static_cast(-0.5 + pre_weights[i] * (1 << PRECISION_BITS)); + } else { + weights[i] = static_cast(0.5 + pre_weights[i] * (1 << PRECISION_BITS)); + } + } + + return ksize; + }; + + // Horizontal resampling pass + // Resizes width from imIn.nx to imOut.nx, preserving height + auto resample_horizontal = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut, + int ksize, const std::vector & bounds, const std::vector & weights) { + imOut.ny = imIn.ny; + imOut.buf.resize(3 * imOut.nx * imOut.ny); + + // Process each row independently + for (int yy = 0; yy < imOut.ny; yy++) { + // For each output pixel in this row + for (int xx = 0; xx < imOut.nx; xx++) { + // Get the range of input pixels and filter coefficients + int xmin = bounds[xx * 2 + 0]; // First input pixel index + int xcnt = bounds[xx * 2 + 1]; // Number of input pixels + + // Initialize accumulators for RGB channels with rounding bias (0.5 in fixed-point) + int32_t ss0 = 1 << (PRECISION_BITS - 1); + int32_t ss1 = 1 << (PRECISION_BITS - 1); + int32_t ss2 = 1 << (PRECISION_BITS - 1); + + // Convolve: sum weighted input pixels + for (int x = 0; x < xcnt; x++) { + int src_idx = ((yy * imIn.nx) + (x + xmin)) * 3; + ss0 += static_cast(imIn.buf[src_idx + 0]) * weights[xx * ksize + x]; // R channel + ss1 += static_cast(imIn.buf[src_idx + 1]) * weights[xx * ksize + x]; // G channel + ss2 += static_cast(imIn.buf[src_idx + 2]) * weights[xx * ksize + x]; // B channel + } + + // Convert back from fixed-point (divide by 2^PRECISION_BITS) and clamp to [0,255] + int dst_idx = (yy * imOut.nx + xx) * 3; + imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS); + imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS); + imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS); + } + } + }; + + // Vertical resampling pass + // Resizes height from imIn.ny to imOut.ny, preserving width + auto resample_vertical = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut, + int ksize, const std::vector & bounds, const std::vector & weight) { + imOut.nx = imIn.nx; + imOut.buf.resize(3 * imOut.nx * imOut.ny); + + // For each output row + for (int yy = 0; yy < imOut.ny; yy++) { + // Get the range of input rows and filter coefficients + int ymin = bounds[yy * 2 + 0]; // First input row index + int ycnt = bounds[yy * 2 + 1]; // Number of input rows + + // Process each column in this output row + for (int xx = 0; xx < imOut.nx; xx++) { + // Initialize accumulators for RGB channels with rounding bias + int32_t ss0 = 1 << (PRECISION_BITS - 1); + int32_t ss1 = 1 << (PRECISION_BITS - 1); + int32_t ss2 = 1 << (PRECISION_BITS - 1); + + // Convolve: sum weighted input pixels vertically + for (int y = 0; y < ycnt; y++) { + int src_idx = ((y + ymin) * imIn.nx + xx) * 3; + ss0 += static_cast(imIn.buf[src_idx + 0]) * weight[yy * ksize + y]; // R channel + ss1 += static_cast(imIn.buf[src_idx + 1]) * weight[yy * ksize + y]; // G channel + ss2 += static_cast(imIn.buf[src_idx + 2]) * weight[yy * ksize + y]; // B channel + } + + // Convert back from fixed-point and clamp to [0,255] + int dst_idx = (yy * imOut.nx + xx) * 3; + imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS); + imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS); + imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS); + } + } + }; + + // Main resampling logic using separable two-pass approach + const int src_width = img.nx; + const int src_height = img.ny; + + dst.nx = target_width; + dst.ny = target_height; + + bool need_horizontal = (target_width != src_width); + bool need_vertical = (target_height != src_height); + + // Precompute filter coefficients for both dimensions + std::vector bounds_horiz, bounds_vert; + std::vector weights_horiz, weights_vert; + int ksize_horiz = 0, ksize_vert = 0; + + if (need_horizontal) { + ksize_horiz = precompute_weights(src_width, target_width, bounds_horiz, weights_horiz); + } + + if (need_vertical) { + ksize_vert = precompute_weights(src_height, target_height, bounds_vert, weights_vert); + } + + // Perform two-pass resampling + if (need_horizontal && need_vertical) { + // Both horizontal and vertical + clip_image_u8 temp; + temp.nx = target_width; + resample_horizontal(img, temp, ksize_horiz, bounds_horiz, weights_horiz); + resample_vertical(temp, dst, ksize_vert, bounds_vert, weights_vert); + } else if (need_horizontal) { + // Only horizontal + resample_horizontal(img, dst, ksize_horiz, bounds_horiz, weights_horiz); + } else if (need_vertical) { + // Only vertical + resample_vertical(img, dst, ksize_vert, bounds_vert, weights_vert); + } else { + // No resizing needed - direct copy + dst.buf = img.buf; + } + + return true; + } + static inline int clip(int x, int lower, int upper) { return std::max(lower, std::min(x, upper)); } @@ -3581,6 +3889,89 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str } } } break; + case PROJECTOR_TYPE_DEEPSEEKOCR: + { + const std::vector native_resolutions = { + /*512 tiny , 640 small, */ 1024 /* base */, 1280 /* large */ + }; + // original image size + const int orig_w = original_size.width; + const int orig_h = original_size.height; + const int orig_area = orig_h * orig_w; + std::array color; + + for (int i = 0; i < 3; i++) { + color[i] = static_cast(params.image_mean[i] * 255.0f); + } + + size_t mode_i = 0; + int min_diff = orig_area; + + for (size_t i = 0; i < native_resolutions.size(); i++) { + int r = native_resolutions[i]; + if (std::abs(orig_area - r * r) < min_diff) { + mode_i = i; + min_diff = std::abs(orig_area - r * r); + } + } + + /* Native Resolution (Base/Large) */ + const int image_size = native_resolutions[mode_i]; + + // Resize maintaining an aspect ratio, then pad to square + float scale = std::min( + static_cast(image_size) / orig_w, + static_cast(image_size) / orig_h + ); + int new_w = static_cast(orig_w * scale); + int new_h = static_cast(orig_h * scale); + + clip_image_u8_ptr scaled_img(clip_image_u8_init()); + img_tool::resize(*img, *scaled_img, clip_image_size{new_w, new_h}, + img_tool::RESIZE_ALGO_BICUBIC_PILLOW, true, color); + + // Use mean color for padding + unsigned char pad_r = static_cast(params.image_mean[0] * 255.0f); + unsigned char pad_g = static_cast(params.image_mean[1] * 255.0f); + unsigned char pad_b = static_cast(params.image_mean[2] * 255.0f); + + // Pad to image_size × image_size (center padding) + clip_image_u8_ptr padded_img(clip_image_u8_init()); + padded_img->nx = image_size; + padded_img->ny = image_size; + padded_img->buf.resize(image_size * image_size * 3); // black padding + + // Fill with mean color + for (int i = 0; i < image_size * image_size; ++i) + { + padded_img->buf[i * 3 + 0] = pad_r; + padded_img->buf[i * 3 + 1] = pad_g; + padded_img->buf[i * 3 + 2] = pad_b; + } + + // Calculate padding offsets (center the image) + int pad_x = (image_size - new_w) / 2; + int pad_y = (image_size - new_h) / 2; + + // Copy scaled image into padded canvas + for (int y = 0; y < new_h; ++y){ + for (int x = 0; x < new_w; ++x){ + int src_idx = (y * new_w + x) * 3; + int dst_idx = ((y + pad_y) * image_size + (x + pad_x)) * 3; + padded_img->buf[dst_idx + 0] = scaled_img->buf[src_idx + 0]; + padded_img->buf[dst_idx + 1] = scaled_img->buf[src_idx + 1]; + padded_img->buf[dst_idx + 2] = scaled_img->buf[src_idx + 2]; + } + } + + // Normalize and output + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*padded_img, *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + + res_imgs->grid_x = 1; + res_imgs->grid_y = 1; + } break; default: LOG_ERR("%s: unsupported projector type %d\n", __func__, ctx->proj_type()); @@ -3812,6 +4203,18 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im { n_patches += 2; // for BOI and EOI token embeddings } break; + case PROJECTOR_TYPE_DEEPSEEKOCR: + { + // SAM encoder applies two stride-2 convolutions (net_2 and net_3) + // which reduces spatial dimensions by 4x in each direction (16x total) + // E.g., 64x64 -> 16x16 patches + n_patches /= 16; + + // build_global_local_features adds image newlines and view separator + // Formula: h*(w+1) + 1 where h = w = sqrt(n_patches) + int h = static_cast(std::sqrt(static_cast(n_patches))); + n_patches = h * (h + 1) + 1; + } break; case PROJECTOR_TYPE_LFM2A: { n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2; @@ -4169,6 +4572,30 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } set_input_i32("patches", patches); } break; + case PROJECTOR_TYPE_DEEPSEEKOCR: + { + GGML_ASSERT(pos_w == pos_h); + + const int window = hparams.attn_window_size; + const int pos = pos_w; + std::vector rel_pos_indices_local(window * window); + std::vector rel_pos_indices_global(pos * pos); + + for (int q = 0; q < window; q++) { + for (int k = 0; k < window; k++) { + rel_pos_indices_local[q * window + k] = q - k + window - 1; + } + } + + for (int q = 0; q < pos; q++) { + for (int k = 0; k < pos; k++) { + rel_pos_indices_global[q * pos + k] = q - k + pos - 1; + } + } + + set_input_i32("rel_pos_indices_local", rel_pos_indices_local); + set_input_i32("rel_pos_indices_global", rel_pos_indices_global); + } break; case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_GEMMA3NV: case PROJECTOR_TYPE_IDEFICS3: @@ -4530,7 +4957,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_GEMMA3NV: return ctx->model.mm_input_proj_w->ne[0]; case PROJECTOR_TYPE_IDEFICS3: - return ctx->model.projection->ne[1]; + return ctx->model.mm_fc_w->ne[1]; case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_MUSIC_FLAMINGO: @@ -4551,6 +4978,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->model.mm_2_w->ne[1]; case PROJECTOR_TYPE_COGVLM: return ctx->model.mm_4h_to_h_w->ne[1]; + case PROJECTOR_TYPE_DEEPSEEKOCR: + return ctx->model.mm_fc_w->ne[1]; case PROJECTOR_TYPE_LFM2A: return ctx->model.position_embeddings->ne[0]; case PROJECTOR_TYPE_GLM4V: diff --git a/tools/mtmd/models/deepseekocr.cpp b/tools/mtmd/models/deepseekocr.cpp new file mode 100644 index 000000000..b1f6ead5b --- /dev/null +++ b/tools/mtmd/models/deepseekocr.cpp @@ -0,0 +1,324 @@ +#include "models.h" + +// Implementation based on approach suggested by Acly +// See: https://github.com/ggml-org/llama.cpp/pull/17383#issuecomment-3554227091 +static ggml_tensor * window_partition(ggml_context * ctx0, ggml_tensor * x, const int window) { + auto [c, w, h, b] = x->ne; + // same as + // x = ggml_win_part(m, x, window); + // x = ggml_reshape_3d(m, x, c, window * window, x->ne[3]); + + const int64_t px = (window - w % window) % window; + const int64_t py = (window - h % window) % window; + const int64_t npw = (w + px) / window; + const int64_t nph = (h + py) / window; + + ggml_tensor * cur = x; + if (px > 0 || py > 0) { + cur = ggml_pad(ctx0, cur, 0, static_cast(px), static_cast(py), 0); + } + cur = ggml_reshape_4d(ctx0, cur, c * window, npw, window, nph * b); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3)); + cur = ggml_reshape_4d(ctx0, cur, c, window, window, npw * nph * b); + return cur; +} + +// Implementation based on approach suggested by Acly +// See: https://github.com/ggml-org/llama.cpp/pull/17383#issuecomment-3554227091 +static ggml_tensor * window_unpartition(ggml_context * ctx0, + ggml_tensor * x, + const int w, + const int h, + const int window) { + const int64_t c = x->ne[0]; + // same as + // x = ggml_reshape_4d(m, x, c, window, window, x->ne[2]); + // x = ggml_win_unpart(m, x, w, h, window); + + const int64_t px = (window - w % window) % window; + const int64_t py = (window - h % window) % window; + const int64_t npw = (w + px) / window; + const int64_t nph = (h + py) / window; + + const int64_t b = x->ne[3] / (npw * nph); + ggml_tensor * cur = x; + cur = ggml_reshape_4d(ctx0, cur, c * window, window, npw, nph * b); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3)); + cur = ggml_reshape_4d(ctx0, cur, c, w + px, h + py, b); + cur = ggml_view_4d(ctx0, cur, cur->ne[0], w, h, cur->ne[3], cur->nb[1], cur->nb[2], cur->nb[3], 0); + cur = ggml_cont(ctx0, cur); + return cur; +} + +static ggml_tensor * get_rel_pos(ggml_context * ctx0, + ggml_tensor * rel_pos, // [L, C] + ggml_tensor * indices, // [q_size, k_size] + const int q_size, + const int k_size) { + const int64_t C = rel_pos->ne[0]; // channels + const int64_t L = rel_pos->ne[1]; // length + + GGML_ASSERT(indices != nullptr); + GGML_ASSERT(indices->type == GGML_TYPE_I32); + GGML_ASSERT(indices->ne[0] == k_size); + GGML_ASSERT(indices->ne[1] == q_size); + + const auto max_rel_dist = 2 * std::max(q_size, k_size) - 1; + ggml_tensor * cur = rel_pos; + + if (max_rel_dist != L) { + // Linear interpolation + const int64_t ne0 = cur->ne[0]; + const int64_t ne1 = cur->ne[1]; + const int64_t ne2 = cur->ne[2]; + const int64_t ne3 = cur->ne[3]; + + cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3)), ne1, 1, ne0 * ne2 * ne3); + cur = ggml_reshape_4d( + ctx0, ggml_interpolate(ctx0, cur, max_rel_dist, 1, ne0 * ne2 * ne3, 1, GGML_SCALE_MODE_BILINEAR), + max_rel_dist, ne0, ne2, ne3); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3)); + } + + // Flatten indices to 1D for ggml_get_rows + const int qk = q_size * k_size; + + cur = ggml_reshape_3d(ctx0, ggml_get_rows(ctx0, cur, ggml_reshape_1d(ctx0, indices, qk)), C, k_size, q_size); + + return cur; // [C, k_size, q_size] +} + +ggml_cgraph * clip_graph_deepseekocr::build() { + // patch embedding + ggml_tensor * inp_raw = build_inp_raw(); + + ggml_tensor * sam_out; + // Building SAM + { + const int n_embd = hparams.sam_n_embd; + const int n_layer = hparams.sam_n_layer; + const int n_heads = hparams.sam_n_head; + const int d_heads = n_embd / n_heads; + const int window = hparams.attn_window_size; + + ggml_tensor * inpL; + + inpL = ggml_conv_2d_sk_p0(ctx0, model.patch_embed_proj_w, inp_raw); + inpL = ggml_add(ctx0, inpL, ggml_reshape_3d(ctx0, model.patch_embed_proj_b, 1, 1, n_embd)); + inpL = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 1, 2, 0, 3)); + + ggml_tensor * rel_pos_indices_local; + ggml_tensor * rel_pos_indices_global; + + rel_pos_indices_local = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, window, window); + rel_pos_indices_global = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, inpL->ne[1], inpL->ne[2]); + ggml_set_name(rel_pos_indices_local, "rel_pos_indices_local"); + ggml_set_name(rel_pos_indices_global, "rel_pos_indices_global"); + ggml_set_input(rel_pos_indices_local); + ggml_set_input(rel_pos_indices_global); + + ggml_tensor * cur; + const auto tgt_size = inpL->ne[1]; + const auto str_size = model.pos_embed->ne[1]; + + if (str_size != tgt_size) { + ggml_tensor * old_pos_embed = nullptr; + old_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, model.pos_embed, 2, 0, 1, 3)); + ggml_tensor * new_pos_embed = + ggml_interpolate(ctx0, old_pos_embed, tgt_size, tgt_size, n_embd, 1, GGML_SCALE_MODE_BICUBIC); + new_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embed, 1, 2, 0, 3)); + cur = ggml_add(ctx0, inpL, new_pos_embed); + } else { + cur = ggml_add(ctx0, inpL, model.pos_embed); + } + + // loop over layers + for (int il = 0; il < n_layer; il++) { + auto & layer = model.sam_layers[il]; + ggml_tensor * shortcut = cur; + + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); + + const int64_t w0 = cur->ne[1]; + const int64_t h0 = cur->ne[2]; + + ggml_tensor * indices; + + if (hparams.is_global_attn(il)) { + indices = rel_pos_indices_global; + } else { + // local attention layer - apply window partition + cur = window_partition(ctx0, cur, window); + indices = rel_pos_indices_local; + } + + const int64_t W = cur->ne[1]; + const int64_t H = cur->ne[2]; + // self-attention + { + const int B = cur->ne[3]; + + cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); + cur = ggml_add(ctx0, cur, layer.qkv_b); + cur = ggml_cont(ctx0, cur); // Ensure tensor is contiguous before reshape + cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B); + + ggml_tensor * Q; + ggml_tensor * K; + ggml_tensor * V; + + Q = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 0 * cur->nb[1]); + Q = ggml_reshape_4d(ctx0, ggml_cont(ctx0, Q), d_heads, n_heads, W * H, B); + + K = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 1 * cur->nb[1]); + K = ggml_reshape_4d(ctx0, ggml_cont(ctx0, K), d_heads, n_heads, W * H, B); + + V = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 2 * cur->nb[1]); + V = ggml_reshape_4d(ctx0, ggml_cont(ctx0, V), d_heads, n_heads, W * H, B); + + ggml_tensor * mask; + ggml_tensor * rw; + ggml_tensor * rh; + ggml_tensor * qr; + + rw = get_rel_pos(ctx0, layer.rel_pos_w, indices, W, W); // [W, W, C] + rh = get_rel_pos(ctx0, layer.rel_pos_h, indices, H, H); // [H, H, C] + qr = ggml_permute(ctx0, Q, 0, 2, 1, 3); + qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads); + + rw = ggml_mul_mat(ctx0, rw, + ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3))); // [B*n_heads, W, H, W] + rw = ggml_cont(ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3)); // [B*n_heads, H, W, W] + rw = ggml_reshape_4d(ctx0, rw, W, 1, W * H, n_heads * B); + rw = ggml_repeat_4d(ctx0, rw, W, H, W * H, n_heads * B); + rh = ggml_mul_mat(ctx0, rh, qr); // [B*n_heads, H, W, H] + rh = ggml_reshape_4d(ctx0, rh, 1, H, W * H, n_heads * B); + mask = ggml_add(ctx0, rw, rh); // [B*n_heads, H*W, H, W] + mask = ggml_reshape_4d(ctx0, mask, W * H, W * H, n_heads, B); + mask = ggml_cast(ctx0, mask, GGML_TYPE_F16); + + const float scale = 1.0f / sqrtf(static_cast(d_heads)); + + cur = build_attn(layer.o_w, layer.o_b, Q, K, V, mask, scale, + il); // [B, H*W, n_embd] + cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), n_embd, W, H, B); + } + + if (hparams.is_global_attn(il) == false) { + // local attention layer - reverse window partition + cur = window_unpartition(ctx0, cur, w0, h0, window); + } + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, shortcut); + + ggml_tensor * inpFF = cur; + + // layernorm2 + cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); + + // ffn + cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + // residual 2 + cur = ggml_add(ctx0, cur, inpFF); + cb(cur, "sam_layer_out", il); + } + + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); + + cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); + cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); + + cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); + cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); + + cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1); + cur = ggml_conv_2d(ctx0, model.net_3, cur, 2, 2, 1, 1, 1, 1); + cb(cur, "sam_output", -1); + + ggml_build_forward_expand(gf, cur); + sam_out = cur; + } + + ggml_tensor * clip_out; + // Building DS-OCR CLIP + { + ggml_tensor * inp; + + inp = ggml_cpy(ctx0, sam_out, ggml_dup_tensor(ctx0, sam_out)); + inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2]); + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); + + ggml_tensor * new_pos_embd = + ggml_cpy(ctx0, model.position_embeddings, ggml_dup_tensor(ctx0, model.position_embeddings)); + + int n_pos = new_pos_embd->ne[1]; // +1 for [CLS] + const auto tgt_size = static_cast(std::sqrt(inp->ne[1])); + const auto src_size = static_cast(std::sqrt(n_pos - 1)); + + if (tgt_size != src_size) { + ggml_tensor * old_pos_embd; + ggml_tensor * cls_tok; + + old_pos_embd = ggml_view_2d(ctx0, new_pos_embd, new_pos_embd->ne[0], src_size * src_size, + ggml_row_size(new_pos_embd->type, new_pos_embd->ne[0]), 0); + cls_tok = ggml_view_2d(ctx0, new_pos_embd, new_pos_embd->ne[0], 1, + ggml_row_size(new_pos_embd->type, new_pos_embd->ne[0]), src_size * src_size); + new_pos_embd = ggml_interpolate(ctx0, old_pos_embd, tgt_size, tgt_size, new_pos_embd->ne[0], 1, + GGML_SCALE_MODE_BICUBIC); + new_pos_embd = ggml_reshape_3d(ctx0, new_pos_embd, n_embd, tgt_size * tgt_size, 1); + new_pos_embd = ggml_concat(ctx0, new_pos_embd, cls_tok, 1); + n_pos = tgt_size * tgt_size + 1; + } + + // add CLS token + inp = ggml_concat(ctx0, model.class_embedding, inp, 1); + + // for selecting learned pos embd, used by ViT + ggml_tensor * positions = ggml_cast(ctx0, ggml_arange(ctx0, 0, n_pos, 1), GGML_TYPE_I32); + ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, new_pos_embd, positions); + + ggml_tensor * cur = build_vit(inp, n_pos, NORM_TYPE_NORMAL, FFN_GELU_QUICK, learned_pos_embd, nullptr); + + ggml_build_forward_expand(gf, cur); + clip_out = cur; + } + + const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1]; + + sam_out = ggml_cont(ctx0, ggml_permute(ctx0, sam_out, 1, 2, 0, 3)); + sam_out = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0], clip_n_patches); + clip_out = ggml_view_2d(ctx0, clip_out, n_embd, clip_n_patches, clip_out->nb[1], clip_out->nb[1]); + + ggml_tensor * cur; + cur = ggml_concat(ctx0, clip_out, sam_out, 0); + cur = ggml_reshape_2d(ctx0, cur, 2 * n_embd, clip_n_patches); + cur = ggml_cont(ctx0, cur); + cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur); + cur = ggml_add(ctx0, cur, model.mm_fc_b); + + const auto h = static_cast(std::sqrt(static_cast(cur->ne[1]))); + const auto w = h; + const auto n_dim = cur->ne[0]; + + ggml_tensor * imgnl; + ggml_tensor * vs; + + imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1); + vs = ggml_reshape_2d(ctx0, model.view_seperator, n_dim, 1); // (n_dim, 1) + cur = ggml_reshape_3d(ctx0, cur, n_dim, w, h); + cur = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h); + cur = ggml_concat(ctx0, cur, vs, 1); // (n_dim, h*(w+1) + 1) + + cb(cur, "dsocr_output", -1); + + ggml_build_forward_expand(gf, cur); + return gf; +} diff --git a/tools/mtmd/models/glm4v.cpp b/tools/mtmd/models/glm4v.cpp index 9dbb162c5..623d2e384 100644 --- a/tools/mtmd/models/glm4v.cpp +++ b/tools/mtmd/models/glm4v.cpp @@ -97,7 +97,7 @@ ggml_cgraph * clip_graph_glm4v::build() { // FC projector { - cur = build_mm(model.projection, cur); + cur = build_mm(model.mm_fc_w, cur); // default LayerNorm (post_projection_norm) cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); cur = ggml_gelu_erf(ctx0, cur); diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index aff222c71..5705d7f21 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -77,6 +77,11 @@ struct clip_graph_whisper_enc : clip_graph { ggml_cgraph * build() override; }; +struct clip_graph_deepseekocr : clip_graph { + clip_graph_deepseekocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + struct clip_graph_conformer : clip_graph { clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; diff --git a/tools/mtmd/models/siglip.cpp b/tools/mtmd/models/siglip.cpp index 9dafa35ea..7ef98eed0 100644 --- a/tools/mtmd/models/siglip.cpp +++ b/tools/mtmd/models/siglip.cpp @@ -43,7 +43,7 @@ ggml_cgraph * clip_graph_siglip::build() { // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578 const int scale_factor = model.hparams.n_merge; cur = build_patch_merge_permute(cur, scale_factor); - cur = build_mm(model.projection, cur); + cur = build_mm(model.mm_fc_w, cur); } else if (proj_type == PROJECTOR_TYPE_LFM2) { // pixel unshuffle block diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh index d2b7e684a..e081bde87 100755 --- a/tools/mtmd/tests.sh +++ b/tools/mtmd/tests.sh @@ -88,6 +88,7 @@ add_test_vision "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M" add_test_vision "ggml-org/LFM2-VL-450M-GGUF:Q8_0" add_test_vision "ggml-org/granite-docling-258M-GGUF:Q8_0" add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0" +add_test_vision "ggml-org/DeepSeek-OCR-GGUF:Q8_0" -p "Free OCR." --chat-template deepseek-ocr add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0" add_test_audio "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M" @@ -108,6 +109,7 @@ if [ "$RUN_BIG_TESTS" = true ]; then add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M" # add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra # add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M" # not always working + add_test_vision "ggml-org/GLM-4.6V-Flash-GGUF:Q4_K_M" -p "extract all texts from this image" add_test_audio "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M" add_test_audio "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"