diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt new file mode 100644 index 000000000..6c53ce0e4 --- /dev/null +++ b/app/CMakeLists.txt @@ -0,0 +1,20 @@ +set(TARGET llama-app) + +add_executable(${TARGET} llama.cpp) +set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama) + +target_link_libraries(${TARGET} PRIVATE + llama-server-impl + llama-cli-impl + llama-completion-impl + llama-bench-impl + llama-batched-bench-impl + llama-fit-params-impl + llama-quantize-impl + llama-perplexity-impl +) +target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/app/llama.cpp b/app/llama.cpp new file mode 100644 index 000000000..e149975d2 --- /dev/null +++ b/app/llama.cpp @@ -0,0 +1,86 @@ +#include "build-info.h" + +#include +#include +#include + +// visible +int llama_server(int argc, char ** argv); +int llama_cli(int argc, char ** argv); + +// hidden +int llama_completion(int argc, char ** argv); +int llama_bench(int argc, char ** argv); +int llama_batched_bench(int argc, char ** argv); +int llama_fit_params(int argc, char ** argv); +int llama_quantize(int argc, char ** argv); +int llama_perplexity(int argc, char ** argv); + +static int help(int argc, char ** argv); +static int version(int argc, char ** argv); + +struct command { + const char * name; + const char * desc; + std::vector aliases; + bool hidden; + int (*func)(int, char **); +}; + +static const command cmds[] = { + {"serve", "HTTP API server", {"server"}, false, llama_server }, + {"cli", "Command-line interactive interface", {"client"}, false, llama_cli }, + {"completion", "Text completion", {"complete"}, true, llama_completion }, + {"bench", "Benchmark prompt processing and text generation", {}, true, llama_bench }, + {"batched-bench", "Benchmark batched decoding performance", {}, true, llama_batched_bench}, + {"fit-params", "Compute parameters to fit a model in device memory", {}, true, llama_fit_params }, + {"quantize", "Quantize a model", {}, true, llama_quantize }, + {"perplexity", "Compute model perplexity and KL divergence", {}, true, llama_perplexity }, + {"version", "Show version", {}, true, version }, + {"help", "Show available commands", {}, true, help }, +}; + +static int version(int argc, char ** argv) { + printf("%s\n", llama_build_info()); + return 0; +} + +static int help(int argc, char ** argv) { + const bool show_all = argc >= 2 && std::string(argv[1]) == "all"; + + printf("Usage: llama [options]\n\nAvailable commands:\n"); + + for (const auto & cmd : cmds) { + if (show_all || !cmd.hidden) { + printf(" %-15s %s\n", cmd.name, cmd.desc); + } + } + printf("\nRun 'llama --help' for command-specific usage.\n"); + + return 0; +} + +static bool matches(const std::string & arg, const command & cmd) { + if (arg == cmd.name) { + return true; + } + for (const auto & alias : cmd.aliases) { + if (arg == alias) { + return true; + } + } + return false; +} + +int main(int argc, char ** argv) { + const std::string arg = argc >= 2 ? argv[1] : "help"; + + for (const auto & cmd : cmds) { + if (matches(arg, cmd)) { + return cmd.func(argc - 1, argv + 1); + } + } + + fprintf(stderr, "error: unknown command '%s'\n", arg.c_str()); + return 1; +} diff --git a/common/arg.cpp b/common/arg.cpp index 558c9697a..be561114c 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3593,6 +3593,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.speculative.draft.p_min = std::stof(value); } ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_P_MIN")); + add_opt(common_arg( + {"--spec-draft-backend-sampling"}, + {"--no-spec-draft-backend-sampling"}, + string_format("offload draft sampling to the backend (default: %s)", + params.speculative.draft.backend_sampling ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.speculative.draft.backend_sampling = value; + } + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_BACKEND_SAMPLING")); add_opt(common_arg( {"--spec-draft-device", "-devd", "--device-draft"}, "", "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n" diff --git a/common/common.h b/common/common.h index 23bc9d631..1ffc4095d 100644 --- a/common/common.h +++ b/common/common.h @@ -306,6 +306,8 @@ struct common_params_speculative_draft { float p_split = 0.1f; // speculative decoding split probability float p_min = 0.0f; // minimum speculative decoding probability (greedy) + bool backend_sampling = true; // offload draft sampling to the backend (default: on) + common_params_model mparams; llama_context * ctx_tgt = nullptr; diff --git a/common/speculative.cpp b/common/speculative.cpp index 723db5cf5..dfbe2bdb6 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -33,16 +33,15 @@ const std::map common_speculative_type_fro }; static std::string common_speculative_get_devices_str(const std::vector & devices) { - if (devices.empty()) { - return "default"; - } - std::string result; for (size_t i = 0; i < devices.size(); i++) { - if (i > 0) result += ", "; + if (devices[i] == nullptr) { + continue; + } + if (!result.empty()) result += ", "; result += ggml_backend_dev_name(devices[i]); } - return result; + return result.empty() ? "default" : result; } struct common_speculative_config { @@ -414,6 +413,9 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { std::vector smpls; + // backend sampler chain per seq, attached to ctx_dft + std::vector backend_chains; + int32_t n_embd = 0; // Per-sequence cross-batch carryover: pair (h_p, x_{p+1}) at MTP pos p+1. @@ -445,7 +447,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { n_embd = llama_model_n_embd(llama_get_model(ctx_dft)); LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__); - LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd); + LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d, backend_sampling=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd, (int) this->params.backend_sampling); LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__, this->params.n_gpu_layers, ggml_type_name(this->params.cache_type_k), @@ -469,6 +471,22 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams)); } + // offload draft sampling to the backend + backend_chains.assign(n_seq, nullptr); + if (this->params.backend_sampling) { + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + llama_sampler * chain = llama_sampler_chain_init(llama_sampler_chain_default_params()); + llama_sampler_chain_add(chain, llama_sampler_init_top_k(10)); + + if (!llama_set_sampler(ctx_dft, seq_id, chain)) { + LOG_WRN("%s: backend offload failed for seq_id=%d; using CPU sampler\n", __func__, (int) seq_id); + llama_sampler_free(chain); + chain = nullptr; + } + backend_chains[seq_id] = chain; + } + } + llama_set_embeddings_pre_norm(ctx_tgt, true, /*masked*/ false); llama_set_embeddings_pre_norm(ctx_dft, true, /*masked*/ true); @@ -484,6 +502,18 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { } ~common_speculative_impl_draft_mtp() override { + auto * ctx_dft = this->params.ctx_dft; + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) backend_chains.size(); ++seq_id) { + if (backend_chains[seq_id] == nullptr) { + continue; + } + if (ctx_dft) { + llama_set_sampler(ctx_dft, seq_id, nullptr); + } + llama_sampler_free(backend_chains[seq_id]); + } + backend_chains.clear(); + if (batch.token != nullptr) { free(batch.token); batch.token = nullptr; diff --git a/conversion/base.py b/conversion/base.py index 30c2124c2..8e12af6c5 100644 --- a/conversion/base.py +++ b/conversion/base.py @@ -1610,6 +1610,42 @@ class TextModel(ModelBase): special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_hybriddna(self): + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute] + assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute] + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute] + added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] + added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute] + + tokens: list[str] = [] + toktypes: list[int] = [] + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token: str = reverse_vocab[i] + if token in added_vocab: + if added_tokens_decoder[i].special or self.does_token_look_special(token): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + toktypes.append(gguf.TokenType.NORMAL) + tokens.append(token) + + tokpre = self.get_vocab_base_pre(tokenizer) + self.gguf_writer.add_tokenizer_model("hybriddna") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_qwen(self): from .qwen import QwenModel diff --git a/conversion/hunyuan.py b/conversion/hunyuan.py index be54f5810..537f023aa 100644 --- a/conversion/hunyuan.py +++ b/conversion/hunyuan.py @@ -189,7 +189,8 @@ class HunYuanModel(TextModel): self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) - # HunyuanOCR has pad_token_id=-1 in config.json; exclude pad from SpecialVocab + # Some HunYuanVL variants (e.g. OCR-style configs) have pad_token_id=-1; + # guard SpecialVocab so it doesn't try to emit an invalid pad id. token_types = None if (self.hparams.get("pad_token_id") or 0) < 0: token_types = ('bos', 'eos', 'unk', 'sep', 'cls', 'mask') @@ -250,7 +251,8 @@ class HunYuanModel(TextModel): self._fix_special_tokens() def set_gguf_parameters(self): - # HunyuanOCR has num_experts=1 which is not MoE, prevent parent from writing it + # Some HunYuanVL variants set num_experts=1 (not real MoE); + # prevent the parent class from emitting expert_count metadata in that case. saved_num_experts = self.hparams.pop("num_experts", None) super().set_gguf_parameters() if saved_num_experts is not None and saved_num_experts > 1: @@ -288,51 +290,21 @@ class HunYuanModel(TextModel): @ModelBase.register("HunYuanVLForConditionalGeneration") class HunyuanVLVisionModel(MmprojModel): - # Handles both HunyuanOCR and HunyuanVL, which share the HF architecture name - # "HunYuanVLForConditionalGeneration" and the `vit.perceive.*` vision layout. - # Each variant maps to a different projector type in clip.cpp so image - # preprocessing follows the correct code path. - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) assert self.hparams_vision is not None - # HunyuanOCR / HunyuanVL uses max_image_size instead of image_size + # HunyuanVL uses max_image_size instead of image_size if "image_size" not in self.hparams_vision: self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048) - @staticmethod - def is_ocr_variant(hparams: dict) -> bool: - """Return True for HunyuanOCR, False for HunyuanVL. - - The projector's output dim must equal the text model's hidden_size by - construction (that's what "projector" means). HunyuanOCR pairs a 1B text - backbone (hidden=1024); HunyuanVL pairs a 4B one (hidden=3072). So the - ViT -> LLM projection dim is a hard architectural signature, not a - magic number. - """ - vision_out = int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) - return vision_out == 1024 - def set_gguf_parameters(self): super().set_gguf_parameters() assert self.hparams_vision is not None vcfg = self.hparams_vision - - if self.is_ocr_variant(self.global_config): - # --- HunyuanOCR --- - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR) - self.gguf_writer.add_vision_use_gelu(True) - self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5)) - self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2)) - self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"]) - self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"]) - return - - # --- HunyuanVL --- self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL) - self.gguf_writer.add_vision_use_gelu(str(vcfg["hidden_act"]).lower() == "gelu") - self.gguf_writer.add_vision_attention_layernorm_eps(float(vcfg["rms_norm_eps"])) - self.gguf_writer.add_vision_spatial_merge_size(int(vcfg["spatial_merge_size"])) + self.gguf_writer.add_vision_use_gelu(True) + self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5)) + self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2)) self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"])) self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"])) @@ -353,7 +325,7 @@ class HunyuanVLVisionModel(MmprojModel): def tensor_force_quant(self, name, new_name, bid, n_dims): # force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal - # Both HunyuanOCR and HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2. + # HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2. if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"): return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32 return super().tensor_force_quant(name, new_name, bid, n_dims) @@ -361,40 +333,18 @@ class HunyuanVLVisionModel(MmprojModel): @ModelBase.register("HunYuanVLForConditionalGeneration") class HunyuanVLTextModel(HunYuanModel): - # The "HunYuanVLForConditionalGeneration" HF architecture covers both HunyuanOCR - # and HunyuanVL. HunyuanOCR reuses the HunYuan-Dense text backbone (standard RoPE), - # while HunyuanVL introduces a new LLM arch with XD-RoPE. Detect the variant from - # the config and pick the matching GGUF architecture. model_arch = gguf.MODEL_ARCH.HUNYUAN_VL - @staticmethod - def _is_ocr_config(hparams: dict) -> bool: - # OCR pairs a 1B text backbone (hidden=1024) with a ViT projector that - # outputs 1024-d; HunyuanVL uses 3072-d. Keep in sync with - # HunyuanVLVisionModel.is_ocr_variant. - return int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) == 1024 - def __init__(self, dir_model: Path, *args, **kwargs): - raw_hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model, is_mistral_format=False) - if self._is_ocr_config(raw_hparams): - self.model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE - else: - self.model_arch = gguf.MODEL_ARCH.HUNYUAN_VL super().__init__(dir_model, *args, **kwargs) def set_gguf_parameters(self): super().set_gguf_parameters() - # Only emit XD-RoPE metadata for the HunyuanVL backbone; HunyuanOCR uses - # the HunYuan-Dense arch which already handles standard rope in super(). - if self.model_arch != gguf.MODEL_ARCH.HUNYUAN_VL: - return - + # XD-RoPE metadata for the HunyuanVL; if self.rope_parameters.get("rope_type") != "xdrope": return - # defaults for HunyuanVL. The C++ side later computes: - # freq_base = rope_theta * alpha ** (head_dim / (head_dim - 2)) self.gguf_writer.add_rope_freq_base(float(self.rope_parameters["rope_theta"])) self.gguf_writer.add_rope_scaling_alpha(float(self.rope_parameters["alpha"])) self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) diff --git a/conversion/llama.py b/conversion/llama.py index 41fde5143..fd6167bfd 100644 --- a/conversion/llama.py +++ b/conversion/llama.py @@ -51,6 +51,15 @@ class LlamaModel(TextModel): if path_tekken_json.is_file() and not path_tokenizer_json.is_file(): self._set_vocab_mistral() + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + if (add_prefix_space := tokenizer_config_json.get("add_prefix_space")) is not None: + self.gguf_writer.add_add_space_prefix(add_prefix_space) + if tokenizer_config_json.get("tokenizer_class") == "HybridDNATokenizer": + return self._set_vocab_hybriddna() + try: self._set_vocab_sentencepiece() except FileNotFoundError: @@ -72,13 +81,6 @@ class LlamaModel(TextModel): special_vocab._set_special_token("eot", 32010) special_vocab.add_to_gguf(self.gguf_writer) - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - if "add_prefix_space" in tokenizer_config_json: - self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) - # Apply to granite small models only if self.hparams.get("vocab_size", 32000) == 49152: self.gguf_writer.add_add_bos_token(False) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 237dea176..9b506e26c 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -379,7 +379,7 @@ void ggml_backend_tensor_get_2d(const struct ggml_tensor * tensor, void * data, ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; GGML_ASSERT(buf != NULL && "tensor buffer not set"); - if (n_copies <= 1 || buf->iface.set_tensor_2d == NULL) { + if (n_copies <= 1 || buf->iface.get_tensor_2d == NULL) { for (size_t i = 0; i < n_copies; i++) { ggml_backend_tensor_get(tensor, (char *) data + i*stride_data, offset + i*stride_tensor, size); } diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu index adb4d5f0c..c25f42b32 100644 --- a/ggml/src/ggml-cuda/binbcast.cu +++ b/ggml/src/ggml-cuda/binbcast.cu @@ -2,6 +2,9 @@ #include #include +template +using type_for_index = T; + static __device__ __forceinline__ float op_repeat(const float a, const float b) { return b; GGML_UNUSED(a); @@ -52,6 +55,7 @@ static __global__ void k_bin_bcast(const src0_t * src0, const int s12, const int s13, src1_ptrs... src1s) { + ggml_cuda_pdl_lc(); const uint32_t i0s = blockDim.x * blockIdx.x + threadIdx.x; const uint32_t i1 = (blockDim.y * blockIdx.y + threadIdx.y); const uint32_t i2 = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3); @@ -72,6 +76,7 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr; dst_t * dst_row = dst + i_dst; + ggml_cuda_pdl_sync(); for (int i0 = i0s; i0 < ne0; i0 += blockDim.x * gridDim.x) { const uint32_t i10 = fastmodulo(i0, ne10); @@ -141,6 +146,7 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const int i10 = fastmodulo(i0, ne10); + ggml_cuda_pdl_sync(); float result = src0_row ? (float) src0_row[i0*s00] : 0.0f; if constexpr (sizeof...(src1_ptrs) > 0) { result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10]))); @@ -282,35 +288,24 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor * const uint3 ne1_fastdiv = init_fastdiv_values((uint32_t) ne1); const uint3 ne2_fastdiv = init_fastdiv_values((uint32_t) ne2); - if constexpr (sizeof...(I) > 0) { - k_bin_bcast_unravel<<>>( + { + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)block_num, block_size, 0, stream); + ggml_cuda_kernel_launch(k_bin_bcast_unravel...>, launch_params, src0_dd, src1_dd, dst_dd, ne0_fastdiv, ne1_fastdiv, ne2_fastdiv, ne3, prod_012, prod_01, ne10, ne11, ne12, ne13, /*s0,*/ s1, s2, s3, s00, s01, s02, s03, s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...); - } else { - k_bin_bcast_unravel - <<>>(src0_dd, src1_dd, dst_dd, ne0_fastdiv, ne1_fastdiv, - ne2_fastdiv, ne3, prod_012, prod_01, ne10, ne11, ne12, ne13, - /*s0,*/ s1, s2, s3, - s00, s01, s02, s03, - s10, s11, s12, s13); } } else { const uint3 ne3_fastdiv = init_fastdiv_values((uint32_t) ne3); - if constexpr (sizeof...(I) > 0) { - k_bin_bcast<<>>( + { + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(k_bin_bcast...>, launch_params, src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3_fastdiv, ne10, ne11, ne12, ne13, /*s0,*/ s1, s2, s3, - s00 ,s01, s02, s03, - s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...); - } else { - k_bin_bcast<<>>( - src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3_fastdiv, ne10, ne11, ne12, ne13, - /*s0,*/ s1, s2, s3, s00, s01, s02, s03, - s10, s11, s12, s13); + s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...); } } } @@ -333,6 +328,7 @@ static __global__ void k_repeat_back( } T sum = 0; + ggml_cuda_pdl_sync(); for (int64_t i3 = tid3; i3 < ne03; i3 += ne3) { for (int64_t i2 = tid2; i2 < ne02; i2 += ne2) { for (int64_t i1 = tid1; i1 < ne01; i1 += ne1) { diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 9e7053720..32ad26261 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -5,6 +5,7 @@ #include "ggml-cuda.h" #include +#include #include #if defined(GGML_USE_HIP) @@ -27,6 +28,7 @@ #include #include #include +#include #include #if defined(GGML_USE_HIP) @@ -50,6 +52,7 @@ #define GGML_CUDA_CC_TURING 750 #define GGML_CUDA_CC_AMPERE 800 #define GGML_CUDA_CC_ADA_LOVELACE 890 +#define GGML_CUDA_CC_HOPPER 900 // While BW spans CC 1000, 1100 & 1200, we are integrating Tensor Core instructions available to 1200 family, see // https://docs.nvidia.com/cutlass/media/docs/cpp/blackwell_functionality.html#blackwell-sm120-gemms #define GGML_CUDA_CC_BLACKWELL 1200 @@ -107,6 +110,24 @@ # define GGML_CUDA_USE_CUB #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070 +// PDL host-side support (cudaLaunchKernelEx) requires CUDART >= 11.8 and excludes HIP/MUSA. +// __CUDA_ARCH__ is undefined in host passes; GPU arch check happens in device-side code. +#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11080 +# define GGML_CUDA_USE_PDL +#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11080 + +static __device__ __forceinline__ void ggml_cuda_pdl_sync() { +#if defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaGridDependencySynchronize(); +#endif // defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER +} + +static __device__ __forceinline__ void ggml_cuda_pdl_lc() { +#if defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + cudaTriggerProgrammaticLaunchCompletion(); +#endif // defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER +} + #ifdef __CUDA_ARCH_LIST__ constexpr bool ggml_cuda_has_arch_impl(int) { return false; @@ -170,6 +191,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in #define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString) + #if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA) static const char * cublas_get_error_str(const cublasStatus_t err) { return cublasGetStatusString(err); @@ -1494,3 +1516,67 @@ struct ggml_cuda_mm_fusion_args_device { const void * gate_bias = nullptr; ggml_glu_op glu_op; }; + +struct ggml_cuda_kernel_launch_params { + dim3 block_nums; + dim3 block_dims; + size_t shmem; + cudaStream_t stream; + + // size_t shmem + ggml_cuda_kernel_launch_params(const dim3& block_nums_, const dim3& block_dims_, const size_t shmem_, const cudaStream_t stream_) + : block_nums(block_nums_), block_dims(block_dims_), shmem(shmem_), stream(stream_) {} + + // Some call sites pass ints instead of the required size_t. This 2nd constructor casts int->size_t to avoid these -Wnarrowing warnings. + ggml_cuda_kernel_launch_params(const dim3& block_nums_, const dim3& block_dims_, const int shmem_, const cudaStream_t stream_) + : block_nums(block_nums_), block_dims(block_dims_), shmem((size_t)shmem_), stream(stream_) {} +}; + +#if defined(GGML_CUDA_USE_PDL) +struct ggml_cuda_pdl_config { + cudaLaunchAttribute attr; + cudaLaunchConfig_t cfg; + + ggml_cuda_pdl_config(const ggml_cuda_kernel_launch_params & params) { + attr.id = cudaLaunchAttributeProgrammaticStreamSerialization; + attr.val.programmaticStreamSerializationAllowed = 1; + + cfg = {}; + cfg.gridDim = params.block_nums; + cfg.blockDim = params.block_dims; + cfg.dynamicSmemBytes = params.shmem; + cfg.stream = params.stream; + cfg.attrs = &attr; + cfg.numAttrs = 1; + } + + // Delete due to &attr + ggml_cuda_pdl_config(const ggml_cuda_pdl_config&) = delete; + ggml_cuda_pdl_config& operator=(const ggml_cuda_pdl_config&) = delete; + ggml_cuda_pdl_config& operator=(ggml_cuda_pdl_config&&) = delete; + +}; +#endif //defined(GGML_CUDA_USE_PDL) + + +template +static __inline__ void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_kernel_launch_params & launch_params, Args&&... args) { +#if defined(GGML_CUDA_USE_PDL) + + static const bool env_pdl_enabled = []() { + const char * env = getenv("GGML_CUDA_PDL"); + return env == nullptr || std::atoi(env) != 0; + }(); + + if (env_pdl_enabled && ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= GGML_CUDA_CC_HOPPER) { + auto pdl_cfg = ggml_cuda_pdl_config(launch_params); + + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, kernel, std::forward(args)... )); + return; + } +#endif //defined(GGML_CUDA_USE_PDL) + + kernel<<>>(std::forward(args)... ); + CUDA_CHECK(cudaGetLastError()); +} + diff --git a/ggml/src/ggml-cuda/concat.cu b/ggml/src/ggml-cuda/concat.cu index 102f944f9..adba4d522 100644 --- a/ggml/src/ggml-cuda/concat.cu +++ b/ggml/src/ggml-cuda/concat.cu @@ -15,6 +15,7 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_f32_cont const int64_t n = ne0 * ne1 * ne2; + ggml_cuda_pdl_sync(); for (int64_t i = (int64_t) blockIdx.x * blockDim.x + threadIdx.x; i < n; i += (int64_t) blockDim.x * gridDim.x) { if constexpr (dim == 0) { const int64_t row = i / ne0; @@ -64,8 +65,8 @@ static void concat_f32_cuda(const float * x, const int num_blocks = (n + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE; if (dim == 0) { - concat_f32_cont<0> - <<>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream); + ggml_cuda_kernel_launch(concat_f32_cont<0>, launch_params,x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2); return; } if (dim == 1) { diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index d208acf2d..121472ec2 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -16,6 +16,7 @@ static __global__ void cpy_scalar(const char * cx, char * cdst, const int64_t ne const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02, const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13) { + ggml_cuda_pdl_lc(); const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; if (i >= ne) { @@ -36,6 +37,7 @@ static __global__ void cpy_scalar(const char * cx, char * cdst, const int64_t ne const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13; + ggml_cuda_pdl_sync(); cpy_1(cx + x_offset, cdst + dst_offset); } @@ -59,6 +61,7 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const __shared__ float tile[2][CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+1]; int cur_tile_buf = 0; + ggml_cuda_pdl_sync(); #pragma unroll for (int i = 0; i < CUDA_CPY_BLOCK_NM; ++i) { @@ -142,6 +145,7 @@ static __global__ void cpy_f32_q(const char * cx, char * cdst, const int64_t ne, const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; const int64_t dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13; + ggml_cuda_pdl_sync(); cpy_blck(cx + x_offset, cdst + dst_offset); } @@ -168,6 +172,7 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst, const int64_t ne, const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13; + ggml_cuda_pdl_sync(); cpy_blck(cx + x_offset, cdst + dst_offset); } @@ -182,6 +187,7 @@ static __global__ void cpy_scalar_contiguous(const char * cx, char * cdst, const const src_t * x = (const src_t *) cx; dst_t * dst = (dst_t *) cdst; + ggml_cuda_pdl_sync(); dst[i] = ggml_cuda_cast(x[i]); } @@ -192,8 +198,8 @@ cudaStream_t stream) { const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; GGML_ASSERT(num_blocks < UINT_MAX); - cpy_scalar_contiguous<<>> - (cx, cdst, ne); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream); + ggml_cuda_kernel_launch(cpy_scalar_contiguous, launch_params, cx, cdst, ne); } template @@ -223,13 +229,15 @@ static void ggml_cpy_scalar_cuda( GGML_ASSERT(grid_z < USHRT_MAX); dim3 dimGrid(grid_x, grid_y, grid_z); dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1); - cpy_scalar_transpose<<>> - (cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(dimGrid, dimBlock, 0, stream); + ggml_cuda_kernel_launch(cpy_scalar_transpose, launch_params, + cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); } else { const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; GGML_ASSERT(num_blocks < UINT_MAX); - cpy_scalar><<>> - (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream); + ggml_cuda_kernel_launch(cpy_scalar>, launch_params, + cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); } } diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index beeb52389..debcb6e54 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -636,6 +636,7 @@ static __global__ void flash_attn_mask_to_KV_max( if (tid < WARP_SIZE) { buf_iw[tid] = 1; } + ggml_cuda_pdl_sync(); __syncthreads(); int KV_max_sj = (ne30 - 1) * FATTN_KQ_STRIDE; @@ -687,6 +688,7 @@ static __global__ void flash_attn_stream_k_fixup_uniform( const uint3 fd_iter_j_z, const uint3 fd_iter_j) { constexpr int ncols = ncols1*ncols2; + ggml_cuda_pdl_lc(); const int tile_idx = blockIdx.x; // One block per output tile. const int j = blockIdx.y; @@ -718,6 +720,7 @@ static __global__ void flash_attn_stream_k_fixup_uniform( dst += sequence*ne02*ne01*D + jt*ne02*(ncols1*D) + zt_Q*D + (j*ne02 + c)*D + tid; + ggml_cuda_pdl_sync(); // Load the partial result that needs a fixup float dst_val = *dst; float max_val; @@ -809,6 +812,7 @@ static __global__ void flash_attn_stream_k_fixup_general( float dst_val = 0.0f; float max_val = 0.0f; float rowsum = 0.0f; + ggml_cuda_pdl_sync(); { dst_val = *dst; @@ -867,6 +871,7 @@ static __global__ void flash_attn_combine_results( const float2 * __restrict__ VKQ_meta, float * __restrict__ dst, const int parallel_blocks) { + ggml_cuda_pdl_lc(); // Dimension 0: threadIdx.x // Dimension 1: blockIdx.x // Dimension 2: blockIdx.y @@ -890,6 +895,7 @@ static __global__ void flash_attn_combine_results( __builtin_assume(tid < D); extern __shared__ float2 meta[]; + ggml_cuda_pdl_sync(); for (int i = tid; i < 2*parallel_blocks; i += D) { ((float *) meta)[i] = ((const float *)VKQ_meta) [i]; } @@ -1146,7 +1152,9 @@ void launch_fattn( const uint3 ne01 = init_fastdiv_values(Q->ne[1]); GGML_ASSERT(block_dim.x % warp_size == 0); - fattn_kernel<<>>( + + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks_num, block_dim, nbytes_shared, main_stream); + ggml_cuda_kernel_launch(fattn_kernel, launch_params, (const char *) Q->data, K_data, V_data, @@ -1176,9 +1184,9 @@ void launch_fattn( const dim3 block_dim_combine(DV, 1, 1); const dim3 blocks_num_combine = {(unsigned)ntiles_dst, ncols1, ncols2}; - flash_attn_stream_k_fixup_uniform - <<>> - ((float *) KQV->data, dst_tmp_meta.ptr, + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks_num_combine, block_dim_combine, 0, main_stream); + ggml_cuda_kernel_launch(flash_attn_stream_k_fixup_uniform, launch_params, + (float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], K->ne[2], nblocks_sk, gqa_ratio, bpt, fd0, fd1, fd2); } else if (ntiles_dst % blocks_num.x != 0) { @@ -1193,9 +1201,9 @@ void launch_fattn( const dim3 block_dim_combine(DV, 1, 1); const dim3 blocks_num_combine = {blocks_num.x, ncols1, ncols2}; - flash_attn_stream_k_fixup_general - <<>> - ((float *) KQV->data, dst_tmp_meta.ptr, + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks_num_combine, block_dim_combine, 0, main_stream); + ggml_cuda_kernel_launch(flash_attn_stream_k_fixup_general, launch_params, + (float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], gqa_ratio, total_work, fd_k_j_z_ne12, fd_k_j_z, fd_k_j, fd_k); } @@ -1204,9 +1212,9 @@ void launch_fattn( const dim3 blocks_num_combine(Q->ne[1], Q->ne[2], Q->ne[3]); const size_t nbytes_shared_combine = parallel_blocks*sizeof(float2); - flash_attn_combine_results - <<>> - (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data, parallel_blocks); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks_num_combine, block_dim_combine, nbytes_shared_combine, main_stream); + ggml_cuda_kernel_launch(flash_attn_combine_results, launch_params, + dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data, parallel_blocks); } CUDA_CHECK(cudaGetLastError()); } diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index a25e912c4..4871b90df 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -1724,6 +1724,7 @@ static __global__ void flash_attn_ext_f16( const int32_t nb21, const int32_t nb22, const int64_t nb23, const int32_t ne31, const int32_t ne32, const int32_t ne33, const int32_t nb31, const int32_t nb32, const int64_t nb33) { + ggml_cuda_pdl_sync(); // TODO optimize placement #if defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE)) // Skip unused kernel variants for faster compilation: diff --git a/ggml/src/ggml-cuda/fattn-tile.cuh b/ggml/src/ggml-cuda/fattn-tile.cuh index 7b0a5e5cf..fac76f135 100644 --- a/ggml/src/ggml-cuda/fattn-tile.cuh +++ b/ggml/src/ggml-cuda/fattn-tile.cuh @@ -894,6 +894,8 @@ static __global__ void flash_attn_tile( } float KQ_sum[cpw] = {0.0f}; + ggml_cuda_pdl_sync(); + // Load Q data, convert to FP16 if fast: #pragma unroll for (int jc0 = 0; jc0 < cpw; ++jc0) { diff --git a/ggml/src/ggml-cuda/fattn-vec.cuh b/ggml/src/ggml-cuda/fattn-vec.cuh index f0bd42a57..b0a6cf67f 100644 --- a/ggml/src/ggml-cuda/fattn-vec.cuh +++ b/ggml/src/ggml-cuda/fattn-vec.cuh @@ -40,6 +40,7 @@ static __global__ void flash_attn_ext_vec( const int32_t nb21, const int32_t nb22, const int64_t nb23, const int32_t ne31, const int32_t ne32, const int32_t ne33, const int32_t nb31, const int32_t nb32, const int64_t nb33) { + ggml_cuda_pdl_lc(); #ifdef FLASH_ATTN_AVAILABLE // Skip unused kernel variants for faster compilation: @@ -136,6 +137,8 @@ static __global__ void flash_attn_ext_vec( #endif // V_DOT2_F32_F16_AVAILABLE int Q_i32[ncols][1 > D/(sizeof(int)*nthreads_KQ) ? 1 : D/(sizeof(int)*nthreads_KQ)]; float2 Q_ds[ncols][1 > D/(sizeof(int)*nthreads_KQ) ? 1 : D/(sizeof(int)*nthreads_KQ)]; + + ggml_cuda_pdl_sync(); if constexpr (Q_q8_1) { #pragma unroll for (int j0 = 0; j0 < ncols; j0 += nwarps) { diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu index 7ef62ee2c..2539443ea 100644 --- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu +++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu @@ -86,6 +86,7 @@ static __global__ void flash_attn_ext_f16( constexpr int kqs_padded = FATTN_KQ_STRIDE + 8; constexpr int kqar = sizeof(KQ_acc_t)/sizeof(half); + ggml_cuda_pdl_sync(); const int sequence = blockIdx.z / ne02; const int head = blockIdx.z - sequence*ne02; const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. diff --git a/ggml/src/ggml-cuda/gated_delta_net.cu b/ggml/src/ggml-cuda/gated_delta_net.cu index b4c9845e7..018d5d37d 100644 --- a/ggml/src/ggml-cuda/gated_delta_net.cu +++ b/ggml/src/ggml-cuda/gated_delta_net.cu @@ -1,4 +1,5 @@ #include "gated_delta_net.cuh" +#include "ggml-cuda/common.cuh" template __global__ void __launch_bounds__((ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v) * 4, 2) @@ -53,6 +54,7 @@ gated_delta_net_cuda(const float * q, float s_shard[rows_per_lane]; // state is stored transposed: M[col][i] = S[i][col], row col is contiguous + ggml_cuda_pdl_sync(); #pragma unroll for (int r = 0; r < rows_per_lane; r++) { const int i = r * warp_size + lane; @@ -189,28 +191,29 @@ static void launch_gated_delta_net( int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(grid_dims, block_dims, 0, stream); switch (S_v) { case 16: - gated_delta_net_cuda<16, KDA, keep_rs_t><<>>( + ggml_cuda_kernel_launch(gated_delta_net_cuda<16, KDA, keep_rs_t>, launch_params, q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K); break; case 32: - gated_delta_net_cuda<32, KDA, keep_rs_t><<>>( + ggml_cuda_kernel_launch(gated_delta_net_cuda<32, KDA, keep_rs_t>, launch_params, q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K); break; case 64: { - gated_delta_net_cuda<64, KDA, keep_rs_t><<>>( + ggml_cuda_kernel_launch(gated_delta_net_cuda<64, KDA, keep_rs_t>, launch_params, q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K); break; } case 128: { - gated_delta_net_cuda<128, KDA, keep_rs_t><<>>( + ggml_cuda_kernel_launch(gated_delta_net_cuda<128, KDA, keep_rs_t>, launch_params, q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K); diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu index 36b840e81..457b695eb 100644 --- a/ggml/src/ggml-cuda/getrows.cu +++ b/ggml/src/ggml-cuda/getrows.cu @@ -11,6 +11,7 @@ static __global__ void k_get_rows( /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03, const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) { + ggml_cuda_pdl_sync(); for (int64_t z = blockIdx.z; z < ne11*(int64_t)ne12_fdv.z; z += gridDim.z) { for (int64_t i00 = 2*(blockIdx.y*blockDim.x + threadIdx.x); i00 < ne00; i00 += gridDim.y*blockDim.x) { // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher. @@ -48,6 +49,8 @@ static __global__ void k_get_rows_float( /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03, const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) { + ggml_cuda_pdl_lc(); + ggml_cuda_pdl_sync(); for (int64_t z = blockIdx.z; z < ne11*(int64_t)ne12_fdv.z; z += gridDim.z) { for (int64_t i00 = blockIdx.y*blockDim.x + threadIdx.x; i00 < ne00; i00 += gridDim.y*blockDim.x) { // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher. @@ -83,6 +86,7 @@ static __global__ void k_get_rows_back_float( float sum = 0.0f; + ggml_cuda_pdl_sync(); for (int64_t i = 0; i < nrows_grad; ++i) { if (rows[i] != dst_row) { continue; @@ -156,7 +160,8 @@ static void get_rows_cuda_float( GGML_ASSERT(ne11 <= std::numeric_limits::max() / ne12); const uint3 ne12_fdv = init_fastdiv_values(ne12); - k_get_rows_float<<>>( + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{block_nums, block_dims, 0, stream}; + ggml_cuda_kernel_launch(k_get_rows_float, launch_params, src0_d, src1_d, dst_d, ne00, /*ne01, ne02, ne03,*/ /*ne10,*/ ne11, ne12_fdv, /*ne13,*/ diff --git a/ggml/src/ggml-cuda/mean.cu b/ggml/src/ggml-cuda/mean.cu index 49af53899..a8f6046e4 100644 --- a/ggml/src/ggml-cuda/mean.cu +++ b/ggml/src/ggml-cuda/mean.cu @@ -67,9 +67,11 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { // See discussion in: https://github.com/ggml-org/llama.cpp/pull/15132 if ((nrows / nsm) < 2) { const dim3 block_dims(512, 1, 1); - reduce_rows_f32<<>>(src0_d, dst_d, ncols); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(reduce_rows_f32, launch_params, src0_d, dst_d, ncols); } else { const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1); - reduce_rows_f32<<>>(src0_d, dst_d, ncols); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(reduce_rows_f32, launch_params, src0_d, dst_d, ncols); } } diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu index d91472024..09d95f309 100644 --- a/ggml/src/ggml-cuda/mmvf.cu +++ b/ggml/src/ggml-cuda/mmvf.cu @@ -21,6 +21,7 @@ static __global__ void mul_mat_vec_f( int channel_y; int sample_dst; + ggml_cuda_pdl_sync(); if constexpr (is_multi_token_id) { // Multi-token MUL_MAT_ID path, adding these in the normal path causes a perf regression for n_tokens=1 case token_idx = blockIdx.z; @@ -298,6 +299,7 @@ static __global__ void mul_mat_vec_f( static_assert(std::is_same_v, "unsupported type"); } + ggml_cuda_pdl_lc(); #pragma unroll for (int j = 0; j < ncols_dst; ++j) { sumf[j] = warp_reduce_sum(sumf[j]); @@ -382,11 +384,13 @@ static void mul_mat_vec_f_switch_fusion( const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst, const dim3 & block_dims, const dim3 & block_nums, const int nbytes_shared, const int ids_stride, const cudaStream_t stream) { + const ggml_cuda_kernel_launch_params launch_params = {block_nums, block_dims, nbytes_shared, stream}; + const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr; if constexpr (ncols_dst == 1) { if (has_fusion) { - mul_mat_vec_f<<>> - (x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst, + ggml_cuda_kernel_launch(mul_mat_vec_f, launch_params, + x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride); return; @@ -395,8 +399,8 @@ static void mul_mat_vec_f_switch_fusion( GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1"); - mul_mat_vec_f<<>> - (x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst, + ggml_cuda_kernel_launch(mul_mat_vec_f, launch_params, + x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride); diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 73a0991e2..13b8b8552 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -424,6 +424,7 @@ static __global__ void mul_mat_vec_q( uint32_t channel_y; uint32_t sample_dst; + ggml_cuda_pdl_sync(); channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : fastdiv(channel_dst, channel_ratio); channel_y = ncols_dst == 1 && ids ? fastmodulo(channel_dst, nchannels_y) : channel_dst; sample_dst = blockIdx.z; @@ -683,8 +684,9 @@ static void mul_mat_vec_q_switch_fusion( const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr; if constexpr (c_ncols_dst == 1) { if (has_fusion) { - mul_mat_vec_q<<>> - (vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, nbytes_shared, stream); + ggml_cuda_kernel_launch(mul_mat_vec_q, launch_params, + vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride); return; @@ -693,8 +695,9 @@ static void mul_mat_vec_q_switch_fusion( GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1"); - mul_mat_vec_q<<>> - (vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, nbytes_shared, stream); + ggml_cuda_kernel_launch(mul_mat_vec_q, launch_params, + vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride); } diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu index ef98f675a..09d9f3a7d 100644 --- a/ggml/src/ggml-cuda/norm.cu +++ b/ggml/src/ggml-cuda/norm.cu @@ -18,6 +18,7 @@ static __global__ void norm_f32( float2 mean_var = make_float2(0.0f, 0.0f); + ggml_cuda_pdl_sync(); for (int col = tid; col < ncols; col += block_size) { const float xi = x[col]; mean_var.x += xi; @@ -46,6 +47,7 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr float tmp = 0.0f; // partial sum for thread in warp + ggml_cuda_pdl_sync(); for (int j = start; j < end; j += block_size) { tmp += x[j]; } @@ -95,6 +97,7 @@ static __global__ void rms_norm_f32(const float * x, const uint3 add_nrows_packed = make_uint3(0, 0, 0), const uint3 add_nchannels_packed = make_uint3(0, 0, 0), const uint3 add_nsamples_packed = make_uint3(0, 0, 0)) { + ggml_cuda_pdl_lc(); const int nrows = gridDim.x; const int nchannels = gridDim.y; @@ -124,6 +127,7 @@ static __global__ void rms_norm_f32(const float * x, float tmp = 0.0f; // partial sum for thread in warp + ggml_cuda_pdl_sync(); for (int col = tid; col < ncols; col += block_size) { const float xi = x[col]; tmp += xi * xi; @@ -163,6 +167,7 @@ static __global__ void rms_norm_back_f32( float sum_xx = 0.0f; // sum for squares of x, equivalent to forward pass float sum_xg = 0.0f; // sum for x * gradient, needed because RMS norm mixes inputs + ggml_cuda_pdl_sync(); for (int col = tid; col < ncols; col += block_size) { const float xfi = xf[col]; sum_xx += xfi * xfi; @@ -253,6 +258,7 @@ static __global__ void l2_norm_f32( float tmp = 0.0f; // partial sum for thread in warp + ggml_cuda_pdl_sync(); for (int col = tid; col < ncols; col += block_size) { const float xi = x[col]; tmp += xi * xi; @@ -261,6 +267,7 @@ static __global__ void l2_norm_f32( // sum up partial sums extern __shared__ float s_sum[]; tmp = block_reduce(tmp, s_sum); + ggml_cuda_pdl_lc(); // from https://pytorch.org/docs/stable/generated/torch.nn.functional.normalize.html const float scale = rsqrtf(fmaxf(tmp, eps * eps)); @@ -300,10 +307,19 @@ static void rms_norm_f32_cuda( const dim3 blocks_num(nrows, nchannels, nsamples); if (ncols < 1024) { const dim3 block_dims(256, 1, 1); - rms_norm_f32<256, false><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); + const ggml_cuda_kernel_launch_params launch_params = {blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream}; + ggml_cuda_kernel_launch(rms_norm_f32<256, false>, launch_params, + x, dst, ncols, stride_row, stride_channel, stride_sample, eps, + // underlying cudaLaunchKernelEx does not support default params + nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), + nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0)); } else { const dim3 block_dims(1024, 1, 1); - rms_norm_f32<1024, false><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream}; + ggml_cuda_kernel_launch(rms_norm_f32<1024, false>, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps, + // underlying cudaLaunchKernelEx does not support default params + nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), + nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0)); } } @@ -346,14 +362,20 @@ static void rms_norm_mul_f32_cuda(const float * x, const uint3 mul_nsamples_packed = init_fastdiv_values(mul_nsamples); if (ncols < 1024) { const dim3 block_dims(256, 1, 1); - rms_norm_f32<256, true><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>( + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream}; + ggml_cuda_kernel_launch(rms_norm_f32<256, true>, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel, - mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed); + mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, + // underlying cudaLaunchKernelEx does not support default params + nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0)); } else { const dim3 block_dims(1024, 1, 1); - rms_norm_f32<1024, true><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>( + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream}; + ggml_cuda_kernel_launch(rms_norm_f32<1024, true>, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel, - mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed); + mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, + // underlying cudaLaunchKernelEx does not support default params + nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0)); } } else { const uint3 mul_ncols_packed = init_fastdiv_values(mul_ncols); @@ -367,14 +389,16 @@ static void rms_norm_mul_f32_cuda(const float * x, const uint3 add_nsamples_packed = init_fastdiv_values(add_nsamples); if (ncols < 1024) { const dim3 block_dims(256, 1, 1); - rms_norm_f32<256, true, true><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>( + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims,block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream}; + ggml_cuda_kernel_launch(rms_norm_f32<256, true, true>, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add, add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed, add_nchannels_packed, add_nsamples_packed); } else { const dim3 block_dims(1024, 1, 1); - rms_norm_f32<1024, true, true><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>( + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream}; + ggml_cuda_kernel_launch(rms_norm_f32<1024, true, true>, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add, add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed, @@ -399,10 +423,12 @@ static void l2_norm_f32_cuda( const dim3 blocks_num(nrows, nchannels, nsamples); if (ncols < 1024) { const dim3 block_dims(WARP_SIZE, 1, 1); - l2_norm_f32<<>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, 0, stream}; + ggml_cuda_kernel_launch(l2_norm_f32, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps); } else { const dim3 block_dims(1024, 1, 1); - l2_norm_f32<1024><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream}; + ggml_cuda_kernel_launch(l2_norm_f32<1024>, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps); } } diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index 52f664719..49516965c 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -6,6 +6,7 @@ static __global__ void quantize_q8_1( const float * __restrict__ x, void * __restrict__ vy, const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, const int64_t ne0, const uint32_t ne1, const uint3 ne2) { + ggml_cuda_pdl_lc(); const int64_t i0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; if (i0 >= ne0) { @@ -28,6 +29,7 @@ static __global__ void quantize_q8_1( const int64_t ib = i_cont / QK8_1; // block index const int64_t iqs = i_cont % QK8_1; // quant index + ggml_cuda_pdl_sync(); const float xi = i0 < ne00 ? x[i03*s03 + i02*s02 + i01*s01 + i00] : 0.0f; float amax = fabsf(xi); float sum = xi; @@ -196,6 +198,7 @@ static __global__ void quantize_mmq_mxfp4(const float * __restrict__ x, const int64_t i2 = blockIdx.z % ne2; const int64_t i3 = blockIdx.z / ne2; + ggml_cuda_pdl_sync(); const int64_t i01 = ids ? ids[i1] : i1; const int64_t i02 = i2; const int64_t i03 = i3; @@ -288,6 +291,7 @@ static __global__ void quantize_mmq_q8_1( const int64_t i3 = blockIdx.z / ne2; const int64_t i00 = i0; + ggml_cuda_pdl_sync(); const int64_t i01 = ids ? ids[i1] : i1; const int64_t i02 = i2; const int64_t i03 = i3; @@ -378,7 +382,8 @@ void quantize_row_q8_1_cuda( const int64_t block_num_x = (ne0 + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; const dim3 num_blocks(block_num_x, ne1, ne2*ne3); const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1); - quantize_q8_1<<>>(x, vy, ne00, s01, s02, s03, ne0, ne1, ne2_fastdiv); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(num_blocks, block_size, 0, stream); + ggml_cuda_kernel_launch(quantize_q8_1, launch_params, x, vy, ne00, s01, s02, s03, ne0, ne1, ne2_fastdiv); GGML_UNUSED(type_src0); } diff --git a/ggml/src/ggml-cuda/reduce_rows.cuh b/ggml/src/ggml-cuda/reduce_rows.cuh index de240fd44..5895d3bf8 100644 --- a/ggml/src/ggml-cuda/reduce_rows.cuh +++ b/ggml/src/ggml-cuda/reduce_rows.cuh @@ -10,6 +10,8 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r const int num_unroll = 8; float temp[num_unroll]; float sum_temp[num_unroll] = { 0.0f }; + + ggml_cuda_pdl_sync(); for (int i = col; i < ncols;) { for (int j = 0; j < num_unroll; ++j) { if (i < ncols) { diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu index 45a49a5dc..e20a5cb6b 100644 --- a/ggml/src/ggml-cuda/rope.cu +++ b/ggml/src/ggml-cuda/rope.cu @@ -134,6 +134,7 @@ static __global__ void rope_neox(const T * x, const float * freq_factors, const int64_t * row_indices, const int set_rows_stride) { + ggml_cuda_pdl_lc(); const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (i0 >= ne00) { @@ -148,6 +149,7 @@ static __global__ void rope_neox(const T * x, int idst = i0 / 2 + i1 * s1 + i2 * s2 + i3 * s3; const int ix = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03; + ggml_cuda_pdl_sync(); // Fusion optimization: ROPE + VIEW + SET_ROWS. // The rope output is viewed as a 1D tensor and offset based on a row index in row_indices. @@ -216,6 +218,7 @@ static __global__ void rope_multi(const T * x, int idst = i0 / 2 + i1 * s1 + i2 * s2 + i3 * s3; const int ix = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03; + ggml_cuda_pdl_sync(); if (i0 >= n_dims) { dst[idst + i0/2 + 0] = x[ix + i0/2 + 0]; dst[idst + i0/2 + 1] = x[ix + i0/2 + 1]; @@ -300,6 +303,7 @@ static __global__ void rope_vision(const T * x, int idst = i0 / 2 + i1 * s1 + i2 * s2 + i3 * s3; const int ix = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03; + ggml_cuda_pdl_sync(); const int sect_dims = sections.v[0] + sections.v[1]; const int sec_w = sections.v[1] + sections.v[0]; const int sector = (i0 / 2) % sect_dims; @@ -399,13 +403,14 @@ static void rope_neox_cuda(const T * x, const dim3 block_nums(nr, n_blocks_x, 1); const float theta_scale = powf(freq_base, -2.0f / n_dims); + const ggml_cuda_kernel_launch_params launch_params = {block_nums, block_dims, 0, stream}; if (freq_factors == nullptr) { - rope_neox<<>>( + ggml_cuda_kernel_launch(rope_neox, launch_params, x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride); } else { - rope_neox<<>>( + ggml_cuda_kernel_launch(rope_neox, launch_params, x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride); } @@ -443,11 +448,13 @@ static void rope_multi_cuda(const T * x, const float theta_scale = powf(freq_base, -2.0f / n_dims); if (freq_factors == nullptr) { - rope_multi<<>>( + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(rope_multi, launch_params, x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale, freq_factors, sections, is_imrope); } else { - rope_multi<<>>( + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(rope_multi, launch_params, x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale, freq_factors, sections, is_imrope); } diff --git a/ggml/src/ggml-cuda/scale.cu b/ggml/src/ggml-cuda/scale.cu index 0ddeff6a1..7b2e59a43 100644 --- a/ggml/src/ggml-cuda/scale.cu +++ b/ggml/src/ggml-cuda/scale.cu @@ -3,9 +3,11 @@ #define MAX_GRIDDIM_X 0x7FFFFFFF static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int64_t nelements) { + ggml_cuda_pdl_lc(); int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x; int64_t stride = (int64_t)blockDim.x * (int64_t)gridDim.x; + ggml_cuda_pdl_sync(); for (int64_t i = tid; i < nelements; i += stride) { dst[i] = scale * x[i] + bias; } @@ -13,7 +15,8 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int64_t nelements, cudaStream_t stream) { const int64_t num_blocks = (nelements + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE; - scale_f32<<>>(x, dst, scale, bias, nelements); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(MIN(MAX_GRIDDIM_X, num_blocks), CUDA_SCALE_BLOCK_SIZE, 0, stream); + ggml_cuda_kernel_launch(scale_f32, launch_params, x, dst, scale, bias, nelements); } void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { diff --git a/ggml/src/ggml-cuda/set-rows.cu b/ggml/src/ggml-cuda/set-rows.cu index 631de7e8f..e14f96b82 100644 --- a/ggml/src/ggml-cuda/set-rows.cu +++ b/ggml/src/ggml-cuda/set-rows.cu @@ -53,6 +53,7 @@ static __global__ void k_set_rows_quant(const float * __restrict__ src0, const int64_t i11 = fastmodulo((uint32_t) i02, ne11_fd); const int64_t i10 = i01; + ggml_cuda_pdl_sync(); const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12); const float * src0_row = src0 + i01*s01 + i02*s02 + i03*s03; @@ -157,7 +158,9 @@ static __global__ void k_set_rows(const src_t * __restrict__ src0, const int64_t i11 = fastmodulo((uint32_t) i02, ne11_fd); const int64_t i10 = i01; + ggml_cuda_pdl_sync(); const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12); + ggml_cuda_pdl_lc(); const src_t * src0_row = src0 + i01*s01 + i02*s02 + i03*s03; dst_t * dst_row_ptr = dst + dst_row*s1 + i02*s2 + i03*s3; @@ -203,9 +206,11 @@ static void set_rows_cuda( const uint3 ne11_fd = init_fastdiv_values((uint32_t) ne11); const uint3 ne12_fd = init_fastdiv_values((uint32_t) ne12); - k_set_rows<<>>(src0_d, src1_d, dst_d, ne_total, ne10, ne11, ne12, ne13, s01, - s02, s03, s10, s11, s12, s1, s2, s3, ne00_fd, ne01_fd, ne02_fd, - ne11_fd, ne12_fd); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(grid_size, block_size, 0, stream); + ggml_cuda_kernel_launch(k_set_rows, launch_params, + src0_d, src1_d, dst_d, ne_total, ne10, ne11, ne12, ne13, s01, + s02, s03, s10, s11, s12, s1, s2, s3, ne00_fd, ne01_fd, ne02_fd, + ne11_fd, ne12_fd); } } diff --git a/ggml/src/ggml-cuda/softcap.cu b/ggml/src/ggml-cuda/softcap.cu index 40dfe45d6..9f0fa1051 100644 --- a/ggml/src/ggml-cuda/softcap.cu +++ b/ggml/src/ggml-cuda/softcap.cu @@ -1,18 +1,21 @@ #include "softcap.cuh" static __global__ void softcap_f32(const float * x, float * dst, const float scale, const float softcap, const int k) { + ggml_cuda_pdl_lc(); const int i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= k) { return; } + ggml_cuda_pdl_sync(); dst[i] = tanhf(scale * x[i]) * softcap; } static void softcap_f32_cuda(const float * x, float * dst, const float scale, const float softcap, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_SOFTCAP_BLOCK_SIZE - 1) / CUDA_SOFTCAP_BLOCK_SIZE; - softcap_f32<<>>(x, dst, scale, softcap, k); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(num_blocks, CUDA_SOFTCAP_BLOCK_SIZE, 0, stream); + ggml_cuda_kernel_launch(softcap_f32, launch_params, x, dst, scale, softcap, k); } // fused GGML_OP_SCALE + GGML_UNARY_OP_TANH + GGML_OP_SCALE diff --git a/ggml/src/ggml-cuda/ssm-conv.cu b/ggml/src/ggml-cuda/ssm-conv.cu index 4c4daf85d..48787b4b8 100644 --- a/ggml/src/ggml-cuda/ssm-conv.cu +++ b/ggml/src/ggml-cuda/ssm-conv.cu @@ -1,3 +1,4 @@ +#include "common.cuh" #include "ssm-conv.cuh" #include "unary.cuh" @@ -7,6 +8,7 @@ static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float const int src0_nb0, const int src0_nb1, const int src0_nb2, const int src1_nb1, float * __restrict__ dst, const int dst_nb0, const int dst_nb1, const int dst_nb2, const int64_t n_t) { + ggml_cuda_pdl_lc(); GGML_UNUSED(src0_nb0); const int tid = threadIdx.x; const int bidx = blockIdx.x; @@ -23,6 +25,7 @@ static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float float x[d_conv] = { 0.0f }; float w[d_conv] = { 0.0f }; + ggml_cuda_pdl_sync(); #pragma unroll for (size_t j = 0; j < d_conv; j++) { w[j] = w_block[tid * stride_w + j]; @@ -128,8 +131,9 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const floa constexpr int kNC = decltype(NC)::value; if (n_t <= 32) { const dim3 blocks(n_s, (nr + threads - 1) / threads, 1); - ssm_conv_f32<<>>(src0, src1, bias, src0_nb0, src0_nb1, src0_nb2, src1_nb1, - dst, dst_nb0, dst_nb1, dst_nb2, n_t); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks, threads, 0, stream); + ggml_cuda_kernel_launch(ssm_conv_f32, launch_params, src0, src1, bias, src0_nb0, src0_nb1, + src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t); } else { const int64_t split_n_t = 32; dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t); diff --git a/ggml/src/ggml-cuda/ssm-scan.cu b/ggml/src/ggml-cuda/ssm-scan.cu index c1d4e2bc8..412980376 100644 --- a/ggml/src/ggml-cuda/ssm-scan.cu +++ b/ggml/src/ggml-cuda/ssm-scan.cu @@ -26,6 +26,7 @@ __global__ void __launch_bounds__(splitD, 1) const int64_t s_off, const int64_t d_inner, const int64_t L_param) { const size_t L = L_template == 0 ? L_param : L_template; + ggml_cuda_pdl_sync(); const float *s0_block = (const float *)((const char *)src0 + src6[blockIdx.x] * src0_nb3 + blockIdx.y * splitD * src0_nb2); const float *x_block = (const float *)((const char *)src1 + (blockIdx.x * src1_nb3) + blockIdx.y * splitD * sizeof(float)); const float *dt_block = (const float *)((const char *)src2 + (blockIdx.x * src2_nb2) + blockIdx.y * splitD * sizeof(float)); @@ -135,6 +136,7 @@ __global__ void __launch_bounds__(d_state, 1) const int group_off = (head_idx / (n_head / n_group)) * d_state * sizeof(float); + ggml_cuda_pdl_sync(); // TODO: refactor strides to be in elements/floats instead of bytes to be cleaner and consistent with the rest of the codebase const float * s0_warp = (const float *) ((const char *) src0 + src6[seq_idx] * src0_nb3 + head_idx * src0_nb2 + head_off * d_state); const float * x_warp = (const float *) ((const char *) src1 + (seq_idx * src1_nb3) + (warp_idx * sizeof(float))); @@ -206,7 +208,8 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa constexpr int num_warps = threads/WARP_SIZE; const dim3 blocks((n_head * head_dim + (num_warps - 1)) / num_warps, n_seq, 1); - ssm_scan_f32_group<128/WARP_SIZE, 128><<>>( + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks, threads, 0, stream); + ggml_cuda_kernel_launch(ssm_scan_f32_group<128/WARP_SIZE, 128>, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok); @@ -215,7 +218,8 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa constexpr int num_warps = threads/WARP_SIZE; const dim3 blocks((n_head * head_dim + (num_warps - 1)) / num_warps, n_seq, 1); - ssm_scan_f32_group<256/WARP_SIZE, 256><<>>( + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks, threads, 0, stream); + ggml_cuda_kernel_launch(ssm_scan_f32_group<256/WARP_SIZE, 256>, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok); @@ -231,58 +235,59 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa const dim3 blocks(n_seq, (n_head + threads - 1) / threads, 1); const int smem_size = (threads * (d_state + 1) * 2) * sizeof(float); if (d_state == 16) { + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks, threads, smem_size, stream); switch (n_tok) { case 1: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); break; case 2: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); break; case 3: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); break; case 4: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); break; case 5: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); break; case 6: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); break; case 7: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); break; case 8: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); break; default: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); diff --git a/ggml/src/ggml-cuda/sumrows.cu b/ggml/src/ggml-cuda/sumrows.cu index 4025771aa..0003658ca 100644 --- a/ggml/src/ggml-cuda/sumrows.cu +++ b/ggml/src/ggml-cuda/sumrows.cu @@ -7,10 +7,12 @@ void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int const dim3 block_nums(nrows, 1, 1); if ((nrows / nsm) < 2) { const dim3 block_dims(512, 1, 1); - reduce_rows_f32<<>>(x, dst, ncols); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(reduce_rows_f32, launch_params, x, dst, ncols); } else { const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1); - reduce_rows_f32<<>>(x, dst, ncols); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(reduce_rows_f32, launch_params, x, dst, ncols); } } @@ -34,10 +36,12 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { if ((nrows / nsm) < 2) { // Increase num threads to 512 for small nrows to better hide the latency const dim3 block_dims(512, 1, 1); - reduce_rows_f32<<>>(src0_d, dst_d, ncols); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(reduce_rows_f32, launch_params, src0_d, dst_d, ncols); } else { // Enough active SMs to hide latency, use smaller blocks to allow better scheduling const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1); - reduce_rows_f32<<>>(src0_d, dst_d, ncols); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(reduce_rows_f32, launch_params, src0_d, dst_d, ncols); } } diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu index 3020e5c74..da20c9aab 100644 --- a/ggml/src/ggml-cuda/topk-moe.cu +++ b/ggml/src/ggml-cuda/topk-moe.cu @@ -105,6 +105,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * wt[i] = -INFINITY; } + ggml_cuda_pdl_sync(); #pragma unroll for (int i = 0; i < n_experts; i += WARP_SIZE) { const int expert = i + threadIdx.x; @@ -161,6 +162,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * output_weights[i] = 0.f; } + ggml_cuda_pdl_lc(); for (int k = 0; k < n_expert_used; k++) { float max_val = wt[0]; int max_expert = threadIdx.x; @@ -271,51 +273,52 @@ static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx, dim3 grid_dims((n_rows + rows_per_block - 1) / rows_per_block, 1, 1); dim3 block_dims(WARP_SIZE, rows_per_block, 1); cudaStream_t stream = ctx.stream(); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(grid_dims, block_dims, 0, stream); switch (n_expert) { case 1: - topk_moe_cuda<1, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + ggml_cuda_kernel_launch(topk_moe_cuda<1, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 2: - topk_moe_cuda<2, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + ggml_cuda_kernel_launch(topk_moe_cuda<2, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 4: - topk_moe_cuda<4, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + ggml_cuda_kernel_launch(topk_moe_cuda<4, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 8: - topk_moe_cuda<8, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + ggml_cuda_kernel_launch(topk_moe_cuda<8, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 16: - topk_moe_cuda<16, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + ggml_cuda_kernel_launch(topk_moe_cuda<16, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 32: - topk_moe_cuda<32, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + ggml_cuda_kernel_launch(topk_moe_cuda<32, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 64: - topk_moe_cuda<64, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + ggml_cuda_kernel_launch(topk_moe_cuda<64, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 128: - topk_moe_cuda<128, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + ggml_cuda_kernel_launch(topk_moe_cuda<128, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 256: - topk_moe_cuda<256, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + ggml_cuda_kernel_launch(topk_moe_cuda<256, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 512: - topk_moe_cuda<512, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + ggml_cuda_kernel_launch(topk_moe_cuda<512, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 576: - topk_moe_cuda<576, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + ggml_cuda_kernel_launch(topk_moe_cuda<576, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; default: GGML_ASSERT(false && "fatal error"); diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu index 2aeba26f4..4cb805fa6 100644 --- a/ggml/src/ggml-cuda/unary.cu +++ b/ggml/src/ggml-cuda/unary.cu @@ -116,19 +116,22 @@ static __device__ __forceinline__ float op_trunc(float x) { template static __global__ void unary_op_kernel(const T * x, T * dst, const int k) { + ggml_cuda_pdl_lc(); const int i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= k) { return; } + ggml_cuda_pdl_sync(); dst[i] = (T)op((float)x[i]); } template static void unary_cuda(const T * x, T * dst, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE; - unary_op_kernel<<>>(x, dst, k); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream); + ggml_cuda_kernel_launch(unary_op_kernel, launch_params, x, dst, k); } template @@ -258,6 +261,7 @@ void ggml_cuda_op_softplus(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { template static __global__ void unary_gated_op_kernel(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o0, const int64_t o1) { + ggml_cuda_pdl_lc(); const int64_t i = int64_t(blockDim.x)*blockIdx.x + threadIdx.x; if (i >= k) { @@ -268,13 +272,15 @@ static __global__ void unary_gated_op_kernel(const T * x, const T * g, T * dst, const int64_t j0 = (i / n) * o0 + (i % n); const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); + ggml_cuda_pdl_sync(); dst[i] = (T)(op((float)x[j0]) * (float)g[j1]); } template static void unary_gated_cuda(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o0, const int64_t o1, cudaStream_t stream) { const int64_t num_blocks = (k + CUDA_GLU_BLOCK_SIZE - 1) / CUDA_GLU_BLOCK_SIZE; - unary_gated_op_kernel<<>>(x, g, dst, k, n, o0, o1); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_GLU_BLOCK_SIZE, 0, stream); + ggml_cuda_kernel_launch(unary_gated_op_kernel, launch_params, x, g, dst, k, n, o0, o1); } template diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp index ba4c2103f..f4130d223 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp @@ -44,36 +44,81 @@ void im2col(const uint ow, const uint z_idx) { const uint KHKW = p.KH * p.KW; + // Precompute base input coordinates + const int base_iw = int(ow * p.s0) - p.p0; + const int base_ih = int(oh * p.s1) - p.p1; + + // Precompute step deltas + const uint delta_ic = BLOCK_SIZE / KHKW; + const uint delta_rem = BLOCK_SIZE % KHKW; + + const uint delta_ky = delta_rem / p.KW; + const uint delta_kx = delta_rem % p.KW; + + const uint delta_ic_offset = delta_ic * p.offset_delta; + + // If using BDA mode, precompute the base pointer and step size +#if BDA + const BDA_STORAGE_T base_dst_addr = p.dst_addr + D_SIZE * dst_row; + const uint bda_step = D_SIZE * BLOCK_SIZE; +#endif + uint wg_x = gl_WorkGroupID.x; do { const uint wg_offset = wg_x * 512; - [[unroll]] for (uint i = 0; i < NUM_ITER; ++i) { - const uint chw_idx = wg_offset + gidx + i * BLOCK_SIZE; + uint chw_idx = wg_offset + gidx; + uint ic = chw_idx / KHKW; + uint rem = chw_idx % KHKW; + + uint ky = rem / p.KW; + uint kx = rem % p.KW; + + uint ic_offset = src_batch + ic * p.offset_delta; + + // Initialize running pointer/index for the destination buffer +#if BDA + BDA_STORAGE_T current_dst_addr = base_dst_addr + D_SIZE * chw_idx; +#else + uint current_dst_idx = dst_row + chw_idx; +#endif + + [[unroll]] for (uint i = 0; i < NUM_ITER; ++i) { if (chw_idx >= p.CHW) { return; } - const uint ic = chw_idx / KHKW; - const uint rem = chw_idx - ic * KHKW; - const uint ky = rem / p.KW; - const uint kx = rem - ky * p.KW; - - const uint iiw = ow * p.s0 + kx * p.d0 - p.p0; - const uint iih = oh * p.s1 + ky * p.d1 - p.p1; + const int iiw = base_iw + int(kx * p.d0); + const int iih = base_ih + int(ky * p.d1); A_TYPE val = A_TYPE(0); - if (iih < p.IH && iiw < p.IW) { - val = data_a[src_batch + ic * p.offset_delta + iih * p.IW + iiw]; + if (uint(iih) < p.IH && uint(iiw) < p.IW) { + val = data_a[ic_offset + uint(iih) * p.IW + uint(iiw)]; } #if BDA - D_ptr out_ptr = D_ptr(p.dst_addr + D_SIZE * (dst_row + chw_idx)); - out_ptr.d = D_TYPE(val); + D_ptr(current_dst_addr).d = D_TYPE(val); + current_dst_addr += bda_step; #else - data_d[dst_row + chw_idx] = D_TYPE(val); + data_d[current_dst_idx] = D_TYPE(val); + current_dst_idx += BLOCK_SIZE; #endif + + chw_idx += BLOCK_SIZE; + ic_offset += delta_ic_offset; + kx += delta_kx; + ky += delta_ky; + + // Handle X axis wrap + uint kx_wrap = uint(kx >= p.KW); + kx -= kx_wrap * p.KW; + ky += kx_wrap; + + // Handle Y axis wrap + uint ky_wrap = uint(ky >= p.KH); + ky -= ky_wrap * p.KH; + ic_offset += ky_wrap * p.offset_delta; } wg_x += gl_NumWorkGroups.x; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index c25f217f9..7fdcf03d7 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -747,7 +747,7 @@ class MODEL_TENSOR(IntEnum): V_LAYER_OUT_SCALE = auto() V_PRE_NORM = auto() V_POST_NORM = auto() - V_MM_PRE_NORM = auto() # hunyuanocr + V_MM_PRE_NORM = auto() # hunyuanvl V_MM_POST_NORM = auto() V_MM_INP_NORM = auto() V_MM_INP_PROJ = auto() # gemma3 @@ -791,8 +791,8 @@ class MODEL_TENSOR(IntEnum): V_MM_GATE = auto() # cogvlm V_TOK_BOI = auto() # cogvlm V_TOK_EOI = auto() # cogvlm - V_TOK_IMG_BEGIN = auto() # hunyuanocr - V_TOK_IMG_END = auto() # hunyuanocr + V_TOK_IMG_BEGIN = auto() # hunyuanvl + V_TOK_IMG_END = auto() # hunyuanvl V_STD_BIAS = auto() # gemma4 V_STD_SCALE = auto() # gemma4 V_SAM_POS_EMBD = auto() # Deepseek-OCR @@ -4273,7 +4273,6 @@ class VisionProjectorType: GLM4V = "glm4v" YOUTUVL = "youtuvl" NEMOTRON_V2_VL = "nemotron_v2_vl" - HUNYUANOCR = "hunyuanocr" HUNYUANVL = "hunyuanvl" MINICPMV4_6 = "minicpmv4_6" GRANITE_SPEECH = "granite_speech" # audio diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index f40cb8282..c2235cb3b 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1366,7 +1366,7 @@ class TensorNameMap: "mlp_AR.linear_{bid}", # PaddleOCR-VL "merger.mlp.{bid}", "vision_tower.merger.mlp.{bid}", # dots.ocr - "vit.perceive.proj.{bid}", # HunyuanOCR (proj.0 = conv1, proj.2 = conv2) + "vit.perceive.proj.{bid}", # HunyuanVL (proj.0 = conv1, proj.2 = conv2) ), MODEL_TENSOR.V_MMPROJ_FC: ( @@ -1374,7 +1374,7 @@ class TensorNameMap: "model.vision.linear_proj.linear_proj", # cogvlm "model.projector.layers", # Deepseek-OCR "visual.merger.proj", # glm4v - "vit.perceive.mlp", # HunyuanOCR + "vit.perceive.mlp", # HunyuanVL ), MODEL_TENSOR.V_MMPROJ_MLP: ( @@ -1403,7 +1403,7 @@ class TensorNameMap: "model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1 "vpm.embeddings.patch_embedding", "model.vision_model.embeddings.patch_embedding", # SmolVLM - "vit.embeddings.patch_embedding", # HunyuanOCR + "vit.embeddings.patch_embedding", # HunyuanVL "vision_tower.patch_conv", # pixtral-hf "vision_encoder.patch_conv", # pixtral "vision_model.patch_embedding.linear", # llama 4 @@ -1429,7 +1429,7 @@ class TensorNameMap: "model.vision_tower.embeddings.position_embeddings", # Intern-S1 "vpm.embeddings.position_embedding", "model.vision_model.embeddings.position_embedding", # SmolVLM - "vit.embeddings.position_embedding", # HunyuanOCR + "vit.embeddings.position_embedding", # HunyuanVL "vision_model.positional_embedding_vlm", # llama 4 "vision_tower.patch_embed.pos_emb", # kimi-vl "visual.pos_embed", # qwen3vl @@ -1442,12 +1442,12 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_EMBD_IMGNL: ( "model.image_newline", # Deepseek-OCR - "vit.perceive.image_newline", # HunyuanOCR + "vit.perceive.image_newline", # HunyuanVL ), MODEL_TENSOR.V_ENC_EMBD_VSEP: ( "model.view_seperator", # Deepseek-OCR - "vit.perceive.image_sep", # HunyuanOCR + "vit.perceive.image_sep", # HunyuanVL ), MODEL_TENSOR.V_ENC_ATTN_QKV: ( @@ -1466,7 +1466,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.q_proj", "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM - "vit.layers.{bid}.self_attn.q_proj", # HunyuanOCR + "vit.layers.{bid}.self_attn.q_proj", # HunyuanVL "vision_model.model.layers.{bid}.self_attn.q_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral @@ -1490,7 +1490,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.k_proj", "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM - "vit.layers.{bid}.self_attn.k_proj", # HunyuanOCR + "vit.layers.{bid}.self_attn.k_proj", # HunyuanVL "vision_model.model.layers.{bid}.self_attn.k_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral @@ -1514,7 +1514,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.v_proj", "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM - "vit.layers.{bid}.self_attn.v_proj", # HunyuanOCR + "vit.layers.{bid}.self_attn.v_proj", # HunyuanVL "vision_model.model.layers.{bid}.self_attn.v_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral @@ -1532,7 +1532,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.layernorm_before", # Intern-S1 "vpm.encoder.layers.{bid}.layer_norm1", "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM - "vit.layers.{bid}.input_layernorm", # HunyuanOCR + "vit.layers.{bid}.input_layernorm", # HunyuanVL "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf "vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral "vision_model.model.layers.{bid}.input_layernorm", # llama4, gemma4 @@ -1553,7 +1553,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.out_proj", "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM - "vit.layers.{bid}.self_attn.o_proj", # HunyuanOCR + "vit.layers.{bid}.self_attn.o_proj", # HunyuanVL "model.vision_model.encoder.layers.{bid}.self_attn.projection_layer", # Janus Pro "vision_model.model.layers.{bid}.self_attn.o_proj", # llama4 "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf @@ -1580,7 +1580,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.layernorm_after", # Intern-S1 "vpm.encoder.layers.{bid}.layer_norm2", "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM - "vit.layers.{bid}.post_attention_layernorm", # HunyuanOCR + "vit.layers.{bid}.post_attention_layernorm", # HunyuanVL "vision_model.model.layers.{bid}.post_attention_layernorm", # llama4 "vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral-hf "vision_encoder.transformer.layers.{bid}.ffn_norm", # pixtral @@ -1601,7 +1601,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1 "vpm.encoder.layers.{bid}.mlp.fc1", "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3 - "vit.layers.{bid}.mlp.dense_h_to_4h", # HunyuanOCR + "vit.layers.{bid}.mlp.dense_h_to_4h", # HunyuanVL "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.feed_forward.w3", # pixtral "vision_model.model.layers.{bid}.mlp.fc1", # llama4 @@ -1630,7 +1630,7 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1 "vpm.encoder.layers.{bid}.mlp.fc2", "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3 - "vit.layers.{bid}.mlp.dense_4h_to_h", # HunyuanOCR + "vit.layers.{bid}.mlp.dense_4h_to_h", # HunyuanVL "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral-hf "vision_encoder.transformer.layers.{bid}.feed_forward.w2", # pixtral "vision_model.model.layers.{bid}.mlp.fc2", # llama4 @@ -1694,7 +1694,7 @@ class TensorNameMap: MODEL_TENSOR.V_MM_POST_NORM: ( "visual.merger.post_projection_norm", # glm4v "vision_tower.post_trunk_norm", # dots.ocr - "vit.perceive.after_rms", # HunyuanOCR + "vit.perceive.after_rms", # HunyuanVL ), MODEL_TENSOR.V_MM_INP_PROJ: ( @@ -1899,15 +1899,15 @@ class TensorNameMap: ), MODEL_TENSOR.V_MM_PRE_NORM: ( - "vit.perceive.before_rms", # HunyuanOCR + "vit.perceive.before_rms", # HunyuanVL ), MODEL_TENSOR.V_TOK_IMG_BEGIN: ( - "vit.perceive.image_begin", # HunyuanOCR + "vit.perceive.image_begin", # HunyuanVL ), MODEL_TENSOR.V_TOK_IMG_END: ( - "vit.perceive.image_end", # HunyuanOCR + "vit.perceive.image_end", # HunyuanVL ), MODEL_TENSOR.V_STD_BIAS: ( diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp index 6554a89b2..f10397747 100644 --- a/src/llama-chat.cpp +++ b/src/llama-chat.cpp @@ -73,7 +73,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE }, { "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE }, { "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE }, - { "hunyuan-ocr", LLM_CHAT_TEMPLATE_HUNYUAN_OCR }, + { "hunyuan-vl", LLM_CHAT_TEMPLATE_HUNYUAN_VL }, { "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 }, { "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS }, { "grok-2", LLM_CHAT_TEMPLATE_GROK_2 }, @@ -218,7 +218,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { } else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) { return LLM_CHAT_TEMPLATE_OPENAI_MOE; } else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_begin▁of▁sentence|>")) { - return LLM_CHAT_TEMPLATE_HUNYUAN_OCR; + return LLM_CHAT_TEMPLATE_HUNYUAN_VL; } else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) { return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE; } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) { @@ -825,8 +825,8 @@ int32_t llm_chat_apply_template( ss << "<|hy_User|>" << chat[i]->content << "<|hy_Assistant|>"; } } - } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_OCR) { - // tencent/HunyuanOCR + } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_VL) { + // tencent/HunyuanOCR & tencent/HunyuanVL ss << "<|hy_begin▁of▁sentence|>"; for (size_t i = 0; i < chat.size(); i++) { std::string role(chat[i]->role); diff --git a/src/llama-chat.h b/src/llama-chat.h index 13f936a94..ea6540c0b 100644 --- a/src/llama-chat.h +++ b/src/llama-chat.h @@ -53,7 +53,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_HUNYUAN_MOE, LLM_CHAT_TEMPLATE_OPENAI_MOE, LLM_CHAT_TEMPLATE_HUNYUAN_DENSE, - LLM_CHAT_TEMPLATE_HUNYUAN_OCR, + LLM_CHAT_TEMPLATE_HUNYUAN_VL, LLM_CHAT_TEMPLATE_KIMI_K2, LLM_CHAT_TEMPLATE_SEED_OSS, LLM_CHAT_TEMPLATE_GROK_2, diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 14834781e..4d3adcbc7 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1147,6 +1147,19 @@ bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) { LLAMA_LOG_DEBUG("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler); + if (sampler && model.split_mode() == LLAMA_SPLIT_MODE_TENSOR) { + static bool warned = false; + if (!warned) { + LLAMA_LOG_WARN("%s: backend sampling not supported with SPLIT_MODE_TENSOR; using CPU\n", __func__); + warned = true; + } + if (sampling.samplers.count(seq_id) > 0) { + sched_need_reserve = true; + } + sampling.samplers.erase(seq_id); + return false; + } + const bool can_offload = sampler && sampler->iface->backend_init && diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index a9a5a8e0c..8e57c64f4 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -500,15 +500,21 @@ bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) { } void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) { - mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch); - mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch); + // base tensors may not be allocated if there are no non-SWA attention layers + if (self_k_idxs && self_k_idxs->buffer) { + mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch); + mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch); - mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + } - mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch); - mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch); + // swa tensors may not be allocated if there are no SWA attention layers + if (self_k_idxs_swa && self_k_idxs_swa->buffer) { + mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch); + mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch); - mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); + mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); + } if (self_k_rot) { mctx->get_base()->set_input_k_rot(self_k_rot); @@ -534,14 +540,21 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) { bool res = true; - res &= self_k_idxs->ne[0] == params.ubatch.n_tokens; - //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there + // base tensors may not be allocated if there are no non-SWA attention layers + if (self_k_idxs && self_k_idxs->buffer) { + res &= self_k_idxs->ne[0] == params.ubatch.n_tokens; + //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there - res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens; - //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there + res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams); + } - res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams); - res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams); + // swa tensors may not be allocated if there are no SWA attention layers + if (self_k_idxs_swa && self_k_idxs_swa->buffer) { + res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens; + //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there + + res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams); + } return res; } diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 7f2fec8ee..d35bd84d6 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -755,6 +755,8 @@ struct llm_tokenizer_bpe : llm_tokenizer { struct llm_tokenizer_bpe_session { llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {} + virtual ~llm_tokenizer_bpe_session() = default; + static void append(const llama_token token_id, std::vector & output) { output.push_back(token_id); } @@ -792,7 +794,7 @@ struct llm_tokenizer_bpe_session { // } } - void tokenize(const std::string & text, std::vector & output) { + virtual void tokenize(const std::string & text, std::vector & output) { int final_prev_index = -1; const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs, tokenizer.byte_encode); @@ -1804,6 +1806,95 @@ private: const llm_tokenizer_plamo2 & tokenizer; }; +struct llm_tokenizer_hybriddna_session : llm_tokenizer_bpe_session { + llm_tokenizer_hybriddna_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {} + + void tokenize(const std::string & text, std::vector & output) override { + static const std::string open_tag = ""; + static const std::string close_tag = ""; + + const auto dna_begin_id = vocab.text_to_token(open_tag); + const auto dna_end_id = vocab.text_to_token(close_tag); + const auto dna_oov_id = vocab.text_to_token(""); + + // Fall back to plain BPE if the DNA pieces aren't in the vocab. + if (dna_begin_id == LLAMA_TOKEN_NULL || dna_end_id == LLAMA_TOKEN_NULL || dna_oov_id == LLAMA_TOKEN_NULL) { + llm_tokenizer_bpe_session::tokenize(text, output); + return; + } + + const size_t k = 6; + size_t pos = 0; + + while (pos < text.size()) { + const size_t start = text.find(open_tag, pos); + if (start == std::string::npos) { + if (pos < text.size()) { + llm_tokenizer_bpe_session::tokenize(text.substr(pos), output); + } + break; + } + if (start > pos) { + llm_tokenizer_bpe_session::tokenize(text.substr(pos, start - pos), output); + } + output.push_back(dna_begin_id); + + const size_t content_start = start + open_tag.size(); + const size_t end = text.find(close_tag, content_start); + const size_t content_end = (end == std::string::npos) ? text.size() : end; + + emit_dna_kmers(text.substr(content_start, content_end - content_start), k, dna_oov_id, output); + + if (end == std::string::npos) { + break; + } + output.push_back(dna_end_id); + pos = end + close_tag.size(); + } + } + +private: + void emit_dna_kmers(const std::string & raw, size_t k, llama_token oov_id, std::vector & output) { + std::string seq = raw; + for (char & c : seq) { + if (c >= 'a' && c <= 'z') { + c = char(c - 32); + } + } + auto is_valid_kmer = [](const std::string & s) { + for (char c : s) { + if (c != 'A' && c != 'C' && c != 'G' && c != 'T') { + return false; + } + } + return true; + }; + + size_t i = 0; + for (; i + k <= seq.size(); i += k) { + const std::string kmer = seq.substr(i, k); + if (is_valid_kmer(kmer)) { + const auto tok = vocab.text_to_token(kmer); + output.push_back(tok != LLAMA_TOKEN_NULL ? tok : oov_id); + } else { + output.push_back(oov_id); + } + } + if (i < seq.size()) { + std::string kmer = seq.substr(i); + kmer.append(k - kmer.size(), 'A'); + if (is_valid_kmer(kmer)) { + const auto tok = vocab.text_to_token(kmer); + output.push_back(tok != LLAMA_TOKEN_NULL ? tok : oov_id); + } else { + output.push_back(oov_id); + } + } + } + + const llama_vocab & vocab; +}; + // // impl // @@ -2034,7 +2125,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { special_mask_id = 103; add_sep = true; - } else if (tokenizer_model == "gpt2") { + } else if (tokenizer_model == "gpt2" || tokenizer_model == "hybriddna") { type = LLAMA_VOCAB_TYPE_BPE; // read bpe merges and populate bpe ranks @@ -3421,12 +3512,19 @@ std::vector llama_vocab::impl::tokenize( } break; } - - llm_tokenizer_bpe_session session(vocab, *static_cast(tokenizer.get())); // it calls some other methods that are not exist in llm_tokenizer, // here just cast it to bpe tokenizer object + const llm_tokenizer_bpe * tok_bpe = static_cast(tokenizer.get()); + + std::unique_ptr session; + if (vocab.get_tokenizer_model() == "hybriddna") { + session = std::make_unique(vocab, *tok_bpe); + } else { + session = std::make_unique(vocab, *tok_bpe); + } + if (add_special) { - session.append_bos(output); + session->append_bos(output); } for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { @@ -3439,15 +3537,15 @@ std::vector llama_vocab::impl::tokenize( #ifdef PRETOKENIZERDEBUG LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str()); #endif - session.tokenize(text, output); + session->tokenize(text, output); } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) - session.append(fragment.token, output); + session->append(fragment.token, output); } } if (add_special) { - session.append_eos(output); - session.check_double_bos_eos(output); + session->append_eos(output); + session->check_double_bos_eos(output); } } break; case LLAMA_VOCAB_TYPE_WPM: diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index 35a0158e8..adeb0c26e 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -525,8 +525,9 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr res->add_input(std::move(inp)); - ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv(); + ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); cb(h_norm, "mtp_hnorm", il); @@ -615,6 +616,8 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr cb(cur, "h_pre_norm", -1); res->t_h_pre_norm = cur; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + ggml_tensor * head_norm_w = layer.nextn.shared_head_norm ? layer.nextn.shared_head_norm : model.output_norm; diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index 4f63c410d..e4512116d 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -588,8 +588,10 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm res->add_input(std::move(inp)); - ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv(); + ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + auto * inp_attn = build_attn_inp_kv(); + ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); cb(h_norm, "mtp_hnorm", il); @@ -710,6 +712,8 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm cb(cur, "h_pre_norm", -1); res->t_h_pre_norm = cur; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + ggml_tensor * head_norm_w = layer.nextn.shared_head_norm ? layer.nextn.shared_head_norm : model.output_norm; diff --git a/tools/batched-bench/main.cpp b/tools/batched-bench/main.cpp new file mode 100644 index 000000000..958cfc5b3 --- /dev/null +++ b/tools/batched-bench/main.cpp @@ -0,0 +1,5 @@ +int llama_batched_bench(int argc, char ** argv); + +int main(int argc, char ** argv) { + return llama_batched_bench(argc, argv); +} diff --git a/tools/cli/main.cpp b/tools/cli/main.cpp new file mode 100644 index 000000000..cb7d795b6 --- /dev/null +++ b/tools/cli/main.cpp @@ -0,0 +1,5 @@ +int llama_cli(int argc, char ** argv); + +int main(int argc, char ** argv) { + return llama_cli(argc, argv); +} diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp index a0765e76b..d328d15e6 100644 --- a/tools/completion/completion.cpp +++ b/tools/completion/completion.cpp @@ -85,7 +85,10 @@ static void sigint_handler(int signo) { } #endif -int main(int argc, char ** argv) { +// satisfies -Wmissing-declarations +int llama_completion(int argc, char ** argv); + +int llama_completion(int argc, char ** argv) { std::setlocale(LC_NUMERIC, "C"); common_params params; diff --git a/tools/completion/main.cpp b/tools/completion/main.cpp new file mode 100644 index 000000000..bea9a0ec9 --- /dev/null +++ b/tools/completion/main.cpp @@ -0,0 +1,5 @@ +int llama_completion(int argc, char ** argv); + +int main(int argc, char ** argv) { + return llama_completion(argc, argv); +} diff --git a/tools/fit-params/CMakeLists.txt b/tools/fit-params/CMakeLists.txt index 25c409663..207caf2ce 100644 --- a/tools/fit-params/CMakeLists.txt +++ b/tools/fit-params/CMakeLists.txt @@ -1,6 +1,18 @@ +# llama-fit-params-impl: fit-params logic, reusable by app + +set(TARGET llama-fit-params-impl) + +add_library(${TARGET} STATIC fit-params.cpp) + +target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT}) + +# llama-fit-params executable + set(TARGET llama-fit-params) -add_executable(${TARGET} fit-params.cpp) -target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT}) + +add_executable(${TARGET} main.cpp) +target_link_libraries(${TARGET} PRIVATE llama-fit-params-impl) target_compile_features(${TARGET} PRIVATE cxx_std_17) if(LLAMA_TOOLS_INSTALL) diff --git a/tools/fit-params/fit-params.cpp b/tools/fit-params/fit-params.cpp index 20a5ff1eb..5d897bc46 100644 --- a/tools/fit-params/fit-params.cpp +++ b/tools/fit-params/fit-params.cpp @@ -12,7 +12,10 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -int main(int argc, char ** argv) { +// satisfies -Wmissing-declarations +int llama_fit_params(int argc, char ** argv); + +int llama_fit_params(int argc, char ** argv) { common_params params; common_init(); diff --git a/tools/fit-params/main.cpp b/tools/fit-params/main.cpp new file mode 100644 index 000000000..b7271d475 --- /dev/null +++ b/tools/fit-params/main.cpp @@ -0,0 +1,5 @@ +int llama_fit_params(int argc, char ** argv); + +int main(int argc, char ** argv) { + return llama_fit_params(argc, argv); +} diff --git a/tools/llama-bench/main.cpp b/tools/llama-bench/main.cpp new file mode 100644 index 000000000..0c18bb0c9 --- /dev/null +++ b/tools/llama-bench/main.cpp @@ -0,0 +1,5 @@ +int llama_bench(int argc, char ** argv); + +int main(int argc, char ** argv) { + return llama_bench(argc, argv); +} diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index f6eab8a03..44aed295d 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -170,7 +170,7 @@ #define TN_TOK_BOI "v.boi" #define TN_TOK_EOI "v.eoi" -// hunyuanocr / hunyuanvl (shared GGUF tensor names) +// hunyuanvl (shared GGUF tensor names) #define TN_MM_PRE_NORM "mm.pre_norm.%s" #define TN_TOK_IMG_BEGIN "mm.image_begin" #define TN_TOK_IMG_END "mm.image_end" @@ -343,7 +343,6 @@ enum projector_type { PROJECTOR_TYPE_YASA2, PROJECTOR_TYPE_KIMIK25, PROJECTOR_TYPE_NEMOTRON_V2_VL, - PROJECTOR_TYPE_HUNYUANOCR, PROJECTOR_TYPE_HUNYUANVL, PROJECTOR_TYPE_MINICPMV4_6, PROJECTOR_TYPE_GRANITE_SPEECH, @@ -393,7 +392,6 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_YASA2, "yasa2"}, { PROJECTOR_TYPE_KIMIK25, "kimik25"}, { PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"}, - { PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"}, { PROJECTOR_TYPE_HUNYUANVL, "hunyuanvl"}, { PROJECTOR_TYPE_MINICPMV4_6, "minicpmv4_6"}, { PROJECTOR_TYPE_GRANITE_SPEECH, "granite_speech"}, diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index ce15dbcd1..e0de41e0b 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -35,6 +35,16 @@ enum resize_algo { // RESIZE_ALGO_LANCZOS, // TODO }; +// Padding style for img_tool::resize +// PAD_NONE - no padding; direct resize to target dimensions +// PAD_CEIL - aspect-preserving pad (default) +// PAD_NEAREST - aspect-preserving pad with nearest-integer rounding (Pillow byte-parity) +enum pad_style { + PAD_NONE, + PAD_CEIL, + PAD_NEAREST, +}; + struct clip_hparams { int32_t image_size = 0; int32_t patch_size = 0; @@ -52,7 +62,7 @@ struct clip_hparams { int32_t image_min_pixels = -1; int32_t image_max_pixels = -1; resize_algo image_resize_algo = RESIZE_ALGO_BICUBIC; - bool image_resize_pad = true; // if false, center-crop will be applied when resizing + pad_style image_resize_pad = PAD_CEIL; // padding style when resizing std::array image_pad_color = {0, 0, 0}; // (preprocessor) for llava-uhd style models @@ -61,8 +71,8 @@ struct clip_hparams { int32_t preproc_max_tiles = 0; resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC; resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR; - bool image_pad_rf = true; // if true, refined image will be padded (e.g. llava-1.6) - bool image_pad_ov = false; // if true, overview image will be padded (e.g. llava-1.6) + pad_style image_pad_rf = PAD_CEIL; // padding style for the refined image (e.g. llava-1.6) + pad_style image_pad_ov = PAD_NONE; // padding style for the overview image (e.g. llava-1.6) std::array image_pad_color_rf = {0, 0, 0}; // padding color for refined image std::array image_pad_color_ov = {0, 0, 0}; // padding color for overview image @@ -510,7 +520,7 @@ struct clip_model { ggml_tensor * mm_boi = nullptr; ggml_tensor * mm_eoi = nullptr; - // hunyuanocr perceiver + // hunyuanvl perceiver ggml_tensor * mm_pre_norm_w = nullptr; ggml_tensor * mm_img_begin = nullptr; ggml_tensor * mm_img_end = nullptr; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 9340f0cda..979a6656a 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -58,7 +58,7 @@ #include "models/gemma4v.cpp" #include "models/glm4v.cpp" #include "models/granite-speech.cpp" -#include "models/hunyuanocr.cpp" +#include "models/hunyuanvl.cpp" #include "models/internvl.cpp" #include "models/kimivl.cpp" #include "models/kimik25.cpp" @@ -996,10 +996,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; - case PROJECTOR_TYPE_HUNYUANOCR: case PROJECTOR_TYPE_HUNYUANVL: { - builder = std::make_unique(ctx, img); + builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_MLP: case PROJECTOR_TYPE_MLP_NORM: @@ -1316,12 +1315,12 @@ struct clip_model_loader { hparams.has_llava_projector = model.proj_type != PROJECTOR_TYPE_COGVLM; hparams.image_pad_color = {122, 116, 104}; if (!hparams.image_res_candidates.empty()) { - hparams.image_resize_pad = true; + hparams.image_resize_pad = PAD_CEIL; hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; } else { // llava-1.6 default params - hparams.image_pad_ov = false; - hparams.image_pad_rf = true; + hparams.image_pad_ov = PAD_NONE; + hparams.image_pad_rf = PAD_CEIL; hparams.image_pad_color_rf = {122, 116, 104}; hparams.image_resize_algo_rf = RESIZE_ALGO_BICUBIC; hparams.image_resize_algo_ov = RESIZE_ALGO_BILINEAR; @@ -1329,7 +1328,7 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_GLM_EDGE: { - hparams.image_resize_pad = true; + hparams.image_resize_pad = PAD_CEIL; hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; } break; case PROJECTOR_TYPE_MINICPMV: @@ -1529,7 +1528,7 @@ struct clip_model_loader { { hparams.n_merge = 2; hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; - hparams.image_resize_pad = false; + hparams.image_resize_pad = PAD_NONE; get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true); std::vector wa_layer_indexes_vec; @@ -1549,7 +1548,7 @@ struct clip_model_loader { // reka model performs better when using resize_bicubic, which stretches // the image to fit fixed square size - hparams.image_resize_pad = false; + hparams.image_resize_pad = PAD_NONE; } break; case PROJECTOR_TYPE_GLM4V: { @@ -1604,31 +1603,23 @@ struct clip_model_loader { hparams.image_size = 1024; hparams.warmup_image_size = 1024; hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW; - hparams.image_pad_color[0] = hparams.image_mean[0]; - hparams.image_pad_color[1] = hparams.image_mean[1]; - hparams.image_pad_color[2] = hparams.image_mean[2]; + hparams.image_pad_color = {127, 127, 127}; get_u32(KEY_SAM_N_BLOCK, hparams.sam_n_layer, true); get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true); get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true); get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true); } break; - case PROJECTOR_TYPE_HUNYUANOCR: - { - hparams.n_merge = 2; - get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); - get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels); - get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels); - hparams.set_warmup_n_tokens(28*28); - } break; case PROJECTOR_TYPE_HUNYUANVL: { hparams.n_merge = 2; hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW; - hparams.image_resize_pad = false; + hparams.image_resize_pad = PAD_NONE; hparams.ffn_op = FFN_GELU; - get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); hparams.set_limit_image_tokens(256, 16384); + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels, false); + get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels, false); hparams.set_warmup_n_tokens(32*32); } break; case PROJECTOR_TYPE_LFM2A: @@ -2438,7 +2429,6 @@ struct clip_model_loader { model.mm_boi = get_tensor(TN_TOK_BOI); model.mm_eoi = get_tensor(TN_TOK_EOI); } break; - case PROJECTOR_TYPE_HUNYUANOCR: case PROJECTOR_TYPE_HUNYUANVL: { // proj.0 -> mm.0 (conv1), proj.2 -> mm.2 (conv2), mlp -> mm.model.fc (linear) @@ -3294,7 +3284,7 @@ void setup_init_vision_shim_kcpp(struct clip_ctx * ctx_v) { } break; case PROJECTOR_TYPE_MINICPMV: { - int minicpmv_version = clip_is_minicpmv(ctx_v); + int minicpmv_version = clip_get_hparams(ctx_v)->minicpmv_version; if (minicpmv_version == 2) { // minicpmv 2.5 format: // (overview) (slice) (slice) \n ... @@ -3503,7 +3493,6 @@ void setup_init_vision_shim_kcpp(struct clip_ctx * ctx_v) { img_end = "\n"; // prevent empty batch on llama-server image_preproc = std::make_unique(ctx_v); } break; - case PROJECTOR_TYPE_HUNYUANOCR: case PROJECTOR_TYPE_HUNYUANVL: { // note: these use fullwidth | (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary @@ -3593,7 +3582,6 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * case PROJECTOR_TYPE_MIMOVL: case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_PADDLEOCR: - case PROJECTOR_TYPE_HUNYUANOCR: case PROJECTOR_TYPE_HUNYUANVL: case PROJECTOR_TYPE_YOUTUVL: return (img->nx / params.patch_size) / 2; @@ -3810,7 +3798,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im int h = static_cast(std::sqrt(static_cast(n_patches))); n_patches = h * (h + 1) + 1; } break; - case PROJECTOR_TYPE_HUNYUANOCR: case PROJECTOR_TYPE_HUNYUANVL: { int merge = ctx->model.hparams.n_merge; @@ -4446,7 +4433,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima case PROJECTOR_TYPE_JANUS_PRO: case PROJECTOR_TYPE_PHI4: case PROJECTOR_TYPE_COGVLM: - case PROJECTOR_TYPE_HUNYUANOCR: case PROJECTOR_TYPE_YASA2: { // do nothing @@ -4456,7 +4442,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // Compute the HunyuanVL 2D position embedding on CPU (with the // custom sf=(target+0.1)/n_grid bilinear sampling that the // reference implementation uses) and upload it to the graph - // input declared in clip_graph_hunyuanocr::build(). + // input declared in clip_graph_hunyuanvl::build(). GGML_ASSERT(model.position_embeddings != nullptr); ggml_tensor * src_t = model.position_embeddings; const int64_t n_embd = src_t->ne[0]; @@ -4974,7 +4960,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_KIMIK25: case PROJECTOR_TYPE_YASA2: return ctx->model.mm_2_w->ne[1]; - case PROJECTOR_TYPE_HUNYUANOCR: case PROJECTOR_TYPE_HUNYUANVL: return ctx->model.mm_model_proj->ne[1]; case PROJECTOR_TYPE_COGVLM: diff --git a/tools/mtmd/models/deepseekocr.cpp b/tools/mtmd/models/deepseekocr.cpp index b1f6ead5b..8419d496a 100644 --- a/tools/mtmd/models/deepseekocr.cpp +++ b/tools/mtmd/models/deepseekocr.cpp @@ -88,164 +88,168 @@ static ggml_tensor * get_rel_pos(ggml_context * ctx0, return cur; // [C, k_size, q_size] } + +ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) { + // Building SAM + const int n_embd = hparams.sam_n_embd; + const int n_layer = hparams.sam_n_layer; + const int n_heads = hparams.sam_n_head; + const int d_heads = n_embd / n_heads; + const int window = hparams.attn_window_size; + + ggml_tensor * inpL; + + inpL = ggml_conv_2d_sk_p0(ctx0, model.patch_embed_proj_w, inp_raw); + inpL = ggml_add(ctx0, inpL, ggml_reshape_3d(ctx0, model.patch_embed_proj_b, 1, 1, n_embd)); + inpL = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 1, 2, 0, 3)); + + ggml_tensor * rel_pos_indices_local; + ggml_tensor * rel_pos_indices_global; + + rel_pos_indices_local = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, window, window); + rel_pos_indices_global = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, inpL->ne[1], inpL->ne[2]); + ggml_set_name(rel_pos_indices_local, "rel_pos_indices_local"); + ggml_set_name(rel_pos_indices_global, "rel_pos_indices_global"); + ggml_set_input(rel_pos_indices_local); + ggml_set_input(rel_pos_indices_global); + + ggml_tensor * cur; + const auto tgt_size = inpL->ne[1]; + const auto str_size = model.pos_embed->ne[1]; + + if (str_size != tgt_size) { + ggml_tensor * old_pos_embed = nullptr; + old_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, model.pos_embed, 2, 0, 1, 3)); + ggml_tensor * new_pos_embed = + ggml_interpolate(ctx0, old_pos_embed, tgt_size, tgt_size, n_embd, 1, GGML_SCALE_MODE_BICUBIC); + new_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embed, 1, 2, 0, 3)); + cur = ggml_add(ctx0, inpL, new_pos_embed); + } else { + cur = ggml_add(ctx0, inpL, model.pos_embed); + } + + // loop over layers + for (int il = 0; il < n_layer; il++) { + auto & layer = model.sam_layers[il]; + ggml_tensor * shortcut = cur; + + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); + + const int64_t w0 = cur->ne[1]; + const int64_t h0 = cur->ne[2]; + + ggml_tensor * indices; + + if (hparams.is_global_attn(il)) { + indices = rel_pos_indices_global; + } else { + // local attention layer - apply window partition + cur = window_partition(ctx0, cur, window); + indices = rel_pos_indices_local; + } + + const int64_t W = cur->ne[1]; + const int64_t H = cur->ne[2]; + // self-attention + { + const int B = cur->ne[3]; + + cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); + cur = ggml_add(ctx0, cur, layer.qkv_b); + cur = ggml_cont(ctx0, cur); // Ensure tensor is contiguous before reshape + cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B); + + ggml_tensor * Q; + ggml_tensor * K; + ggml_tensor * V; + + Q = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 0 * cur->nb[1]); + Q = ggml_reshape_4d(ctx0, ggml_cont(ctx0, Q), d_heads, n_heads, W * H, B); + + K = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 1 * cur->nb[1]); + K = ggml_reshape_4d(ctx0, ggml_cont(ctx0, K), d_heads, n_heads, W * H, B); + + V = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 2 * cur->nb[1]); + V = ggml_reshape_4d(ctx0, ggml_cont(ctx0, V), d_heads, n_heads, W * H, B); + + ggml_tensor * mask; + ggml_tensor * rw; + ggml_tensor * rh; + ggml_tensor * qr; + + rw = get_rel_pos(ctx0, layer.rel_pos_w, indices, W, W); // [W, W, C] + rh = get_rel_pos(ctx0, layer.rel_pos_h, indices, H, H); // [H, H, C] + qr = ggml_permute(ctx0, Q, 0, 2, 1, 3); + qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads); + + rw = ggml_mul_mat(ctx0, rw, + ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3))); // [B*n_heads, W, H, W] + rw = ggml_cont(ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3)); // [B*n_heads, H, W, W] + rw = ggml_reshape_4d(ctx0, rw, W, 1, W * H, n_heads * B); + rw = ggml_repeat_4d(ctx0, rw, W, H, W * H, n_heads * B); + rh = ggml_mul_mat(ctx0, rh, qr); // [B*n_heads, H, W, H] + rh = ggml_reshape_4d(ctx0, rh, 1, H, W * H, n_heads * B); + mask = ggml_add(ctx0, rw, rh); // [B*n_heads, H*W, H, W] + mask = ggml_reshape_4d(ctx0, mask, W * H, W * H, n_heads, B); + // casting mask to F16 only required when flash-attn is enabled + if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) { + mask = ggml_cast(ctx0, mask, GGML_TYPE_F16); + } + + const float scale = 1.0f / sqrtf(static_cast(d_heads)); + + cur = build_attn(layer.o_w, layer.o_b, Q, K, V, mask, scale, + il); // [B, H*W, n_embd] + cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), n_embd, W, H, B); + } + + if (hparams.is_global_attn(il) == false) { + // local attention layer - reverse window partition + cur = window_unpartition(ctx0, cur, w0, h0, window); + } + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, shortcut); + + ggml_tensor * inpFF = cur; + + // layernorm2 + cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); + + // ffn + cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + // residual 2 + cur = ggml_add(ctx0, cur, inpFF); + cb(cur, "sam_layer_out", il); + } + + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); + + cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); + cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); + + cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); + cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); + + cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1); + cur = ggml_conv_2d(ctx0, model.net_3, cur, 2, 2, 1, 1, 1, 1); + cb(cur, "sam_output", -1); + + ggml_build_forward_expand(gf, cur); + return cur; +} + ggml_cgraph * clip_graph_deepseekocr::build() { // patch embedding ggml_tensor * inp_raw = build_inp_raw(); - - ggml_tensor * sam_out; - // Building SAM - { - const int n_embd = hparams.sam_n_embd; - const int n_layer = hparams.sam_n_layer; - const int n_heads = hparams.sam_n_head; - const int d_heads = n_embd / n_heads; - const int window = hparams.attn_window_size; - - ggml_tensor * inpL; - - inpL = ggml_conv_2d_sk_p0(ctx0, model.patch_embed_proj_w, inp_raw); - inpL = ggml_add(ctx0, inpL, ggml_reshape_3d(ctx0, model.patch_embed_proj_b, 1, 1, n_embd)); - inpL = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 1, 2, 0, 3)); - - ggml_tensor * rel_pos_indices_local; - ggml_tensor * rel_pos_indices_global; - - rel_pos_indices_local = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, window, window); - rel_pos_indices_global = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, inpL->ne[1], inpL->ne[2]); - ggml_set_name(rel_pos_indices_local, "rel_pos_indices_local"); - ggml_set_name(rel_pos_indices_global, "rel_pos_indices_global"); - ggml_set_input(rel_pos_indices_local); - ggml_set_input(rel_pos_indices_global); - - ggml_tensor * cur; - const auto tgt_size = inpL->ne[1]; - const auto str_size = model.pos_embed->ne[1]; - - if (str_size != tgt_size) { - ggml_tensor * old_pos_embed = nullptr; - old_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, model.pos_embed, 2, 0, 1, 3)); - ggml_tensor * new_pos_embed = - ggml_interpolate(ctx0, old_pos_embed, tgt_size, tgt_size, n_embd, 1, GGML_SCALE_MODE_BICUBIC); - new_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embed, 1, 2, 0, 3)); - cur = ggml_add(ctx0, inpL, new_pos_embed); - } else { - cur = ggml_add(ctx0, inpL, model.pos_embed); - } - - // loop over layers - for (int il = 0; il < n_layer; il++) { - auto & layer = model.sam_layers[il]; - ggml_tensor * shortcut = cur; - - // layernorm1 - cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); - - const int64_t w0 = cur->ne[1]; - const int64_t h0 = cur->ne[2]; - - ggml_tensor * indices; - - if (hparams.is_global_attn(il)) { - indices = rel_pos_indices_global; - } else { - // local attention layer - apply window partition - cur = window_partition(ctx0, cur, window); - indices = rel_pos_indices_local; - } - - const int64_t W = cur->ne[1]; - const int64_t H = cur->ne[2]; - // self-attention - { - const int B = cur->ne[3]; - - cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); - cur = ggml_add(ctx0, cur, layer.qkv_b); - cur = ggml_cont(ctx0, cur); // Ensure tensor is contiguous before reshape - cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B); - - ggml_tensor * Q; - ggml_tensor * K; - ggml_tensor * V; - - Q = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 0 * cur->nb[1]); - Q = ggml_reshape_4d(ctx0, ggml_cont(ctx0, Q), d_heads, n_heads, W * H, B); - - K = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 1 * cur->nb[1]); - K = ggml_reshape_4d(ctx0, ggml_cont(ctx0, K), d_heads, n_heads, W * H, B); - - V = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 2 * cur->nb[1]); - V = ggml_reshape_4d(ctx0, ggml_cont(ctx0, V), d_heads, n_heads, W * H, B); - - ggml_tensor * mask; - ggml_tensor * rw; - ggml_tensor * rh; - ggml_tensor * qr; - - rw = get_rel_pos(ctx0, layer.rel_pos_w, indices, W, W); // [W, W, C] - rh = get_rel_pos(ctx0, layer.rel_pos_h, indices, H, H); // [H, H, C] - qr = ggml_permute(ctx0, Q, 0, 2, 1, 3); - qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads); - - rw = ggml_mul_mat(ctx0, rw, - ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3))); // [B*n_heads, W, H, W] - rw = ggml_cont(ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3)); // [B*n_heads, H, W, W] - rw = ggml_reshape_4d(ctx0, rw, W, 1, W * H, n_heads * B); - rw = ggml_repeat_4d(ctx0, rw, W, H, W * H, n_heads * B); - rh = ggml_mul_mat(ctx0, rh, qr); // [B*n_heads, H, W, H] - rh = ggml_reshape_4d(ctx0, rh, 1, H, W * H, n_heads * B); - mask = ggml_add(ctx0, rw, rh); // [B*n_heads, H*W, H, W] - mask = ggml_reshape_4d(ctx0, mask, W * H, W * H, n_heads, B); - mask = ggml_cast(ctx0, mask, GGML_TYPE_F16); - - const float scale = 1.0f / sqrtf(static_cast(d_heads)); - - cur = build_attn(layer.o_w, layer.o_b, Q, K, V, mask, scale, - il); // [B, H*W, n_embd] - cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), n_embd, W, H, B); - } - - if (hparams.is_global_attn(il) == false) { - // local attention layer - reverse window partition - cur = window_unpartition(ctx0, cur, w0, h0, window); - } - - // re-add the layer input, e.g., residual - cur = ggml_add(ctx0, cur, shortcut); - - ggml_tensor * inpFF = cur; - - // layernorm2 - cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); - - // ffn - cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, - hparams.ffn_op, il); - - // residual 2 - cur = ggml_add(ctx0, cur, inpFF); - cb(cur, "sam_layer_out", il); - } - - cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); - - cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1); - cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); - cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1); - cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); - - cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1); - cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); - cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1); - cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); - - cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1); - cur = ggml_conv_2d(ctx0, model.net_3, cur, 2, 2, 1, 1, 1, 1); - cb(cur, "sam_output", -1); - - ggml_build_forward_expand(gf, cur); - sam_out = cur; - } + ggml_tensor * sam_out = build_sam(inp_raw); ggml_tensor * clip_out; // Building DS-OCR CLIP diff --git a/tools/mtmd/models/hunyuanocr.cpp b/tools/mtmd/models/hunyuanvl.cpp similarity index 70% rename from tools/mtmd/models/hunyuanocr.cpp rename to tools/mtmd/models/hunyuanvl.cpp index 45ed684f7..2c670979d 100644 --- a/tools/mtmd/models/hunyuanocr.cpp +++ b/tools/mtmd/models/hunyuanvl.cpp @@ -1,25 +1,15 @@ #include "models.h" -ggml_cgraph * clip_graph_hunyuanocr::build() { +ggml_cgraph * clip_graph_hunyuanvl::build() { const int merge = hparams.n_merge; const int pw = n_patches_x; const int ph = n_patches_y; - // Position embedding interpolation. - // HunyuanVL needs scale factors sf=(target+0.1)/n_grid, which the standard - // ggml_interpolate cannot express. To avoid adding a new ggml op, the - // resize is computed on CPU in clip_image_batch_encode and uploaded here - // as a graph input (named "hunyuanvl_pos_embd"). - // HunyuanOCR uses the same square layout and the standard ratio-based - // interpolation provided by resize_position_embeddings(). - ggml_tensor * pos_embd = nullptr; - if (proj_type == PROJECTOR_TYPE_HUNYUANVL && model.position_embeddings) { - pos_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ph * pw); - ggml_set_name(pos_embd, "hunyuanvl_pos_embd"); - ggml_set_input(pos_embd); - } else { - pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR); - } + // position embedding: declared as a graph input, filled on CPU + // by clip_image_batch_encode (see PROJECTOR_TYPE_HUNYUANVL branch there). + ggml_tensor * pos_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ph * pw); + ggml_set_name(pos_embd, "hunyuanvl_pos_embd"); + ggml_set_input(pos_embd); ggml_tensor * inp = build_inp(); ggml_tensor * cur = build_vit(inp, n_patches, NORM_TYPE_NORMAL, hparams.ffn_op, pos_embd, nullptr); diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index 955daa6d6..119c2d541 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -118,6 +118,7 @@ struct clip_graph_whisper_enc : clip_graph { struct clip_graph_deepseekocr : clip_graph { clip_graph_deepseekocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; + ggml_tensor * build_sam(ggml_tensor * inp); // build the SAM model }; struct clip_graph_conformer : clip_graph { @@ -141,8 +142,8 @@ struct clip_graph_glm4v : clip_graph { ggml_cgraph * build() override; }; -struct clip_graph_hunyuanocr : clip_graph { - clip_graph_hunyuanocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} +struct clip_graph_hunyuanvl : clip_graph { + clip_graph_hunyuanvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; }; diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp index 1b058e026..37c271d18 100644 --- a/tools/mtmd/mtmd-image.cpp +++ b/tools/mtmd/mtmd-image.cpp @@ -38,7 +38,7 @@ struct img_tool { clip_image_u8 & dst, const clip_image_size & target_resolution, resize_algo algo, - bool add_padding = true, // TODO: define the behavior for add_padding = false + pad_style padding = PAD_CEIL, std::array pad_color = {0, 0, 0}) { dst.nx = target_resolution.width; dst.ny = target_resolution.height; @@ -50,7 +50,7 @@ struct img_tool { return; } - if (!add_padding) { + if (padding == PAD_NONE) { // direct resize switch (algo) { case RESIZE_ALGO_BILINEAR: @@ -71,8 +71,15 @@ struct img_tool { float scale_w = static_cast(target_resolution.width) / src.nx; float scale_h = static_cast(target_resolution.height) / src.ny; float scale = std::min(scale_w, scale_h); - int new_width = std::min(static_cast(std::ceil(src.nx * scale)), target_resolution.width); - int new_height = std::min(static_cast(std::ceil(src.ny * scale)), target_resolution.height); + + int new_width, new_height; + if (padding == PAD_NEAREST) { + new_width = std::min(static_cast(std::round(src.nx * scale)), target_resolution.width); + new_height = std::min(static_cast(std::round(src.ny * scale)), target_resolution.height); + } else { + new_width = std::min(static_cast(std::ceil(src.nx * scale)), target_resolution.width); + new_height = std::min(static_cast(std::ceil(src.ny * scale)), target_resolution.height); + } switch (algo) { case RESIZE_ALGO_BILINEAR: @@ -91,9 +98,14 @@ struct img_tool { // fill dst with pad_color fill(dst, pad_color); - int offset_x = (target_resolution.width - new_width) / 2; - int offset_y = (target_resolution.height - new_height) / 2; - + int offset_x, offset_y; + if (padding == PAD_NEAREST) { + offset_x = static_cast(std::round((target_resolution.width - new_width) / 2.0f)); + offset_y = static_cast(std::round((target_resolution.height - new_height) / 2.0f)); + } else { + offset_x = (target_resolution.width - new_width) / 2; + offset_y = (target_resolution.height - new_height) / 2; + } composite(dst, resized_image, offset_x, offset_y); } } @@ -356,10 +368,10 @@ private: GGML_ASSERT(inSize > 0 && outSize > 0); double support, scale, filterscale; double center, ww, ss; - int xx, x, ksize, xmin, xmax, xcnt; + int xx, x, ksize, xmin, xmax; // Calculate scaling factor: ratio of input range to output size - filterscale = scale = (double)inSize / outSize; + filterscale = scale = static_cast(inSize) / outSize; // For upsampling (scale < 1), keep filterscale = 1 to maintain filter sharpness // For downsampling (scale > 1), widen filter to prevent aliasing if (filterscale < 1.0) { @@ -373,6 +385,7 @@ private: std::vector pre_weights(outSize * ksize); // Temporary weights bounds.resize(outSize * 2); + // For each output pixel, compute its filter coefficients for (xx = 0; xx < outSize; xx++) { // Calculate the center position in input space (pixel-center convention: +0.5) @@ -391,10 +404,10 @@ private: xmax = inSize; } - xcnt = xmax - xmin; + xmax -= xmin; // Compute filter weights for each contributing input pixel - for (x = 0; x < xcnt; x++) { + for (x = 0; x < xmax; x++) { // Distance from input pixel center to output pixel center in input space double w = bicubic_filter((x + xmin - center + 0.5) * ss); pre_weights[xx * ksize + x] = w; @@ -402,7 +415,7 @@ private: } // Normalize weights to sum to 1.0 (preserves brightness) - for (x = 0; x < xcnt; x++) { + for (x = 0; x < xmax; x++) { if (ww != 0.0) { pre_weights[xx * ksize + x] /= ww; } @@ -415,18 +428,27 @@ private: // Store input pixel range for this output pixel bounds[xx * 2 + 0] = xmin; - bounds[xx * 2 + 1] = xcnt; + bounds[xx * 2 + 1] = xmax; } // Convert floating-point coefficients to fixed-point integers // Formula: int32 = round(float * 2^PRECISION_BITS) weights.resize(outSize * ksize); + + const double fxp_scale = std::ldexp(1.0, PRECISION_BITS); // 1.0 * 2^PRECISION_BITS + for (int i = 0; i < outSize * ksize; i++) { + double tmp_val = pre_weights[i] * fxp_scale; if (pre_weights[i] < 0) { - weights[i] = static_cast(-0.5 + pre_weights[i] * (1 << PRECISION_BITS)); + tmp_val -= 0.5; } else { - weights[i] = static_cast(0.5 + pre_weights[i] * (1 << PRECISION_BITS)); + tmp_val += 0.5; } + tmp_val = std::round(tmp_val); + tmp_val = std::clamp(tmp_val, + static_cast(std::numeric_limits::min()), + static_cast(std::numeric_limits::max())); + weights[i] = static_cast(tmp_val); } return ksize; @@ -1083,35 +1105,31 @@ bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, cli // bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { - const std::vector native_resolutions = { - /*512 tiny , 640 small, */ 1024 /* base */, 1280 /* large */ - }; - // original image size - const clip_image_size original_size{img.nx, img.ny}; - const int orig_w = original_size.width; - const int orig_h = original_size.height; - const int orig_area = orig_h * orig_w; + static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ }; + // TODO: support 512 (tiny) and 640 (small) once we have eval data for them - size_t mode_i = 0; - int min_diff = orig_area; + const int64_t orig_area = static_cast(img.nx) * img.ny; - for (size_t i = 0; i < native_resolutions.size(); i++) { - int r = native_resolutions[i]; - if (std::abs(orig_area - r * r) < min_diff) { - mode_i = i; - min_diff = std::abs(orig_area - r * r); + size_t mode_i = 0; + int64_t min_diff = std::numeric_limits::max(); + for (size_t i = 0; i < std::size(native_resolutions); i++) { + const int64_t r = native_resolutions[i]; + const int64_t diff = std::abs(orig_area - r * r); + if (diff < min_diff) { + mode_i = i; + min_diff = diff; } } - - /* Native Resolution (Base/Large) */ const int image_size = native_resolutions[mode_i]; - // scaled and padded image - clip_image_u8_ptr scaled_img(clip_image_u8_init()); - img_tool::resize(img, *scaled_img, clip_image_size{image_size, image_size}, hparams.image_resize_algo); + // Aspect-preserving fit-and-pad. Pillow bicubic + PAD_NEAREST for + // byte-parity with the upstream deepseek-ai/DeepSeek-OCR HF preprocessor. + clip_image_u8 padded; + img_tool::resize(img, padded, {image_size, image_size}, RESIZE_ALGO_BICUBIC_PILLOW, + PAD_NEAREST, hparams.image_pad_color); clip_image_f32_ptr res(clip_image_f32_init()); - img_u8_to_f32(*scaled_img, *res, hparams.image_mean, hparams.image_std); + img_u8_to_f32(padded, *res, hparams.image_mean, hparams.image_std); output.entries.push_back(std::move(res)); output.grid_x = 1; @@ -1246,7 +1264,7 @@ clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8 std::max(1, static_cast(std::floor(resized.ny * scale))), }; clip_image_u8 scaled; - img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, false); + img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, PAD_NONE); resized = std::move(scaled); } @@ -1347,7 +1365,7 @@ bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip clip_image_u8 img_for_crop = prepared; if (instructions.refined_size.width != prepared.nx || instructions.refined_size.height != prepared.ny) { clip_image_u8 refined; - img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, false); + img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, PAD_NONE); img_for_crop = std::move(refined); } diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index ce047d114..63b7e4d05 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -493,7 +493,6 @@ struct mtmd_context { img_end = "\n"; // prevent empty batch on llama-server image_preproc = std::make_unique(ctx_v); } break; - case PROJECTOR_TYPE_HUNYUANOCR: case PROJECTOR_TYPE_HUNYUANVL: { // note: these use fullwidth | (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary diff --git a/tools/mtmd/tests/test-1-ground-truth.txt b/tools/mtmd/tests/test-1-ground-truth.txt new file mode 100644 index 000000000..fd85b6485 --- /dev/null +++ b/tools/mtmd/tests/test-1-ground-truth.txt @@ -0,0 +1,24 @@ + + A Powdery Surface + Is Closely Explored + +By JOHN NOBLE WILFORD +Special to The New York Times + +HOUSTON, Monday, July 21—Men have landed and walked on the moon. + +Two Americans, astronauts of Apollo 11, steered their fragile four-legged lunar module safely and smoothly to the historic landing yesterday at 4:17:40 P.M., Eastern daylight time. + +Neil A. Armstrong, the 38-year-old civilian commander, radioed to earth and the mission control room here: + +"Houston, Tranquility Base here. The Eagle has landed." + +The first men to reach the moon—Mr. Armstrong and his co-pilot, Col. Edwin E. Aldrin Jr. of the Air Force—brought their ship to rest on a level, rock-strewn plain near the southwestern shore of the arid Sea of Tranquility. + +About six and a half hours later, Mr. Armstrong opened the landing craft's hatch, stepped slowly down the ladder and declared as he planted the first human footprint on the lunar crust: + +"That's one small step for man, one giant leap for mankind." + +His first step on the moon came at 10:56:20 P.M., as a television camera outside the craft transmitted his every move to an awed and excited audience of hundreds of millions of people on earth. + +Tentative Steps Test Soil diff --git a/tools/perplexity/main.cpp b/tools/perplexity/main.cpp new file mode 100644 index 000000000..13a9940e9 --- /dev/null +++ b/tools/perplexity/main.cpp @@ -0,0 +1,5 @@ +int llama_perplexity(int argc, char ** argv); + +int main(int argc, char ** argv) { + return llama_perplexity(argc, argv); +} diff --git a/tools/quantize/main.cpp b/tools/quantize/main.cpp new file mode 100644 index 000000000..fc247190c --- /dev/null +++ b/tools/quantize/main.cpp @@ -0,0 +1,5 @@ +int llama_quantize(int argc, char ** argv); + +int main(int argc, char ** argv) { + return llama_quantize(argc, argv); +} diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index f2709497e..154b2a9ae 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -489,7 +489,10 @@ static bool parse_layer_prune(const char * data, std::vector & prune_layers return true; } -int main(int argc, char ** argv) { +// satisfies -Wmissing-declarations +int llama_quantize(int argc, char ** argv); + +int llama_quantize(int argc, char ** argv) { std::setlocale(LC_NUMERIC, "C"); if (argc < 3) { usage(argv[0]); diff --git a/tools/server/main.cpp b/tools/server/main.cpp new file mode 100644 index 000000000..7f17c56a8 --- /dev/null +++ b/tools/server/main.cpp @@ -0,0 +1,5 @@ +int llama_server(int argc, char ** argv); + +int main(int argc, char ** argv) { + return llama_server(argc, argv); +} diff --git a/tools/server/server.cpp b/tools/server/server.cpp index c82f11794..4d56d45e8 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -71,7 +71,10 @@ static server_http_context::handler_t ex_wrapper(server_http_context::handler_t }; } -int main(int argc, char ** argv) { +// satisfies -Wmissing-declarations +int llama_server(int argc, char ** argv); + +int llama_server(int argc, char ** argv) { std::setlocale(LC_NUMERIC, "C"); // own arguments required by this example diff --git a/tools/ui/scripts/dev.sh b/tools/ui/scripts/dev.sh index 9256f255a..7e1d3c15e 100644 --- a/tools/ui/scripts/dev.sh +++ b/tools/ui/scripts/dev.sh @@ -11,24 +11,28 @@ cd ../../ +# Ensure node_modules are installed +if [ ! -d "tools/ui/node_modules" ]; then + echo "📦 Installing npm dependencies..." + cd tools/ui && npm install && cd ../../ +fi + # Check and install git hooks if missing check_and_install_hooks() { local hooks_missing=false # Check for required hooks - if [ ! -f ".git/hooks/pre-commit" ] || [ ! -f ".git/hooks/pre-push" ] || [ ! -f ".git/hooks/post-push" ]; then + if [ ! -f ".git/hooks/pre-commit" ] || [ ! -f ".git/hooks/pre-push" ]; then hooks_missing=true fi if [ "$hooks_missing" = true ]; then echo "🔧 Git hooks missing, installing them..." - cd tools/ui - if bash scripts/install-git-hooks.sh; then + if bash "$(dirname "$0")/git-hooks/install.sh"; then echo "✅ Git hooks installed successfully" else echo "⚠️ Failed to install git hooks, continuing anyway..." fi - cd ../../ else echo "✅ Git hooks already installed" fi diff --git a/tools/ui/scripts/git-hooks/install.sh b/tools/ui/scripts/git-hooks/install.sh new file mode 100755 index 000000000..1a17dcad6 --- /dev/null +++ b/tools/ui/scripts/git-hooks/install.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# +# Install git hooks for llama-ui +# Copies pre-commit and pre-push hooks into the repo's .git/hooks directory. + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../../../.." && pwd)" +HOOKS_DIR="$REPO_ROOT/$(cd "$REPO_ROOT" && git rev-parse --git-path hooks)" + +# Verify package.json exists +if [ ! -f "$REPO_ROOT/tools/ui/package.json" ]; then + echo "❌ package.json not found in tools/ui" + exit 1 +fi + +echo "Installing git hooks for llama-ui..." + +for hook in pre-commit pre-push; do + src="$SCRIPT_DIR/${hook}.sh" + dst="$HOOKS_DIR/$hook" + + if cp "$src" "$dst" && chmod +x "$dst"; then + echo " ✅ $hook" + else + echo " ❌ Failed to install $hook" + exit 1 + fi +done + +echo "" +echo "Pre-commit: format (staged) + type-check" +echo "Pre-push: lint + test" +echo "" +echo "Hooks stash unstaged changes temporarily and restore them after." +echo "Skip with: git commit --no-verify / git push --no-verify" diff --git a/tools/ui/scripts/git-hooks/pre-commit.sh b/tools/ui/scripts/git-hooks/pre-commit.sh new file mode 100755 index 000000000..1fa83efde --- /dev/null +++ b/tools/ui/scripts/git-hooks/pre-commit.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +# +# Pre-commit hook for llama-ui +# Runs: format (staged files only) + type-check +# Stashes unstaged changes temporarily and restores them after. + +# Only run when there are staged changes in tools/ui/ +if ! git diff --cached --name-only | grep -q "^tools/ui/"; then + exit 0 +fi + +REPO_ROOT=$(git rev-parse --show-toplevel) +cd "$REPO_ROOT/tools/ui" + +# Check that node_modules exists +if [ ! -d "node_modules" ]; then + echo "❌ node_modules not found. Run 'npm install' first." + exit 1 +fi + +# Stash unstaged changes in tools/ui/ so they don't interfere +stash_name="pi-ui-precommit" +git stash push --keep-index -u -m "$stash_name" -- tools/ui/ 2>/dev/null || true + +echo "Running pre-commit checks for llama-ui..." + +# Format only staged files +staged_ui=$(git diff --cached --name-only -- tools/ui/) +if [ -n "$staged_ui" ]; then + echo "$staged_ui" | xargs npx --no-install prettier --write + format_ok=$? + # Re-stage formatted files + git add tools/ui/ +else + format_ok=0 +fi + +# Type-check the clean tree +npm run check +check_ok=$? + +# Restore stashed changes +if git stash list | grep -q "$stash_name"; then + git stash pop 2>/dev/null || true +fi + +if [ $format_ok -ne 0 ]; then + echo "❌ Format failed" + exit 1 +fi +if [ $check_ok -ne 0 ]; then + echo "❌ Type check failed" + exit 1 +fi + +echo "✅ Pre-commit checks passed" +exit 0 diff --git a/tools/ui/scripts/git-hooks/pre-push.sh b/tools/ui/scripts/git-hooks/pre-push.sh new file mode 100755 index 000000000..953d3a224 --- /dev/null +++ b/tools/ui/scripts/git-hooks/pre-push.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +# +# Pre-push hook for llama-ui +# Runs: lint + test +# Ignores unstaged changes (stashes them temporarily and restores after). + +needs_check=false + +# Read refs from stdin: local_ref local_sha remote_ref remote_sha +while read local_ref local_sha remote_ref remote_sha; do + # New branch or force-push — always check + if [ "$local_sha" = "0000000000000000000000000000000000000000" ] || \ + [ "$remote_sha" = "0000000000000000000000000000000000000000" ]; then + needs_check=true + continue + fi + + # Check for changes in tools/ui/ between remote and local + if git diff --name-only "$remote_sha...$local_sha" -- tools/ui/ | grep -q .; then + needs_check=true + fi +done + +if [ "$needs_check" = false ]; then + exit 0 +fi + +REPO_ROOT=$(git rev-parse --show-toplevel) +cd "$REPO_ROOT/tools/ui" + +# Check that node_modules exists +if [ ! -d "node_modules" ]; then + echo "❌ node_modules not found. Run 'npm install' first." + exit 1 +fi + +# Stash unstaged changes so they don't interfere with checks +stash_name="pi-ui-prepush" +git stash push -u -m "$stash_name" -- tools/ui/ 2>/dev/null || true + +echo "Running pre-push checks for llama-ui..." + +# Lint +npm run lint +lint_ok=$? + +# Test +npm test +test_ok=$? + +# Restore stashed changes +if git stash list | grep -q "$stash_name"; then + git stash pop 2>/dev/null || true +fi + +if [ $lint_ok -ne 0 ]; then + echo "❌ Lint failed" + exit 1 +fi +if [ $test_ok -ne 0 ]; then + echo "❌ Tests failed" + exit 1 +fi + +echo "✅ Pre-push checks passed" +exit 0 diff --git a/tools/ui/scripts/install-git-hooks.sh b/tools/ui/scripts/install-git-hooks.sh deleted file mode 100755 index 213feb08d..000000000 --- a/tools/ui/scripts/install-git-hooks.sh +++ /dev/null @@ -1,78 +0,0 @@ -#!/bin/bash - -# Script to install pre-commit hook for llama-ui -# Pre-commit: formats, checks, and builds the UI app - -REPO_ROOT=$(git rev-parse --show-toplevel) -PRE_COMMIT_HOOK="$REPO_ROOT/.git/hooks/pre-commit" - -echo "Installing pre-commit hook for llama-ui..." - -# Create the pre-commit hook -cat > "$PRE_COMMIT_HOOK" << 'EOF' -#!/bin/bash - -# Check if there are any changes in the tools/ui directory -if git diff --cached --name-only | grep -q "^tools/ui/"; then - REPO_ROOT=$(git rev-parse --show-toplevel) - cd "$REPO_ROOT/tools/ui" - - # Check if package.json exists - if [ ! -f "package.json" ]; then - echo "Error: package.json not found in tools/ui" - exit 1 - fi - - echo "Formatting and checking llama-ui code..." - - # Run the format command - npm run format - if [ $? -ne 0 ]; then - echo "Error: npm run format failed" - exit 1 - fi - - # Run the lint command - npm run lint - if [ $? -ne 0 ]; then - echo "Error: npm run lint failed" - exit 1 - fi - - # Run the check command - npm run check - if [ $? -ne 0 ]; then - echo "Error: npm run check failed" - exit 1 - fi - - echo "✅ llama-ui code formatted and checked successfully" - - # Build the llama-ui - echo "Building llama-ui..." - npm run build - if [ $? -ne 0 ]; then - echo "❌ npm run build failed" - exit 1 - fi - - echo "✅ llama-ui built successfully" -fi - -exit 0 -EOF - -# Make hook executable -chmod +x "$PRE_COMMIT_HOOK" - -if [ $? -eq 0 ]; then - echo "✅ Git hook installed successfully!" - echo " Pre-commit: $PRE_COMMIT_HOOK" - echo "" - echo "The hook will automatically:" - echo " • Format, lint and check llama-ui code before commits" - echo " • Build llama-ui" -else - echo "❌ Failed to make hook executable" - exit 1 -fi diff --git a/tools/ui/src/lib/constants/image-size.ts b/tools/ui/src/lib/constants/image-size.ts new file mode 100644 index 000000000..0d0c4e484 --- /dev/null +++ b/tools/ui/src/lib/constants/image-size.ts @@ -0,0 +1 @@ +export const MEGAPIXELS_TO_PIXELS = 1_000_000; \ No newline at end of file diff --git a/tools/ui/src/lib/constants/settings-keys.ts b/tools/ui/src/lib/constants/settings-keys.ts index 92a57f88a..53243992f 100644 --- a/tools/ui/src/lib/constants/settings-keys.ts +++ b/tools/ui/src/lib/constants/settings-keys.ts @@ -18,6 +18,7 @@ export const SETTINGS_KEYS = { TITLE_GENERATION_USE_FIRST_LINE: 'titleGenerationUseFirstLine', TITLE_GENERATION_USE_LLM: 'titleGenerationUseLLM', TITLE_GENERATION_PROMPT: 'titleGenerationPrompt', + MAX_IMAGE_RESOLUTION: 'maxImageMPixels', // Display SHOW_MESSAGE_STATS: 'showMessageStats', SHOW_THOUGHT_IN_PROGRESS: 'showThoughtInProgress', diff --git a/tools/ui/src/lib/constants/settings-registry.ts b/tools/ui/src/lib/constants/settings-registry.ts index 93b3cd5ed..efef18fde 100644 --- a/tools/ui/src/lib/constants/settings-registry.ts +++ b/tools/ui/src/lib/constants/settings-registry.ts @@ -193,6 +193,14 @@ const SETTINGS_REGISTRY: Record = { defaultValue: TITLE_GENERATION.DEFAULT_PROMPT, type: SettingsFieldType.TEXTAREA, section: SETTINGS_SECTION_SLUGS.GENERAL + }, + { + key: SETTINGS_KEYS.MAX_IMAGE_RESOLUTION, + label: 'Maximum image resolution (megapixels)', + help: 'Images larger than this will be resized before sending to server. Set to 0 to disable.', + defaultValue: 0, + type: SettingsFieldType.INPUT, + section: SETTINGS_SECTION_SLUGS.GENERAL } ] }, diff --git a/tools/ui/src/lib/constants/uri-template.ts b/tools/ui/src/lib/constants/uri-template.ts index dc834aca2..9b44d6ed3 100644 --- a/tools/ui/src/lib/constants/uri-template.ts +++ b/tools/ui/src/lib/constants/uri-template.ts @@ -55,3 +55,6 @@ export const VARIABLE_PREFIX_MODIFIER_REGEX = /:[\d]+$/; /** Regex to strip one or more leading slashes */ export const LEADING_SLASHES_REGEX = /^\/+/; + +/** Regex to match base64-encoded image URIs (format: "data:image/[media type];base64,[data]")*/ +export const BASE64_IMAGE_URI_REGEX = /^data:(image\/[a-z0-9.\-+]+);base64/; diff --git a/tools/ui/src/lib/enums/files.enums.ts b/tools/ui/src/lib/enums/files.enums.ts index 5aef3955e..2f583d52e 100644 --- a/tools/ui/src/lib/enums/files.enums.ts +++ b/tools/ui/src/lib/enums/files.enums.ts @@ -183,6 +183,10 @@ export enum MimeTypeAudio { MP3 = 'audio/mp3', MP4 = 'audio/mp4', WAV = 'audio/wav', + WAVE = 'audio/wave', + X_WAV = 'audio/x-wav', + X_WAVE = 'audio/x-wave', + X_PN_WAV = 'audio/x-pn-wav', WEBM = 'audio/webm', WEBM_OPUS = 'audio/webm;codecs=opus' } diff --git a/tools/ui/src/lib/services/chat.service.ts b/tools/ui/src/lib/services/chat.service.ts index 0a2b795b8..ae29206e2 100644 --- a/tools/ui/src/lib/services/chat.service.ts +++ b/tools/ui/src/lib/services/chat.service.ts @@ -5,12 +5,15 @@ import { ATTACHMENT_LABEL_PDF_FILE, ATTACHMENT_LABEL_MCP_PROMPT, ATTACHMENT_LABEL_MCP_RESOURCE, - LEGACY_AGENTIC_REGEX + LEGACY_AGENTIC_REGEX, + SETTINGS_KEYS } from '$lib/constants'; import { AttachmentType, ContentPartType, + FileTypeAudio, MessageRole, + MimeTypeAudio, ReasoningFormat, UrlProtocol } from '$lib/enums'; @@ -19,8 +22,31 @@ import type { ApiChatMessageData, ApiChatCompletionToolCall } from '$lib/types/api'; -import type { DatabaseMessageExtraMcpPrompt, DatabaseMessageExtraMcpResource } from '$lib/types'; +import type { + AudioInputFormat, + DatabaseMessageExtraMcpPrompt, + DatabaseMessageExtraMcpResource +} from '$lib/types'; import { modelsStore } from '$lib/stores/models.svelte'; +import { settingsStore } from '../stores/settings.svelte'; +import { capImageDataURLSize } from '../utils/cap-img-size'; +import { MEGAPIXELS_TO_PIXELS } from '$lib/constants/image-size'; + +function getAudioInputFormat(mimeType: string): AudioInputFormat { + const normalizedMimeType = mimeType.trim().toLowerCase(); + + if ( + normalizedMimeType === MimeTypeAudio.WAV || + normalizedMimeType === MimeTypeAudio.WAVE || + normalizedMimeType === MimeTypeAudio.X_WAV || + normalizedMimeType === MimeTypeAudio.X_WAVE || + normalizedMimeType === MimeTypeAudio.X_PN_WAV + ) { + return FileTypeAudio.WAV; + } + + return FileTypeAudio.MP3; +} export class ChatService { /** @@ -134,26 +160,28 @@ export class ChatService { continueFinalMessage } = options; - const normalizedMessages: ApiChatMessageData[] = messages - .map((msg) => { - if ('id' in msg && 'convId' in msg && 'timestamp' in msg) { - const dbMsg = msg as DatabaseMessage & { extra?: DatabaseMessageExtra[] }; + const normalizedMessages: ApiChatMessageData[] = ( + await Promise.all( + messages.map((msg) => { + if ('id' in msg && 'convId' in msg && 'timestamp' in msg) { + const dbMsg = msg as DatabaseMessage & { extra?: DatabaseMessageExtra[] }; - return ChatService.convertDbMessageToApiChatMessageData(dbMsg); - } else { - return msg as ApiChatMessageData; - } - }) - .filter((msg) => { - // Filter out empty system messages - if (msg.role === MessageRole.SYSTEM) { - const content = typeof msg.content === 'string' ? msg.content : ''; + return ChatService.convertDbMessageToApiChatMessageData(dbMsg); + } else { + return msg as ApiChatMessageData; + } + }) + ) + ).filter((msg: { role: ChatRole; content: string | ApiChatMessageContentPart[] }) => { + // Filter out empty system messages + if (msg.role === MessageRole.SYSTEM) { + const content = typeof msg.content === 'string' ? msg.content : ''; - return content.trim().length > 0; - } + return content.trim().length > 0; + } - return true; - }); + return true; + }); // Filter out image attachments if the model doesn't support vision if (options.model && !modelsStore.modelSupportsVision(options.model)) { @@ -382,25 +410,27 @@ export class ChatService { excludeReasoning?: boolean, signal?: AbortSignal ): Promise { - const normalizedMessages: ApiChatMessageData[] = messages - .map((msg) => { - if ('id' in msg && 'convId' in msg && 'timestamp' in msg) { - return ChatService.convertDbMessageToApiChatMessageData( - msg as DatabaseMessage & { extra?: DatabaseMessageExtra[] } - ); - } + const normalizedMessages: ApiChatMessageData[] = ( + await Promise.all( + messages.map((msg) => { + if ('id' in msg && 'convId' in msg && 'timestamp' in msg) { + return ChatService.convertDbMessageToApiChatMessageData( + msg as DatabaseMessage & { extra?: DatabaseMessageExtra[] } + ); + } - return msg as ApiChatMessageData; - }) - .filter((msg) => { - if (msg.role === MessageRole.SYSTEM) { - const content = typeof msg.content === 'string' ? msg.content : ''; + return msg as ApiChatMessageData; + }) + ) + ).filter((msg: { role: ChatRole; content: string | ApiChatMessageContentPart[] }) => { + if (msg.role === MessageRole.SYSTEM) { + const content = typeof msg.content === 'string' ? msg.content : ''; - return content.trim().length > 0; - } + return content.trim().length > 0; + } - return true; - }); + return true; + }); const requestBody: Record = { messages: normalizedMessages.map((msg: ApiChatMessageData) => { @@ -787,9 +817,9 @@ export class ChatService { * @returns {ApiChatMessageData} object formatted for the chat completion API * @static */ - static convertDbMessageToApiChatMessageData( + static async convertDbMessageToApiChatMessageData( message: DatabaseMessage & { extra?: DatabaseMessageExtra[] } - ): ApiChatMessageData { + ): Promise { // Handle tool result messages (role: 'tool') if (message.role === MessageRole.TOOL && message.toolCallId) { return { @@ -867,9 +897,14 @@ export class ChatService { ); for (const image of imageFiles) { + const maxImageResolution = settingsStore.getConfig(SETTINGS_KEYS.MAX_IMAGE_RESOLUTION); + let base64Url = image.base64Url; + if (maxImageResolution > 1 / MEGAPIXELS_TO_PIXELS) { + base64Url = await capImageDataURLSize(image.base64Url, maxImageResolution); + } contentParts.push({ type: ContentPartType.IMAGE_URL, - image_url: { url: image.base64Url } + image_url: { url: base64Url } }); } @@ -883,7 +918,7 @@ export class ChatService { type: ContentPartType.INPUT_AUDIO, input_audio: { data: audio.base64Data, - format: audio.mimeType.includes('wav') ? 'wav' : 'mp3' + format: getAudioInputFormat(audio.mimeType) } }); } diff --git a/tools/ui/src/lib/stores/agentic.svelte.ts b/tools/ui/src/lib/stores/agentic.svelte.ts index e8c0cc523..4866995b4 100644 --- a/tools/ui/src/lib/stores/agentic.svelte.ts +++ b/tools/ui/src/lib/stores/agentic.svelte.ts @@ -416,21 +416,23 @@ class AgenticStore { console.log(`[AgenticStore] Starting agentic flow with ${tools.length} tools`); - const normalizedMessages: ApiChatMessageData[] = messages - .map((msg) => { - if ('id' in msg && 'convId' in msg && 'timestamp' in msg) - return ChatService.convertDbMessageToApiChatMessageData( - msg as DatabaseMessage & { extra?: DatabaseMessageExtra[] } - ); - return msg as ApiChatMessageData; - }) - .filter((msg) => { - if (msg.role === MessageRole.SYSTEM) { - const content = typeof msg.content === 'string' ? msg.content : ''; - return content.trim().length > 0; - } - return true; - }); + const normalizedMessages: ApiChatMessageData[] = ( + await Promise.all( + messages.map((msg) => { + if ('id' in msg && 'convId' in msg && 'timestamp' in msg) + return ChatService.convertDbMessageToApiChatMessageData( + msg as DatabaseMessage & { extra?: DatabaseMessageExtra[] } + ); + return msg as ApiChatMessageData; + }) + ) + ).filter((msg: { role: ChatRole; content: string | ApiChatMessageContentPart[] }) => { + if (msg.role === MessageRole.SYSTEM) { + const content = typeof msg.content === 'string' ? msg.content : ''; + return content.trim().length > 0; + } + return true; + }); this.updateSession(conversationId, { isRunning: true, diff --git a/tools/ui/src/lib/types/api.d.ts b/tools/ui/src/lib/types/api.d.ts index 5f0a38dd3..c78035180 100644 --- a/tools/ui/src/lib/types/api.d.ts +++ b/tools/ui/src/lib/types/api.d.ts @@ -1,6 +1,8 @@ -import type { ContentPartType, ServerModelStatus, ServerRole } from '$lib/enums'; +import type { ContentPartType, FileTypeAudio, ServerModelStatus, ServerRole } from '$lib/enums'; import type { ChatMessagePromptProgress, ChatRole } from './chat'; +export type AudioInputFormat = FileTypeAudio.WAV | FileTypeAudio.MP3; + export interface ApiChatCompletionToolFunction { name: string; description?: string; @@ -20,7 +22,7 @@ export interface ApiChatMessageContentPart { }; input_audio?: { data: string; - format: 'wav' | 'mp3'; + format: AudioInputFormat; }; input_video?: { data: string; diff --git a/tools/ui/src/lib/types/index.ts b/tools/ui/src/lib/types/index.ts index d704a4b3c..0eb1e6701 100644 --- a/tools/ui/src/lib/types/index.ts +++ b/tools/ui/src/lib/types/index.ts @@ -29,7 +29,8 @@ export type { ApiRouterModelsStatusResponse, ApiRouterModelsListResponse, ApiRouterModelsUnloadRequest, - ApiRouterModelsUnloadResponse + ApiRouterModelsUnloadResponse, + AudioInputFormat } from './api'; // Chat types diff --git a/tools/ui/src/lib/utils/cap-img-size.ts b/tools/ui/src/lib/utils/cap-img-size.ts new file mode 100644 index 000000000..fa2af53f6 --- /dev/null +++ b/tools/ui/src/lib/utils/cap-img-size.ts @@ -0,0 +1,73 @@ +import { MEGAPIXELS_TO_PIXELS } from '$lib/constants/image-size'; +import { BASE64_IMAGE_URI_REGEX } from '$lib/constants/uri-template'; +import { MimeTypeImage } from '$lib/enums'; + +/** + * Converts an Image base64 data URL to another Image data URL with capped dimensions to reduce file size. + * @param base64UrlImage - The Image base64 data URL to convert + * @param maxMegapixels - The maximum image size in megapixels for the output Image + * @returns Promise resolving to Image data URL + */ +export function capImageDataURLSize( + base64UrlImage: string, + maxMegapixels: number +): Promise { + return new Promise((resolve, reject) => { + try { + + const mimeMatch = base64UrlImage.match(BASE64_IMAGE_URI_REGEX); + + if (!mimeMatch) { + return reject(new Error('Invalid data URL format.')); + } + + const mimeType = mimeMatch[1] as MimeTypeImage; + + if (!Object.values(MimeTypeImage).includes(mimeType)) { + return reject(new Error(`Unsupported image MIME type: ${mimeType}`)); + } + + const img = new Image(); + + img.onload = () => { + try { + const canvas = document.createElement('canvas'); + const ctx = canvas.getContext('2d'); + + if (!ctx) { + throw new Error('Failed to get 2D canvas context.'); + } + + const targetWidth = img.naturalWidth; + const targetHeight = img.naturalHeight; + const totalPixels = targetWidth * targetHeight; + const maxPixels = Math.floor(maxMegapixels * MEGAPIXELS_TO_PIXELS); + + if (maxPixels > 0 && totalPixels > maxPixels) { + const scaleFactor = Math.sqrt(maxPixels / totalPixels); + canvas.width = Math.floor(targetWidth * scaleFactor); + canvas.height = Math.floor(targetHeight * scaleFactor); + } else { + return resolve(base64UrlImage); + } + + ctx.drawImage(img, 0, 0, canvas.width, canvas.height); + resolve(canvas.toDataURL(mimeType)); + } catch (err) { + reject(err instanceof Error ? err : new Error(String(err))); + } + }; + + img.onerror = () => { + reject(new Error('Failed to load image.')); + }; + + img.src = base64UrlImage; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + const errorMessage = `Error resizing image: ${message}`; + console.error(errorMessage, error); + reject(new Error(errorMessage)); + } + }); +} diff --git a/tools/ui/src/lib/utils/file-type.ts b/tools/ui/src/lib/utils/file-type.ts index ae814e805..7495163d1 100644 --- a/tools/ui/src/lib/utils/file-type.ts +++ b/tools/ui/src/lib/utils/file-type.ts @@ -18,8 +18,12 @@ import { MimeTypeText } from '$lib/enums'; +function normalizeMimeType(mimeType: string): string { + return mimeType.trim().toLowerCase(); +} + export function getFileTypeCategory(mimeType: string): FileTypeCategory | null { - switch (mimeType) { + switch (normalizeMimeType(mimeType)) { // Images case MimeTypeImage.JPEG: case MimeTypeImage.PNG: @@ -33,6 +37,10 @@ export function getFileTypeCategory(mimeType: string): FileTypeCategory | null { case MimeTypeAudio.MP3: case MimeTypeAudio.MP4: case MimeTypeAudio.WAV: + case MimeTypeAudio.WAVE: + case MimeTypeAudio.X_WAV: + case MimeTypeAudio.X_WAVE: + case MimeTypeAudio.X_PN_WAV: case MimeTypeAudio.WEBM: case MimeTypeAudio.WEBM_OPUS: return FileTypeCategory.AUDIO;