diff --git a/.editorconfig b/.editorconfig index 316448c7e..47111c72d 100644 --- a/.editorconfig +++ b/.editorconfig @@ -49,6 +49,6 @@ charset = unset trim_trailing_whitespace = unset insert_final_newline = unset -[tools/mtmd/miniaudio.h] +[tools/mtmd/vendor/miniaudio.h] trim_trailing_whitespace = unset insert_final_newline = unset diff --git a/CMakeLists.txt b/CMakeLists.txt index 90ecd1f04..ad677a60b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -440,7 +440,7 @@ add_library(ggml ggml/include/gguf.h ggml/src/gguf.cpp ${GGML_SOURCES_CUDA}) -target_include_directories(ggml PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools) +target_include_directories(ggml PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor) target_compile_features(ggml PUBLIC c_std_11) # don't bump target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) @@ -449,7 +449,7 @@ target_compile_options(ggml PRIVATE $<$:-use_fast_math -e add_library(ggml_v1 otherarch/ggml_v1.c otherarch/ggml_v1.h) -target_include_directories(ggml_v1 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools) +target_include_directories(ggml_v1 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor) target_compile_features(ggml_v1 PUBLIC c_std_11) # don't bump target_link_libraries(ggml_v1 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) set_target_properties(ggml_v1 PROPERTIES POSITION_INDEPENDENT_CODE ON) @@ -459,7 +459,7 @@ add_library(ggml_v2 otherarch/ggml_v2.h ${GGML_V2_CUDA_SOURCES} ${GGML_V2_LEGACY_CUDA_SOURCES}) -target_include_directories(ggml_v2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools) +target_include_directories(ggml_v2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor) target_compile_features(ggml_v2 PUBLIC c_std_11) # don't bump target_link_libraries(ggml_v2 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) set_target_properties(ggml_v2 PROPERTIES POSITION_INDEPENDENT_CODE ON) @@ -468,7 +468,7 @@ add_library(ggml_v3 otherarch/ggml_v3.c otherarch/ggml_v3.h ${GGML_V3_CUDA_SOURCES}) -target_include_directories(ggml_v3 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools) +target_include_directories(ggml_v3 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor) target_compile_features(ggml_v3 PUBLIC c_std_11) # don't bump target_link_libraries(ggml_v3 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) set_target_properties(ggml_v3 PROPERTIES POSITION_INDEPENDENT_CODE ON) @@ -487,42 +487,42 @@ add_library(common2 src/unicode-data.cpp otherarch/utils.cpp otherarch/utils.h) -target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common) +target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common) target_compile_features(common2 PUBLIC cxx_std_17) # don't bump target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS}) set_target_properties(common2 PROPERTIES POSITION_INDEPENDENT_CODE ON) add_library(sdtype_adapter otherarch/sdcpp/sdtype_adapter.cpp) -target_include_directories(sdtype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common) +target_include_directories(sdtype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common) target_compile_features(sdtype_adapter PUBLIC cxx_std_17) # don't bump target_link_libraries(sdtype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS}) set_target_properties(sdtype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) add_library(whisper_adapter otherarch/whispercpp/whisper_adapter.cpp) -target_include_directories(whisper_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/whispercpp ./tools ./common) +target_include_directories(whisper_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/whispercpp ./tools ./common) target_compile_features(whisper_adapter PUBLIC cxx_std_17) # don't bump target_link_libraries(whisper_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS}) set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) add_library(tts_adapter otherarch/tts_adapter.cpp) -target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools ./common) +target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./tools ./common) target_compile_features(tts_adapter PUBLIC cxx_std_17) # don't bump target_link_libraries(tts_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS}) set_target_properties(tts_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) add_library(embeddings_adapter otherarch/embeddings_adapter.cpp) -target_include_directories(embeddings_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools ./common) +target_include_directories(embeddings_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./tools ./common) target_compile_features(embeddings_adapter PUBLIC cxx_std_17) # don't bump target_link_libraries(embeddings_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS}) set_target_properties(embeddings_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) add_library(gpttype_adapter gpttype_adapter.cpp) -target_include_directories(gpttype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common) +target_include_directories(gpttype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common) target_compile_features(gpttype_adapter PUBLIC cxx_std_17) # don't bump target_link_libraries(gpttype_adapter PRIVATE common2 ggml ggml_v1 ggml_v2 ggml_v3 ${LLAMA_EXTRA_LIBS}) set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) @@ -530,7 +530,7 @@ set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) if (LLAMA_CUBLAS) set(TARGET koboldcpp_cublas) add_library(${TARGET} SHARED expose.cpp expose.h) - target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common) + target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common) target_compile_features(${TARGET} PUBLIC cxx_std_17) # don't bump set_target_properties(${TARGET} PROPERTIES PREFIX "") set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas") @@ -550,7 +550,7 @@ endif() if (LLAMA_HIPBLAS) set(TARGET koboldcpp_hipblas) add_library(${TARGET} SHARED expose.cpp expose.h) - target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common) + target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common) target_compile_features(${TARGET} PUBLIC cxx_std_17) # don't bump set_target_properties(${TARGET} PROPERTIES PREFIX "") set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas") diff --git a/Makefile b/Makefile index cffbb4773..84c84a61b 100644 --- a/Makefile +++ b/Makefile @@ -51,8 +51,8 @@ ifdef KCPP_DEBUG CFLAGS = -g -O0 CXXFLAGS = -g -O0 endif -CFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_AARCH64 -CXXFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c++17 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_AARCH64 +CFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./tools/mtmd/vendor -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_AARCH64 +CXXFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./tools/mtmd/vendor -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c++17 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_AARCH64 ifndef KCPP_DEBUG CFLAGS += -DNDEBUG -s CXXFLAGS += -DNDEBUG -s diff --git a/common/common.cpp b/common/common.cpp index 1499e301d..09682d5f5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -911,13 +911,16 @@ struct common_init_result common_init_from_params(common_params & params) { ok = false; } - if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) { - LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__); - ok = false; - } + bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL; + bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL; - if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) { - LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__); + if (!has_eos && !has_sep) { + LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__); + ok = false; + } else if (!has_eos) { + LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__); + } else if (!has_sep) { + LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__); ok = false; } diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a015ecee0..227ae7bc2 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -423,19 +423,19 @@ class ModelBase: try: # for security reason, we don't allow loading remote code by default # if a model need remote code, we will fallback to config.json - return AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict() + config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict() except Exception as e: logger.warning(f"Failed to load model config from {dir_model}: {e}") logger.warning("Trying to load config.json instead") with open(dir_model / "config.json", "r", encoding="utf-8") as f: config = json.load(f) - if "llm_config" in config: - # rename for InternVL - config["text_config"] = config["llm_config"] - if "thinker_config" in config: - # rename for Qwen2.5-Omni - config["text_config"] = config["thinker_config"]["text_config"] - return config + if "llm_config" in config: + # rename for InternVL + config["text_config"] = config["llm_config"] + if "thinker_config" in config: + # rename for Qwen2.5-Omni + config["text_config"] = config["thinker_config"]["text_config"] + return config @classmethod def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: @@ -1207,7 +1207,7 @@ class MmprojModel(ModelBase): self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys)) self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"])) - else: + if not self.has_vision_encoder and not self.has_audio_encoder: raise ValueError("MmprojModel must have either vision or audio encoder") def write_vocab(self): @@ -1841,7 +1841,8 @@ class StableLMModel(TextModel): "MistralForCausalLM", "MixtralForCausalLM", "VLlama3ForCausalLM", - "LlavaForConditionalGeneration") + "LlavaForConditionalGeneration", + "LlamaModel") class LlamaModel(TextModel): model_arch = gguf.MODEL_ARCH.LLAMA undo_permute = True @@ -1921,6 +1922,8 @@ class LlamaModel(TextModel): if is_vision_tensor: return [] # skip vision tensors + elif self.hf_arch == "LlamaModel": + name = "model." + name elif name.startswith("model.text_model"): name = name.replace("text_model.", "") # for SmolVLM elif name.startswith("language_model."): @@ -2169,6 +2172,9 @@ class Llama4VisionModel(MmprojModel): # process vision tensors if "positional_embedding_vlm" in name and ".weight" not in name: name += ".weight" + if "multi_modal_projector.linear_1" in name: + # despite the name with number postfix, this is a single fully connected layer + return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)] return [(self.map_tensor_name(name), data_torch)] return [] @@ -3676,7 +3682,7 @@ class InternLM3Model(TextModel): return [(self.map_tensor_name(name), data_torch)] -@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel") +@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel", "BertForSequenceClassification") class BertModel(TextModel): model_arch = gguf.MODEL_ARCH.BERT @@ -3739,6 +3745,13 @@ class BertModel(TextModel): if name.startswith("cls.seq_relationship"): return [] + # For BertForSequenceClassification (direct projection layer) + if name == "classifier.weight": + name = "classifier.out_proj.weight" + + if name == "classifier.bias": + name = "classifier.out_proj.bias" + return [(self.map_tensor_name(name), data_torch)] def _xlmroberta_tokenizer_init(self) -> None: diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 5993a4c98..c66ec547b 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -288,7 +288,7 @@ logger.info("+++ convert_hf_to_gguf.py was updated") tests = [ "ied 4 ½ months", - "Führer", + "Äpfel", "", " ", " ", diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index a4fbd8236..cfab2b5eb 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -623,8 +623,8 @@ static __global__ void flash_attn_combine_results( __builtin_assume(tid < D); extern __shared__ float2 meta[]; - if (tid < 2*parallel_blocks) { - ((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + tid]; + for (int i = tid; i < 2*parallel_blocks; i += D) { + ((float *) meta)[i] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + i]; } __syncthreads(); diff --git a/ggml/src/ggml-opencl/kernels/argsort.cl b/ggml/src/ggml-opencl/kernels/argsort.cl new file mode 100644 index 000000000..af4adc7b8 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/argsort.cl @@ -0,0 +1,86 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#ifdef cl_intel_subgroups +#pragma OPENCL EXTENSION cl_intel_subgroups : enable +#else +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#endif + +#ifdef cl_intel_required_subgroup_size +#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable +#define INTEL_GPU 1 +#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16))) +#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32))) +#elif defined(cl_qcom_reqd_sub_group_size) +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define ADRENO_GPU 1 +#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half"))) +#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) +#endif + +#define SWAP(x, y, T) { T tmp = (x); (x) = (y); (y) = tmp; } + +enum ggml_sort_order { + GGML_SORT_ORDER_ASC, + GGML_SORT_ORDER_DESC, +}; + +kernel void kernel_argsort_f32_i32( + global float * src0, + ulong offset0, + global int * dst, + ulong offsetd, + const int ne00, + const int ne00_pad, + const int order, + local int * dst_row +) { + // bitonic sort + int col = get_local_id(0); + int row = get_group_id(1); + + if (col >= ne00_pad) { + return; + } + + src0 = (global char *)((global char *)src0 + offset0); + dst = (global float *)((global char *)dst + offsetd); + + global float * x_row = src0 + row * ne00; + + // initialize indices + dst_row[col] = col; + + barrier(CLK_LOCAL_MEM_FENCE); + + for (int k = 2; k <= ne00_pad; k *= 2) { + for (int j = k / 2; j > 0; j /= 2) { + int ixj = col ^ j; + if (ixj > col) { + if ((col & k) == 0) { + if (dst_row[col] >= ne00 || + (dst_row[ixj] < ne00 && (order == GGML_SORT_ORDER_ASC ? + x_row[dst_row[col]] > x_row[dst_row[ixj]] : + x_row[dst_row[col]] < x_row[dst_row[ixj]])) + ) { + SWAP(dst_row[col], dst_row[ixj], int); + } + } else { + if (dst_row[ixj] >= ne00 || + (dst_row[col] < ne00 && (order == GGML_SORT_ORDER_ASC ? + x_row[dst_row[col]] < x_row[dst_row[ixj]] : + x_row[dst_row[col]] > x_row[dst_row[ixj]])) + ) { + SWAP(dst_row[col], dst_row[ixj], int); + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + } + + // copy the result to dst without the padding + if (col < ne00) { + dst[row * ne00 + col] = dst_row[col]; + } +} diff --git a/ggml/src/ggml-opencl/kernels/div.cl b/ggml/src/ggml-opencl/kernels/div.cl new file mode 100644 index 000000000..d453ad99b --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/div.cl @@ -0,0 +1,72 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +//------------------------------------------------------------------------------ +// div +//------------------------------------------------------------------------------ +kernel void kernel_div( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global char * dst, + ulong offsetd, + ulong nb00, + ulong nb01, + ulong nb02, + ulong nb03, + int ne10, + int ne11, + int ne12, + int ne13, + ulong nb10, + ulong nb11, + ulong nb12, + ulong nb13, + int ne0, + ulong nb0, + ulong nb1, + ulong nb2, + ulong nb3 +) { + src0 = src0 + offset0; + src1 = src1 + offset1; + dst = dst + offsetd; + + int i03 = get_group_id(2); + int i02 = get_group_id(1); + int i01 = get_group_id(0); + + int i13 = i03 % ne13; + int i12 = i02 % ne12; + int i11 = i01 % ne11; + + global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01; + global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11; + global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1; + + for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) { + const int i10 = i0 % ne10; + *((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) / *((global float *)(src1_ptr + i10*nb10)); + } +} + +// assumption: src1 is a row +// broadcast src1 into src0 +kernel void kernel_div_row( + global float4 * src0, + ulong offset0, + global float4 * src1, + ulong offset1, + global float4 * dst, + ulong offsetd, + int ne +) { + src0 = (global float4*)((global char*)src0 + offset0); + src1 = (global float4*)((global char*)src1 + offset1); + dst = (global float4*)((global char*)dst + offsetd); + + // This performs better than using %. + uint gid = get_global_id(0); + uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne + dst[gid] = src0[gid] / src1[idx1]; +} diff --git a/ggml/src/ggml-opencl/kernels/group_norm.cl b/ggml/src/ggml-opencl/kernels/group_norm.cl new file mode 100644 index 000000000..57c9df4d3 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/group_norm.cl @@ -0,0 +1,72 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#ifdef cl_intel_subgroups +#pragma OPENCL EXTENSION cl_intel_subgroups : enable +#else +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#endif + +#ifdef cl_intel_required_subgroup_size +#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable +#define INTEL_GPU 1 +#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16))) +#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32))) +#elif defined(cl_qcom_reqd_sub_group_size) +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define ADRENO_GPU 1 +#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half"))) +#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) +#endif + +// Workgroup must be a subgroup +#ifdef INTEL_GPU +REQD_SUBGROUP_SIZE_32 +#elif defined (ADRENO_GPU) +REQD_SUBGROUP_SIZE_64 +#endif +kernel void kernel_group_norm( + global float * src0, + ulong offset0, + global float * dst, + ulong offsetd, + int ne, + int group_size, + float eps +) { + src0 = (global float *)((global char *)src0 + offset0); + dst = (global float *)((global char *)dst + offsetd); + + int start = get_group_id(0) * group_size; + int end = start + group_size; + + start += get_local_id(0); + + if (end >= ne) { + end = ne; + } + + float tmp = 0.0f; + + for (int j = start; j < end; j += get_local_size(0)) { + tmp += src0[j]; + } + + tmp = sub_group_reduce_add(tmp); + + const float mean = tmp / group_size; + tmp = 0.0f; + + for (int j = start; j < end; j += get_local_size(0)) { + float xi = src0[j] - mean; + dst[j] = xi; + tmp += xi * xi; + } + + tmp = sub_group_reduce_add(tmp); + + const float variance = tmp / group_size; + const float scale = 1.0f/sqrt(variance + eps); + for (int j = start; j < end; j += get_local_size(0)) { + dst[j] *= scale; + } +} diff --git a/ggml/src/ggml-opencl/kernels/sigmoid.cl b/ggml/src/ggml-opencl/kernels/sigmoid.cl new file mode 100644 index 000000000..e3f669dde --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/sigmoid.cl @@ -0,0 +1,29 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +//------------------------------------------------------------------------------ +// sigmoid +//------------------------------------------------------------------------------ + +kernel void kernel_sigmoid_f32( + global float * src0, + ulong offset0, + global float * dst, + ulong offsetd +) { + src0 = (global float*)((global char*)src0 + offset0); + dst = (global float*)((global char*)dst + offsetd); + + dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)])); +} + +kernel void kernel_sigmoid_f16( + global half * src0, + ulong offset0, + global half * dst, + ulong offsetd +) { + src0 = (global half*)((global char*)src0 + offset0); + dst = (global half*)((global char*)dst + offsetd); + + dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)])); +} diff --git a/ggml/src/ggml-opencl/kernels/sub.cl b/ggml/src/ggml-opencl/kernels/sub.cl new file mode 100644 index 000000000..041e88ad3 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/sub.cl @@ -0,0 +1,72 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +//------------------------------------------------------------------------------ +// div +//------------------------------------------------------------------------------ +kernel void kernel_sub( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global char * dst, + ulong offsetd, + ulong nb00, + ulong nb01, + ulong nb02, + ulong nb03, + int ne10, + int ne11, + int ne12, + int ne13, + ulong nb10, + ulong nb11, + ulong nb12, + ulong nb13, + int ne0, + ulong nb0, + ulong nb1, + ulong nb2, + ulong nb3 +) { + src0 = src0 + offset0; + src1 = src1 + offset1; + dst = dst + offsetd; + + int i03 = get_group_id(2); + int i02 = get_group_id(1); + int i01 = get_group_id(0); + + int i13 = i03 % ne13; + int i12 = i02 % ne12; + int i11 = i01 % ne11; + + global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01; + global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11; + global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1; + + for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) { + const int i10 = i0 % ne10; + *((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) - *((global float *)(src1_ptr + i10*nb10)); + } +} + +// assumption: src1 is a row +// broadcast src1 into src0 +kernel void kernel_sub_row( + global float4 * src0, + ulong offset0, + global float4 * src1, + ulong offset1, + global float4 * dst, + ulong offsetd, + int ne +) { + src0 = (global float4*)((global char*)src0 + offset0); + src1 = (global float4*)((global char*)src1 + offset1); + dst = (global float4*)((global char*)dst + offsetd); + + // This performs better than using %. + uint gid = get_global_id(0); + uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne + dst[gid] = src0[gid] - src1[idx1]; +} diff --git a/ggml/src/ggml-opencl/kernels/sum_rows.cl b/ggml/src/ggml-opencl/kernels/sum_rows.cl new file mode 100644 index 000000000..c5f7c570f --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/sum_rows.cl @@ -0,0 +1,39 @@ + +kernel void kernel_sum_rows_f32( + global float * src0, + ulong offset0, + global float * dst, + ulong offsetd, + int ne00, + int ne01, + int ne02, + int ne03, + ulong nb01, + ulong nb02, + ulong nb03, + ulong nb1, + ulong nb2, + ulong nb3 +) { + src0 = (global float *)((global char *)src0 + offset0); + dst = (global float *)((global char *)dst + offsetd); + + int i3 = get_global_id(2); + int i2 = get_global_id(1); + int i1 = get_global_id(0); + + if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) { + return; + } + + global float * src_row = (global float *) ((global char *) src0 + i1*nb01 + i2*nb02 + i3*nb03); + global float * dst_row = (global float *) ((global char *) dst + i1*nb1 + i2*nb2 + i3*nb3); + + float row_sum = 0; + + for (int i0 = 0; i0 < ne00; i0++) { + row_sum += src_row[i0]; + } + + dst_row[0] = row_sum; +} diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 06c26c9b1..2c5ac809e 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -12,7 +12,7 @@ #include "ggml-vulkan.h" #include -#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_PERF) || defined(GGML_VULKAN_CHECK_RESULTS) +#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS) #include #include "ggml-cpu.h" #endif @@ -200,9 +200,7 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = { #ifdef GGML_VULKAN_MEMORY_DEBUG class vk_memory_logger; #endif -#ifdef GGML_VULKAN_PERF class vk_perf_logger; -#endif static void ggml_vk_destroy_buffer(vk_buffer& buf); static constexpr uint32_t mul_mat_vec_max_cols = 8; @@ -458,9 +456,11 @@ struct vk_device_struct { #ifdef GGML_VULKAN_MEMORY_DEBUG std::unique_ptr memory_logger; #endif -#ifdef GGML_VULKAN_PERF + + // for GGML_VK_PERF_LOGGER std::unique_ptr perf_logger; -#endif + vk::QueryPool query_pool; + uint32_t num_queries; ~vk_device_struct() { VK_LOG_DEBUG("destroy device " << name); @@ -844,8 +844,6 @@ private: #define VK_LOG_MEMORY(msg) ((void) 0) #endif // GGML_VULKAN_MEMORY_DEBUG -#if defined(GGML_VULKAN_PERF) - class vk_perf_logger { public: void print_timings() { @@ -855,7 +853,7 @@ public: for (const auto& time : t.second) { total += time; } - std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " ms" << std::endl; + std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us" << std::endl; } timings.clear(); @@ -884,7 +882,6 @@ public: private: std::map> timings; }; -#endif // GGML_VULKAN_PERF struct ggml_backend_vk_context { std::string name; @@ -974,6 +971,8 @@ struct vk_instance_t { static bool vk_instance_initialized = false; static vk_instance_t vk_instance; +static bool vk_perf_logger_enabled = false; + #ifdef GGML_VULKAN_CHECK_RESULTS static size_t vk_skip_checks; static size_t vk_output_tensor; @@ -2773,9 +2772,9 @@ static vk_device ggml_vk_get_device(size_t idx) { #ifdef GGML_VULKAN_MEMORY_DEBUG device->memory_logger = std::unique_ptr(new vk_memory_logger()); #endif -#ifdef GGML_VULKAN_PERF - device->perf_logger = std::unique_ptr(new vk_perf_logger()); -#endif + if (vk_perf_logger_enabled) { + device->perf_logger = std::unique_ptr(new vk_perf_logger()); + } size_t dev_num = vk_instance.device_indices[idx]; @@ -3571,6 +3570,8 @@ static void ggml_vk_instance_init() { vk_instance.instance = vk::createInstance(instance_create_info); vk_instance_initialized = true; + vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr; + size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size(); // Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan @@ -8909,7 +8910,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod ctx->tensor_ctxs[node_idx] = compute_ctx; -#if defined(GGML_VULKAN_CHECK_RESULTS) || defined(GGML_VULKAN_PERF) +#if defined(GGML_VULKAN_CHECK_RESULTS) // Force context reset on each node so that each tensor ends up in its own context // and can be run and compared to its CPU equivalent separately last_node = true; @@ -9529,6 +9530,29 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg bool first_node_in_batch = true; // true if next node will be first node in a batch int submit_node_idx = 0; // index to first node in a batch + vk_context compute_ctx; + if (vk_perf_logger_enabled) { + // allocate/resize the query pool + if (ctx->device->num_queries < cgraph->n_nodes + 1) { + if (ctx->device->query_pool) { + ctx->device->device.destroyQueryPool(ctx->device->query_pool); + } + VkQueryPoolCreateInfo query_create_info = { VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO }; + query_create_info.queryType = VK_QUERY_TYPE_TIMESTAMP; + query_create_info.queryCount = cgraph->n_nodes + 100; + ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info); + ctx->device->num_queries = query_create_info.queryCount; + } + + ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1); + + GGML_ASSERT(ctx->compute_ctx.expired()); + compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); + ctx->compute_ctx = compute_ctx; + ggml_vk_ctx_begin(ctx->device, compute_ctx); + compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0); + } + // Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution. // Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB // (and scaled down based on model size, so smaller models submit earlier). @@ -9556,6 +9580,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit); + if (vk_perf_logger_enabled) { + if (ctx->compute_ctx.expired()) { + compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); + ctx->compute_ctx = compute_ctx; + ggml_vk_ctx_begin(ctx->device, compute_ctx); + } else { + compute_ctx = ctx->compute_ctx.lock(); + } + compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+1); + } + if (enqueued) { ++submitted_nodes; @@ -9577,9 +9612,27 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg } } -#ifdef GGML_VULKAN_PERF - ctx->device->perf_logger->print_timings(); -#endif + if (vk_perf_logger_enabled) { + // End the command buffer and submit/wait + GGML_ASSERT(!ctx->compute_ctx.expired()); + compute_ctx = ctx->compute_ctx.lock(); + ggml_vk_ctx_end(compute_ctx); + + ggml_vk_submit(compute_ctx, ctx->device->fence); + VK_CHECK(ctx->device->device.waitForFences({ ctx->device->fence }, true, UINT64_MAX), "GGML_VULKAN_PERF waitForFences"); + ctx->device->device.resetFences({ ctx->device->fence }); + + // Get the results and pass them to the logger + std::vector timestamps(cgraph->n_nodes + 1); + ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait); + for (int i = 0; i < cgraph->n_nodes; i++) { + if (!ggml_vk_is_empty(cgraph->nodes[i])) { + ctx->device->perf_logger->log_timing(cgraph->nodes[i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod)); + } + } + + ctx->device->perf_logger->print_timings(); + } ggml_vk_graph_cleanup(ctx); diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 000ffd006..48167dd64 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -902,7 +902,6 @@ class TensorNameMap: MODEL_TENSOR.V_MMPROJ_FC: ( "model.connector.modality_projection.proj", # SmolVLM - "multi_modal_projector.linear_1", # llama 4 ), MODEL_TENSOR.V_MMPROJ_MLP: ( diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 86641d660..a1d95b953 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -455,7 +455,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : } int64_t llm_graph_context::n_pos_per_embd() const { - return arch == LLM_ARCH_QWEN2VL ? 4 : 1; + return hparams.rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1; } void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const { @@ -1562,20 +1562,25 @@ void llm_graph_context::build_pooling( ggml_tensor * inp_cls = build_inp_cls(); inp = ggml_get_rows(ctx0, inp, inp_cls); - // classification head - // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566 - GGML_ASSERT(cls != nullptr); - GGML_ASSERT(cls_b != nullptr); + if (cls != nullptr && cls_b != nullptr) { + // classification head + // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566 + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b); + cur = ggml_tanh(ctx0, cur); - cur = ggml_add (ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b); - cur = ggml_tanh(ctx0, cur); - - // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en - // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896 - if (cls_out) { + // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en + // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896 + if (cls_out) { + GGML_ASSERT(cls_out_b != nullptr); + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b); + } + } else if (cls_out) { + // Single layer classification head (direct projection) + // https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476 GGML_ASSERT(cls_out_b != nullptr); - - cur = ggml_add (ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b); + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, inp), cls_out_b); + } else { + GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b"); } } break; default: diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 56c1939a2..9397ac773 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -757,11 +757,19 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift( const auto & yarn_beta_slow = cparams.yarn_beta_slow; const auto & n_rot = hparams.n_rot; - const auto & rope_type = hparams.rope_type; + const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE + // @ngxson : this is a workaround + // for M-RoPE, we want to rotate the whole vector when doing KV shift + // a normal RoPE should work, we just need to use the correct ordering + // ref: https://github.com/ggml-org/llama.cpp/pull/13870 + ? LLAMA_ROPE_TYPE_NEOX + : hparams.rope_type; // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly. // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation. - const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor; + const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 + ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) + : cparams.yarn_attn_factor; ggml_tensor * tmp; diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index ae06a695d..4d053895c 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -1,28 +1,5 @@ -// fix problem with std::min and std::max -#if defined(_WIN32) -#define WIN32_LEAN_AND_MEAN -#ifndef NOMINMAX -# define NOMINMAX -#endif -#include -#endif - #include "mtmd-audio.h" -//#define MTMD_AUDIO_DEBUG - -#define MINIAUDIO_IMPLEMENTATION -#ifndef MTMD_AUDIO_DEBUG -# define MA_NO_ENCODING -#endif -#define MA_NO_DEVICE_IO -#define MA_NO_RESOURCE_MANAGER -#define MA_NO_NODE_GRAPH -#define MA_NO_ENGINE -#define MA_NO_GENERATION -#define MA_API static -#include "miniaudio.h" - #define _USE_MATH_DEFINES // for M_PI #include #include @@ -359,69 +336,6 @@ bool preprocess_audio( } // namespace whisper_preprocessor -namespace audio_helpers { - -bool is_audio_file(const char * buf, size_t len) { - if (len < 12) { - return false; - } - - // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format - // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html - bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0; - bool is_mp3 = len >= 3 && ( - memcmp(buf, "ID3", 3) == 0 || - // Check for MPEG sync word (simplified check) - ((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0) - ); - bool is_flac = memcmp(buf, "fLaC", 4) == 0; - - return is_wav || is_mp3 || is_flac; -} - -// returns true if the buffer is a valid audio file -bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector & pcmf32_mono) { - ma_result result; - const int channels = 1; - ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate); - ma_decoder decoder; - - result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder); - if (result != MA_SUCCESS) { - return false; - } - - ma_uint64 frame_count; - ma_uint64 frames_read; - result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count); - if (result != MA_SUCCESS) { - ma_decoder_uninit(&decoder); - return false; - } - - pcmf32_mono.resize(frame_count); - result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read); - if (result != MA_SUCCESS) { - ma_decoder_uninit(&decoder); - return false; - } - -#ifdef MTMD_AUDIO_DEBUG - // save audio to wav file - ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate); - ma_encoder encoder; - ma_encoder_init_file("output.wav", &config, &encoder); - ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read); - ma_encoder_uninit(&encoder); -#endif - - ma_decoder_uninit(&decoder); - return true; -} - -} // namespace wav_utils - - // precalculated mel filter banks // values are multiplied by 1000.0 to save space, and will be divided by 1000.0 in the end of the function // diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h index 348d11dca..b7b940aff 100644 --- a/tools/mtmd/mtmd-audio.h +++ b/tools/mtmd/mtmd-audio.h @@ -32,7 +32,7 @@ struct whisper_filters { std::vector data; }; -extern bool preprocess_audio( +bool preprocess_audio( const float * samples, size_t n_samples, const whisper_filters & filters, @@ -40,23 +40,8 @@ extern bool preprocess_audio( } // namespace whisper_preprocessor - -// TODO @ngxson : move this helper to mtmd-helpers.cpp -namespace audio_helpers { - -extern bool is_audio_file(const char * buf, size_t len); - -extern bool decode_audio_from_buf( - const unsigned char * buf_in, - size_t len, - int target_sampler_rate, - std::vector & pcmf32_mono); - -} // namespace audio_helpers - - namespace whisper_precalc_filters { -extern whisper_preprocessor::whisper_filters get_128_bins(); +whisper_preprocessor::whisper_filters get_128_bins(); } // namespace whisper_precalc_filters diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index a70f11ca9..508a64c58 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -7,6 +7,7 @@ #include "console.h" #include "chat.h" #include "mtmd.h" +#include "mtmd-helper.h" #include #include @@ -143,7 +144,7 @@ struct mtmd_cli_context { } bool load_media(const std::string & fname) { - mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str())); + mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str())); if (!bmp.ptr) { return false; } diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp index e6c926080..058323818 100644 --- a/tools/mtmd/mtmd-helper.cpp +++ b/tools/mtmd/mtmd-helper.cpp @@ -1,10 +1,37 @@ +// fix problem with std::min and std::max +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +# define NOMINMAX +#endif +#include +#endif + #include "mtmd.h" +#include "mtmd-helper.h" #include "llama.h" #include #include #include +//#define MTMD_AUDIO_DEBUG + +#define MINIAUDIO_IMPLEMENTATION +#ifndef MTMD_AUDIO_DEBUG +# define MA_NO_ENCODING +#endif +#define MA_NO_DEVICE_IO +#define MA_NO_RESOURCE_MANAGER +#define MA_NO_NODE_GRAPH +#define MA_NO_ENGINE +#define MA_NO_GENERATION +#define MA_API static +#include "vendor/miniaudio.h" + +#define STB_IMAGE_IMPLEMENTATION +#include "vendor/stb_image.h" + #define LOG_INF(...) fprintf(stdout, __VA_ARGS__) #define LOG_ERR(...) fprintf(stderr, __VA_ARGS__) @@ -315,3 +342,118 @@ int32_t mtmd_helper_eval_chunks(mtmd_context * ctx, return 0; } + +namespace audio_helpers { + +static bool is_audio_file(const char * buf, size_t len) { + if (len < 12) { + return false; + } + + // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format + // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html + bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0; + bool is_mp3 = len >= 3 && ( + memcmp(buf, "ID3", 3) == 0 || + // Check for MPEG sync word (simplified check) + ((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0) + ); + bool is_flac = memcmp(buf, "fLaC", 4) == 0; + + return is_wav || is_mp3 || is_flac; +} + +// returns true if the buffer is a valid audio file +static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector & pcmf32_mono) { + ma_result result; + const int channels = 1; + ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate); + ma_decoder decoder; + + result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder); + if (result != MA_SUCCESS) { + return false; + } + + ma_uint64 frame_count; + ma_uint64 frames_read; + result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count); + if (result != MA_SUCCESS) { + ma_decoder_uninit(&decoder); + return false; + } + + pcmf32_mono.resize(frame_count); + result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read); + if (result != MA_SUCCESS) { + ma_decoder_uninit(&decoder); + return false; + } + +#ifdef MTMD_AUDIO_DEBUG + // save audio to wav file + ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate); + ma_encoder encoder; + ma_encoder_init_file("output.wav", &config, &encoder); + ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read); + ma_encoder_uninit(&encoder); +#endif + + ma_decoder_uninit(&decoder); + return true; +} + +} // namespace audio_helpers + +mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) { + if (audio_helpers::is_audio_file((const char *)buf, len)) { + std::vector pcmf32; + int bitrate = mtmd_get_audio_bitrate(ctx); + if (bitrate < 0) { + LOG_ERR("This model does not support audio input\n"); + return nullptr; + } + if (!audio_helpers::decode_audio_from_buf(buf, len, bitrate, pcmf32)) { + LOG_ERR("Unable to read WAV audio file from buffer\n"); + return nullptr; + } + return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data()); + } + + // otherwise, we assume it's an image + mtmd_bitmap * result = nullptr; + { + int nx, ny, nc; + auto * data = stbi_load_from_memory(buf, len, &nx, &ny, &nc, 3); + if (!data) { + LOG_ERR("%s: failed to decode image bytes\n", __func__); + return nullptr; + } + result = mtmd_bitmap_init(nx, ny, data); + stbi_image_free(data); + } + return result; +} + +mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) { + std::vector buf; + FILE * f = fopen(fname, "rb"); + if (!f) { + LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno)); + return nullptr; + } + + fseek(f, 0, SEEK_END); + long file_size = ftell(f); + fseek(f, 0, SEEK_SET); + buf.resize(file_size); + + size_t n_read = fread(buf.data(), 1, file_size, f); + fclose(f); + if (n_read != (size_t)file_size) { + LOG_ERR("Failed to read entire file %s", fname); + return nullptr; + } + + return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size()); +} diff --git a/tools/mtmd/mtmd-helper.h b/tools/mtmd/mtmd-helper.h new file mode 100644 index 000000000..5c0edc693 --- /dev/null +++ b/tools/mtmd/mtmd-helper.h @@ -0,0 +1,91 @@ +#ifndef MTMD_HELPER_H +#define MTMD_HELPER_H + +#include "ggml.h" +#include "llama.h" +#include "mtmd.h" + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// +// libmtmd helper functions +// +// Please note that these helpers are not guaranteed to be stable. +// BREAKING CHANGES are expected. +// + +// helper function to construct a mtmd_bitmap from a file +// it calls mtmd_helper_bitmap_init_from_buf() internally +// returns nullptr on failure +// this function is thread-safe +MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname); + +// helper function to construct a mtmd_bitmap from a buffer containing a file +// supported formats: +// image: formats supported by stb_image: jpg, png, bmp, gif, etc. +// audio: formats supported by miniaudio: wav, mp3, flac +// note: audio files will be auto-detected based on magic bytes +// returns nullptr on failure +// this function is thread-safe +MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len); + +// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache +MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks); + +// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past +// normally, n_pos is equal to n_tokens, but for M-RoPE it is different +MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks); + +// helper function that automatically: +// 1. run llama_decode() on text chunks +// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode() +// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error +// otherwise, returns 0 on success +// this function is NOT thread-safe +MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx, + struct llama_context * lctx, + const mtmd_input_chunks * chunks, + llama_pos n_past, + llama_seq_id seq_id, + int32_t n_batch, + bool logits_last, + llama_pos * new_n_past); + +// works like mtmd_helper_eval_chunks(), but only for a single chunk +// this function is NOT thread-safe +MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, + struct llama_context * lctx, + const mtmd_input_chunk * chunk, + llama_pos n_past, + llama_seq_id seq_id, + int32_t n_batch, + bool logits_last, + llama_pos * new_n_past); + +// helper function to decode an image whose embeddings have already been calculated +// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention) +// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure +MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx, + struct llama_context * lctx, + const mtmd_input_chunk * chunk, + float * encoded_embd, + llama_pos n_past, + llama_seq_id seq_id, + int32_t n_batch, + llama_pos * new_n_past); + +#ifdef __cplusplus +} // extern "C" +#endif + +// +// C++ wrappers +// + +#endif diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 33bdca078..8573f1143 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -819,53 +819,12 @@ bool mtmd_support_audio(mtmd_context * ctx) { return ctx->ctx_a != nullptr; } -// these 2 helpers below use internal clip_image_u8_ptr, -// so unfortunately they cannot moved to mtmd-helper.h -// however, in theory, user can decode image file to bitmap using -// whichever library they want, and then use mtmd_bitmap_init() to create bitmap - -mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) { - if (audio_helpers::is_audio_file((const char *)buf, len)) { - std::vector pcmf32; - if (!audio_helpers::decode_audio_from_buf(buf, len, COMMON_SAMPLE_RATE, pcmf32)) { - LOG_ERR("Unable to read WAV audio file from buffer\n"); - return nullptr; - } - return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data()); +int mtmd_get_audio_bitrate(mtmd_context * ctx) { + if (!ctx->ctx_a) { + return -1; } - - clip_image_u8_ptr img_u8(clip_image_u8_init()); - bool ok = clip_image_load_from_bytes(buf, len, img_u8.get(),2048); - if (!ok) { - LOG_ERR("Unable to load image from buffer\n"); - return nullptr; - } - uint32_t nx, ny; - unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny); - return mtmd_bitmap_init(nx, ny, data); -} - -mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) { - std::vector buf; - FILE * f = fopen(fname, "rb"); - if (!f) { - LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno)); - return nullptr; - } - - fseek(f, 0, SEEK_END); - long file_size = ftell(f); - fseek(f, 0, SEEK_SET); - buf.resize(file_size); - - size_t n_read = fread(buf.data(), 1, file_size, f); - fclose(f); - if (n_read != (size_t)file_size) { - LOG_ERR("Failed to read entire file %s", fname); - return nullptr; - } - - return mtmd_helper_bitmap_init_from_buf(buf.data(), buf.size()); + // for now, we assume that all audio models have the same bitrate + return 16000; // 16kHz } // diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index b53f215a2..541918e09 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -109,6 +109,10 @@ MTMD_API bool mtmd_support_vision(mtmd_context * ctx); // whether the current model supports audio input MTMD_API bool mtmd_support_audio(mtmd_context * ctx); +// get audio bitrate in Hz, for example 16000 for Whisper +// return -1 if audio is not supported +MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx); + // mtmd_bitmap // // if bitmap is image: @@ -209,75 +213,6 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx); ///////////////////////////////////////// -// -// Helper functions (can be implemented based on other functions) -// -// Please note that these helpers are not guaranteed to be stable. -// BREAKING CHANGES are expected. -// - -// helper function to construct a mtmd_bitmap from a file -// it calls mtmd_helper_bitmap_init_from_buf() internally -// returns nullptr on failure -// this function is thread-safe -MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname); - -// helper function to construct a mtmd_bitmap from a buffer containing a file -// supported formats: -// image: formats supported by stb_image: jpg, png, bmp, gif, etc. -// audio: formats supported by miniaudio: wav, mp3, flac -// note: audio files will be auto-detected based on magic bytes -// returns nullptr on failure -// this function is thread-safe -MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len); - -// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache -MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks); - -// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past -// normally, n_pos is equal to n_tokens, but for M-RoPE it is different -MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks); - -// helper function that automatically: -// 1. run llama_decode() on text chunks -// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode() -// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error -// otherwise, returns 0 on success -// this function is NOT thread-safe -MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx, - struct llama_context * lctx, - const mtmd_input_chunks * chunks, - llama_pos n_past, - llama_seq_id seq_id, - int32_t n_batch, - bool logits_last, - llama_pos * new_n_past); - -// works like mtmd_helper_eval_chunks(), but only for a single chunk -// this function is NOT thread-safe -MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, - struct llama_context * lctx, - const mtmd_input_chunk * chunk, - llama_pos n_past, - llama_seq_id seq_id, - int32_t n_batch, - bool logits_last, - llama_pos * new_n_past); - -// helper function to decode an image whose embeddings have already been calculated -// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention) -// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure -MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx, - struct llama_context * lctx, - const mtmd_input_chunk * chunk, - float * encoded_embd, - llama_pos n_past, - llama_seq_id seq_id, - int32_t n_batch, - llama_pos * new_n_past); - -///////////////////////////////////////// - // test function, to be used in test-mtmd-c-api.c MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void); diff --git a/tools/mtmd/miniaudio.h b/tools/mtmd/vendor/miniaudio.h similarity index 100% rename from tools/mtmd/miniaudio.h rename to tools/mtmd/vendor/miniaudio.h diff --git a/common/stb_image.h b/tools/mtmd/vendor/stb_image.h similarity index 100% rename from common/stb_image.h rename to tools/mtmd/vendor/stb_image.h diff --git a/tools/server/server.cpp b/tools/server/server.cpp index fe6c685ec..96d1e4adf 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -9,6 +9,7 @@ #include "sampling.h" #include "speculative.h" #include "mtmd.h" +#include "mtmd-helper.h" // Change JSON_ASSERT from assert() to GGML_ASSERT: #define JSON_ASSERT GGML_ASSERT @@ -4187,7 +4188,7 @@ int main(int argc, char ** argv) { throw std::runtime_error("This server does not support multimodal"); } for (auto & file : files) { - mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size())); + mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(ctx_server.mctx, file.data(), file.size())); if (!bmp.ptr) { throw std::runtime_error("Failed to load image or audio file"); } diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index 8456a02e6..58b679d75 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -6,6 +6,7 @@ #include "arg.h" // common_remote_get_content #include "base64.hpp" #include "mtmd.h" +#include "mtmd-helper.h" // increase max payload length to allow use of larger context size #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576 @@ -264,13 +265,19 @@ static size_t validate_utf8(const std::string& text) { static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_tokens & query, const llama_tokens & doc) { llama_tokens result; + // Get EOS token - use SEP token as fallback if EOS is not available + llama_token eos_token = llama_vocab_eos(vocab); + if (eos_token == LLAMA_TOKEN_NULL) { + eos_token = llama_vocab_sep(vocab); + } + result.reserve(doc.size() + query.size() + 4); result.push_back(llama_vocab_bos(vocab)); result.insert(result.end(), query.begin(), query.end()); - result.push_back(llama_vocab_eos(vocab)); + result.push_back(eos_token); result.push_back(llama_vocab_sep(vocab)); result.insert(result.end(), doc.begin(), doc.end()); - result.push_back(llama_vocab_eos(vocab)); + result.push_back(eos_token); return result; } @@ -573,7 +580,7 @@ struct oaicompat_parser_options { // used by /chat/completions endpoint static json oaicompat_chat_params_parse( - const json & body, /* openai api json semantics */ + json & body, /* openai api json semantics */ const oaicompat_parser_options & opt, std::vector & out_files) { @@ -624,7 +631,7 @@ static json oaicompat_chat_params_parse( if (!body.contains("messages")) { throw std::runtime_error("'messages' is required"); } - json messages = body.at("messages"); + json & messages = body.at("messages"); if (!messages.is_array()) { throw std::runtime_error("Expected 'messages' to be an array"); }