Merge commit '763d06edb7' into concedo_experimental

# Conflicts:
#	.github/workflows/build-linux-cross.yml
#	ggml/CMakeLists.txt
#	ggml/src/ggml-cann/CMakeLists.txt
#	ggml/src/ggml-opencl/CMakeLists.txt
#	ggml/src/ggml-opencl/ggml-opencl.cpp
#	ggml/src/ggml-vulkan/CMakeLists.txt
#	tools/mtmd/CMakeLists.txt
#	tools/mtmd/clip.cpp
#	tools/mtmd/mtmd.cpp
#	tools/server/CMakeLists.txt
This commit is contained in:
Concedo 2025-05-31 12:44:18 +08:00
commit c987abf9f5
28 changed files with 777 additions and 291 deletions

View file

@ -49,6 +49,6 @@ charset = unset
trim_trailing_whitespace = unset trim_trailing_whitespace = unset
insert_final_newline = unset insert_final_newline = unset
[tools/mtmd/miniaudio.h] [tools/mtmd/vendor/miniaudio.h]
trim_trailing_whitespace = unset trim_trailing_whitespace = unset
insert_final_newline = unset insert_final_newline = unset

View file

@ -440,7 +440,7 @@ add_library(ggml
ggml/include/gguf.h ggml/include/gguf.h
ggml/src/gguf.cpp ggml/src/gguf.cpp
${GGML_SOURCES_CUDA}) ${GGML_SOURCES_CUDA})
target_include_directories(ggml PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools) target_include_directories(ggml PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor)
target_compile_features(ggml PUBLIC c_std_11) # don't bump target_compile_features(ggml PUBLIC c_std_11) # don't bump
target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
@ -449,7 +449,7 @@ target_compile_options(ggml PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-use_fast_math -e
add_library(ggml_v1 add_library(ggml_v1
otherarch/ggml_v1.c otherarch/ggml_v1.c
otherarch/ggml_v1.h) otherarch/ggml_v1.h)
target_include_directories(ggml_v1 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools) target_include_directories(ggml_v1 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor)
target_compile_features(ggml_v1 PUBLIC c_std_11) # don't bump target_compile_features(ggml_v1 PUBLIC c_std_11) # don't bump
target_link_libraries(ggml_v1 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) target_link_libraries(ggml_v1 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
set_target_properties(ggml_v1 PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(ggml_v1 PROPERTIES POSITION_INDEPENDENT_CODE ON)
@ -459,7 +459,7 @@ add_library(ggml_v2
otherarch/ggml_v2.h otherarch/ggml_v2.h
${GGML_V2_CUDA_SOURCES} ${GGML_V2_CUDA_SOURCES}
${GGML_V2_LEGACY_CUDA_SOURCES}) ${GGML_V2_LEGACY_CUDA_SOURCES})
target_include_directories(ggml_v2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools) target_include_directories(ggml_v2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor)
target_compile_features(ggml_v2 PUBLIC c_std_11) # don't bump target_compile_features(ggml_v2 PUBLIC c_std_11) # don't bump
target_link_libraries(ggml_v2 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) target_link_libraries(ggml_v2 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
set_target_properties(ggml_v2 PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(ggml_v2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
@ -468,7 +468,7 @@ add_library(ggml_v3
otherarch/ggml_v3.c otherarch/ggml_v3.c
otherarch/ggml_v3.h otherarch/ggml_v3.h
${GGML_V3_CUDA_SOURCES}) ${GGML_V3_CUDA_SOURCES})
target_include_directories(ggml_v3 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools) target_include_directories(ggml_v3 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor)
target_compile_features(ggml_v3 PUBLIC c_std_11) # don't bump target_compile_features(ggml_v3 PUBLIC c_std_11) # don't bump
target_link_libraries(ggml_v3 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) target_link_libraries(ggml_v3 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
set_target_properties(ggml_v3 PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(ggml_v3 PROPERTIES POSITION_INDEPENDENT_CODE ON)
@ -487,42 +487,42 @@ add_library(common2
src/unicode-data.cpp src/unicode-data.cpp
otherarch/utils.cpp otherarch/utils.cpp
otherarch/utils.h) otherarch/utils.h)
target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common) target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
target_compile_features(common2 PUBLIC cxx_std_17) # don't bump target_compile_features(common2 PUBLIC cxx_std_17) # don't bump
target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS}) target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS})
set_target_properties(common2 PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(common2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
add_library(sdtype_adapter add_library(sdtype_adapter
otherarch/sdcpp/sdtype_adapter.cpp) otherarch/sdcpp/sdtype_adapter.cpp)
target_include_directories(sdtype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common) target_include_directories(sdtype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
target_compile_features(sdtype_adapter PUBLIC cxx_std_17) # don't bump target_compile_features(sdtype_adapter PUBLIC cxx_std_17) # don't bump
target_link_libraries(sdtype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS}) target_link_libraries(sdtype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
set_target_properties(sdtype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(sdtype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
add_library(whisper_adapter add_library(whisper_adapter
otherarch/whispercpp/whisper_adapter.cpp) otherarch/whispercpp/whisper_adapter.cpp)
target_include_directories(whisper_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/whispercpp ./tools ./common) target_include_directories(whisper_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/whispercpp ./tools ./common)
target_compile_features(whisper_adapter PUBLIC cxx_std_17) # don't bump target_compile_features(whisper_adapter PUBLIC cxx_std_17) # don't bump
target_link_libraries(whisper_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS}) target_link_libraries(whisper_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
add_library(tts_adapter add_library(tts_adapter
otherarch/tts_adapter.cpp) otherarch/tts_adapter.cpp)
target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools ./common) target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./tools ./common)
target_compile_features(tts_adapter PUBLIC cxx_std_17) # don't bump target_compile_features(tts_adapter PUBLIC cxx_std_17) # don't bump
target_link_libraries(tts_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS}) target_link_libraries(tts_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
set_target_properties(tts_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(tts_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
add_library(embeddings_adapter add_library(embeddings_adapter
otherarch/embeddings_adapter.cpp) otherarch/embeddings_adapter.cpp)
target_include_directories(embeddings_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools ./common) target_include_directories(embeddings_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./tools ./common)
target_compile_features(embeddings_adapter PUBLIC cxx_std_17) # don't bump target_compile_features(embeddings_adapter PUBLIC cxx_std_17) # don't bump
target_link_libraries(embeddings_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS}) target_link_libraries(embeddings_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
set_target_properties(embeddings_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(embeddings_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
add_library(gpttype_adapter add_library(gpttype_adapter
gpttype_adapter.cpp) gpttype_adapter.cpp)
target_include_directories(gpttype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common) target_include_directories(gpttype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
target_compile_features(gpttype_adapter PUBLIC cxx_std_17) # don't bump target_compile_features(gpttype_adapter PUBLIC cxx_std_17) # don't bump
target_link_libraries(gpttype_adapter PRIVATE common2 ggml ggml_v1 ggml_v2 ggml_v3 ${LLAMA_EXTRA_LIBS}) target_link_libraries(gpttype_adapter PRIVATE common2 ggml ggml_v1 ggml_v2 ggml_v3 ${LLAMA_EXTRA_LIBS})
set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
@ -530,7 +530,7 @@ set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
if (LLAMA_CUBLAS) if (LLAMA_CUBLAS)
set(TARGET koboldcpp_cublas) set(TARGET koboldcpp_cublas)
add_library(${TARGET} SHARED expose.cpp expose.h) add_library(${TARGET} SHARED expose.cpp expose.h)
target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common) target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
target_compile_features(${TARGET} PUBLIC cxx_std_17) # don't bump target_compile_features(${TARGET} PUBLIC cxx_std_17) # don't bump
set_target_properties(${TARGET} PROPERTIES PREFIX "") set_target_properties(${TARGET} PROPERTIES PREFIX "")
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas") set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
@ -550,7 +550,7 @@ endif()
if (LLAMA_HIPBLAS) if (LLAMA_HIPBLAS)
set(TARGET koboldcpp_hipblas) set(TARGET koboldcpp_hipblas)
add_library(${TARGET} SHARED expose.cpp expose.h) add_library(${TARGET} SHARED expose.cpp expose.h)
target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common) target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
target_compile_features(${TARGET} PUBLIC cxx_std_17) # don't bump target_compile_features(${TARGET} PUBLIC cxx_std_17) # don't bump
set_target_properties(${TARGET} PROPERTIES PREFIX "") set_target_properties(${TARGET} PROPERTIES PREFIX "")
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas") set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas")

View file

@ -51,8 +51,8 @@ ifdef KCPP_DEBUG
CFLAGS = -g -O0 CFLAGS = -g -O0
CXXFLAGS = -g -O0 CXXFLAGS = -g -O0
endif endif
CFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_AARCH64 CFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./tools/mtmd/vendor -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_AARCH64
CXXFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c++17 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_AARCH64 CXXFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./tools/mtmd/vendor -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c++17 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_AARCH64
ifndef KCPP_DEBUG ifndef KCPP_DEBUG
CFLAGS += -DNDEBUG -s CFLAGS += -DNDEBUG -s
CXXFLAGS += -DNDEBUG -s CXXFLAGS += -DNDEBUG -s

View file

@ -911,13 +911,16 @@ struct common_init_result common_init_from_params(common_params & params) {
ok = false; ok = false;
} }
if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) { bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__); bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
ok = false;
}
if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) { if (!has_eos && !has_sep) {
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__); LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
ok = false;
} else if (!has_eos) {
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
} else if (!has_sep) {
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
ok = false; ok = false;
} }

View file

@ -423,19 +423,19 @@ class ModelBase:
try: try:
# for security reason, we don't allow loading remote code by default # for security reason, we don't allow loading remote code by default
# if a model need remote code, we will fallback to config.json # if a model need remote code, we will fallback to config.json
return AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict() config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
except Exception as e: except Exception as e:
logger.warning(f"Failed to load model config from {dir_model}: {e}") logger.warning(f"Failed to load model config from {dir_model}: {e}")
logger.warning("Trying to load config.json instead") logger.warning("Trying to load config.json instead")
with open(dir_model / "config.json", "r", encoding="utf-8") as f: with open(dir_model / "config.json", "r", encoding="utf-8") as f:
config = json.load(f) config = json.load(f)
if "llm_config" in config: if "llm_config" in config:
# rename for InternVL # rename for InternVL
config["text_config"] = config["llm_config"] config["text_config"] = config["llm_config"]
if "thinker_config" in config: if "thinker_config" in config:
# rename for Qwen2.5-Omni # rename for Qwen2.5-Omni
config["text_config"] = config["thinker_config"]["text_config"] config["text_config"] = config["thinker_config"]["text_config"]
return config return config
@classmethod @classmethod
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
@ -1207,7 +1207,7 @@ class MmprojModel(ModelBase):
self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys)) self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys))
self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"])) self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"]))
else: if not self.has_vision_encoder and not self.has_audio_encoder:
raise ValueError("MmprojModel must have either vision or audio encoder") raise ValueError("MmprojModel must have either vision or audio encoder")
def write_vocab(self): def write_vocab(self):
@ -1841,7 +1841,8 @@ class StableLMModel(TextModel):
"MistralForCausalLM", "MistralForCausalLM",
"MixtralForCausalLM", "MixtralForCausalLM",
"VLlama3ForCausalLM", "VLlama3ForCausalLM",
"LlavaForConditionalGeneration") "LlavaForConditionalGeneration",
"LlamaModel")
class LlamaModel(TextModel): class LlamaModel(TextModel):
model_arch = gguf.MODEL_ARCH.LLAMA model_arch = gguf.MODEL_ARCH.LLAMA
undo_permute = True undo_permute = True
@ -1921,6 +1922,8 @@ class LlamaModel(TextModel):
if is_vision_tensor: if is_vision_tensor:
return [] # skip vision tensors return [] # skip vision tensors
elif self.hf_arch == "LlamaModel":
name = "model." + name
elif name.startswith("model.text_model"): elif name.startswith("model.text_model"):
name = name.replace("text_model.", "") # for SmolVLM name = name.replace("text_model.", "") # for SmolVLM
elif name.startswith("language_model."): elif name.startswith("language_model."):
@ -2169,6 +2172,9 @@ class Llama4VisionModel(MmprojModel):
# process vision tensors # process vision tensors
if "positional_embedding_vlm" in name and ".weight" not in name: if "positional_embedding_vlm" in name and ".weight" not in name:
name += ".weight" name += ".weight"
if "multi_modal_projector.linear_1" in name:
# despite the name with number postfix, this is a single fully connected layer
return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)]
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
return [] return []
@ -3676,7 +3682,7 @@ class InternLM3Model(TextModel):
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel") @ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel", "BertForSequenceClassification")
class BertModel(TextModel): class BertModel(TextModel):
model_arch = gguf.MODEL_ARCH.BERT model_arch = gguf.MODEL_ARCH.BERT
@ -3739,6 +3745,13 @@ class BertModel(TextModel):
if name.startswith("cls.seq_relationship"): if name.startswith("cls.seq_relationship"):
return [] return []
# For BertForSequenceClassification (direct projection layer)
if name == "classifier.weight":
name = "classifier.out_proj.weight"
if name == "classifier.bias":
name = "classifier.out_proj.bias"
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
def _xlmroberta_tokenizer_init(self) -> None: def _xlmroberta_tokenizer_init(self) -> None:

View file

@ -288,7 +288,7 @@ logger.info("+++ convert_hf_to_gguf.py was updated")
tests = [ tests = [
"ied 4 ½ months", "ied 4 ½ months",
"Führer", "Äpfel",
"", "",
" ", " ",
" ", " ",

View file

@ -623,8 +623,8 @@ static __global__ void flash_attn_combine_results(
__builtin_assume(tid < D); __builtin_assume(tid < D);
extern __shared__ float2 meta[]; extern __shared__ float2 meta[];
if (tid < 2*parallel_blocks) { for (int i = tid; i < 2*parallel_blocks; i += D) {
((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + tid]; ((float *) meta)[i] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + i];
} }
__syncthreads(); __syncthreads();

View file

@ -0,0 +1,86 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#ifdef cl_intel_subgroups
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
#else
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#endif
#ifdef cl_intel_required_subgroup_size
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
#define INTEL_GPU 1
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
#elif defined(cl_qcom_reqd_sub_group_size)
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif
#define SWAP(x, y, T) { T tmp = (x); (x) = (y); (y) = tmp; }
enum ggml_sort_order {
GGML_SORT_ORDER_ASC,
GGML_SORT_ORDER_DESC,
};
kernel void kernel_argsort_f32_i32(
global float * src0,
ulong offset0,
global int * dst,
ulong offsetd,
const int ne00,
const int ne00_pad,
const int order,
local int * dst_row
) {
// bitonic sort
int col = get_local_id(0);
int row = get_group_id(1);
if (col >= ne00_pad) {
return;
}
src0 = (global char *)((global char *)src0 + offset0);
dst = (global float *)((global char *)dst + offsetd);
global float * x_row = src0 + row * ne00;
// initialize indices
dst_row[col] = col;
barrier(CLK_LOCAL_MEM_FENCE);
for (int k = 2; k <= ne00_pad; k *= 2) {
for (int j = k / 2; j > 0; j /= 2) {
int ixj = col ^ j;
if (ixj > col) {
if ((col & k) == 0) {
if (dst_row[col] >= ne00 ||
(dst_row[ixj] < ne00 && (order == GGML_SORT_ORDER_ASC ?
x_row[dst_row[col]] > x_row[dst_row[ixj]] :
x_row[dst_row[col]] < x_row[dst_row[ixj]]))
) {
SWAP(dst_row[col], dst_row[ixj], int);
}
} else {
if (dst_row[ixj] >= ne00 ||
(dst_row[col] < ne00 && (order == GGML_SORT_ORDER_ASC ?
x_row[dst_row[col]] < x_row[dst_row[ixj]] :
x_row[dst_row[col]] > x_row[dst_row[ixj]]))
) {
SWAP(dst_row[col], dst_row[ixj], int);
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
// copy the result to dst without the padding
if (col < ne00) {
dst[row * ne00 + col] = dst_row[col];
}
}

View file

@ -0,0 +1,72 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
//------------------------------------------------------------------------------
// div
//------------------------------------------------------------------------------
kernel void kernel_div(
global char * src0,
ulong offset0,
global char * src1,
ulong offset1,
global char * dst,
ulong offsetd,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
int ne11,
int ne12,
int ne13,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3
) {
src0 = src0 + offset0;
src1 = src1 + offset1;
dst = dst + offsetd;
int i03 = get_group_id(2);
int i02 = get_group_id(1);
int i01 = get_group_id(0);
int i13 = i03 % ne13;
int i12 = i02 % ne12;
int i11 = i01 % ne11;
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
const int i10 = i0 % ne10;
*((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) / *((global float *)(src1_ptr + i10*nb10));
}
}
// assumption: src1 is a row
// broadcast src1 into src0
kernel void kernel_div_row(
global float4 * src0,
ulong offset0,
global float4 * src1,
ulong offset1,
global float4 * dst,
ulong offsetd,
int ne
) {
src0 = (global float4*)((global char*)src0 + offset0);
src1 = (global float4*)((global char*)src1 + offset1);
dst = (global float4*)((global char*)dst + offsetd);
// This performs better than using %.
uint gid = get_global_id(0);
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
dst[gid] = src0[gid] / src1[idx1];
}

View file

@ -0,0 +1,72 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#ifdef cl_intel_subgroups
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
#else
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#endif
#ifdef cl_intel_required_subgroup_size
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
#define INTEL_GPU 1
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
#elif defined(cl_qcom_reqd_sub_group_size)
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif
// Workgroup must be a subgroup
#ifdef INTEL_GPU
REQD_SUBGROUP_SIZE_32
#elif defined (ADRENO_GPU)
REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_group_norm(
global float * src0,
ulong offset0,
global float * dst,
ulong offsetd,
int ne,
int group_size,
float eps
) {
src0 = (global float *)((global char *)src0 + offset0);
dst = (global float *)((global char *)dst + offsetd);
int start = get_group_id(0) * group_size;
int end = start + group_size;
start += get_local_id(0);
if (end >= ne) {
end = ne;
}
float tmp = 0.0f;
for (int j = start; j < end; j += get_local_size(0)) {
tmp += src0[j];
}
tmp = sub_group_reduce_add(tmp);
const float mean = tmp / group_size;
tmp = 0.0f;
for (int j = start; j < end; j += get_local_size(0)) {
float xi = src0[j] - mean;
dst[j] = xi;
tmp += xi * xi;
}
tmp = sub_group_reduce_add(tmp);
const float variance = tmp / group_size;
const float scale = 1.0f/sqrt(variance + eps);
for (int j = start; j < end; j += get_local_size(0)) {
dst[j] *= scale;
}
}

View file

@ -0,0 +1,29 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
//------------------------------------------------------------------------------
// sigmoid
//------------------------------------------------------------------------------
kernel void kernel_sigmoid_f32(
global float * src0,
ulong offset0,
global float * dst,
ulong offsetd
) {
src0 = (global float*)((global char*)src0 + offset0);
dst = (global float*)((global char*)dst + offsetd);
dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
}
kernel void kernel_sigmoid_f16(
global half * src0,
ulong offset0,
global half * dst,
ulong offsetd
) {
src0 = (global half*)((global char*)src0 + offset0);
dst = (global half*)((global char*)dst + offsetd);
dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
}

View file

@ -0,0 +1,72 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
//------------------------------------------------------------------------------
// div
//------------------------------------------------------------------------------
kernel void kernel_sub(
global char * src0,
ulong offset0,
global char * src1,
ulong offset1,
global char * dst,
ulong offsetd,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
int ne11,
int ne12,
int ne13,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3
) {
src0 = src0 + offset0;
src1 = src1 + offset1;
dst = dst + offsetd;
int i03 = get_group_id(2);
int i02 = get_group_id(1);
int i01 = get_group_id(0);
int i13 = i03 % ne13;
int i12 = i02 % ne12;
int i11 = i01 % ne11;
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
const int i10 = i0 % ne10;
*((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) - *((global float *)(src1_ptr + i10*nb10));
}
}
// assumption: src1 is a row
// broadcast src1 into src0
kernel void kernel_sub_row(
global float4 * src0,
ulong offset0,
global float4 * src1,
ulong offset1,
global float4 * dst,
ulong offsetd,
int ne
) {
src0 = (global float4*)((global char*)src0 + offset0);
src1 = (global float4*)((global char*)src1 + offset1);
dst = (global float4*)((global char*)dst + offsetd);
// This performs better than using %.
uint gid = get_global_id(0);
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
dst[gid] = src0[gid] - src1[idx1];
}

View file

@ -0,0 +1,39 @@
kernel void kernel_sum_rows_f32(
global float * src0,
ulong offset0,
global float * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne03,
ulong nb01,
ulong nb02,
ulong nb03,
ulong nb1,
ulong nb2,
ulong nb3
) {
src0 = (global float *)((global char *)src0 + offset0);
dst = (global float *)((global char *)dst + offsetd);
int i3 = get_global_id(2);
int i2 = get_global_id(1);
int i1 = get_global_id(0);
if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
return;
}
global float * src_row = (global float *) ((global char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
global float * dst_row = (global float *) ((global char *) dst + i1*nb1 + i2*nb2 + i3*nb3);
float row_sum = 0;
for (int i0 = 0; i0 < ne00; i0++) {
row_sum += src_row[i0];
}
dst_row[0] = row_sum;
}

View file

@ -12,7 +12,7 @@
#include "ggml-vulkan.h" #include "ggml-vulkan.h"
#include <vulkan/vulkan_core.h> #include <vulkan/vulkan_core.h>
#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_PERF) || defined(GGML_VULKAN_CHECK_RESULTS) #if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS)
#include <chrono> #include <chrono>
#include "ggml-cpu.h" #include "ggml-cpu.h"
#endif #endif
@ -200,9 +200,7 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
#ifdef GGML_VULKAN_MEMORY_DEBUG #ifdef GGML_VULKAN_MEMORY_DEBUG
class vk_memory_logger; class vk_memory_logger;
#endif #endif
#ifdef GGML_VULKAN_PERF
class vk_perf_logger; class vk_perf_logger;
#endif
static void ggml_vk_destroy_buffer(vk_buffer& buf); static void ggml_vk_destroy_buffer(vk_buffer& buf);
static constexpr uint32_t mul_mat_vec_max_cols = 8; static constexpr uint32_t mul_mat_vec_max_cols = 8;
@ -458,9 +456,11 @@ struct vk_device_struct {
#ifdef GGML_VULKAN_MEMORY_DEBUG #ifdef GGML_VULKAN_MEMORY_DEBUG
std::unique_ptr<vk_memory_logger> memory_logger; std::unique_ptr<vk_memory_logger> memory_logger;
#endif #endif
#ifdef GGML_VULKAN_PERF
// for GGML_VK_PERF_LOGGER
std::unique_ptr<vk_perf_logger> perf_logger; std::unique_ptr<vk_perf_logger> perf_logger;
#endif vk::QueryPool query_pool;
uint32_t num_queries;
~vk_device_struct() { ~vk_device_struct() {
VK_LOG_DEBUG("destroy device " << name); VK_LOG_DEBUG("destroy device " << name);
@ -844,8 +844,6 @@ private:
#define VK_LOG_MEMORY(msg) ((void) 0) #define VK_LOG_MEMORY(msg) ((void) 0)
#endif // GGML_VULKAN_MEMORY_DEBUG #endif // GGML_VULKAN_MEMORY_DEBUG
#if defined(GGML_VULKAN_PERF)
class vk_perf_logger { class vk_perf_logger {
public: public:
void print_timings() { void print_timings() {
@ -855,7 +853,7 @@ public:
for (const auto& time : t.second) { for (const auto& time : t.second) {
total += time; total += time;
} }
std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " ms" << std::endl; std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us" << std::endl;
} }
timings.clear(); timings.clear();
@ -884,7 +882,6 @@ public:
private: private:
std::map<std::string, std::vector<uint64_t>> timings; std::map<std::string, std::vector<uint64_t>> timings;
}; };
#endif // GGML_VULKAN_PERF
struct ggml_backend_vk_context { struct ggml_backend_vk_context {
std::string name; std::string name;
@ -974,6 +971,8 @@ struct vk_instance_t {
static bool vk_instance_initialized = false; static bool vk_instance_initialized = false;
static vk_instance_t vk_instance; static vk_instance_t vk_instance;
static bool vk_perf_logger_enabled = false;
#ifdef GGML_VULKAN_CHECK_RESULTS #ifdef GGML_VULKAN_CHECK_RESULTS
static size_t vk_skip_checks; static size_t vk_skip_checks;
static size_t vk_output_tensor; static size_t vk_output_tensor;
@ -2773,9 +2772,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
#ifdef GGML_VULKAN_MEMORY_DEBUG #ifdef GGML_VULKAN_MEMORY_DEBUG
device->memory_logger = std::unique_ptr<vk_memory_logger>(new vk_memory_logger()); device->memory_logger = std::unique_ptr<vk_memory_logger>(new vk_memory_logger());
#endif #endif
#ifdef GGML_VULKAN_PERF if (vk_perf_logger_enabled) {
device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger()); device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
#endif }
size_t dev_num = vk_instance.device_indices[idx]; size_t dev_num = vk_instance.device_indices[idx];
@ -3571,6 +3570,8 @@ static void ggml_vk_instance_init() {
vk_instance.instance = vk::createInstance(instance_create_info); vk_instance.instance = vk::createInstance(instance_create_info);
vk_instance_initialized = true; vk_instance_initialized = true;
vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size(); size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
// Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan // Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
@ -8909,7 +8910,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
ctx->tensor_ctxs[node_idx] = compute_ctx; ctx->tensor_ctxs[node_idx] = compute_ctx;
#if defined(GGML_VULKAN_CHECK_RESULTS) || defined(GGML_VULKAN_PERF) #if defined(GGML_VULKAN_CHECK_RESULTS)
// Force context reset on each node so that each tensor ends up in its own context // Force context reset on each node so that each tensor ends up in its own context
// and can be run and compared to its CPU equivalent separately // and can be run and compared to its CPU equivalent separately
last_node = true; last_node = true;
@ -9529,6 +9530,29 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
bool first_node_in_batch = true; // true if next node will be first node in a batch bool first_node_in_batch = true; // true if next node will be first node in a batch
int submit_node_idx = 0; // index to first node in a batch int submit_node_idx = 0; // index to first node in a batch
vk_context compute_ctx;
if (vk_perf_logger_enabled) {
// allocate/resize the query pool
if (ctx->device->num_queries < cgraph->n_nodes + 1) {
if (ctx->device->query_pool) {
ctx->device->device.destroyQueryPool(ctx->device->query_pool);
}
VkQueryPoolCreateInfo query_create_info = { VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO };
query_create_info.queryType = VK_QUERY_TYPE_TIMESTAMP;
query_create_info.queryCount = cgraph->n_nodes + 100;
ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info);
ctx->device->num_queries = query_create_info.queryCount;
}
ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
GGML_ASSERT(ctx->compute_ctx.expired());
compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
ctx->compute_ctx = compute_ctx;
ggml_vk_ctx_begin(ctx->device, compute_ctx);
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
}
// Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution. // Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
// Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB // Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
// (and scaled down based on model size, so smaller models submit earlier). // (and scaled down based on model size, so smaller models submit earlier).
@ -9556,6 +9580,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit); bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit);
if (vk_perf_logger_enabled) {
if (ctx->compute_ctx.expired()) {
compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
ctx->compute_ctx = compute_ctx;
ggml_vk_ctx_begin(ctx->device, compute_ctx);
} else {
compute_ctx = ctx->compute_ctx.lock();
}
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+1);
}
if (enqueued) { if (enqueued) {
++submitted_nodes; ++submitted_nodes;
@ -9577,9 +9612,27 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
} }
} }
#ifdef GGML_VULKAN_PERF if (vk_perf_logger_enabled) {
ctx->device->perf_logger->print_timings(); // End the command buffer and submit/wait
#endif GGML_ASSERT(!ctx->compute_ctx.expired());
compute_ctx = ctx->compute_ctx.lock();
ggml_vk_ctx_end(compute_ctx);
ggml_vk_submit(compute_ctx, ctx->device->fence);
VK_CHECK(ctx->device->device.waitForFences({ ctx->device->fence }, true, UINT64_MAX), "GGML_VULKAN_PERF waitForFences");
ctx->device->device.resetFences({ ctx->device->fence });
// Get the results and pass them to the logger
std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
for (int i = 0; i < cgraph->n_nodes; i++) {
if (!ggml_vk_is_empty(cgraph->nodes[i])) {
ctx->device->perf_logger->log_timing(cgraph->nodes[i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod));
}
}
ctx->device->perf_logger->print_timings();
}
ggml_vk_graph_cleanup(ctx); ggml_vk_graph_cleanup(ctx);

View file

@ -902,7 +902,6 @@ class TensorNameMap:
MODEL_TENSOR.V_MMPROJ_FC: ( MODEL_TENSOR.V_MMPROJ_FC: (
"model.connector.modality_projection.proj", # SmolVLM "model.connector.modality_projection.proj", # SmolVLM
"multi_modal_projector.linear_1", # llama 4
), ),
MODEL_TENSOR.V_MMPROJ_MLP: ( MODEL_TENSOR.V_MMPROJ_MLP: (

View file

@ -455,7 +455,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
} }
int64_t llm_graph_context::n_pos_per_embd() const { int64_t llm_graph_context::n_pos_per_embd() const {
return arch == LLM_ARCH_QWEN2VL ? 4 : 1; return hparams.rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
} }
void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const { void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
@ -1562,20 +1562,25 @@ void llm_graph_context::build_pooling(
ggml_tensor * inp_cls = build_inp_cls(); ggml_tensor * inp_cls = build_inp_cls();
inp = ggml_get_rows(ctx0, inp, inp_cls); inp = ggml_get_rows(ctx0, inp, inp_cls);
// classification head if (cls != nullptr && cls_b != nullptr) {
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566 // classification head
GGML_ASSERT(cls != nullptr); // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
GGML_ASSERT(cls_b != nullptr); cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
cur = ggml_tanh(ctx0, cur);
cur = ggml_add (ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b); // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
cur = ggml_tanh(ctx0, cur); // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
if (cls_out) {
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en GGML_ASSERT(cls_out_b != nullptr);
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896 cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
if (cls_out) { }
} else if (cls_out) {
// Single layer classification head (direct projection)
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
GGML_ASSERT(cls_out_b != nullptr); GGML_ASSERT(cls_out_b != nullptr);
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, inp), cls_out_b);
cur = ggml_add (ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b); } else {
GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
} }
} break; } break;
default: default:

View file

@ -757,11 +757,19 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
const auto & yarn_beta_slow = cparams.yarn_beta_slow; const auto & yarn_beta_slow = cparams.yarn_beta_slow;
const auto & n_rot = hparams.n_rot; const auto & n_rot = hparams.n_rot;
const auto & rope_type = hparams.rope_type; const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE
// @ngxson : this is a workaround
// for M-RoPE, we want to rotate the whole vector when doing KV shift
// a normal RoPE should work, we just need to use the correct ordering
// ref: https://github.com/ggml-org/llama.cpp/pull/13870
? LLAMA_ROPE_TYPE_NEOX
: hparams.rope_type;
// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly. // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation. // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor; const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
: cparams.yarn_attn_factor;
ggml_tensor * tmp; ggml_tensor * tmp;

View file

@ -1,28 +1,5 @@
// fix problem with std::min and std::max
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
# define NOMINMAX
#endif
#include <windows.h>
#endif
#include "mtmd-audio.h" #include "mtmd-audio.h"
//#define MTMD_AUDIO_DEBUG
#define MINIAUDIO_IMPLEMENTATION
#ifndef MTMD_AUDIO_DEBUG
# define MA_NO_ENCODING
#endif
#define MA_NO_DEVICE_IO
#define MA_NO_RESOURCE_MANAGER
#define MA_NO_NODE_GRAPH
#define MA_NO_ENGINE
#define MA_NO_GENERATION
#define MA_API static
#include "miniaudio.h"
#define _USE_MATH_DEFINES // for M_PI #define _USE_MATH_DEFINES // for M_PI
#include <cmath> #include <cmath>
#include <cstdint> #include <cstdint>
@ -359,69 +336,6 @@ bool preprocess_audio(
} // namespace whisper_preprocessor } // namespace whisper_preprocessor
namespace audio_helpers {
bool is_audio_file(const char * buf, size_t len) {
if (len < 12) {
return false;
}
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
bool is_mp3 = len >= 3 && (
memcmp(buf, "ID3", 3) == 0 ||
// Check for MPEG sync word (simplified check)
((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
);
bool is_flac = memcmp(buf, "fLaC", 4) == 0;
return is_wav || is_mp3 || is_flac;
}
// returns true if the buffer is a valid audio file
bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
ma_result result;
const int channels = 1;
ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
ma_decoder decoder;
result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
if (result != MA_SUCCESS) {
return false;
}
ma_uint64 frame_count;
ma_uint64 frames_read;
result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
if (result != MA_SUCCESS) {
ma_decoder_uninit(&decoder);
return false;
}
pcmf32_mono.resize(frame_count);
result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
if (result != MA_SUCCESS) {
ma_decoder_uninit(&decoder);
return false;
}
#ifdef MTMD_AUDIO_DEBUG
// save audio to wav file
ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
ma_encoder encoder;
ma_encoder_init_file("output.wav", &config, &encoder);
ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
ma_encoder_uninit(&encoder);
#endif
ma_decoder_uninit(&decoder);
return true;
}
} // namespace wav_utils
// precalculated mel filter banks // precalculated mel filter banks
// values are multiplied by 1000.0 to save space, and will be divided by 1000.0 in the end of the function // values are multiplied by 1000.0 to save space, and will be divided by 1000.0 in the end of the function
// //

View file

@ -32,7 +32,7 @@ struct whisper_filters {
std::vector<float> data; std::vector<float> data;
}; };
extern bool preprocess_audio( bool preprocess_audio(
const float * samples, const float * samples,
size_t n_samples, size_t n_samples,
const whisper_filters & filters, const whisper_filters & filters,
@ -40,23 +40,8 @@ extern bool preprocess_audio(
} // namespace whisper_preprocessor } // namespace whisper_preprocessor
// TODO @ngxson : move this helper to mtmd-helpers.cpp
namespace audio_helpers {
extern bool is_audio_file(const char * buf, size_t len);
extern bool decode_audio_from_buf(
const unsigned char * buf_in,
size_t len,
int target_sampler_rate,
std::vector<float> & pcmf32_mono);
} // namespace audio_helpers
namespace whisper_precalc_filters { namespace whisper_precalc_filters {
extern whisper_preprocessor::whisper_filters get_128_bins(); whisper_preprocessor::whisper_filters get_128_bins();
} // namespace whisper_precalc_filters } // namespace whisper_precalc_filters

View file

@ -7,6 +7,7 @@
#include "console.h" #include "console.h"
#include "chat.h" #include "chat.h"
#include "mtmd.h" #include "mtmd.h"
#include "mtmd-helper.h"
#include <vector> #include <vector>
#include <limits.h> #include <limits.h>
@ -143,7 +144,7 @@ struct mtmd_cli_context {
} }
bool load_media(const std::string & fname) { bool load_media(const std::string & fname) {
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str())); mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
if (!bmp.ptr) { if (!bmp.ptr) {
return false; return false;
} }

View file

@ -1,10 +1,37 @@
// fix problem with std::min and std::max
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
# define NOMINMAX
#endif
#include <windows.h>
#endif
#include "mtmd.h" #include "mtmd.h"
#include "mtmd-helper.h"
#include "llama.h" #include "llama.h"
#include <algorithm> #include <algorithm>
#include <cinttypes> #include <cinttypes>
#include <vector> #include <vector>
//#define MTMD_AUDIO_DEBUG
#define MINIAUDIO_IMPLEMENTATION
#ifndef MTMD_AUDIO_DEBUG
# define MA_NO_ENCODING
#endif
#define MA_NO_DEVICE_IO
#define MA_NO_RESOURCE_MANAGER
#define MA_NO_NODE_GRAPH
#define MA_NO_ENGINE
#define MA_NO_GENERATION
#define MA_API static
#include "vendor/miniaudio.h"
#define STB_IMAGE_IMPLEMENTATION
#include "vendor/stb_image.h"
#define LOG_INF(...) fprintf(stdout, __VA_ARGS__) #define LOG_INF(...) fprintf(stdout, __VA_ARGS__)
#define LOG_ERR(...) fprintf(stderr, __VA_ARGS__) #define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
@ -315,3 +342,118 @@ int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
return 0; return 0;
} }
namespace audio_helpers {
static bool is_audio_file(const char * buf, size_t len) {
if (len < 12) {
return false;
}
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
bool is_mp3 = len >= 3 && (
memcmp(buf, "ID3", 3) == 0 ||
// Check for MPEG sync word (simplified check)
((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
);
bool is_flac = memcmp(buf, "fLaC", 4) == 0;
return is_wav || is_mp3 || is_flac;
}
// returns true if the buffer is a valid audio file
static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
ma_result result;
const int channels = 1;
ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
ma_decoder decoder;
result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
if (result != MA_SUCCESS) {
return false;
}
ma_uint64 frame_count;
ma_uint64 frames_read;
result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
if (result != MA_SUCCESS) {
ma_decoder_uninit(&decoder);
return false;
}
pcmf32_mono.resize(frame_count);
result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
if (result != MA_SUCCESS) {
ma_decoder_uninit(&decoder);
return false;
}
#ifdef MTMD_AUDIO_DEBUG
// save audio to wav file
ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
ma_encoder encoder;
ma_encoder_init_file("output.wav", &config, &encoder);
ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
ma_encoder_uninit(&encoder);
#endif
ma_decoder_uninit(&decoder);
return true;
}
} // namespace audio_helpers
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) {
if (audio_helpers::is_audio_file((const char *)buf, len)) {
std::vector<float> pcmf32;
int bitrate = mtmd_get_audio_bitrate(ctx);
if (bitrate < 0) {
LOG_ERR("This model does not support audio input\n");
return nullptr;
}
if (!audio_helpers::decode_audio_from_buf(buf, len, bitrate, pcmf32)) {
LOG_ERR("Unable to read WAV audio file from buffer\n");
return nullptr;
}
return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
}
// otherwise, we assume it's an image
mtmd_bitmap * result = nullptr;
{
int nx, ny, nc;
auto * data = stbi_load_from_memory(buf, len, &nx, &ny, &nc, 3);
if (!data) {
LOG_ERR("%s: failed to decode image bytes\n", __func__);
return nullptr;
}
result = mtmd_bitmap_init(nx, ny, data);
stbi_image_free(data);
}
return result;
}
mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
std::vector<unsigned char> buf;
FILE * f = fopen(fname, "rb");
if (!f) {
LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
return nullptr;
}
fseek(f, 0, SEEK_END);
long file_size = ftell(f);
fseek(f, 0, SEEK_SET);
buf.resize(file_size);
size_t n_read = fread(buf.data(), 1, file_size, f);
fclose(f);
if (n_read != (size_t)file_size) {
LOG_ERR("Failed to read entire file %s", fname);
return nullptr;
}
return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
}

91
tools/mtmd/mtmd-helper.h Normal file
View file

@ -0,0 +1,91 @@
#ifndef MTMD_HELPER_H
#define MTMD_HELPER_H
#include "ggml.h"
#include "llama.h"
#include "mtmd.h"
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
#ifdef __cplusplus
extern "C" {
#endif
//
// libmtmd helper functions
//
// Please note that these helpers are not guaranteed to be stable.
// BREAKING CHANGES are expected.
//
// helper function to construct a mtmd_bitmap from a file
// it calls mtmd_helper_bitmap_init_from_buf() internally
// returns nullptr on failure
// this function is thread-safe
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);
// helper function to construct a mtmd_bitmap from a buffer containing a file
// supported formats:
// image: formats supported by stb_image: jpg, png, bmp, gif, etc.
// audio: formats supported by miniaudio: wav, mp3, flac
// note: audio files will be auto-detected based on magic bytes
// returns nullptr on failure
// this function is thread-safe
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
// helper function that automatically:
// 1. run llama_decode() on text chunks
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
// otherwise, returns 0 on success
// this function is NOT thread-safe
MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunks * chunks,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
bool logits_last,
llama_pos * new_n_past);
// works like mtmd_helper_eval_chunks(), but only for a single chunk
// this function is NOT thread-safe
MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunk * chunk,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
bool logits_last,
llama_pos * new_n_past);
// helper function to decode an image whose embeddings have already been calculated
// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunk * chunk,
float * encoded_embd,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
llama_pos * new_n_past);
#ifdef __cplusplus
} // extern "C"
#endif
//
// C++ wrappers
//
#endif

View file

@ -819,53 +819,12 @@ bool mtmd_support_audio(mtmd_context * ctx) {
return ctx->ctx_a != nullptr; return ctx->ctx_a != nullptr;
} }
// these 2 helpers below use internal clip_image_u8_ptr, int mtmd_get_audio_bitrate(mtmd_context * ctx) {
// so unfortunately they cannot moved to mtmd-helper.h if (!ctx->ctx_a) {
// however, in theory, user can decode image file to bitmap using return -1;
// whichever library they want, and then use mtmd_bitmap_init() to create bitmap
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) {
if (audio_helpers::is_audio_file((const char *)buf, len)) {
std::vector<float> pcmf32;
if (!audio_helpers::decode_audio_from_buf(buf, len, COMMON_SAMPLE_RATE, pcmf32)) {
LOG_ERR("Unable to read WAV audio file from buffer\n");
return nullptr;
}
return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
} }
// for now, we assume that all audio models have the same bitrate
clip_image_u8_ptr img_u8(clip_image_u8_init()); return 16000; // 16kHz
bool ok = clip_image_load_from_bytes(buf, len, img_u8.get(),2048);
if (!ok) {
LOG_ERR("Unable to load image from buffer\n");
return nullptr;
}
uint32_t nx, ny;
unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
return mtmd_bitmap_init(nx, ny, data);
}
mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) {
std::vector<unsigned char> buf;
FILE * f = fopen(fname, "rb");
if (!f) {
LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
return nullptr;
}
fseek(f, 0, SEEK_END);
long file_size = ftell(f);
fseek(f, 0, SEEK_SET);
buf.resize(file_size);
size_t n_read = fread(buf.data(), 1, file_size, f);
fclose(f);
if (n_read != (size_t)file_size) {
LOG_ERR("Failed to read entire file %s", fname);
return nullptr;
}
return mtmd_helper_bitmap_init_from_buf(buf.data(), buf.size());
} }
// //

View file

@ -109,6 +109,10 @@ MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
// whether the current model supports audio input // whether the current model supports audio input
MTMD_API bool mtmd_support_audio(mtmd_context * ctx); MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
// get audio bitrate in Hz, for example 16000 for Whisper
// return -1 if audio is not supported
MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
// mtmd_bitmap // mtmd_bitmap
// //
// if bitmap is image: // if bitmap is image:
@ -209,75 +213,6 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
///////////////////////////////////////// /////////////////////////////////////////
//
// Helper functions (can be implemented based on other functions)
//
// Please note that these helpers are not guaranteed to be stable.
// BREAKING CHANGES are expected.
//
// helper function to construct a mtmd_bitmap from a file
// it calls mtmd_helper_bitmap_init_from_buf() internally
// returns nullptr on failure
// this function is thread-safe
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname);
// helper function to construct a mtmd_bitmap from a buffer containing a file
// supported formats:
// image: formats supported by stb_image: jpg, png, bmp, gif, etc.
// audio: formats supported by miniaudio: wav, mp3, flac
// note: audio files will be auto-detected based on magic bytes
// returns nullptr on failure
// this function is thread-safe
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len);
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
// helper function that automatically:
// 1. run llama_decode() on text chunks
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
// otherwise, returns 0 on success
// this function is NOT thread-safe
MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunks * chunks,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
bool logits_last,
llama_pos * new_n_past);
// works like mtmd_helper_eval_chunks(), but only for a single chunk
// this function is NOT thread-safe
MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunk * chunk,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
bool logits_last,
llama_pos * new_n_past);
// helper function to decode an image whose embeddings have already been calculated
// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunk * chunk,
float * encoded_embd,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
llama_pos * new_n_past);
/////////////////////////////////////////
// test function, to be used in test-mtmd-c-api.c // test function, to be used in test-mtmd-c-api.c
MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void); MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);

View file

@ -9,6 +9,7 @@
#include "sampling.h" #include "sampling.h"
#include "speculative.h" #include "speculative.h"
#include "mtmd.h" #include "mtmd.h"
#include "mtmd-helper.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT: // Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT #define JSON_ASSERT GGML_ASSERT
@ -4187,7 +4188,7 @@ int main(int argc, char ** argv) {
throw std::runtime_error("This server does not support multimodal"); throw std::runtime_error("This server does not support multimodal");
} }
for (auto & file : files) { for (auto & file : files) {
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size())); mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(ctx_server.mctx, file.data(), file.size()));
if (!bmp.ptr) { if (!bmp.ptr) {
throw std::runtime_error("Failed to load image or audio file"); throw std::runtime_error("Failed to load image or audio file");
} }

View file

@ -6,6 +6,7 @@
#include "arg.h" // common_remote_get_content #include "arg.h" // common_remote_get_content
#include "base64.hpp" #include "base64.hpp"
#include "mtmd.h" #include "mtmd.h"
#include "mtmd-helper.h"
// increase max payload length to allow use of larger context size // increase max payload length to allow use of larger context size
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576 #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
@ -264,13 +265,19 @@ static size_t validate_utf8(const std::string& text) {
static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_tokens & query, const llama_tokens & doc) { static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_tokens & query, const llama_tokens & doc) {
llama_tokens result; llama_tokens result;
// Get EOS token - use SEP token as fallback if EOS is not available
llama_token eos_token = llama_vocab_eos(vocab);
if (eos_token == LLAMA_TOKEN_NULL) {
eos_token = llama_vocab_sep(vocab);
}
result.reserve(doc.size() + query.size() + 4); result.reserve(doc.size() + query.size() + 4);
result.push_back(llama_vocab_bos(vocab)); result.push_back(llama_vocab_bos(vocab));
result.insert(result.end(), query.begin(), query.end()); result.insert(result.end(), query.begin(), query.end());
result.push_back(llama_vocab_eos(vocab)); result.push_back(eos_token);
result.push_back(llama_vocab_sep(vocab)); result.push_back(llama_vocab_sep(vocab));
result.insert(result.end(), doc.begin(), doc.end()); result.insert(result.end(), doc.begin(), doc.end());
result.push_back(llama_vocab_eos(vocab)); result.push_back(eos_token);
return result; return result;
} }
@ -573,7 +580,7 @@ struct oaicompat_parser_options {
// used by /chat/completions endpoint // used by /chat/completions endpoint
static json oaicompat_chat_params_parse( static json oaicompat_chat_params_parse(
const json & body, /* openai api json semantics */ json & body, /* openai api json semantics */
const oaicompat_parser_options & opt, const oaicompat_parser_options & opt,
std::vector<raw_buffer> & out_files) std::vector<raw_buffer> & out_files)
{ {
@ -624,7 +631,7 @@ static json oaicompat_chat_params_parse(
if (!body.contains("messages")) { if (!body.contains("messages")) {
throw std::runtime_error("'messages' is required"); throw std::runtime_error("'messages' is required");
} }
json messages = body.at("messages"); json & messages = body.at("messages");
if (!messages.is_array()) { if (!messages.is_array()) {
throw std::runtime_error("Expected 'messages' to be an array"); throw std::runtime_error("Expected 'messages' to be an array");
} }