mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
Merge commit '763d06edb7
' into concedo_experimental
# Conflicts: # .github/workflows/build-linux-cross.yml # ggml/CMakeLists.txt # ggml/src/ggml-cann/CMakeLists.txt # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-vulkan/CMakeLists.txt # tools/mtmd/CMakeLists.txt # tools/mtmd/clip.cpp # tools/mtmd/mtmd.cpp # tools/server/CMakeLists.txt
This commit is contained in:
commit
c987abf9f5
28 changed files with 777 additions and 291 deletions
|
@ -49,6 +49,6 @@ charset = unset
|
||||||
trim_trailing_whitespace = unset
|
trim_trailing_whitespace = unset
|
||||||
insert_final_newline = unset
|
insert_final_newline = unset
|
||||||
|
|
||||||
[tools/mtmd/miniaudio.h]
|
[tools/mtmd/vendor/miniaudio.h]
|
||||||
trim_trailing_whitespace = unset
|
trim_trailing_whitespace = unset
|
||||||
insert_final_newline = unset
|
insert_final_newline = unset
|
||||||
|
|
|
@ -440,7 +440,7 @@ add_library(ggml
|
||||||
ggml/include/gguf.h
|
ggml/include/gguf.h
|
||||||
ggml/src/gguf.cpp
|
ggml/src/gguf.cpp
|
||||||
${GGML_SOURCES_CUDA})
|
${GGML_SOURCES_CUDA})
|
||||||
target_include_directories(ggml PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools)
|
target_include_directories(ggml PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor)
|
||||||
target_compile_features(ggml PUBLIC c_std_11) # don't bump
|
target_compile_features(ggml PUBLIC c_std_11) # don't bump
|
||||||
target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
|
target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
|
||||||
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
@ -449,7 +449,7 @@ target_compile_options(ggml PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-use_fast_math -e
|
||||||
add_library(ggml_v1
|
add_library(ggml_v1
|
||||||
otherarch/ggml_v1.c
|
otherarch/ggml_v1.c
|
||||||
otherarch/ggml_v1.h)
|
otherarch/ggml_v1.h)
|
||||||
target_include_directories(ggml_v1 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools)
|
target_include_directories(ggml_v1 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor)
|
||||||
target_compile_features(ggml_v1 PUBLIC c_std_11) # don't bump
|
target_compile_features(ggml_v1 PUBLIC c_std_11) # don't bump
|
||||||
target_link_libraries(ggml_v1 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
|
target_link_libraries(ggml_v1 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
|
||||||
set_target_properties(ggml_v1 PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(ggml_v1 PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
@ -459,7 +459,7 @@ add_library(ggml_v2
|
||||||
otherarch/ggml_v2.h
|
otherarch/ggml_v2.h
|
||||||
${GGML_V2_CUDA_SOURCES}
|
${GGML_V2_CUDA_SOURCES}
|
||||||
${GGML_V2_LEGACY_CUDA_SOURCES})
|
${GGML_V2_LEGACY_CUDA_SOURCES})
|
||||||
target_include_directories(ggml_v2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools)
|
target_include_directories(ggml_v2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor)
|
||||||
target_compile_features(ggml_v2 PUBLIC c_std_11) # don't bump
|
target_compile_features(ggml_v2 PUBLIC c_std_11) # don't bump
|
||||||
target_link_libraries(ggml_v2 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
|
target_link_libraries(ggml_v2 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
|
||||||
set_target_properties(ggml_v2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(ggml_v2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
@ -468,7 +468,7 @@ add_library(ggml_v3
|
||||||
otherarch/ggml_v3.c
|
otherarch/ggml_v3.c
|
||||||
otherarch/ggml_v3.h
|
otherarch/ggml_v3.h
|
||||||
${GGML_V3_CUDA_SOURCES})
|
${GGML_V3_CUDA_SOURCES})
|
||||||
target_include_directories(ggml_v3 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools)
|
target_include_directories(ggml_v3 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor)
|
||||||
target_compile_features(ggml_v3 PUBLIC c_std_11) # don't bump
|
target_compile_features(ggml_v3 PUBLIC c_std_11) # don't bump
|
||||||
target_link_libraries(ggml_v3 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
|
target_link_libraries(ggml_v3 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
|
||||||
set_target_properties(ggml_v3 PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(ggml_v3 PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
@ -487,42 +487,42 @@ add_library(common2
|
||||||
src/unicode-data.cpp
|
src/unicode-data.cpp
|
||||||
otherarch/utils.cpp
|
otherarch/utils.cpp
|
||||||
otherarch/utils.h)
|
otherarch/utils.h)
|
||||||
target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
|
target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
|
||||||
target_compile_features(common2 PUBLIC cxx_std_17) # don't bump
|
target_compile_features(common2 PUBLIC cxx_std_17) # don't bump
|
||||||
target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS})
|
target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS})
|
||||||
set_target_properties(common2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(common2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
|
||||||
add_library(sdtype_adapter
|
add_library(sdtype_adapter
|
||||||
otherarch/sdcpp/sdtype_adapter.cpp)
|
otherarch/sdcpp/sdtype_adapter.cpp)
|
||||||
target_include_directories(sdtype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
|
target_include_directories(sdtype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
|
||||||
target_compile_features(sdtype_adapter PUBLIC cxx_std_17) # don't bump
|
target_compile_features(sdtype_adapter PUBLIC cxx_std_17) # don't bump
|
||||||
target_link_libraries(sdtype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
|
target_link_libraries(sdtype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
|
||||||
set_target_properties(sdtype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(sdtype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
|
||||||
add_library(whisper_adapter
|
add_library(whisper_adapter
|
||||||
otherarch/whispercpp/whisper_adapter.cpp)
|
otherarch/whispercpp/whisper_adapter.cpp)
|
||||||
target_include_directories(whisper_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/whispercpp ./tools ./common)
|
target_include_directories(whisper_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/whispercpp ./tools ./common)
|
||||||
target_compile_features(whisper_adapter PUBLIC cxx_std_17) # don't bump
|
target_compile_features(whisper_adapter PUBLIC cxx_std_17) # don't bump
|
||||||
target_link_libraries(whisper_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
|
target_link_libraries(whisper_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
|
||||||
set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
|
||||||
add_library(tts_adapter
|
add_library(tts_adapter
|
||||||
otherarch/tts_adapter.cpp)
|
otherarch/tts_adapter.cpp)
|
||||||
target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools ./common)
|
target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./tools ./common)
|
||||||
target_compile_features(tts_adapter PUBLIC cxx_std_17) # don't bump
|
target_compile_features(tts_adapter PUBLIC cxx_std_17) # don't bump
|
||||||
target_link_libraries(tts_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
|
target_link_libraries(tts_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
|
||||||
set_target_properties(tts_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(tts_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
|
||||||
add_library(embeddings_adapter
|
add_library(embeddings_adapter
|
||||||
otherarch/embeddings_adapter.cpp)
|
otherarch/embeddings_adapter.cpp)
|
||||||
target_include_directories(embeddings_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools ./common)
|
target_include_directories(embeddings_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./tools ./common)
|
||||||
target_compile_features(embeddings_adapter PUBLIC cxx_std_17) # don't bump
|
target_compile_features(embeddings_adapter PUBLIC cxx_std_17) # don't bump
|
||||||
target_link_libraries(embeddings_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
|
target_link_libraries(embeddings_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
|
||||||
set_target_properties(embeddings_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(embeddings_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
|
||||||
add_library(gpttype_adapter
|
add_library(gpttype_adapter
|
||||||
gpttype_adapter.cpp)
|
gpttype_adapter.cpp)
|
||||||
target_include_directories(gpttype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
|
target_include_directories(gpttype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
|
||||||
target_compile_features(gpttype_adapter PUBLIC cxx_std_17) # don't bump
|
target_compile_features(gpttype_adapter PUBLIC cxx_std_17) # don't bump
|
||||||
target_link_libraries(gpttype_adapter PRIVATE common2 ggml ggml_v1 ggml_v2 ggml_v3 ${LLAMA_EXTRA_LIBS})
|
target_link_libraries(gpttype_adapter PRIVATE common2 ggml ggml_v1 ggml_v2 ggml_v3 ${LLAMA_EXTRA_LIBS})
|
||||||
set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
@ -530,7 +530,7 @@ set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
if (LLAMA_CUBLAS)
|
if (LLAMA_CUBLAS)
|
||||||
set(TARGET koboldcpp_cublas)
|
set(TARGET koboldcpp_cublas)
|
||||||
add_library(${TARGET} SHARED expose.cpp expose.h)
|
add_library(${TARGET} SHARED expose.cpp expose.h)
|
||||||
target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
|
target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
|
||||||
target_compile_features(${TARGET} PUBLIC cxx_std_17) # don't bump
|
target_compile_features(${TARGET} PUBLIC cxx_std_17) # don't bump
|
||||||
set_target_properties(${TARGET} PROPERTIES PREFIX "")
|
set_target_properties(${TARGET} PROPERTIES PREFIX "")
|
||||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
|
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
|
||||||
|
@ -550,7 +550,7 @@ endif()
|
||||||
if (LLAMA_HIPBLAS)
|
if (LLAMA_HIPBLAS)
|
||||||
set(TARGET koboldcpp_hipblas)
|
set(TARGET koboldcpp_hipblas)
|
||||||
add_library(${TARGET} SHARED expose.cpp expose.h)
|
add_library(${TARGET} SHARED expose.cpp expose.h)
|
||||||
target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
|
target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
|
||||||
target_compile_features(${TARGET} PUBLIC cxx_std_17) # don't bump
|
target_compile_features(${TARGET} PUBLIC cxx_std_17) # don't bump
|
||||||
set_target_properties(${TARGET} PROPERTIES PREFIX "")
|
set_target_properties(${TARGET} PROPERTIES PREFIX "")
|
||||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas")
|
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas")
|
||||||
|
|
4
Makefile
4
Makefile
|
@ -51,8 +51,8 @@ ifdef KCPP_DEBUG
|
||||||
CFLAGS = -g -O0
|
CFLAGS = -g -O0
|
||||||
CXXFLAGS = -g -O0
|
CXXFLAGS = -g -O0
|
||||||
endif
|
endif
|
||||||
CFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_AARCH64
|
CFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./tools/mtmd/vendor -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_AARCH64
|
||||||
CXXFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c++17 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_AARCH64
|
CXXFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./tools/mtmd/vendor -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c++17 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_AARCH64
|
||||||
ifndef KCPP_DEBUG
|
ifndef KCPP_DEBUG
|
||||||
CFLAGS += -DNDEBUG -s
|
CFLAGS += -DNDEBUG -s
|
||||||
CXXFLAGS += -DNDEBUG -s
|
CXXFLAGS += -DNDEBUG -s
|
||||||
|
|
|
@ -911,13 +911,16 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
ok = false;
|
ok = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
||||||
LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
|
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
||||||
ok = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
|
if (!has_eos && !has_sep) {
|
||||||
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
|
||||||
|
ok = false;
|
||||||
|
} else if (!has_eos) {
|
||||||
|
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
||||||
|
} else if (!has_sep) {
|
||||||
|
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
||||||
ok = false;
|
ok = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -423,19 +423,19 @@ class ModelBase:
|
||||||
try:
|
try:
|
||||||
# for security reason, we don't allow loading remote code by default
|
# for security reason, we don't allow loading remote code by default
|
||||||
# if a model need remote code, we will fallback to config.json
|
# if a model need remote code, we will fallback to config.json
|
||||||
return AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
|
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Failed to load model config from {dir_model}: {e}")
|
logger.warning(f"Failed to load model config from {dir_model}: {e}")
|
||||||
logger.warning("Trying to load config.json instead")
|
logger.warning("Trying to load config.json instead")
|
||||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||||
config = json.load(f)
|
config = json.load(f)
|
||||||
if "llm_config" in config:
|
if "llm_config" in config:
|
||||||
# rename for InternVL
|
# rename for InternVL
|
||||||
config["text_config"] = config["llm_config"]
|
config["text_config"] = config["llm_config"]
|
||||||
if "thinker_config" in config:
|
if "thinker_config" in config:
|
||||||
# rename for Qwen2.5-Omni
|
# rename for Qwen2.5-Omni
|
||||||
config["text_config"] = config["thinker_config"]["text_config"]
|
config["text_config"] = config["thinker_config"]["text_config"]
|
||||||
return config
|
return config
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
|
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
|
||||||
|
@ -1207,7 +1207,7 @@ class MmprojModel(ModelBase):
|
||||||
self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys))
|
self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys))
|
||||||
self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"]))
|
self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"]))
|
||||||
|
|
||||||
else:
|
if not self.has_vision_encoder and not self.has_audio_encoder:
|
||||||
raise ValueError("MmprojModel must have either vision or audio encoder")
|
raise ValueError("MmprojModel must have either vision or audio encoder")
|
||||||
|
|
||||||
def write_vocab(self):
|
def write_vocab(self):
|
||||||
|
@ -1841,7 +1841,8 @@ class StableLMModel(TextModel):
|
||||||
"MistralForCausalLM",
|
"MistralForCausalLM",
|
||||||
"MixtralForCausalLM",
|
"MixtralForCausalLM",
|
||||||
"VLlama3ForCausalLM",
|
"VLlama3ForCausalLM",
|
||||||
"LlavaForConditionalGeneration")
|
"LlavaForConditionalGeneration",
|
||||||
|
"LlamaModel")
|
||||||
class LlamaModel(TextModel):
|
class LlamaModel(TextModel):
|
||||||
model_arch = gguf.MODEL_ARCH.LLAMA
|
model_arch = gguf.MODEL_ARCH.LLAMA
|
||||||
undo_permute = True
|
undo_permute = True
|
||||||
|
@ -1921,6 +1922,8 @@ class LlamaModel(TextModel):
|
||||||
|
|
||||||
if is_vision_tensor:
|
if is_vision_tensor:
|
||||||
return [] # skip vision tensors
|
return [] # skip vision tensors
|
||||||
|
elif self.hf_arch == "LlamaModel":
|
||||||
|
name = "model." + name
|
||||||
elif name.startswith("model.text_model"):
|
elif name.startswith("model.text_model"):
|
||||||
name = name.replace("text_model.", "") # for SmolVLM
|
name = name.replace("text_model.", "") # for SmolVLM
|
||||||
elif name.startswith("language_model."):
|
elif name.startswith("language_model."):
|
||||||
|
@ -2169,6 +2172,9 @@ class Llama4VisionModel(MmprojModel):
|
||||||
# process vision tensors
|
# process vision tensors
|
||||||
if "positional_embedding_vlm" in name and ".weight" not in name:
|
if "positional_embedding_vlm" in name and ".weight" not in name:
|
||||||
name += ".weight"
|
name += ".weight"
|
||||||
|
if "multi_modal_projector.linear_1" in name:
|
||||||
|
# despite the name with number postfix, this is a single fully connected layer
|
||||||
|
return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)]
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
@ -3676,7 +3682,7 @@ class InternLM3Model(TextModel):
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel")
|
@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel", "BertForSequenceClassification")
|
||||||
class BertModel(TextModel):
|
class BertModel(TextModel):
|
||||||
model_arch = gguf.MODEL_ARCH.BERT
|
model_arch = gguf.MODEL_ARCH.BERT
|
||||||
|
|
||||||
|
@ -3739,6 +3745,13 @@ class BertModel(TextModel):
|
||||||
if name.startswith("cls.seq_relationship"):
|
if name.startswith("cls.seq_relationship"):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
# For BertForSequenceClassification (direct projection layer)
|
||||||
|
if name == "classifier.weight":
|
||||||
|
name = "classifier.out_proj.weight"
|
||||||
|
|
||||||
|
if name == "classifier.bias":
|
||||||
|
name = "classifier.out_proj.bias"
|
||||||
|
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
def _xlmroberta_tokenizer_init(self) -> None:
|
def _xlmroberta_tokenizer_init(self) -> None:
|
||||||
|
|
|
@ -288,7 +288,7 @@ logger.info("+++ convert_hf_to_gguf.py was updated")
|
||||||
|
|
||||||
tests = [
|
tests = [
|
||||||
"ied 4 ½ months",
|
"ied 4 ½ months",
|
||||||
"Führer",
|
"Äpfel",
|
||||||
"",
|
"",
|
||||||
" ",
|
" ",
|
||||||
" ",
|
" ",
|
||||||
|
|
|
@ -623,8 +623,8 @@ static __global__ void flash_attn_combine_results(
|
||||||
__builtin_assume(tid < D);
|
__builtin_assume(tid < D);
|
||||||
|
|
||||||
extern __shared__ float2 meta[];
|
extern __shared__ float2 meta[];
|
||||||
if (tid < 2*parallel_blocks) {
|
for (int i = tid; i < 2*parallel_blocks; i += D) {
|
||||||
((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + tid];
|
((float *) meta)[i] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + i];
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
86
ggml/src/ggml-opencl/kernels/argsort.cl
Normal file
86
ggml/src/ggml-opencl/kernels/argsort.cl
Normal file
|
@ -0,0 +1,86 @@
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
|
||||||
|
#ifdef cl_intel_subgroups
|
||||||
|
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||||
|
#else
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef cl_intel_required_subgroup_size
|
||||||
|
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||||
|
#define INTEL_GPU 1
|
||||||
|
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||||
|
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||||
|
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||||
|
#define ADRENO_GPU 1
|
||||||
|
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||||
|
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define SWAP(x, y, T) { T tmp = (x); (x) = (y); (y) = tmp; }
|
||||||
|
|
||||||
|
enum ggml_sort_order {
|
||||||
|
GGML_SORT_ORDER_ASC,
|
||||||
|
GGML_SORT_ORDER_DESC,
|
||||||
|
};
|
||||||
|
|
||||||
|
kernel void kernel_argsort_f32_i32(
|
||||||
|
global float * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global int * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
const int ne00,
|
||||||
|
const int ne00_pad,
|
||||||
|
const int order,
|
||||||
|
local int * dst_row
|
||||||
|
) {
|
||||||
|
// bitonic sort
|
||||||
|
int col = get_local_id(0);
|
||||||
|
int row = get_group_id(1);
|
||||||
|
|
||||||
|
if (col >= ne00_pad) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
src0 = (global char *)((global char *)src0 + offset0);
|
||||||
|
dst = (global float *)((global char *)dst + offsetd);
|
||||||
|
|
||||||
|
global float * x_row = src0 + row * ne00;
|
||||||
|
|
||||||
|
// initialize indices
|
||||||
|
dst_row[col] = col;
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
for (int k = 2; k <= ne00_pad; k *= 2) {
|
||||||
|
for (int j = k / 2; j > 0; j /= 2) {
|
||||||
|
int ixj = col ^ j;
|
||||||
|
if (ixj > col) {
|
||||||
|
if ((col & k) == 0) {
|
||||||
|
if (dst_row[col] >= ne00 ||
|
||||||
|
(dst_row[ixj] < ne00 && (order == GGML_SORT_ORDER_ASC ?
|
||||||
|
x_row[dst_row[col]] > x_row[dst_row[ixj]] :
|
||||||
|
x_row[dst_row[col]] < x_row[dst_row[ixj]]))
|
||||||
|
) {
|
||||||
|
SWAP(dst_row[col], dst_row[ixj], int);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (dst_row[ixj] >= ne00 ||
|
||||||
|
(dst_row[col] < ne00 && (order == GGML_SORT_ORDER_ASC ?
|
||||||
|
x_row[dst_row[col]] < x_row[dst_row[ixj]] :
|
||||||
|
x_row[dst_row[col]] > x_row[dst_row[ixj]]))
|
||||||
|
) {
|
||||||
|
SWAP(dst_row[col], dst_row[ixj], int);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy the result to dst without the padding
|
||||||
|
if (col < ne00) {
|
||||||
|
dst[row * ne00 + col] = dst_row[col];
|
||||||
|
}
|
||||||
|
}
|
72
ggml/src/ggml-opencl/kernels/div.cl
Normal file
72
ggml/src/ggml-opencl/kernels/div.cl
Normal file
|
@ -0,0 +1,72 @@
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
// div
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
kernel void kernel_div(
|
||||||
|
global char * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global char * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global char * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
ulong nb00,
|
||||||
|
ulong nb01,
|
||||||
|
ulong nb02,
|
||||||
|
ulong nb03,
|
||||||
|
int ne10,
|
||||||
|
int ne11,
|
||||||
|
int ne12,
|
||||||
|
int ne13,
|
||||||
|
ulong nb10,
|
||||||
|
ulong nb11,
|
||||||
|
ulong nb12,
|
||||||
|
ulong nb13,
|
||||||
|
int ne0,
|
||||||
|
ulong nb0,
|
||||||
|
ulong nb1,
|
||||||
|
ulong nb2,
|
||||||
|
ulong nb3
|
||||||
|
) {
|
||||||
|
src0 = src0 + offset0;
|
||||||
|
src1 = src1 + offset1;
|
||||||
|
dst = dst + offsetd;
|
||||||
|
|
||||||
|
int i03 = get_group_id(2);
|
||||||
|
int i02 = get_group_id(1);
|
||||||
|
int i01 = get_group_id(0);
|
||||||
|
|
||||||
|
int i13 = i03 % ne13;
|
||||||
|
int i12 = i02 % ne12;
|
||||||
|
int i11 = i01 % ne11;
|
||||||
|
|
||||||
|
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
||||||
|
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
||||||
|
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
||||||
|
|
||||||
|
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||||
|
const int i10 = i0 % ne10;
|
||||||
|
*((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) / *((global float *)(src1_ptr + i10*nb10));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// assumption: src1 is a row
|
||||||
|
// broadcast src1 into src0
|
||||||
|
kernel void kernel_div_row(
|
||||||
|
global float4 * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global float4 * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global float4 * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
int ne
|
||||||
|
) {
|
||||||
|
src0 = (global float4*)((global char*)src0 + offset0);
|
||||||
|
src1 = (global float4*)((global char*)src1 + offset1);
|
||||||
|
dst = (global float4*)((global char*)dst + offsetd);
|
||||||
|
|
||||||
|
// This performs better than using %.
|
||||||
|
uint gid = get_global_id(0);
|
||||||
|
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||||
|
dst[gid] = src0[gid] / src1[idx1];
|
||||||
|
}
|
72
ggml/src/ggml-opencl/kernels/group_norm.cl
Normal file
72
ggml/src/ggml-opencl/kernels/group_norm.cl
Normal file
|
@ -0,0 +1,72 @@
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
|
||||||
|
#ifdef cl_intel_subgroups
|
||||||
|
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||||
|
#else
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef cl_intel_required_subgroup_size
|
||||||
|
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||||
|
#define INTEL_GPU 1
|
||||||
|
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||||
|
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||||
|
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||||
|
#define ADRENO_GPU 1
|
||||||
|
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||||
|
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Workgroup must be a subgroup
|
||||||
|
#ifdef INTEL_GPU
|
||||||
|
REQD_SUBGROUP_SIZE_32
|
||||||
|
#elif defined (ADRENO_GPU)
|
||||||
|
REQD_SUBGROUP_SIZE_64
|
||||||
|
#endif
|
||||||
|
kernel void kernel_group_norm(
|
||||||
|
global float * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global float * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
int ne,
|
||||||
|
int group_size,
|
||||||
|
float eps
|
||||||
|
) {
|
||||||
|
src0 = (global float *)((global char *)src0 + offset0);
|
||||||
|
dst = (global float *)((global char *)dst + offsetd);
|
||||||
|
|
||||||
|
int start = get_group_id(0) * group_size;
|
||||||
|
int end = start + group_size;
|
||||||
|
|
||||||
|
start += get_local_id(0);
|
||||||
|
|
||||||
|
if (end >= ne) {
|
||||||
|
end = ne;
|
||||||
|
}
|
||||||
|
|
||||||
|
float tmp = 0.0f;
|
||||||
|
|
||||||
|
for (int j = start; j < end; j += get_local_size(0)) {
|
||||||
|
tmp += src0[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
tmp = sub_group_reduce_add(tmp);
|
||||||
|
|
||||||
|
const float mean = tmp / group_size;
|
||||||
|
tmp = 0.0f;
|
||||||
|
|
||||||
|
for (int j = start; j < end; j += get_local_size(0)) {
|
||||||
|
float xi = src0[j] - mean;
|
||||||
|
dst[j] = xi;
|
||||||
|
tmp += xi * xi;
|
||||||
|
}
|
||||||
|
|
||||||
|
tmp = sub_group_reduce_add(tmp);
|
||||||
|
|
||||||
|
const float variance = tmp / group_size;
|
||||||
|
const float scale = 1.0f/sqrt(variance + eps);
|
||||||
|
for (int j = start; j < end; j += get_local_size(0)) {
|
||||||
|
dst[j] *= scale;
|
||||||
|
}
|
||||||
|
}
|
29
ggml/src/ggml-opencl/kernels/sigmoid.cl
Normal file
29
ggml/src/ggml-opencl/kernels/sigmoid.cl
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
// sigmoid
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
kernel void kernel_sigmoid_f32(
|
||||||
|
global float * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global float * dst,
|
||||||
|
ulong offsetd
|
||||||
|
) {
|
||||||
|
src0 = (global float*)((global char*)src0 + offset0);
|
||||||
|
dst = (global float*)((global char*)dst + offsetd);
|
||||||
|
|
||||||
|
dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_sigmoid_f16(
|
||||||
|
global half * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global half * dst,
|
||||||
|
ulong offsetd
|
||||||
|
) {
|
||||||
|
src0 = (global half*)((global char*)src0 + offset0);
|
||||||
|
dst = (global half*)((global char*)dst + offsetd);
|
||||||
|
|
||||||
|
dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
|
||||||
|
}
|
72
ggml/src/ggml-opencl/kernels/sub.cl
Normal file
72
ggml/src/ggml-opencl/kernels/sub.cl
Normal file
|
@ -0,0 +1,72 @@
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
// div
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
kernel void kernel_sub(
|
||||||
|
global char * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global char * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global char * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
ulong nb00,
|
||||||
|
ulong nb01,
|
||||||
|
ulong nb02,
|
||||||
|
ulong nb03,
|
||||||
|
int ne10,
|
||||||
|
int ne11,
|
||||||
|
int ne12,
|
||||||
|
int ne13,
|
||||||
|
ulong nb10,
|
||||||
|
ulong nb11,
|
||||||
|
ulong nb12,
|
||||||
|
ulong nb13,
|
||||||
|
int ne0,
|
||||||
|
ulong nb0,
|
||||||
|
ulong nb1,
|
||||||
|
ulong nb2,
|
||||||
|
ulong nb3
|
||||||
|
) {
|
||||||
|
src0 = src0 + offset0;
|
||||||
|
src1 = src1 + offset1;
|
||||||
|
dst = dst + offsetd;
|
||||||
|
|
||||||
|
int i03 = get_group_id(2);
|
||||||
|
int i02 = get_group_id(1);
|
||||||
|
int i01 = get_group_id(0);
|
||||||
|
|
||||||
|
int i13 = i03 % ne13;
|
||||||
|
int i12 = i02 % ne12;
|
||||||
|
int i11 = i01 % ne11;
|
||||||
|
|
||||||
|
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
||||||
|
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
||||||
|
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
||||||
|
|
||||||
|
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||||
|
const int i10 = i0 % ne10;
|
||||||
|
*((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) - *((global float *)(src1_ptr + i10*nb10));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// assumption: src1 is a row
|
||||||
|
// broadcast src1 into src0
|
||||||
|
kernel void kernel_sub_row(
|
||||||
|
global float4 * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global float4 * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global float4 * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
int ne
|
||||||
|
) {
|
||||||
|
src0 = (global float4*)((global char*)src0 + offset0);
|
||||||
|
src1 = (global float4*)((global char*)src1 + offset1);
|
||||||
|
dst = (global float4*)((global char*)dst + offsetd);
|
||||||
|
|
||||||
|
// This performs better than using %.
|
||||||
|
uint gid = get_global_id(0);
|
||||||
|
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||||
|
dst[gid] = src0[gid] - src1[idx1];
|
||||||
|
}
|
39
ggml/src/ggml-opencl/kernels/sum_rows.cl
Normal file
39
ggml/src/ggml-opencl/kernels/sum_rows.cl
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
|
||||||
|
kernel void kernel_sum_rows_f32(
|
||||||
|
global float * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global float * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
int ne00,
|
||||||
|
int ne01,
|
||||||
|
int ne02,
|
||||||
|
int ne03,
|
||||||
|
ulong nb01,
|
||||||
|
ulong nb02,
|
||||||
|
ulong nb03,
|
||||||
|
ulong nb1,
|
||||||
|
ulong nb2,
|
||||||
|
ulong nb3
|
||||||
|
) {
|
||||||
|
src0 = (global float *)((global char *)src0 + offset0);
|
||||||
|
dst = (global float *)((global char *)dst + offsetd);
|
||||||
|
|
||||||
|
int i3 = get_global_id(2);
|
||||||
|
int i2 = get_global_id(1);
|
||||||
|
int i1 = get_global_id(0);
|
||||||
|
|
||||||
|
if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
global float * src_row = (global float *) ((global char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
|
||||||
|
global float * dst_row = (global float *) ((global char *) dst + i1*nb1 + i2*nb2 + i3*nb3);
|
||||||
|
|
||||||
|
float row_sum = 0;
|
||||||
|
|
||||||
|
for (int i0 = 0; i0 < ne00; i0++) {
|
||||||
|
row_sum += src_row[i0];
|
||||||
|
}
|
||||||
|
|
||||||
|
dst_row[0] = row_sum;
|
||||||
|
}
|
|
@ -12,7 +12,7 @@
|
||||||
|
|
||||||
#include "ggml-vulkan.h"
|
#include "ggml-vulkan.h"
|
||||||
#include <vulkan/vulkan_core.h>
|
#include <vulkan/vulkan_core.h>
|
||||||
#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_PERF) || defined(GGML_VULKAN_CHECK_RESULTS)
|
#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS)
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
#include "ggml-cpu.h"
|
#include "ggml-cpu.h"
|
||||||
#endif
|
#endif
|
||||||
|
@ -200,9 +200,7 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
||||||
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
||||||
class vk_memory_logger;
|
class vk_memory_logger;
|
||||||
#endif
|
#endif
|
||||||
#ifdef GGML_VULKAN_PERF
|
|
||||||
class vk_perf_logger;
|
class vk_perf_logger;
|
||||||
#endif
|
|
||||||
static void ggml_vk_destroy_buffer(vk_buffer& buf);
|
static void ggml_vk_destroy_buffer(vk_buffer& buf);
|
||||||
|
|
||||||
static constexpr uint32_t mul_mat_vec_max_cols = 8;
|
static constexpr uint32_t mul_mat_vec_max_cols = 8;
|
||||||
|
@ -458,9 +456,11 @@ struct vk_device_struct {
|
||||||
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
||||||
std::unique_ptr<vk_memory_logger> memory_logger;
|
std::unique_ptr<vk_memory_logger> memory_logger;
|
||||||
#endif
|
#endif
|
||||||
#ifdef GGML_VULKAN_PERF
|
|
||||||
|
// for GGML_VK_PERF_LOGGER
|
||||||
std::unique_ptr<vk_perf_logger> perf_logger;
|
std::unique_ptr<vk_perf_logger> perf_logger;
|
||||||
#endif
|
vk::QueryPool query_pool;
|
||||||
|
uint32_t num_queries;
|
||||||
|
|
||||||
~vk_device_struct() {
|
~vk_device_struct() {
|
||||||
VK_LOG_DEBUG("destroy device " << name);
|
VK_LOG_DEBUG("destroy device " << name);
|
||||||
|
@ -844,8 +844,6 @@ private:
|
||||||
#define VK_LOG_MEMORY(msg) ((void) 0)
|
#define VK_LOG_MEMORY(msg) ((void) 0)
|
||||||
#endif // GGML_VULKAN_MEMORY_DEBUG
|
#endif // GGML_VULKAN_MEMORY_DEBUG
|
||||||
|
|
||||||
#if defined(GGML_VULKAN_PERF)
|
|
||||||
|
|
||||||
class vk_perf_logger {
|
class vk_perf_logger {
|
||||||
public:
|
public:
|
||||||
void print_timings() {
|
void print_timings() {
|
||||||
|
@ -855,7 +853,7 @@ public:
|
||||||
for (const auto& time : t.second) {
|
for (const auto& time : t.second) {
|
||||||
total += time;
|
total += time;
|
||||||
}
|
}
|
||||||
std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " ms" << std::endl;
|
std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
timings.clear();
|
timings.clear();
|
||||||
|
@ -884,7 +882,6 @@ public:
|
||||||
private:
|
private:
|
||||||
std::map<std::string, std::vector<uint64_t>> timings;
|
std::map<std::string, std::vector<uint64_t>> timings;
|
||||||
};
|
};
|
||||||
#endif // GGML_VULKAN_PERF
|
|
||||||
|
|
||||||
struct ggml_backend_vk_context {
|
struct ggml_backend_vk_context {
|
||||||
std::string name;
|
std::string name;
|
||||||
|
@ -974,6 +971,8 @@ struct vk_instance_t {
|
||||||
static bool vk_instance_initialized = false;
|
static bool vk_instance_initialized = false;
|
||||||
static vk_instance_t vk_instance;
|
static vk_instance_t vk_instance;
|
||||||
|
|
||||||
|
static bool vk_perf_logger_enabled = false;
|
||||||
|
|
||||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||||
static size_t vk_skip_checks;
|
static size_t vk_skip_checks;
|
||||||
static size_t vk_output_tensor;
|
static size_t vk_output_tensor;
|
||||||
|
@ -2773,9 +2772,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
||||||
device->memory_logger = std::unique_ptr<vk_memory_logger>(new vk_memory_logger());
|
device->memory_logger = std::unique_ptr<vk_memory_logger>(new vk_memory_logger());
|
||||||
#endif
|
#endif
|
||||||
#ifdef GGML_VULKAN_PERF
|
if (vk_perf_logger_enabled) {
|
||||||
device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
|
device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
|
||||||
#endif
|
}
|
||||||
|
|
||||||
size_t dev_num = vk_instance.device_indices[idx];
|
size_t dev_num = vk_instance.device_indices[idx];
|
||||||
|
|
||||||
|
@ -3571,6 +3570,8 @@ static void ggml_vk_instance_init() {
|
||||||
vk_instance.instance = vk::createInstance(instance_create_info);
|
vk_instance.instance = vk::createInstance(instance_create_info);
|
||||||
vk_instance_initialized = true;
|
vk_instance_initialized = true;
|
||||||
|
|
||||||
|
vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
|
||||||
|
|
||||||
size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
|
size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
|
||||||
|
|
||||||
// Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
|
// Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
|
||||||
|
@ -8909,7 +8910,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||||
|
|
||||||
ctx->tensor_ctxs[node_idx] = compute_ctx;
|
ctx->tensor_ctxs[node_idx] = compute_ctx;
|
||||||
|
|
||||||
#if defined(GGML_VULKAN_CHECK_RESULTS) || defined(GGML_VULKAN_PERF)
|
#if defined(GGML_VULKAN_CHECK_RESULTS)
|
||||||
// Force context reset on each node so that each tensor ends up in its own context
|
// Force context reset on each node so that each tensor ends up in its own context
|
||||||
// and can be run and compared to its CPU equivalent separately
|
// and can be run and compared to its CPU equivalent separately
|
||||||
last_node = true;
|
last_node = true;
|
||||||
|
@ -9529,6 +9530,29 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||||
bool first_node_in_batch = true; // true if next node will be first node in a batch
|
bool first_node_in_batch = true; // true if next node will be first node in a batch
|
||||||
int submit_node_idx = 0; // index to first node in a batch
|
int submit_node_idx = 0; // index to first node in a batch
|
||||||
|
|
||||||
|
vk_context compute_ctx;
|
||||||
|
if (vk_perf_logger_enabled) {
|
||||||
|
// allocate/resize the query pool
|
||||||
|
if (ctx->device->num_queries < cgraph->n_nodes + 1) {
|
||||||
|
if (ctx->device->query_pool) {
|
||||||
|
ctx->device->device.destroyQueryPool(ctx->device->query_pool);
|
||||||
|
}
|
||||||
|
VkQueryPoolCreateInfo query_create_info = { VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO };
|
||||||
|
query_create_info.queryType = VK_QUERY_TYPE_TIMESTAMP;
|
||||||
|
query_create_info.queryCount = cgraph->n_nodes + 100;
|
||||||
|
ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info);
|
||||||
|
ctx->device->num_queries = query_create_info.queryCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
|
||||||
|
|
||||||
|
GGML_ASSERT(ctx->compute_ctx.expired());
|
||||||
|
compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
||||||
|
ctx->compute_ctx = compute_ctx;
|
||||||
|
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
||||||
|
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
|
||||||
|
}
|
||||||
|
|
||||||
// Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
|
// Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
|
||||||
// Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
|
// Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
|
||||||
// (and scaled down based on model size, so smaller models submit earlier).
|
// (and scaled down based on model size, so smaller models submit earlier).
|
||||||
|
@ -9556,6 +9580,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||||
|
|
||||||
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit);
|
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit);
|
||||||
|
|
||||||
|
if (vk_perf_logger_enabled) {
|
||||||
|
if (ctx->compute_ctx.expired()) {
|
||||||
|
compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
||||||
|
ctx->compute_ctx = compute_ctx;
|
||||||
|
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
||||||
|
} else {
|
||||||
|
compute_ctx = ctx->compute_ctx.lock();
|
||||||
|
}
|
||||||
|
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+1);
|
||||||
|
}
|
||||||
|
|
||||||
if (enqueued) {
|
if (enqueued) {
|
||||||
++submitted_nodes;
|
++submitted_nodes;
|
||||||
|
|
||||||
|
@ -9577,9 +9612,27 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_VULKAN_PERF
|
if (vk_perf_logger_enabled) {
|
||||||
ctx->device->perf_logger->print_timings();
|
// End the command buffer and submit/wait
|
||||||
#endif
|
GGML_ASSERT(!ctx->compute_ctx.expired());
|
||||||
|
compute_ctx = ctx->compute_ctx.lock();
|
||||||
|
ggml_vk_ctx_end(compute_ctx);
|
||||||
|
|
||||||
|
ggml_vk_submit(compute_ctx, ctx->device->fence);
|
||||||
|
VK_CHECK(ctx->device->device.waitForFences({ ctx->device->fence }, true, UINT64_MAX), "GGML_VULKAN_PERF waitForFences");
|
||||||
|
ctx->device->device.resetFences({ ctx->device->fence });
|
||||||
|
|
||||||
|
// Get the results and pass them to the logger
|
||||||
|
std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
|
||||||
|
ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
|
||||||
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
|
if (!ggml_vk_is_empty(cgraph->nodes[i])) {
|
||||||
|
ctx->device->perf_logger->log_timing(cgraph->nodes[i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->device->perf_logger->print_timings();
|
||||||
|
}
|
||||||
|
|
||||||
ggml_vk_graph_cleanup(ctx);
|
ggml_vk_graph_cleanup(ctx);
|
||||||
|
|
||||||
|
|
|
@ -902,7 +902,6 @@ class TensorNameMap:
|
||||||
|
|
||||||
MODEL_TENSOR.V_MMPROJ_FC: (
|
MODEL_TENSOR.V_MMPROJ_FC: (
|
||||||
"model.connector.modality_projection.proj", # SmolVLM
|
"model.connector.modality_projection.proj", # SmolVLM
|
||||||
"multi_modal_projector.linear_1", # llama 4
|
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_MMPROJ_MLP: (
|
MODEL_TENSOR.V_MMPROJ_MLP: (
|
||||||
|
|
|
@ -455,7 +455,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t llm_graph_context::n_pos_per_embd() const {
|
int64_t llm_graph_context::n_pos_per_embd() const {
|
||||||
return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
|
return hparams.rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
|
void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
|
||||||
|
@ -1562,20 +1562,25 @@ void llm_graph_context::build_pooling(
|
||||||
ggml_tensor * inp_cls = build_inp_cls();
|
ggml_tensor * inp_cls = build_inp_cls();
|
||||||
inp = ggml_get_rows(ctx0, inp, inp_cls);
|
inp = ggml_get_rows(ctx0, inp, inp_cls);
|
||||||
|
|
||||||
// classification head
|
if (cls != nullptr && cls_b != nullptr) {
|
||||||
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
|
// classification head
|
||||||
GGML_ASSERT(cls != nullptr);
|
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
|
||||||
GGML_ASSERT(cls_b != nullptr);
|
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
|
||||||
|
cur = ggml_tanh(ctx0, cur);
|
||||||
|
|
||||||
cur = ggml_add (ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
|
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
||||||
cur = ggml_tanh(ctx0, cur);
|
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
|
||||||
|
if (cls_out) {
|
||||||
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
GGML_ASSERT(cls_out_b != nullptr);
|
||||||
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
|
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
|
||||||
if (cls_out) {
|
}
|
||||||
|
} else if (cls_out) {
|
||||||
|
// Single layer classification head (direct projection)
|
||||||
|
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
|
||||||
GGML_ASSERT(cls_out_b != nullptr);
|
GGML_ASSERT(cls_out_b != nullptr);
|
||||||
|
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, inp), cls_out_b);
|
||||||
cur = ggml_add (ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
|
} else {
|
||||||
|
GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
|
|
|
@ -757,11 +757,19 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
|
||||||
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
|
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
|
||||||
|
|
||||||
const auto & n_rot = hparams.n_rot;
|
const auto & n_rot = hparams.n_rot;
|
||||||
const auto & rope_type = hparams.rope_type;
|
const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE
|
||||||
|
// @ngxson : this is a workaround
|
||||||
|
// for M-RoPE, we want to rotate the whole vector when doing KV shift
|
||||||
|
// a normal RoPE should work, we just need to use the correct ordering
|
||||||
|
// ref: https://github.com/ggml-org/llama.cpp/pull/13870
|
||||||
|
? LLAMA_ROPE_TYPE_NEOX
|
||||||
|
: hparams.rope_type;
|
||||||
|
|
||||||
// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
|
// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
|
||||||
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
||||||
const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor;
|
const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
|
||||||
|
? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
|
||||||
|
: cparams.yarn_attn_factor;
|
||||||
|
|
||||||
ggml_tensor * tmp;
|
ggml_tensor * tmp;
|
||||||
|
|
||||||
|
|
|
@ -1,28 +1,5 @@
|
||||||
// fix problem with std::min and std::max
|
|
||||||
#if defined(_WIN32)
|
|
||||||
#define WIN32_LEAN_AND_MEAN
|
|
||||||
#ifndef NOMINMAX
|
|
||||||
# define NOMINMAX
|
|
||||||
#endif
|
|
||||||
#include <windows.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include "mtmd-audio.h"
|
#include "mtmd-audio.h"
|
||||||
|
|
||||||
//#define MTMD_AUDIO_DEBUG
|
|
||||||
|
|
||||||
#define MINIAUDIO_IMPLEMENTATION
|
|
||||||
#ifndef MTMD_AUDIO_DEBUG
|
|
||||||
# define MA_NO_ENCODING
|
|
||||||
#endif
|
|
||||||
#define MA_NO_DEVICE_IO
|
|
||||||
#define MA_NO_RESOURCE_MANAGER
|
|
||||||
#define MA_NO_NODE_GRAPH
|
|
||||||
#define MA_NO_ENGINE
|
|
||||||
#define MA_NO_GENERATION
|
|
||||||
#define MA_API static
|
|
||||||
#include "miniaudio.h"
|
|
||||||
|
|
||||||
#define _USE_MATH_DEFINES // for M_PI
|
#define _USE_MATH_DEFINES // for M_PI
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
@ -359,69 +336,6 @@ bool preprocess_audio(
|
||||||
} // namespace whisper_preprocessor
|
} // namespace whisper_preprocessor
|
||||||
|
|
||||||
|
|
||||||
namespace audio_helpers {
|
|
||||||
|
|
||||||
bool is_audio_file(const char * buf, size_t len) {
|
|
||||||
if (len < 12) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
|
|
||||||
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
|
|
||||||
bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
|
|
||||||
bool is_mp3 = len >= 3 && (
|
|
||||||
memcmp(buf, "ID3", 3) == 0 ||
|
|
||||||
// Check for MPEG sync word (simplified check)
|
|
||||||
((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
|
|
||||||
);
|
|
||||||
bool is_flac = memcmp(buf, "fLaC", 4) == 0;
|
|
||||||
|
|
||||||
return is_wav || is_mp3 || is_flac;
|
|
||||||
}
|
|
||||||
|
|
||||||
// returns true if the buffer is a valid audio file
|
|
||||||
bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
|
|
||||||
ma_result result;
|
|
||||||
const int channels = 1;
|
|
||||||
ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
|
|
||||||
ma_decoder decoder;
|
|
||||||
|
|
||||||
result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
|
|
||||||
if (result != MA_SUCCESS) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
ma_uint64 frame_count;
|
|
||||||
ma_uint64 frames_read;
|
|
||||||
result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
|
|
||||||
if (result != MA_SUCCESS) {
|
|
||||||
ma_decoder_uninit(&decoder);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
pcmf32_mono.resize(frame_count);
|
|
||||||
result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
|
|
||||||
if (result != MA_SUCCESS) {
|
|
||||||
ma_decoder_uninit(&decoder);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef MTMD_AUDIO_DEBUG
|
|
||||||
// save audio to wav file
|
|
||||||
ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
|
|
||||||
ma_encoder encoder;
|
|
||||||
ma_encoder_init_file("output.wav", &config, &encoder);
|
|
||||||
ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
|
|
||||||
ma_encoder_uninit(&encoder);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ma_decoder_uninit(&decoder);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace wav_utils
|
|
||||||
|
|
||||||
|
|
||||||
// precalculated mel filter banks
|
// precalculated mel filter banks
|
||||||
// values are multiplied by 1000.0 to save space, and will be divided by 1000.0 in the end of the function
|
// values are multiplied by 1000.0 to save space, and will be divided by 1000.0 in the end of the function
|
||||||
//
|
//
|
||||||
|
|
|
@ -32,7 +32,7 @@ struct whisper_filters {
|
||||||
std::vector<float> data;
|
std::vector<float> data;
|
||||||
};
|
};
|
||||||
|
|
||||||
extern bool preprocess_audio(
|
bool preprocess_audio(
|
||||||
const float * samples,
|
const float * samples,
|
||||||
size_t n_samples,
|
size_t n_samples,
|
||||||
const whisper_filters & filters,
|
const whisper_filters & filters,
|
||||||
|
@ -40,23 +40,8 @@ extern bool preprocess_audio(
|
||||||
|
|
||||||
} // namespace whisper_preprocessor
|
} // namespace whisper_preprocessor
|
||||||
|
|
||||||
|
|
||||||
// TODO @ngxson : move this helper to mtmd-helpers.cpp
|
|
||||||
namespace audio_helpers {
|
|
||||||
|
|
||||||
extern bool is_audio_file(const char * buf, size_t len);
|
|
||||||
|
|
||||||
extern bool decode_audio_from_buf(
|
|
||||||
const unsigned char * buf_in,
|
|
||||||
size_t len,
|
|
||||||
int target_sampler_rate,
|
|
||||||
std::vector<float> & pcmf32_mono);
|
|
||||||
|
|
||||||
} // namespace audio_helpers
|
|
||||||
|
|
||||||
|
|
||||||
namespace whisper_precalc_filters {
|
namespace whisper_precalc_filters {
|
||||||
|
|
||||||
extern whisper_preprocessor::whisper_filters get_128_bins();
|
whisper_preprocessor::whisper_filters get_128_bins();
|
||||||
|
|
||||||
} // namespace whisper_precalc_filters
|
} // namespace whisper_precalc_filters
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
#include "console.h"
|
#include "console.h"
|
||||||
#include "chat.h"
|
#include "chat.h"
|
||||||
#include "mtmd.h"
|
#include "mtmd.h"
|
||||||
|
#include "mtmd-helper.h"
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
|
@ -143,7 +144,7 @@ struct mtmd_cli_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool load_media(const std::string & fname) {
|
bool load_media(const std::string & fname) {
|
||||||
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str()));
|
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
|
||||||
if (!bmp.ptr) {
|
if (!bmp.ptr) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,10 +1,37 @@
|
||||||
|
// fix problem with std::min and std::max
|
||||||
|
#if defined(_WIN32)
|
||||||
|
#define WIN32_LEAN_AND_MEAN
|
||||||
|
#ifndef NOMINMAX
|
||||||
|
# define NOMINMAX
|
||||||
|
#endif
|
||||||
|
#include <windows.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "mtmd.h"
|
#include "mtmd.h"
|
||||||
|
#include "mtmd-helper.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
//#define MTMD_AUDIO_DEBUG
|
||||||
|
|
||||||
|
#define MINIAUDIO_IMPLEMENTATION
|
||||||
|
#ifndef MTMD_AUDIO_DEBUG
|
||||||
|
# define MA_NO_ENCODING
|
||||||
|
#endif
|
||||||
|
#define MA_NO_DEVICE_IO
|
||||||
|
#define MA_NO_RESOURCE_MANAGER
|
||||||
|
#define MA_NO_NODE_GRAPH
|
||||||
|
#define MA_NO_ENGINE
|
||||||
|
#define MA_NO_GENERATION
|
||||||
|
#define MA_API static
|
||||||
|
#include "vendor/miniaudio.h"
|
||||||
|
|
||||||
|
#define STB_IMAGE_IMPLEMENTATION
|
||||||
|
#include "vendor/stb_image.h"
|
||||||
|
|
||||||
#define LOG_INF(...) fprintf(stdout, __VA_ARGS__)
|
#define LOG_INF(...) fprintf(stdout, __VA_ARGS__)
|
||||||
#define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
|
#define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
|
||||||
|
|
||||||
|
@ -315,3 +342,118 @@ int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
namespace audio_helpers {
|
||||||
|
|
||||||
|
static bool is_audio_file(const char * buf, size_t len) {
|
||||||
|
if (len < 12) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
|
||||||
|
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
|
||||||
|
bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
|
||||||
|
bool is_mp3 = len >= 3 && (
|
||||||
|
memcmp(buf, "ID3", 3) == 0 ||
|
||||||
|
// Check for MPEG sync word (simplified check)
|
||||||
|
((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
|
||||||
|
);
|
||||||
|
bool is_flac = memcmp(buf, "fLaC", 4) == 0;
|
||||||
|
|
||||||
|
return is_wav || is_mp3 || is_flac;
|
||||||
|
}
|
||||||
|
|
||||||
|
// returns true if the buffer is a valid audio file
|
||||||
|
static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
|
||||||
|
ma_result result;
|
||||||
|
const int channels = 1;
|
||||||
|
ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
|
||||||
|
ma_decoder decoder;
|
||||||
|
|
||||||
|
result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
|
||||||
|
if (result != MA_SUCCESS) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
ma_uint64 frame_count;
|
||||||
|
ma_uint64 frames_read;
|
||||||
|
result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
|
||||||
|
if (result != MA_SUCCESS) {
|
||||||
|
ma_decoder_uninit(&decoder);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
pcmf32_mono.resize(frame_count);
|
||||||
|
result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
|
||||||
|
if (result != MA_SUCCESS) {
|
||||||
|
ma_decoder_uninit(&decoder);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef MTMD_AUDIO_DEBUG
|
||||||
|
// save audio to wav file
|
||||||
|
ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
|
||||||
|
ma_encoder encoder;
|
||||||
|
ma_encoder_init_file("output.wav", &config, &encoder);
|
||||||
|
ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
|
||||||
|
ma_encoder_uninit(&encoder);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
ma_decoder_uninit(&decoder);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace audio_helpers
|
||||||
|
|
||||||
|
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) {
|
||||||
|
if (audio_helpers::is_audio_file((const char *)buf, len)) {
|
||||||
|
std::vector<float> pcmf32;
|
||||||
|
int bitrate = mtmd_get_audio_bitrate(ctx);
|
||||||
|
if (bitrate < 0) {
|
||||||
|
LOG_ERR("This model does not support audio input\n");
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
if (!audio_helpers::decode_audio_from_buf(buf, len, bitrate, pcmf32)) {
|
||||||
|
LOG_ERR("Unable to read WAV audio file from buffer\n");
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
|
||||||
|
}
|
||||||
|
|
||||||
|
// otherwise, we assume it's an image
|
||||||
|
mtmd_bitmap * result = nullptr;
|
||||||
|
{
|
||||||
|
int nx, ny, nc;
|
||||||
|
auto * data = stbi_load_from_memory(buf, len, &nx, &ny, &nc, 3);
|
||||||
|
if (!data) {
|
||||||
|
LOG_ERR("%s: failed to decode image bytes\n", __func__);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
result = mtmd_bitmap_init(nx, ny, data);
|
||||||
|
stbi_image_free(data);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
|
||||||
|
std::vector<unsigned char> buf;
|
||||||
|
FILE * f = fopen(fname, "rb");
|
||||||
|
if (!f) {
|
||||||
|
LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
fseek(f, 0, SEEK_END);
|
||||||
|
long file_size = ftell(f);
|
||||||
|
fseek(f, 0, SEEK_SET);
|
||||||
|
buf.resize(file_size);
|
||||||
|
|
||||||
|
size_t n_read = fread(buf.data(), 1, file_size, f);
|
||||||
|
fclose(f);
|
||||||
|
if (n_read != (size_t)file_size) {
|
||||||
|
LOG_ERR("Failed to read entire file %s", fname);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
|
||||||
|
}
|
||||||
|
|
91
tools/mtmd/mtmd-helper.h
Normal file
91
tools/mtmd/mtmd-helper.h
Normal file
|
@ -0,0 +1,91 @@
|
||||||
|
#ifndef MTMD_HELPER_H
|
||||||
|
#define MTMD_HELPER_H
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "mtmd.h"
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdbool.h>
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//
|
||||||
|
// libmtmd helper functions
|
||||||
|
//
|
||||||
|
// Please note that these helpers are not guaranteed to be stable.
|
||||||
|
// BREAKING CHANGES are expected.
|
||||||
|
//
|
||||||
|
|
||||||
|
// helper function to construct a mtmd_bitmap from a file
|
||||||
|
// it calls mtmd_helper_bitmap_init_from_buf() internally
|
||||||
|
// returns nullptr on failure
|
||||||
|
// this function is thread-safe
|
||||||
|
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);
|
||||||
|
|
||||||
|
// helper function to construct a mtmd_bitmap from a buffer containing a file
|
||||||
|
// supported formats:
|
||||||
|
// image: formats supported by stb_image: jpg, png, bmp, gif, etc.
|
||||||
|
// audio: formats supported by miniaudio: wav, mp3, flac
|
||||||
|
// note: audio files will be auto-detected based on magic bytes
|
||||||
|
// returns nullptr on failure
|
||||||
|
// this function is thread-safe
|
||||||
|
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
|
||||||
|
|
||||||
|
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
|
||||||
|
MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
|
||||||
|
|
||||||
|
// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
|
||||||
|
// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
|
||||||
|
MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
|
||||||
|
|
||||||
|
// helper function that automatically:
|
||||||
|
// 1. run llama_decode() on text chunks
|
||||||
|
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
|
||||||
|
// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
|
||||||
|
// otherwise, returns 0 on success
|
||||||
|
// this function is NOT thread-safe
|
||||||
|
MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
|
||||||
|
struct llama_context * lctx,
|
||||||
|
const mtmd_input_chunks * chunks,
|
||||||
|
llama_pos n_past,
|
||||||
|
llama_seq_id seq_id,
|
||||||
|
int32_t n_batch,
|
||||||
|
bool logits_last,
|
||||||
|
llama_pos * new_n_past);
|
||||||
|
|
||||||
|
// works like mtmd_helper_eval_chunks(), but only for a single chunk
|
||||||
|
// this function is NOT thread-safe
|
||||||
|
MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
|
||||||
|
struct llama_context * lctx,
|
||||||
|
const mtmd_input_chunk * chunk,
|
||||||
|
llama_pos n_past,
|
||||||
|
llama_seq_id seq_id,
|
||||||
|
int32_t n_batch,
|
||||||
|
bool logits_last,
|
||||||
|
llama_pos * new_n_past);
|
||||||
|
|
||||||
|
// helper function to decode an image whose embeddings have already been calculated
|
||||||
|
// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
|
||||||
|
// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
|
||||||
|
MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
|
||||||
|
struct llama_context * lctx,
|
||||||
|
const mtmd_input_chunk * chunk,
|
||||||
|
float * encoded_embd,
|
||||||
|
llama_pos n_past,
|
||||||
|
llama_seq_id seq_id,
|
||||||
|
int32_t n_batch,
|
||||||
|
llama_pos * new_n_past);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
} // extern "C"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//
|
||||||
|
// C++ wrappers
|
||||||
|
//
|
||||||
|
|
||||||
|
#endif
|
|
@ -819,53 +819,12 @@ bool mtmd_support_audio(mtmd_context * ctx) {
|
||||||
return ctx->ctx_a != nullptr;
|
return ctx->ctx_a != nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
// these 2 helpers below use internal clip_image_u8_ptr,
|
int mtmd_get_audio_bitrate(mtmd_context * ctx) {
|
||||||
// so unfortunately they cannot moved to mtmd-helper.h
|
if (!ctx->ctx_a) {
|
||||||
// however, in theory, user can decode image file to bitmap using
|
return -1;
|
||||||
// whichever library they want, and then use mtmd_bitmap_init() to create bitmap
|
|
||||||
|
|
||||||
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) {
|
|
||||||
if (audio_helpers::is_audio_file((const char *)buf, len)) {
|
|
||||||
std::vector<float> pcmf32;
|
|
||||||
if (!audio_helpers::decode_audio_from_buf(buf, len, COMMON_SAMPLE_RATE, pcmf32)) {
|
|
||||||
LOG_ERR("Unable to read WAV audio file from buffer\n");
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
|
|
||||||
}
|
}
|
||||||
|
// for now, we assume that all audio models have the same bitrate
|
||||||
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
return 16000; // 16kHz
|
||||||
bool ok = clip_image_load_from_bytes(buf, len, img_u8.get(),2048);
|
|
||||||
if (!ok) {
|
|
||||||
LOG_ERR("Unable to load image from buffer\n");
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
uint32_t nx, ny;
|
|
||||||
unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
|
|
||||||
return mtmd_bitmap_init(nx, ny, data);
|
|
||||||
}
|
|
||||||
|
|
||||||
mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) {
|
|
||||||
std::vector<unsigned char> buf;
|
|
||||||
FILE * f = fopen(fname, "rb");
|
|
||||||
if (!f) {
|
|
||||||
LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
fseek(f, 0, SEEK_END);
|
|
||||||
long file_size = ftell(f);
|
|
||||||
fseek(f, 0, SEEK_SET);
|
|
||||||
buf.resize(file_size);
|
|
||||||
|
|
||||||
size_t n_read = fread(buf.data(), 1, file_size, f);
|
|
||||||
fclose(f);
|
|
||||||
if (n_read != (size_t)file_size) {
|
|
||||||
LOG_ERR("Failed to read entire file %s", fname);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
return mtmd_helper_bitmap_init_from_buf(buf.data(), buf.size());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|
|
@ -109,6 +109,10 @@ MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
|
||||||
// whether the current model supports audio input
|
// whether the current model supports audio input
|
||||||
MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
|
MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
|
||||||
|
|
||||||
|
// get audio bitrate in Hz, for example 16000 for Whisper
|
||||||
|
// return -1 if audio is not supported
|
||||||
|
MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
|
||||||
|
|
||||||
// mtmd_bitmap
|
// mtmd_bitmap
|
||||||
//
|
//
|
||||||
// if bitmap is image:
|
// if bitmap is image:
|
||||||
|
@ -209,75 +213,6 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
|
||||||
|
|
||||||
/////////////////////////////////////////
|
/////////////////////////////////////////
|
||||||
|
|
||||||
//
|
|
||||||
// Helper functions (can be implemented based on other functions)
|
|
||||||
//
|
|
||||||
// Please note that these helpers are not guaranteed to be stable.
|
|
||||||
// BREAKING CHANGES are expected.
|
|
||||||
//
|
|
||||||
|
|
||||||
// helper function to construct a mtmd_bitmap from a file
|
|
||||||
// it calls mtmd_helper_bitmap_init_from_buf() internally
|
|
||||||
// returns nullptr on failure
|
|
||||||
// this function is thread-safe
|
|
||||||
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname);
|
|
||||||
|
|
||||||
// helper function to construct a mtmd_bitmap from a buffer containing a file
|
|
||||||
// supported formats:
|
|
||||||
// image: formats supported by stb_image: jpg, png, bmp, gif, etc.
|
|
||||||
// audio: formats supported by miniaudio: wav, mp3, flac
|
|
||||||
// note: audio files will be auto-detected based on magic bytes
|
|
||||||
// returns nullptr on failure
|
|
||||||
// this function is thread-safe
|
|
||||||
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len);
|
|
||||||
|
|
||||||
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
|
|
||||||
MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
|
|
||||||
|
|
||||||
// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
|
|
||||||
// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
|
|
||||||
MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
|
|
||||||
|
|
||||||
// helper function that automatically:
|
|
||||||
// 1. run llama_decode() on text chunks
|
|
||||||
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
|
|
||||||
// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
|
|
||||||
// otherwise, returns 0 on success
|
|
||||||
// this function is NOT thread-safe
|
|
||||||
MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
|
|
||||||
struct llama_context * lctx,
|
|
||||||
const mtmd_input_chunks * chunks,
|
|
||||||
llama_pos n_past,
|
|
||||||
llama_seq_id seq_id,
|
|
||||||
int32_t n_batch,
|
|
||||||
bool logits_last,
|
|
||||||
llama_pos * new_n_past);
|
|
||||||
|
|
||||||
// works like mtmd_helper_eval_chunks(), but only for a single chunk
|
|
||||||
// this function is NOT thread-safe
|
|
||||||
MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
|
|
||||||
struct llama_context * lctx,
|
|
||||||
const mtmd_input_chunk * chunk,
|
|
||||||
llama_pos n_past,
|
|
||||||
llama_seq_id seq_id,
|
|
||||||
int32_t n_batch,
|
|
||||||
bool logits_last,
|
|
||||||
llama_pos * new_n_past);
|
|
||||||
|
|
||||||
// helper function to decode an image whose embeddings have already been calculated
|
|
||||||
// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
|
|
||||||
// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
|
|
||||||
MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
|
|
||||||
struct llama_context * lctx,
|
|
||||||
const mtmd_input_chunk * chunk,
|
|
||||||
float * encoded_embd,
|
|
||||||
llama_pos n_past,
|
|
||||||
llama_seq_id seq_id,
|
|
||||||
int32_t n_batch,
|
|
||||||
llama_pos * new_n_past);
|
|
||||||
|
|
||||||
/////////////////////////////////////////
|
|
||||||
|
|
||||||
// test function, to be used in test-mtmd-c-api.c
|
// test function, to be used in test-mtmd-c-api.c
|
||||||
MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
|
MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,7 @@
|
||||||
#include "sampling.h"
|
#include "sampling.h"
|
||||||
#include "speculative.h"
|
#include "speculative.h"
|
||||||
#include "mtmd.h"
|
#include "mtmd.h"
|
||||||
|
#include "mtmd-helper.h"
|
||||||
|
|
||||||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||||
#define JSON_ASSERT GGML_ASSERT
|
#define JSON_ASSERT GGML_ASSERT
|
||||||
|
@ -4187,7 +4188,7 @@ int main(int argc, char ** argv) {
|
||||||
throw std::runtime_error("This server does not support multimodal");
|
throw std::runtime_error("This server does not support multimodal");
|
||||||
}
|
}
|
||||||
for (auto & file : files) {
|
for (auto & file : files) {
|
||||||
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size()));
|
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(ctx_server.mctx, file.data(), file.size()));
|
||||||
if (!bmp.ptr) {
|
if (!bmp.ptr) {
|
||||||
throw std::runtime_error("Failed to load image or audio file");
|
throw std::runtime_error("Failed to load image or audio file");
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
#include "arg.h" // common_remote_get_content
|
#include "arg.h" // common_remote_get_content
|
||||||
#include "base64.hpp"
|
#include "base64.hpp"
|
||||||
#include "mtmd.h"
|
#include "mtmd.h"
|
||||||
|
#include "mtmd-helper.h"
|
||||||
|
|
||||||
// increase max payload length to allow use of larger context size
|
// increase max payload length to allow use of larger context size
|
||||||
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
||||||
|
@ -264,13 +265,19 @@ static size_t validate_utf8(const std::string& text) {
|
||||||
static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_tokens & query, const llama_tokens & doc) {
|
static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_tokens & query, const llama_tokens & doc) {
|
||||||
llama_tokens result;
|
llama_tokens result;
|
||||||
|
|
||||||
|
// Get EOS token - use SEP token as fallback if EOS is not available
|
||||||
|
llama_token eos_token = llama_vocab_eos(vocab);
|
||||||
|
if (eos_token == LLAMA_TOKEN_NULL) {
|
||||||
|
eos_token = llama_vocab_sep(vocab);
|
||||||
|
}
|
||||||
|
|
||||||
result.reserve(doc.size() + query.size() + 4);
|
result.reserve(doc.size() + query.size() + 4);
|
||||||
result.push_back(llama_vocab_bos(vocab));
|
result.push_back(llama_vocab_bos(vocab));
|
||||||
result.insert(result.end(), query.begin(), query.end());
|
result.insert(result.end(), query.begin(), query.end());
|
||||||
result.push_back(llama_vocab_eos(vocab));
|
result.push_back(eos_token);
|
||||||
result.push_back(llama_vocab_sep(vocab));
|
result.push_back(llama_vocab_sep(vocab));
|
||||||
result.insert(result.end(), doc.begin(), doc.end());
|
result.insert(result.end(), doc.begin(), doc.end());
|
||||||
result.push_back(llama_vocab_eos(vocab));
|
result.push_back(eos_token);
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -573,7 +580,7 @@ struct oaicompat_parser_options {
|
||||||
|
|
||||||
// used by /chat/completions endpoint
|
// used by /chat/completions endpoint
|
||||||
static json oaicompat_chat_params_parse(
|
static json oaicompat_chat_params_parse(
|
||||||
const json & body, /* openai api json semantics */
|
json & body, /* openai api json semantics */
|
||||||
const oaicompat_parser_options & opt,
|
const oaicompat_parser_options & opt,
|
||||||
std::vector<raw_buffer> & out_files)
|
std::vector<raw_buffer> & out_files)
|
||||||
{
|
{
|
||||||
|
@ -624,7 +631,7 @@ static json oaicompat_chat_params_parse(
|
||||||
if (!body.contains("messages")) {
|
if (!body.contains("messages")) {
|
||||||
throw std::runtime_error("'messages' is required");
|
throw std::runtime_error("'messages' is required");
|
||||||
}
|
}
|
||||||
json messages = body.at("messages");
|
json & messages = body.at("messages");
|
||||||
if (!messages.is_array()) {
|
if (!messages.is_array()) {
|
||||||
throw std::runtime_error("Expected 'messages' to be an array");
|
throw std::runtime_error("Expected 'messages' to be an array");
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue