diff --git a/CMakeLists.txt b/CMakeLists.txt index 523233fb7..4ea472bdf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -449,7 +449,9 @@ add_library(common2 src/unicode.cpp src/unicode-data.cpp otherarch/utils.cpp - otherarch/utils.h) + otherarch/utils.h + tools/mtmd/mtmd-audio.cpp + tools/mtmd/mtmd-audio.h) target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common) target_compile_features(common2 PUBLIC cxx_std_17) # don't bump target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS}) diff --git a/Makefile b/Makefile index 2f4c38ac2..dd93602e0 100644 --- a/Makefile +++ b/Makefile @@ -90,10 +90,10 @@ endif CUBLASLD_FLAGS = CUBLAS_OBJS = -OBJS_FULL += ggml-alloc.o ggml-cpu-traits.o ggml-quants.o ggml-cpu-quants.o kcpp-quantmapper.o ggml-repack.o kcpp-repackmapper.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm.o common.o sampling.o kcpputils.o -OBJS_SIMPLE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx2.o ggml-cpu-quants.o kcpp-quantmapper_noavx2.o ggml-repack.o kcpp-repackmapper_noavx2.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx2.o common.o sampling.o kcpputils.o -OBJS_SIMPLER += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx1.o ggml-cpu-quants.o kcpp-quantmapper_noavx1.o ggml-repack.o kcpp-repackmapper_noavx1.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx1.o common.o sampling.o kcpputils.o -OBJS_FAILSAFE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_failsafe.o ggml-cpu-quants.o kcpp-quantmapper_failsafe.o ggml-repack.o kcpp-repackmapper_failsafe.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_failsafe.o common.o sampling.o kcpputils.o +OBJS_FULL += ggml-alloc.o ggml-cpu-traits.o ggml-quants.o ggml-cpu-quants.o kcpp-quantmapper.o ggml-repack.o kcpp-repackmapper.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm.o common.o sampling.o kcpputils.o mtmdaudio.o +OBJS_SIMPLE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx2.o ggml-cpu-quants.o kcpp-quantmapper_noavx2.o ggml-repack.o kcpp-repackmapper_noavx2.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx2.o common.o sampling.o kcpputils.o mtmdaudio.o +OBJS_SIMPLER += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx1.o ggml-cpu-quants.o kcpp-quantmapper_noavx1.o ggml-repack.o kcpp-repackmapper_noavx1.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx1.o common.o sampling.o kcpputils.o mtmdaudio.o +OBJS_FAILSAFE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_failsafe.o ggml-cpu-quants.o kcpp-quantmapper_failsafe.o ggml-repack.o kcpp-repackmapper_failsafe.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_failsafe.o common.o sampling.o kcpputils.o mtmdaudio.o # OS specific ifeq ($(UNAME_S),Linux) @@ -566,6 +566,8 @@ gguf.o: ggml/src/gguf.cpp ggml/include/gguf.h $(CXX) $(CXXFLAGS) -c $< -o $@ kcpputils.o: otherarch/utils.cpp otherarch/utils.h $(CXX) $(CXXFLAGS) -c $< -o $@ +mtmdaudio.o: tools/mtmd/mtmd-audio.cpp tools/mtmd/mtmd-audio.h + $(CXX) $(CXXFLAGS) -c $< -o $@ #these have special gpu defines ggml-backend_default.o: ggml/src/ggml-backend.cpp ggml/src/ggml-backend-impl.h ggml/include/ggml.h ggml/include/ggml-backend.h diff --git a/expose.h b/expose.h index cafa4bf90..e968d0a95 100644 --- a/expose.h +++ b/expose.h @@ -3,6 +3,7 @@ const int tensor_split_max = 16; const int images_max = 8; +const int audio_max = 4; const int logprobs_max = 5; // match kobold's sampler list and order @@ -83,6 +84,7 @@ struct generation_inputs const char * negative_prompt = nullptr; const float guidance_scale = 1; const char * images[images_max] = {}; + const char * audio[audio_max] = {}; const int max_context_length = 0; const int max_length = 0; const float temperature = 0.0f; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index eabc7d0c4..20ca0483a 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -42,12 +42,13 @@ #include "mpt_v3.cpp" #include "tools/mtmd/clip.h" #include "tools/mtmd/llava.h" +#include "tools/mtmd/mtmd-audio.h" #include "common/common.h" //const const int extra_context_handle_fragmentation = 128; -const int LLAVA_TOKEN_IDENTIFIER_A = -998; //alternate between both, changing when image changes -const int LLAVA_TOKEN_IDENTIFIER_B = -999; +const int MEDIA_TOKEN_IDENTIFIER_A = -998; //alternate between both, changing when image changes +const int MEDIA_TOKEN_IDENTIFIER_B = -999; //shared std::string executable_path = ""; @@ -100,12 +101,14 @@ static llama_context * llama_ctx_v4 = nullptr; static llama_context * draft_ctx = nullptr; //will remain null if speculative is unused static llama_context * guidance_ctx = nullptr; //for classifier free guidance, will be null if unused -static clip_ctx * clp_ctx = nullptr; //for llava +static clip_ctx * clp_ctx_v = nullptr; //for llava static clip_image_u8 * clp_img_data = nullptr; //most recent image -static std::vector llava_images; -static std::vector last_llava_mem; //for storing dummy tokens that will be consumed by llava -static std::string llava_composite_image_signature = ""; //for identifying when the llava images change, we need to invalidate the cache -static int current_llava_identifier = LLAVA_TOKEN_IDENTIFIER_A; +static clip_ctx * clp_ctx_a = nullptr; //for audio multimodal +static whisper_preprocessor::whisper_filters w_filters; //for audio processing +static std::vector media_objects; +static std::vector last_media_mem; //for storing dummy tokens that will be consumed by llava +static std::string media_composite_image_signature = ""; //for identifying when the llava images change, we need to invalidate the cache +static int current_media_identifier = MEDIA_TOKEN_IDENTIFIER_A; static int vision_max_res = 2048; static kcpp_params * kcpp_data = nullptr; @@ -1803,8 +1806,8 @@ static bool kcpp_eval_image(llama_context * ctx_llama, float * img_embd, int num n_eval = n_batch; } float * embd = img_embd+i*n_embd; - kcpp_embd_batch llava_batch = kcpp_embd_batch(embd, n_eval, *n_past, use_mrope); - if (llama_decode(ctx_llama, llava_batch.batch)) { + kcpp_embd_batch media_batch = kcpp_embd_batch(embd, n_eval, *n_past, use_mrope); + if (llama_decode(ctx_llama, media_batch.batch)) { fprintf(stderr, "\n%s : failed to eval image\n", __func__); return false; } @@ -2431,17 +2434,32 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in /* use_gpu */ true, /* verbosity */ static_cast(1), }); - clp_ctx = cres.ctx_v; - if(clp_ctx == nullptr) { + clp_ctx_v = cres.ctx_v; + clp_ctx_a = cres.ctx_a; + if(clp_ctx_v == nullptr && clp_ctx_a == nullptr) { fprintf(stderr, "%s: error: failed to load mmproj model!\n", __func__); return ModelLoadResult::FAIL; } - const int n_embd_clip = clip_n_mmproj_embd(clp_ctx); + const int n_embd_clip = clip_n_mmproj_embd(clp_ctx_v); const int n_embd_llm = llama_n_embd(llamamodel); + if (clp_ctx_v && clp_ctx_a) { + int n_embd_a = clip_n_mmproj_embd(clp_ctx_a); + if (n_embd_clip != n_embd_a) { + fprintf(stderr, "%s: mmproj embedding mismatch between Audio and Vision (%d and %d)! Make sure you use the correct mmproj file!\n", __func__,n_embd_clip, n_embd_a); + return ModelLoadResult::FAIL; + } + } if (n_embd_clip != n_embd_llm) { fprintf(stderr, "%s: mmproj embedding mismatch (%d and %d)! Make sure you use the correct mmproj file!\n", __func__,n_embd_clip, n_embd_llm); return ModelLoadResult::FAIL; } + if(clp_ctx_a) //init audio + { + if (clip_has_whisper_encoder(clp_ctx_a)) { + // TODO @ngxson : check if model n_mel is 128 or 80 + w_filters = whisper_precalc_filters::get_128_bins(); + } + } clp_img_data = clip_image_u8_init(); } @@ -2454,7 +2472,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in { printf("Error: Speculative decoding cannot be used with Recurrent models!\n"); } - else if(clp_ctx!=nullptr) + else if(clp_ctx_v!=nullptr) { printf("Error: Speculative decoding cannot be used with multimodal vision projectors!\n"); } @@ -2988,54 +3006,113 @@ int GetThreadsToUse(bool blasmode) } //this function prepares the clip embds for llava. it's only needed when images change -static void PrepareLlavaEmbds(const int nctx, const std::vector & llava_sep, const std::vector & llava_intro) +static void PrepareMediaEmbds(const int nctx, const std::vector & media_sep, const std::vector & media_intro) { - if(clp_ctx!=nullptr && clp_img_data!=nullptr) + bool vision_on = (clp_ctx_v != nullptr && clp_img_data != nullptr); + bool audio_on = (clp_ctx_a != nullptr); + if (vision_on || audio_on) { - int sepsize = llava_sep.size(); - int introsize = llava_intro.size(); - last_llava_mem.clear(); + int sepsize = media_sep.size(); + int introsize = media_intro.size(); + last_media_mem.clear(); - for(int i=0;i image_buffer = kcpp_base64_decode(llava_image); - if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), clp_img_data, vision_max_res)) + std::string media_obj = media_objects[i].b64data; + const std::vector media_data_buffer = kcpp_base64_decode(media_obj); + if(!media_objects[i].is_audio && vision_on) { - //failed to load image - printf("\nError: Clip image %d failed to load!",i); - } - else - { - if(debugmode==1 && !is_quiet) + //images + if (!clip_image_load_from_bytes(media_data_buffer.data(), media_data_buffer.size(), clp_img_data, vision_max_res)) { - printf("\nCreating clip image embed..."); + //failed to load image + printf("\nError: Clip image %d failed to load!",i); } - llava_images[i].clp_image_tokens = 0; - if (!llava_image_embed_make_with_clip_img(clp_ctx, kcpp_data->n_threads, clp_img_data, &llava_images[i].clp_img_embd, &llava_images[i].clp_image_tokens)) { - printf("\nError: Clip image %d failed to create embd!",i); - } - if(debugmode==1 && !is_quiet) + else { - printf("\nLLAVA Clip Embed %i used Tokens: %d",i,llava_images[i].clp_image_tokens); + if(debugmode==1 && !is_quiet) + { + printf("\nCreating clip image embed..."); + } + media_chunk chunk; + if (!llava_image_embed_make_with_clip_img(clp_ctx_v, kcpp_data->n_threads, clp_img_data, &chunk.clp_img_embd, &chunk.clp_image_tokens)) { + printf("\nError: Clip image %d failed to create embd!",i); + } + if(debugmode==1 && !is_quiet) + { + printf("\nVision Clip Embed %i used Tokens: %d",i,chunk.clp_image_tokens); + } + int cliptokensneeded = chunk.clp_image_tokens; + if(cliptokensneeded>0 && cliptokensneeded < nctx) + { + int tokcnt = (i==0?(chunk.clp_image_tokens):(chunk.clp_image_tokens+sepsize)); + if(i==0) + { + tokcnt += introsize; + } + for(int n=0;n pcmf32; + bool ok = kcpp_decode_audio_from_buf(media_data_buffer.data(), media_data_buffer.size(), 16000, pcmf32); + if (!ok) { + printf("\nError: Clip audio %d failed to convert!",i); + continue; + } + + std::vector mel_spec_chunks; + ok = whisper_preprocessor::preprocess_audio(pcmf32.data(), pcmf32.size(), w_filters, mel_spec_chunks); + if (!ok) { + printf("\nError: Clip audio %d failed to load!",i); + continue; + } + + // consider each mel_spec as a separate audio chunk + int total_chunk_tokens = 0; + for (auto & mel_spec : mel_spec_chunks) { + media_chunk chunk; + bool ok = audio_embd_make_with_clip_img(clp_ctx_a, kcpp_data->n_threads, mel_spec, &chunk.clp_img_embd, &chunk.clp_image_tokens); + if (!ok) { + printf("\nError: Clip audio chunk in %d failed to make embd!",i); + } else { + if(debugmode==1 && !is_quiet) + { + printf("\nAudio Clip Embed Chunk %i used Tokens: %d",i,chunk.clp_image_tokens); + } + total_chunk_tokens += chunk.clp_image_tokens; + media_objects[i].mediachunks.push_back(chunk); + } + } + int cliptokensneeded = total_chunk_tokens; if(cliptokensneeded>0 && cliptokensneeded < nctx) { - int tokcnt = (i==0?(llava_images[i].clp_image_tokens):(llava_images[i].clp_image_tokens+sepsize)); + int tokcnt = (i==0?(cliptokensneeded):(cliptokensneeded+sepsize)); if(i==0) { tokcnt += introsize; } for(int n=0;nprompt = inputs.prompt; @@ -3373,26 +3470,26 @@ generation_outputs gpttype_generate(const generation_inputs inputs) // tokenize the prompt std::vector embd_inp; std::vector embd_inp_mem; //for storing added memory - std::vector llava_sep; //to separate between different llava images - std::vector llava_intro; //to separate between different llava images + std::vector media_sep; //to separate between different llava images + std::vector media_intro; //to separate between different llava images std::vector guidance_embd; //holds the guidance prompt - bool llava_embds_built = false; + bool media_embds_built = false; int32_t nctx = kcpp_data->n_ctx; TokenizeString(kcpp_data->prompt, embd_inp, file_format, add_bos_token); bool use_mrope = (file_format == FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL); - TokenizeString("\n\n", llava_sep, file_format, false); - TokenizeString("\nImages:\n", llava_intro, file_format, false); + TokenizeString("\n\n", media_sep, file_format, false); + TokenizeString("\nImages:\n", media_intro, file_format, false); - if(llava_composite_image_signature=="") + if(media_composite_image_signature=="") { - last_llava_mem.clear(); + last_media_mem.clear(); } - if(llava_images_changed) + if(media_data_changed) { - PrepareLlavaEmbds(nctx, llava_sep, llava_intro); - llava_embds_built = true; + PrepareMediaEmbds(nctx, media_sep, media_intro); + media_embds_built = true; } if(addedmemory!="") @@ -3415,9 +3512,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs) } } - if(last_llava_mem.size()>0) //stick the llava mem before the added mem + if(last_media_mem.size()>0) //stick the llava mem before the added mem { - if(last_llava_mem.size() + kcpp_data->n_predict + 4 > nctx) + if(last_media_mem.size() + kcpp_data->n_predict + 4 > nctx) { printf("\nWarning: Too many LLaVA tokens, max context exceeded! They will be ignored!\n"); } @@ -3433,7 +3530,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs) } //append llava dummy tokens - embd_inp_mem.insert(embd_inp_mem.begin(), last_llava_mem.begin(), last_llava_mem.end()); + embd_inp_mem.insert(embd_inp_mem.begin(), last_media_mem.begin(), last_media_mem.end()); if (bos.size() > 0 && embd_inp_mem.size() > 0) { embd_inp_mem.insert(embd_inp_mem.begin(), bos[0]); //insert bos at front @@ -4159,12 +4256,12 @@ generation_outputs gpttype_generate(const generation_inputs inputs) while ((int)embd_inp.size() > input_consumed) { int currtoken = embd_inp[input_consumed]; - if(currtoken==LLAVA_TOKEN_IDENTIFIER_A || currtoken==LLAVA_TOKEN_IDENTIFIER_B) //special llava token hit + if(currtoken==MEDIA_TOKEN_IDENTIFIER_A || currtoken==MEDIA_TOKEN_IDENTIFIER_B) //special llava token hit { - if(!llava_embds_built) //this should never happen! however, handle it anyway + if(!media_embds_built) //this should never happen! however, handle it anyway { - PrepareLlavaEmbds(nctx, llava_sep, llava_intro); - llava_embds_built = true; + PrepareMediaEmbds(nctx, media_sep, media_intro); + media_embds_built = true; printf("\nSomehow vision embd was not prepared (maybe no fast forward), rebuilding it...\n"); } @@ -4178,9 +4275,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs) //batch is empty, do image processing int llavatokenscounted = 0; int llavatokensevaled = 0; - int sepsize = llava_sep.size(); - int introsize = llava_intro.size(); - while(input_consumed < embd_inp.size() && (embd_inp[input_consumed]==LLAVA_TOKEN_IDENTIFIER_A || embd_inp[input_consumed]==LLAVA_TOKEN_IDENTIFIER_B)) + int sepsize = media_sep.size(); + int introsize = media_intro.size(); + while(input_consumed < embd_inp.size() && (embd_inp[input_consumed]==MEDIA_TOKEN_IDENTIFIER_A || embd_inp[input_consumed]==MEDIA_TOKEN_IDENTIFIER_B)) { if (!last_n_tokens.empty()) { @@ -4191,13 +4288,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs) ++input_consumed; ++llavatokenscounted; } - for(int i=0;i0 && i==0) { //added at the start of everything - kcpp_embd_batch batch = kcpp_embd_batch(llava_intro, n_past, use_mrope, false); + kcpp_embd_batch batch = kcpp_embd_batch(media_intro, n_past, use_mrope, false); auto evr = llama_decode(llama_ctx_v4, batch.batch); if(evr!=0) { @@ -4213,41 +4310,45 @@ generation_outputs gpttype_generate(const generation_inputs inputs) if(sepsize>0 && i>0) { //add a separator between each image - kcpp_embd_batch batch = kcpp_embd_batch(llava_sep, n_past, use_mrope, false); + kcpp_embd_batch batch = kcpp_embd_batch(media_sep, n_past, use_mrope, false); auto evr = llama_decode(llama_ctx_v4, batch.batch); if(evr!=0) { - printf("\nError when appending llava separator: %d\n",evr); + printf("\nError when appending media separator: %d\n",evr); } else { - printf("\rProcessing LLaVa Separator (%d tokens)",sepsize); + printf("\rProcessing Media Separator (%d tokens)",sepsize); } n_past += sepsize; llavatokensevaled += sepsize; } - if(allow_regular_prints) + for(int j=0;jn_batch,&n_past); - llavatokensevaled += llava_images[i].clp_image_tokens; - if(!err) - { - llava_composite_image_signature = ""; //force invalidate - fprintf(stderr, "\nFailed to eval llava image at %d!\n",n_past); - output.text = nullptr; - output.status = 0; - output.prompt_tokens = output.completion_tokens = 0; - output.stopreason = stop_reason::INVALID; - generation_finished = true; - return output; + media_chunk chunk = media_objects[i].mediachunks[j]; + if(allow_regular_prints) + { + printf("\rProcessing Media Embedding %d (%d tokens)",(i+1), chunk.clp_image_tokens); + } + bool err = kcpp_eval_image(llama_ctx_v4,chunk.clp_img_embd,chunk.clp_image_tokens,kcpp_data->n_batch,&n_past); + llavatokensevaled += chunk.clp_image_tokens; + if(!err) + { + media_composite_image_signature = ""; //force invalidate + fprintf(stderr, "\nFailed to eval llava image at %d!\n",n_past); + output.text = nullptr; + output.status = 0; + output.prompt_tokens = output.completion_tokens = 0; + output.stopreason = stop_reason::INVALID; + generation_finished = true; + return output; + } } } if(llavatokenscounted!=llavatokensevaled) { - llava_composite_image_signature = ""; //force invalidate + media_composite_image_signature = ""; //force invalidate fprintf(stderr, "\nLLAVA image tokens mismatch at %d! (%d vs %d tokens)\n",n_past,llavatokenscounted,llavatokensevaled); output.text = nullptr; output.status = 0; diff --git a/klite.embd b/klite.embd index a556364e4..36c8239ed 100644 --- a/klite.embd +++ b/klite.embd @@ -17874,9 +17874,9 @@ Current version indicated by LITEVER below. { return render_audio_html(data); } - else if(data.startsWith("data:image")) + else //also handles ALL pending items { - return render_image_html(data, pend_txt, siclass) + return render_image_html(data, pend_txt, siclass); } return ""; } diff --git a/koboldcpp.py b/koboldcpp.py index b2241797c..ab858ea53 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -44,6 +44,7 @@ import subprocess sampler_order_max = 7 tensor_split_max = 16 images_max = 8 +audio_max = 4 bias_min_value = -100.0 bias_max_value = 100.0 logprobs_max = 5 @@ -215,6 +216,7 @@ class generation_inputs(ctypes.Structure): ("negative_prompt", ctypes.c_char_p), ("guidance_scale", ctypes.c_float), ("images", ctypes.c_char_p * images_max), + ("audio", ctypes.c_char_p * audio_max), ("max_context_length", ctypes.c_int), ("max_length", ctypes.c_int), ("temperature", ctypes.c_float), @@ -1402,6 +1404,7 @@ def generate(genparams, stream_flag=False): negative_prompt = genparams.get('negative_prompt', "") guidance_scale = tryparsefloat(genparams.get('guidance_scale', 1.0),1.0) images = genparams.get('images', []) + audio = genparams.get('audio', []) max_context_length = tryparseint(genparams.get('max_context_length', maxctx),maxctx) max_length = tryparseint(genparams.get('max_length', args.defaultgenamt),args.defaultgenamt) temperature = tryparsefloat(genparams.get('temperature', adapter_obj.get("temperature", 0.75)),0.75) @@ -1468,6 +1471,11 @@ def generate(genparams, stream_flag=False): inputs.images[n] = "".encode("UTF-8") else: inputs.images[n] = images[n].encode("UTF-8") + for n in range(audio_max): + if not audio or n >= len(audio): + inputs.audio[n] = "".encode("UTF-8") + else: + inputs.audio[n] = audio[n].encode("UTF-8") global showmaxctxwarning if max_context_length > maxctx: if showmaxctxwarning: diff --git a/otherarch/otherarch.h b/otherarch/otherarch.h index be7d04da6..456fd808e 100644 --- a/otherarch/otherarch.h +++ b/otherarch/otherarch.h @@ -502,11 +502,16 @@ struct mpt_model { std::map tensors; }; -struct llava_image +struct media_chunk +{ + int32_t clp_image_tokens = 0; //holds number of tokens llava used in this chunk + float * clp_img_embd = nullptr; //this holds dynamic memory and must be freed each use! +}; +struct media_object { std::string b64data = ""; - int32_t clp_image_tokens = 0; //holds number of tokens llava used - float * clp_img_embd = nullptr; //this holds dynamic memory and must be freed each use! + std::vector mediachunks; + bool is_audio = false; //if true its audio, otherwise its vision }; struct speculative_draft_result diff --git a/otherarch/utils.cpp b/otherarch/utils.cpp index 897b5243a..7e7ed1ef2 100644 --- a/otherarch/utils.cpp +++ b/otherarch/utils.cpp @@ -11,6 +11,17 @@ #include #include +#define MINIAUDIO_IMPLEMENTATION +#ifndef MTMD_AUDIO_DEBUG +# define MA_NO_ENCODING +#endif +#define MA_NO_DEVICE_IO +#define MA_NO_RESOURCE_MANAGER +#define MA_NO_NODE_GRAPH +#define MA_NO_ENGINE +#define MA_NO_GENERATION +#define MA_API static +#include "miniaudio/miniaudio.h" void utreplace(std::string & str, const std::string & needle, const std::string & replacement) { size_t pos = 0; @@ -501,47 +512,47 @@ kcpp_embd_batch::kcpp_embd_batch(float * embd, int32_t n_tokens, int32_t npast, kcpp_embd_batch::kcpp_embd_batch(std::vector & tokens, int32_t npast, bool use_mrope, bool return_all_logits) { - int32_t seq_id = 0; - int32_t n_tokens = tokens.size(); - pos.resize(n_tokens * (use_mrope?4:1)); - std::fill(pos.begin(), pos.end(), 0); - n_seq_id.resize(n_tokens); - seq_ids.resize(n_tokens + 1); - logits.resize(n_tokens); - seq_id_0.resize(1); - seq_id_0[0] = seq_id; - seq_ids[n_tokens] = nullptr; - batch = { - /*n_tokens =*/ n_tokens, - /*tokens =*/ tokens.data(), - /*embd =*/ nullptr, - /*pos =*/ pos.data(), - /*n_seq_id =*/ n_seq_id.data(), - /*seq_id =*/ seq_ids.data(), - /*logits =*/ logits.data(), - }; + int32_t seq_id = 0; + int32_t n_tokens = tokens.size(); + pos.resize(n_tokens * (use_mrope?4:1)); + std::fill(pos.begin(), pos.end(), 0); + n_seq_id.resize(n_tokens); + seq_ids.resize(n_tokens + 1); + logits.resize(n_tokens); + seq_id_0.resize(1); + seq_id_0[0] = seq_id; + seq_ids[n_tokens] = nullptr; + batch = { + /*n_tokens =*/ n_tokens, + /*tokens =*/ tokens.data(), + /*embd =*/ nullptr, + /*pos =*/ pos.data(), + /*n_seq_id =*/ n_seq_id.data(), + /*seq_id =*/ seq_ids.data(), + /*logits =*/ logits.data(), + }; - if(!use_mrope) - { - for (int i = 0; i < n_tokens; i++) { - batch.pos [i] = npast + i; - batch.n_seq_id[i] = 1; - batch.seq_id [i] = seq_id_0.data(); - batch.logits [i] = (return_all_logits?true:false); - } + if(!use_mrope) + { + for (int i = 0; i < n_tokens; i++) { + batch.pos [i] = npast + i; + batch.n_seq_id[i] = 1; + batch.seq_id [i] = seq_id_0.data(); + batch.logits [i] = (return_all_logits?true:false); } - else - { - for (int i = 0; i < n_tokens; i++) { - batch.n_seq_id[i] = 1; - batch.seq_id [i] = seq_id_0.data(); - batch.logits [i] = (return_all_logits?true:false); - } - for (int j = 0; j < batch.n_tokens * 3; j++) { - batch.pos[j] = npast + (j % batch.n_tokens); - } + } + else + { + for (int i = 0; i < n_tokens; i++) { + batch.n_seq_id[i] = 1; + batch.seq_id [i] = seq_id_0.data(); + batch.logits [i] = (return_all_logits?true:false); } - batch.logits[n_tokens - 1] = true; + for (int j = 0; j < batch.n_tokens * 3; j++) { + batch.pos[j] = npast + (j % batch.n_tokens); + } + } + batch.logits[n_tokens - 1] = true; } std::vector split_string(const std::string& input, const std::string& separator) { @@ -559,4 +570,59 @@ std::vector split_string(const std::string& input, const std::strin result.push_back(input.substr(start)); return result; +} + + +static bool buf_is_audio_file(const char * buf, size_t len) { + if (len < 12) { + return false; + } + + // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format + // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html + bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0; + bool is_mp3 = len >= 3 && ( + memcmp(buf, "ID3", 3) == 0 || + // Check for MPEG sync word (simplified check) + ((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0) + ); + bool is_flac = memcmp(buf, "fLaC", 4) == 0; + + return is_wav || is_mp3 || is_flac; +} + +// returns true if the buffer is a valid audio file +bool kcpp_decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector & pcmf32_mono) { + if (!buf_is_audio_file((const char *)buf_in, len)) + { + return false; + } + + ma_result result; + const int channels = 1; + ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate); + ma_decoder decoder; + + result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder); + if (result != MA_SUCCESS) { + return false; + } + + ma_uint64 frame_count; + ma_uint64 frames_read; + result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count); + if (result != MA_SUCCESS) { + ma_decoder_uninit(&decoder); + return false; + } + + pcmf32_mono.resize(frame_count); + result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read); + if (result != MA_SUCCESS) { + ma_decoder_uninit(&decoder); + return false; + } + + ma_decoder_uninit(&decoder); + return true; } \ No newline at end of file diff --git a/otherarch/utils.h b/otherarch/utils.h index 83aa7648f..4cfb829f1 100644 --- a/otherarch/utils.h +++ b/otherarch/utils.h @@ -66,6 +66,7 @@ std::vector resample_wav(const std::vector& input, uint32_t input_ int32_t kcpp_quick_sample(float * logits, const int n_logits, const std::vector & last_n_tokens, float rep_pen, float top_p, int top_k, float temp, std::mt19937 & rng); std::vector split_string(const std::string& input, const std::string& separator); +bool kcpp_decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector & pcmf32_mono); struct kcpp_embd_batch { //duplcated from llava_embd_batch std::vector pos; diff --git a/tools/mtmd/llava.cpp b/tools/mtmd/llava.cpp index a50552d48..dbd7e64a8 100644 --- a/tools/mtmd/llava.cpp +++ b/tools/mtmd/llava.cpp @@ -1,5 +1,7 @@ #include "clip.h" +#include "clip-impl.h" #include "llava.h" +#include "mtmd-audio.h" #include "llama.h" #include "ggml-cpp.h" @@ -13,35 +15,6 @@ #include #include -#if defined(LLAVA_LOG_OFF) -# define LOG_INF(...) -# define LOG_WRN(...) -# define LOG_ERR(...) -# define LOG_DBG(...) -#else // defined(LLAVA_LOG_OFF) -# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) -# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) -# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) -# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0) -#endif // defined(LLAVA_LOG_OFF) - -// RGB uint8 image -struct clip_image_u8 { - int nx; - int ny; - - std::vector buf; -}; - -// RGB float32 image (NHWC) -// Memory layout: RGBRGBRGB... -struct clip_image_f32 { - int nx; - int ny; - - std::vector buf; -}; - struct clip_image_grid_shape { int first; int second; @@ -53,11 +26,6 @@ struct clip_image_f32_batch_deleter { }; typedef std::unique_ptr clip_image_f32_batch_ptr; -struct clip_image_size_deleter { - void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); } -}; -typedef std::unique_ptr clip_image_size_ptr; - /** * Selects the best resolution from a list of possible resolutions based on the original size. * @@ -471,3 +439,28 @@ void llava_image_embed_free(struct llava_image_embed * embed) { free(embed->embed); free(embed); } + +//kcpp helper function +bool audio_embd_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const whisper_preprocessor::whisper_mel & mel_spec, float ** image_embd_out, int * n_img_pos_out) +{ + clip_image_f32_ptr mel_f32(clip_image_f32_init()); + mel_f32->nx = mel_spec.n_len; + mel_f32->ny = mel_spec.n_mel; + mel_f32->buf = std::move(mel_spec.data); + size_t n_tokens = clip_n_output_tokens(ctx_clip, mel_f32.get()); + + clip_image_f32_batch batch_f32; + batch_f32.is_audio = true; + batch_f32.entries.push_back(std::move(mel_f32)); + + int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip); + float * audio_embd = (float *)malloc(n_tokens * n_mmproj_embd); + bool ok = clip_image_batch_encode( + ctx_clip, + n_threads, + &batch_f32, + audio_embd); + *image_embd_out = audio_embd; + *n_img_pos_out = n_tokens; + return ok ? 0 : 1; +} \ No newline at end of file diff --git a/tools/mtmd/llava.h b/tools/mtmd/llava.h index b6feb3027..05b97ebb3 100644 --- a/tools/mtmd/llava.h +++ b/tools/mtmd/llava.h @@ -26,6 +26,9 @@ struct llava_image_embed { float * embed; int n_image_pos; }; +namespace whisper_preprocessor { + struct whisper_mel; +} /** sanity check for clip <-> llava embed size match */ LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip); @@ -42,6 +45,9 @@ LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */ LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past); +LLAVA_API bool audio_embd_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const whisper_preprocessor::whisper_mel & mel_spec, float ** image_embd_out, int * n_img_pos_out); + + #ifdef __cplusplus } #endif