mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
wip2 (+1 squashed commits)
Squashed commits: [4628777b6] wip
This commit is contained in:
parent
8cd72ea924
commit
e9473305d0
11 changed files with 362 additions and 176 deletions
|
@ -449,7 +449,9 @@ add_library(common2
|
||||||
src/unicode.cpp
|
src/unicode.cpp
|
||||||
src/unicode-data.cpp
|
src/unicode-data.cpp
|
||||||
otherarch/utils.cpp
|
otherarch/utils.cpp
|
||||||
otherarch/utils.h)
|
otherarch/utils.h
|
||||||
|
tools/mtmd/mtmd-audio.cpp
|
||||||
|
tools/mtmd/mtmd-audio.h)
|
||||||
target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
|
target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
|
||||||
target_compile_features(common2 PUBLIC cxx_std_17) # don't bump
|
target_compile_features(common2 PUBLIC cxx_std_17) # don't bump
|
||||||
target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS})
|
target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS})
|
||||||
|
|
10
Makefile
10
Makefile
|
@ -90,10 +90,10 @@ endif
|
||||||
CUBLASLD_FLAGS =
|
CUBLASLD_FLAGS =
|
||||||
CUBLAS_OBJS =
|
CUBLAS_OBJS =
|
||||||
|
|
||||||
OBJS_FULL += ggml-alloc.o ggml-cpu-traits.o ggml-quants.o ggml-cpu-quants.o kcpp-quantmapper.o ggml-repack.o kcpp-repackmapper.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm.o common.o sampling.o kcpputils.o
|
OBJS_FULL += ggml-alloc.o ggml-cpu-traits.o ggml-quants.o ggml-cpu-quants.o kcpp-quantmapper.o ggml-repack.o kcpp-repackmapper.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm.o common.o sampling.o kcpputils.o mtmdaudio.o
|
||||||
OBJS_SIMPLE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx2.o ggml-cpu-quants.o kcpp-quantmapper_noavx2.o ggml-repack.o kcpp-repackmapper_noavx2.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx2.o common.o sampling.o kcpputils.o
|
OBJS_SIMPLE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx2.o ggml-cpu-quants.o kcpp-quantmapper_noavx2.o ggml-repack.o kcpp-repackmapper_noavx2.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx2.o common.o sampling.o kcpputils.o mtmdaudio.o
|
||||||
OBJS_SIMPLER += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx1.o ggml-cpu-quants.o kcpp-quantmapper_noavx1.o ggml-repack.o kcpp-repackmapper_noavx1.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx1.o common.o sampling.o kcpputils.o
|
OBJS_SIMPLER += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx1.o ggml-cpu-quants.o kcpp-quantmapper_noavx1.o ggml-repack.o kcpp-repackmapper_noavx1.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx1.o common.o sampling.o kcpputils.o mtmdaudio.o
|
||||||
OBJS_FAILSAFE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_failsafe.o ggml-cpu-quants.o kcpp-quantmapper_failsafe.o ggml-repack.o kcpp-repackmapper_failsafe.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_failsafe.o common.o sampling.o kcpputils.o
|
OBJS_FAILSAFE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_failsafe.o ggml-cpu-quants.o kcpp-quantmapper_failsafe.o ggml-repack.o kcpp-repackmapper_failsafe.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_failsafe.o common.o sampling.o kcpputils.o mtmdaudio.o
|
||||||
|
|
||||||
# OS specific
|
# OS specific
|
||||||
ifeq ($(UNAME_S),Linux)
|
ifeq ($(UNAME_S),Linux)
|
||||||
|
@ -566,6 +566,8 @@ gguf.o: ggml/src/gguf.cpp ggml/include/gguf.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
kcpputils.o: otherarch/utils.cpp otherarch/utils.h
|
kcpputils.o: otherarch/utils.cpp otherarch/utils.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
mtmdaudio.o: tools/mtmd/mtmd-audio.cpp tools/mtmd/mtmd-audio.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
#these have special gpu defines
|
#these have special gpu defines
|
||||||
ggml-backend_default.o: ggml/src/ggml-backend.cpp ggml/src/ggml-backend-impl.h ggml/include/ggml.h ggml/include/ggml-backend.h
|
ggml-backend_default.o: ggml/src/ggml-backend.cpp ggml/src/ggml-backend-impl.h ggml/include/ggml.h ggml/include/ggml-backend.h
|
||||||
|
|
2
expose.h
2
expose.h
|
@ -3,6 +3,7 @@
|
||||||
|
|
||||||
const int tensor_split_max = 16;
|
const int tensor_split_max = 16;
|
||||||
const int images_max = 8;
|
const int images_max = 8;
|
||||||
|
const int audio_max = 4;
|
||||||
const int logprobs_max = 5;
|
const int logprobs_max = 5;
|
||||||
|
|
||||||
// match kobold's sampler list and order
|
// match kobold's sampler list and order
|
||||||
|
@ -83,6 +84,7 @@ struct generation_inputs
|
||||||
const char * negative_prompt = nullptr;
|
const char * negative_prompt = nullptr;
|
||||||
const float guidance_scale = 1;
|
const float guidance_scale = 1;
|
||||||
const char * images[images_max] = {};
|
const char * images[images_max] = {};
|
||||||
|
const char * audio[audio_max] = {};
|
||||||
const int max_context_length = 0;
|
const int max_context_length = 0;
|
||||||
const int max_length = 0;
|
const int max_length = 0;
|
||||||
const float temperature = 0.0f;
|
const float temperature = 0.0f;
|
||||||
|
|
|
@ -42,12 +42,13 @@
|
||||||
#include "mpt_v3.cpp"
|
#include "mpt_v3.cpp"
|
||||||
#include "tools/mtmd/clip.h"
|
#include "tools/mtmd/clip.h"
|
||||||
#include "tools/mtmd/llava.h"
|
#include "tools/mtmd/llava.h"
|
||||||
|
#include "tools/mtmd/mtmd-audio.h"
|
||||||
#include "common/common.h"
|
#include "common/common.h"
|
||||||
|
|
||||||
//const
|
//const
|
||||||
const int extra_context_handle_fragmentation = 128;
|
const int extra_context_handle_fragmentation = 128;
|
||||||
const int LLAVA_TOKEN_IDENTIFIER_A = -998; //alternate between both, changing when image changes
|
const int MEDIA_TOKEN_IDENTIFIER_A = -998; //alternate between both, changing when image changes
|
||||||
const int LLAVA_TOKEN_IDENTIFIER_B = -999;
|
const int MEDIA_TOKEN_IDENTIFIER_B = -999;
|
||||||
|
|
||||||
//shared
|
//shared
|
||||||
std::string executable_path = "";
|
std::string executable_path = "";
|
||||||
|
@ -100,12 +101,14 @@ static llama_context * llama_ctx_v4 = nullptr;
|
||||||
static llama_context * draft_ctx = nullptr; //will remain null if speculative is unused
|
static llama_context * draft_ctx = nullptr; //will remain null if speculative is unused
|
||||||
static llama_context * guidance_ctx = nullptr; //for classifier free guidance, will be null if unused
|
static llama_context * guidance_ctx = nullptr; //for classifier free guidance, will be null if unused
|
||||||
|
|
||||||
static clip_ctx * clp_ctx = nullptr; //for llava
|
static clip_ctx * clp_ctx_v = nullptr; //for llava
|
||||||
static clip_image_u8 * clp_img_data = nullptr; //most recent image
|
static clip_image_u8 * clp_img_data = nullptr; //most recent image
|
||||||
static std::vector<llava_image> llava_images;
|
static clip_ctx * clp_ctx_a = nullptr; //for audio multimodal
|
||||||
static std::vector<int> last_llava_mem; //for storing dummy tokens that will be consumed by llava
|
static whisper_preprocessor::whisper_filters w_filters; //for audio processing
|
||||||
static std::string llava_composite_image_signature = ""; //for identifying when the llava images change, we need to invalidate the cache
|
static std::vector<media_object> media_objects;
|
||||||
static int current_llava_identifier = LLAVA_TOKEN_IDENTIFIER_A;
|
static std::vector<int> last_media_mem; //for storing dummy tokens that will be consumed by llava
|
||||||
|
static std::string media_composite_image_signature = ""; //for identifying when the llava images change, we need to invalidate the cache
|
||||||
|
static int current_media_identifier = MEDIA_TOKEN_IDENTIFIER_A;
|
||||||
static int vision_max_res = 2048;
|
static int vision_max_res = 2048;
|
||||||
|
|
||||||
static kcpp_params * kcpp_data = nullptr;
|
static kcpp_params * kcpp_data = nullptr;
|
||||||
|
@ -1803,8 +1806,8 @@ static bool kcpp_eval_image(llama_context * ctx_llama, float * img_embd, int num
|
||||||
n_eval = n_batch;
|
n_eval = n_batch;
|
||||||
}
|
}
|
||||||
float * embd = img_embd+i*n_embd;
|
float * embd = img_embd+i*n_embd;
|
||||||
kcpp_embd_batch llava_batch = kcpp_embd_batch(embd, n_eval, *n_past, use_mrope);
|
kcpp_embd_batch media_batch = kcpp_embd_batch(embd, n_eval, *n_past, use_mrope);
|
||||||
if (llama_decode(ctx_llama, llava_batch.batch)) {
|
if (llama_decode(ctx_llama, media_batch.batch)) {
|
||||||
fprintf(stderr, "\n%s : failed to eval image\n", __func__);
|
fprintf(stderr, "\n%s : failed to eval image\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -2431,17 +2434,32 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
/* use_gpu */ true,
|
/* use_gpu */ true,
|
||||||
/* verbosity */ static_cast<ggml_log_level>(1),
|
/* verbosity */ static_cast<ggml_log_level>(1),
|
||||||
});
|
});
|
||||||
clp_ctx = cres.ctx_v;
|
clp_ctx_v = cres.ctx_v;
|
||||||
if(clp_ctx == nullptr) {
|
clp_ctx_a = cres.ctx_a;
|
||||||
|
if(clp_ctx_v == nullptr && clp_ctx_a == nullptr) {
|
||||||
fprintf(stderr, "%s: error: failed to load mmproj model!\n", __func__);
|
fprintf(stderr, "%s: error: failed to load mmproj model!\n", __func__);
|
||||||
return ModelLoadResult::FAIL;
|
return ModelLoadResult::FAIL;
|
||||||
}
|
}
|
||||||
const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
|
const int n_embd_clip = clip_n_mmproj_embd(clp_ctx_v);
|
||||||
const int n_embd_llm = llama_n_embd(llamamodel);
|
const int n_embd_llm = llama_n_embd(llamamodel);
|
||||||
|
if (clp_ctx_v && clp_ctx_a) {
|
||||||
|
int n_embd_a = clip_n_mmproj_embd(clp_ctx_a);
|
||||||
|
if (n_embd_clip != n_embd_a) {
|
||||||
|
fprintf(stderr, "%s: mmproj embedding mismatch between Audio and Vision (%d and %d)! Make sure you use the correct mmproj file!\n", __func__,n_embd_clip, n_embd_a);
|
||||||
|
return ModelLoadResult::FAIL;
|
||||||
|
}
|
||||||
|
}
|
||||||
if (n_embd_clip != n_embd_llm) {
|
if (n_embd_clip != n_embd_llm) {
|
||||||
fprintf(stderr, "%s: mmproj embedding mismatch (%d and %d)! Make sure you use the correct mmproj file!\n", __func__,n_embd_clip, n_embd_llm);
|
fprintf(stderr, "%s: mmproj embedding mismatch (%d and %d)! Make sure you use the correct mmproj file!\n", __func__,n_embd_clip, n_embd_llm);
|
||||||
return ModelLoadResult::FAIL;
|
return ModelLoadResult::FAIL;
|
||||||
}
|
}
|
||||||
|
if(clp_ctx_a) //init audio
|
||||||
|
{
|
||||||
|
if (clip_has_whisper_encoder(clp_ctx_a)) {
|
||||||
|
// TODO @ngxson : check if model n_mel is 128 or 80
|
||||||
|
w_filters = whisper_precalc_filters::get_128_bins();
|
||||||
|
}
|
||||||
|
}
|
||||||
clp_img_data = clip_image_u8_init();
|
clp_img_data = clip_image_u8_init();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2454,7 +2472,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
{
|
{
|
||||||
printf("Error: Speculative decoding cannot be used with Recurrent models!\n");
|
printf("Error: Speculative decoding cannot be used with Recurrent models!\n");
|
||||||
}
|
}
|
||||||
else if(clp_ctx!=nullptr)
|
else if(clp_ctx_v!=nullptr)
|
||||||
{
|
{
|
||||||
printf("Error: Speculative decoding cannot be used with multimodal vision projectors!\n");
|
printf("Error: Speculative decoding cannot be used with multimodal vision projectors!\n");
|
||||||
}
|
}
|
||||||
|
@ -2988,54 +3006,113 @@ int GetThreadsToUse(bool blasmode)
|
||||||
}
|
}
|
||||||
|
|
||||||
//this function prepares the clip embds for llava. it's only needed when images change
|
//this function prepares the clip embds for llava. it's only needed when images change
|
||||||
static void PrepareLlavaEmbds(const int nctx, const std::vector<int> & llava_sep, const std::vector<int> & llava_intro)
|
static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_sep, const std::vector<int> & media_intro)
|
||||||
{
|
{
|
||||||
if(clp_ctx!=nullptr && clp_img_data!=nullptr)
|
bool vision_on = (clp_ctx_v != nullptr && clp_img_data != nullptr);
|
||||||
|
bool audio_on = (clp_ctx_a != nullptr);
|
||||||
|
if (vision_on || audio_on)
|
||||||
{
|
{
|
||||||
int sepsize = llava_sep.size();
|
int sepsize = media_sep.size();
|
||||||
int introsize = llava_intro.size();
|
int introsize = media_intro.size();
|
||||||
last_llava_mem.clear();
|
last_media_mem.clear();
|
||||||
|
|
||||||
for(int i=0;i<llava_images.size();++i)
|
for(int i=0;i<media_objects.size();++i)
|
||||||
{
|
{
|
||||||
std::string llava_image = llava_images[i].b64data;
|
std::string media_obj = media_objects[i].b64data;
|
||||||
const std::vector<uint8_t> image_buffer = kcpp_base64_decode(llava_image);
|
const std::vector<uint8_t> media_data_buffer = kcpp_base64_decode(media_obj);
|
||||||
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), clp_img_data, vision_max_res))
|
if(!media_objects[i].is_audio && vision_on)
|
||||||
{
|
{
|
||||||
//failed to load image
|
//images
|
||||||
printf("\nError: Clip image %d failed to load!",i);
|
if (!clip_image_load_from_bytes(media_data_buffer.data(), media_data_buffer.size(), clp_img_data, vision_max_res))
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if(debugmode==1 && !is_quiet)
|
|
||||||
{
|
{
|
||||||
printf("\nCreating clip image embed...");
|
//failed to load image
|
||||||
|
printf("\nError: Clip image %d failed to load!",i);
|
||||||
}
|
}
|
||||||
llava_images[i].clp_image_tokens = 0;
|
else
|
||||||
if (!llava_image_embed_make_with_clip_img(clp_ctx, kcpp_data->n_threads, clp_img_data, &llava_images[i].clp_img_embd, &llava_images[i].clp_image_tokens)) {
|
|
||||||
printf("\nError: Clip image %d failed to create embd!",i);
|
|
||||||
}
|
|
||||||
if(debugmode==1 && !is_quiet)
|
|
||||||
{
|
{
|
||||||
printf("\nLLAVA Clip Embed %i used Tokens: %d",i,llava_images[i].clp_image_tokens);
|
if(debugmode==1 && !is_quiet)
|
||||||
|
{
|
||||||
|
printf("\nCreating clip image embed...");
|
||||||
|
}
|
||||||
|
media_chunk chunk;
|
||||||
|
if (!llava_image_embed_make_with_clip_img(clp_ctx_v, kcpp_data->n_threads, clp_img_data, &chunk.clp_img_embd, &chunk.clp_image_tokens)) {
|
||||||
|
printf("\nError: Clip image %d failed to create embd!",i);
|
||||||
|
}
|
||||||
|
if(debugmode==1 && !is_quiet)
|
||||||
|
{
|
||||||
|
printf("\nVision Clip Embed %i used Tokens: %d",i,chunk.clp_image_tokens);
|
||||||
|
}
|
||||||
|
int cliptokensneeded = chunk.clp_image_tokens;
|
||||||
|
if(cliptokensneeded>0 && cliptokensneeded < nctx)
|
||||||
|
{
|
||||||
|
int tokcnt = (i==0?(chunk.clp_image_tokens):(chunk.clp_image_tokens+sepsize));
|
||||||
|
if(i==0)
|
||||||
|
{
|
||||||
|
tokcnt += introsize;
|
||||||
|
}
|
||||||
|
for(int n=0;n<tokcnt;++n)
|
||||||
|
{
|
||||||
|
last_media_mem.push_back(current_media_identifier);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
printf("\nWarning: LLAVA Image excluded - Context size too low or not enough clip tokens! (needed %d)\n",cliptokensneeded);
|
||||||
|
}
|
||||||
|
media_objects[i].mediachunks.push_back(chunk);
|
||||||
}
|
}
|
||||||
int cliptokensneeded = llava_images[i].clp_image_tokens;
|
} else if(media_objects[i].is_audio && audio_on) {
|
||||||
|
// audio
|
||||||
|
GGML_ASSERT(w_filters.n_mel); // make sure we have filter preloaded
|
||||||
|
|
||||||
|
std::vector<float> pcmf32;
|
||||||
|
bool ok = kcpp_decode_audio_from_buf(media_data_buffer.data(), media_data_buffer.size(), 16000, pcmf32);
|
||||||
|
if (!ok) {
|
||||||
|
printf("\nError: Clip audio %d failed to convert!",i);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<whisper_preprocessor::whisper_mel> mel_spec_chunks;
|
||||||
|
ok = whisper_preprocessor::preprocess_audio(pcmf32.data(), pcmf32.size(), w_filters, mel_spec_chunks);
|
||||||
|
if (!ok) {
|
||||||
|
printf("\nError: Clip audio %d failed to load!",i);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// consider each mel_spec as a separate audio chunk
|
||||||
|
int total_chunk_tokens = 0;
|
||||||
|
for (auto & mel_spec : mel_spec_chunks) {
|
||||||
|
media_chunk chunk;
|
||||||
|
bool ok = audio_embd_make_with_clip_img(clp_ctx_a, kcpp_data->n_threads, mel_spec, &chunk.clp_img_embd, &chunk.clp_image_tokens);
|
||||||
|
if (!ok) {
|
||||||
|
printf("\nError: Clip audio chunk in %d failed to make embd!",i);
|
||||||
|
} else {
|
||||||
|
if(debugmode==1 && !is_quiet)
|
||||||
|
{
|
||||||
|
printf("\nAudio Clip Embed Chunk %i used Tokens: %d",i,chunk.clp_image_tokens);
|
||||||
|
}
|
||||||
|
total_chunk_tokens += chunk.clp_image_tokens;
|
||||||
|
media_objects[i].mediachunks.push_back(chunk);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int cliptokensneeded = total_chunk_tokens;
|
||||||
if(cliptokensneeded>0 && cliptokensneeded < nctx)
|
if(cliptokensneeded>0 && cliptokensneeded < nctx)
|
||||||
{
|
{
|
||||||
int tokcnt = (i==0?(llava_images[i].clp_image_tokens):(llava_images[i].clp_image_tokens+sepsize));
|
int tokcnt = (i==0?(cliptokensneeded):(cliptokensneeded+sepsize));
|
||||||
if(i==0)
|
if(i==0)
|
||||||
{
|
{
|
||||||
tokcnt += introsize;
|
tokcnt += introsize;
|
||||||
}
|
}
|
||||||
for(int n=0;n<tokcnt;++n)
|
for(int n=0;n<tokcnt;++n)
|
||||||
{
|
{
|
||||||
last_llava_mem.push_back(current_llava_identifier);
|
last_media_mem.push_back(current_media_identifier);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
printf("\nWarning: LLAVA Image excluded - Context size too low or not enough clip tokens! (needed %d)\n",cliptokensneeded);
|
printf("\nWarning: Audio Embd excluded - Context size too low or not enough clip tokens! (needed %d)\n",cliptokensneeded);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3082,7 +3159,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
||||||
double time0 = 0, time1 = 0, time2 = 0;
|
double time0 = 0, time1 = 0, time2 = 0;
|
||||||
timer_start();
|
timer_start();
|
||||||
|
|
||||||
bool llava_images_changed = false;
|
bool media_data_changed = false;
|
||||||
|
|
||||||
for(int x=0;x<inputs.stop_sequence_len;++x)
|
for(int x=0;x<inputs.stop_sequence_len;++x)
|
||||||
{
|
{
|
||||||
|
@ -3184,37 +3261,57 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
||||||
std::string negative_prompt = inputs.negative_prompt;
|
std::string negative_prompt = inputs.negative_prompt;
|
||||||
|
|
||||||
//clear previous run llava embd memory, just-in-time free
|
//clear previous run llava embd memory, just-in-time free
|
||||||
for(int i=0;i<llava_images.size();++i)
|
for(int i=0;i<media_objects.size();++i)
|
||||||
{
|
{
|
||||||
if(llava_images[i].b64data!="" && llava_images[i].clp_img_embd!=nullptr)
|
if(media_objects[i].b64data!="")
|
||||||
{
|
{
|
||||||
free(llava_images[i].clp_img_embd);
|
for(int j=0;j<media_objects[i].mediachunks.size();++j)
|
||||||
llava_images[i].clp_img_embd = nullptr;
|
{
|
||||||
|
if(media_objects[i].mediachunks[j].clp_img_embd!=nullptr)
|
||||||
|
{
|
||||||
|
free(media_objects[i].mediachunks[j].clp_img_embd);
|
||||||
|
media_objects[i].mediachunks[j].clp_img_embd = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
media_objects[i].mediachunks.clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
llava_images.clear();
|
media_objects.clear();
|
||||||
std::string new_llava_composite = "";
|
std::string new_media_composite = "";
|
||||||
for(int x=0;x<images_max;++x)
|
for(int x=0;x<images_max;++x)
|
||||||
{
|
{
|
||||||
std::string item = inputs.images[x];
|
std::string item = inputs.images[x];
|
||||||
if(item!="")
|
if(item!="")
|
||||||
{
|
{
|
||||||
llava_image lv;
|
media_object lv;
|
||||||
lv.b64data = item;
|
lv.b64data = item;
|
||||||
llava_images.push_back(lv);
|
lv.is_audio = false;
|
||||||
new_llava_composite += item;
|
media_objects.push_back(lv);
|
||||||
|
new_media_composite += item;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(llava_composite_image_signature!=new_llava_composite)
|
for(int x=0;x<audio_max;++x)
|
||||||
|
{
|
||||||
|
std::string item = inputs.audio[x];
|
||||||
|
if(item!="")
|
||||||
|
{
|
||||||
|
media_object lv;
|
||||||
|
lv.b64data = item;
|
||||||
|
lv.is_audio = true;
|
||||||
|
media_objects.push_back(lv);
|
||||||
|
new_media_composite += item;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(media_composite_image_signature!=new_media_composite)
|
||||||
{
|
{
|
||||||
//images have changed. swap identifiers to force reprocessing
|
//images have changed. swap identifiers to force reprocessing
|
||||||
current_llava_identifier = (current_llava_identifier==LLAVA_TOKEN_IDENTIFIER_A?LLAVA_TOKEN_IDENTIFIER_B:LLAVA_TOKEN_IDENTIFIER_A);
|
current_media_identifier = (current_media_identifier==MEDIA_TOKEN_IDENTIFIER_A?MEDIA_TOKEN_IDENTIFIER_B:MEDIA_TOKEN_IDENTIFIER_A);
|
||||||
llava_composite_image_signature = new_llava_composite;
|
media_composite_image_signature = new_media_composite;
|
||||||
if(debugmode==1 && !is_quiet)
|
if(debugmode==1 && !is_quiet)
|
||||||
{
|
{
|
||||||
printf("\nLLAVA images changed, existing cache invalidated");
|
printf("\nLLAVA images changed, existing cache invalidated");
|
||||||
}
|
}
|
||||||
llava_images_changed = true;
|
media_data_changed = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
kcpp_data->prompt = inputs.prompt;
|
kcpp_data->prompt = inputs.prompt;
|
||||||
|
@ -3373,26 +3470,26 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<int> embd_inp;
|
std::vector<int> embd_inp;
|
||||||
std::vector<int> embd_inp_mem; //for storing added memory
|
std::vector<int> embd_inp_mem; //for storing added memory
|
||||||
std::vector<int> llava_sep; //to separate between different llava images
|
std::vector<int> media_sep; //to separate between different llava images
|
||||||
std::vector<int> llava_intro; //to separate between different llava images
|
std::vector<int> media_intro; //to separate between different llava images
|
||||||
std::vector<int> guidance_embd; //holds the guidance prompt
|
std::vector<int> guidance_embd; //holds the guidance prompt
|
||||||
bool llava_embds_built = false;
|
bool media_embds_built = false;
|
||||||
|
|
||||||
int32_t nctx = kcpp_data->n_ctx;
|
int32_t nctx = kcpp_data->n_ctx;
|
||||||
|
|
||||||
TokenizeString(kcpp_data->prompt, embd_inp, file_format, add_bos_token);
|
TokenizeString(kcpp_data->prompt, embd_inp, file_format, add_bos_token);
|
||||||
bool use_mrope = (file_format == FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL);
|
bool use_mrope = (file_format == FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL);
|
||||||
TokenizeString("\n\n", llava_sep, file_format, false);
|
TokenizeString("\n\n", media_sep, file_format, false);
|
||||||
TokenizeString("\nImages:\n", llava_intro, file_format, false);
|
TokenizeString("\nImages:\n", media_intro, file_format, false);
|
||||||
|
|
||||||
if(llava_composite_image_signature=="")
|
if(media_composite_image_signature=="")
|
||||||
{
|
{
|
||||||
last_llava_mem.clear();
|
last_media_mem.clear();
|
||||||
}
|
}
|
||||||
if(llava_images_changed)
|
if(media_data_changed)
|
||||||
{
|
{
|
||||||
PrepareLlavaEmbds(nctx, llava_sep, llava_intro);
|
PrepareMediaEmbds(nctx, media_sep, media_intro);
|
||||||
llava_embds_built = true;
|
media_embds_built = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(addedmemory!="")
|
if(addedmemory!="")
|
||||||
|
@ -3415,9 +3512,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(last_llava_mem.size()>0) //stick the llava mem before the added mem
|
if(last_media_mem.size()>0) //stick the llava mem before the added mem
|
||||||
{
|
{
|
||||||
if(last_llava_mem.size() + kcpp_data->n_predict + 4 > nctx)
|
if(last_media_mem.size() + kcpp_data->n_predict + 4 > nctx)
|
||||||
{
|
{
|
||||||
printf("\nWarning: Too many LLaVA tokens, max context exceeded! They will be ignored!\n");
|
printf("\nWarning: Too many LLaVA tokens, max context exceeded! They will be ignored!\n");
|
||||||
}
|
}
|
||||||
|
@ -3433,7 +3530,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
||||||
}
|
}
|
||||||
|
|
||||||
//append llava dummy tokens
|
//append llava dummy tokens
|
||||||
embd_inp_mem.insert(embd_inp_mem.begin(), last_llava_mem.begin(), last_llava_mem.end());
|
embd_inp_mem.insert(embd_inp_mem.begin(), last_media_mem.begin(), last_media_mem.end());
|
||||||
if (bos.size() > 0 && embd_inp_mem.size() > 0)
|
if (bos.size() > 0 && embd_inp_mem.size() > 0)
|
||||||
{
|
{
|
||||||
embd_inp_mem.insert(embd_inp_mem.begin(), bos[0]); //insert bos at front
|
embd_inp_mem.insert(embd_inp_mem.begin(), bos[0]); //insert bos at front
|
||||||
|
@ -4159,12 +4256,12 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
||||||
while ((int)embd_inp.size() > input_consumed)
|
while ((int)embd_inp.size() > input_consumed)
|
||||||
{
|
{
|
||||||
int currtoken = embd_inp[input_consumed];
|
int currtoken = embd_inp[input_consumed];
|
||||||
if(currtoken==LLAVA_TOKEN_IDENTIFIER_A || currtoken==LLAVA_TOKEN_IDENTIFIER_B) //special llava token hit
|
if(currtoken==MEDIA_TOKEN_IDENTIFIER_A || currtoken==MEDIA_TOKEN_IDENTIFIER_B) //special llava token hit
|
||||||
{
|
{
|
||||||
if(!llava_embds_built) //this should never happen! however, handle it anyway
|
if(!media_embds_built) //this should never happen! however, handle it anyway
|
||||||
{
|
{
|
||||||
PrepareLlavaEmbds(nctx, llava_sep, llava_intro);
|
PrepareMediaEmbds(nctx, media_sep, media_intro);
|
||||||
llava_embds_built = true;
|
media_embds_built = true;
|
||||||
printf("\nSomehow vision embd was not prepared (maybe no fast forward), rebuilding it...\n");
|
printf("\nSomehow vision embd was not prepared (maybe no fast forward), rebuilding it...\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4178,9 +4275,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
||||||
//batch is empty, do image processing
|
//batch is empty, do image processing
|
||||||
int llavatokenscounted = 0;
|
int llavatokenscounted = 0;
|
||||||
int llavatokensevaled = 0;
|
int llavatokensevaled = 0;
|
||||||
int sepsize = llava_sep.size();
|
int sepsize = media_sep.size();
|
||||||
int introsize = llava_intro.size();
|
int introsize = media_intro.size();
|
||||||
while(input_consumed < embd_inp.size() && (embd_inp[input_consumed]==LLAVA_TOKEN_IDENTIFIER_A || embd_inp[input_consumed]==LLAVA_TOKEN_IDENTIFIER_B))
|
while(input_consumed < embd_inp.size() && (embd_inp[input_consumed]==MEDIA_TOKEN_IDENTIFIER_A || embd_inp[input_consumed]==MEDIA_TOKEN_IDENTIFIER_B))
|
||||||
{
|
{
|
||||||
if (!last_n_tokens.empty())
|
if (!last_n_tokens.empty())
|
||||||
{
|
{
|
||||||
|
@ -4191,13 +4288,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
||||||
++input_consumed;
|
++input_consumed;
|
||||||
++llavatokenscounted;
|
++llavatokenscounted;
|
||||||
}
|
}
|
||||||
for(int i=0;i<llava_images.size();++i)
|
for(int i=0;i<media_objects.size();++i)
|
||||||
{
|
{
|
||||||
//note: no handling for draft_ctx as we don't support vision for it
|
//note: no handling for draft_ctx as we don't support vision for it
|
||||||
if(introsize>0 && i==0)
|
if(introsize>0 && i==0)
|
||||||
{
|
{
|
||||||
//added at the start of everything
|
//added at the start of everything
|
||||||
kcpp_embd_batch batch = kcpp_embd_batch(llava_intro, n_past, use_mrope, false);
|
kcpp_embd_batch batch = kcpp_embd_batch(media_intro, n_past, use_mrope, false);
|
||||||
auto evr = llama_decode(llama_ctx_v4, batch.batch);
|
auto evr = llama_decode(llama_ctx_v4, batch.batch);
|
||||||
if(evr!=0)
|
if(evr!=0)
|
||||||
{
|
{
|
||||||
|
@ -4213,41 +4310,45 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
|
||||||
if(sepsize>0 && i>0)
|
if(sepsize>0 && i>0)
|
||||||
{
|
{
|
||||||
//add a separator between each image
|
//add a separator between each image
|
||||||
kcpp_embd_batch batch = kcpp_embd_batch(llava_sep, n_past, use_mrope, false);
|
kcpp_embd_batch batch = kcpp_embd_batch(media_sep, n_past, use_mrope, false);
|
||||||
auto evr = llama_decode(llama_ctx_v4, batch.batch);
|
auto evr = llama_decode(llama_ctx_v4, batch.batch);
|
||||||
if(evr!=0)
|
if(evr!=0)
|
||||||
{
|
{
|
||||||
printf("\nError when appending llava separator: %d\n",evr);
|
printf("\nError when appending media separator: %d\n",evr);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
printf("\rProcessing LLaVa Separator (%d tokens)",sepsize);
|
printf("\rProcessing Media Separator (%d tokens)",sepsize);
|
||||||
}
|
}
|
||||||
n_past += sepsize;
|
n_past += sepsize;
|
||||||
llavatokensevaled += sepsize;
|
llavatokensevaled += sepsize;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(allow_regular_prints)
|
for(int j=0;j<media_objects[i].mediachunks.size();++j)
|
||||||
{
|
{
|
||||||
printf("\rProcessing LLaVa Embedding %d (%d tokens)",(i+1), llava_images[i].clp_image_tokens);
|
media_chunk chunk = media_objects[i].mediachunks[j];
|
||||||
}
|
if(allow_regular_prints)
|
||||||
bool err = kcpp_eval_image(llama_ctx_v4,llava_images[i].clp_img_embd,llava_images[i].clp_image_tokens,kcpp_data->n_batch,&n_past);
|
{
|
||||||
llavatokensevaled += llava_images[i].clp_image_tokens;
|
printf("\rProcessing Media Embedding %d (%d tokens)",(i+1), chunk.clp_image_tokens);
|
||||||
if(!err)
|
}
|
||||||
{
|
bool err = kcpp_eval_image(llama_ctx_v4,chunk.clp_img_embd,chunk.clp_image_tokens,kcpp_data->n_batch,&n_past);
|
||||||
llava_composite_image_signature = ""; //force invalidate
|
llavatokensevaled += chunk.clp_image_tokens;
|
||||||
fprintf(stderr, "\nFailed to eval llava image at %d!\n",n_past);
|
if(!err)
|
||||||
output.text = nullptr;
|
{
|
||||||
output.status = 0;
|
media_composite_image_signature = ""; //force invalidate
|
||||||
output.prompt_tokens = output.completion_tokens = 0;
|
fprintf(stderr, "\nFailed to eval llava image at %d!\n",n_past);
|
||||||
output.stopreason = stop_reason::INVALID;
|
output.text = nullptr;
|
||||||
generation_finished = true;
|
output.status = 0;
|
||||||
return output;
|
output.prompt_tokens = output.completion_tokens = 0;
|
||||||
|
output.stopreason = stop_reason::INVALID;
|
||||||
|
generation_finished = true;
|
||||||
|
return output;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(llavatokenscounted!=llavatokensevaled)
|
if(llavatokenscounted!=llavatokensevaled)
|
||||||
{
|
{
|
||||||
llava_composite_image_signature = ""; //force invalidate
|
media_composite_image_signature = ""; //force invalidate
|
||||||
fprintf(stderr, "\nLLAVA image tokens mismatch at %d! (%d vs %d tokens)\n",n_past,llavatokenscounted,llavatokensevaled);
|
fprintf(stderr, "\nLLAVA image tokens mismatch at %d! (%d vs %d tokens)\n",n_past,llavatokenscounted,llavatokensevaled);
|
||||||
output.text = nullptr;
|
output.text = nullptr;
|
||||||
output.status = 0;
|
output.status = 0;
|
||||||
|
|
|
@ -17874,9 +17874,9 @@ Current version indicated by LITEVER below.
|
||||||
{
|
{
|
||||||
return render_audio_html(data);
|
return render_audio_html(data);
|
||||||
}
|
}
|
||||||
else if(data.startsWith("data:image"))
|
else //also handles ALL pending items
|
||||||
{
|
{
|
||||||
return render_image_html(data, pend_txt, siclass)
|
return render_image_html(data, pend_txt, siclass);
|
||||||
}
|
}
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
|
@ -44,6 +44,7 @@ import subprocess
|
||||||
sampler_order_max = 7
|
sampler_order_max = 7
|
||||||
tensor_split_max = 16
|
tensor_split_max = 16
|
||||||
images_max = 8
|
images_max = 8
|
||||||
|
audio_max = 4
|
||||||
bias_min_value = -100.0
|
bias_min_value = -100.0
|
||||||
bias_max_value = 100.0
|
bias_max_value = 100.0
|
||||||
logprobs_max = 5
|
logprobs_max = 5
|
||||||
|
@ -215,6 +216,7 @@ class generation_inputs(ctypes.Structure):
|
||||||
("negative_prompt", ctypes.c_char_p),
|
("negative_prompt", ctypes.c_char_p),
|
||||||
("guidance_scale", ctypes.c_float),
|
("guidance_scale", ctypes.c_float),
|
||||||
("images", ctypes.c_char_p * images_max),
|
("images", ctypes.c_char_p * images_max),
|
||||||
|
("audio", ctypes.c_char_p * audio_max),
|
||||||
("max_context_length", ctypes.c_int),
|
("max_context_length", ctypes.c_int),
|
||||||
("max_length", ctypes.c_int),
|
("max_length", ctypes.c_int),
|
||||||
("temperature", ctypes.c_float),
|
("temperature", ctypes.c_float),
|
||||||
|
@ -1402,6 +1404,7 @@ def generate(genparams, stream_flag=False):
|
||||||
negative_prompt = genparams.get('negative_prompt', "")
|
negative_prompt = genparams.get('negative_prompt', "")
|
||||||
guidance_scale = tryparsefloat(genparams.get('guidance_scale', 1.0),1.0)
|
guidance_scale = tryparsefloat(genparams.get('guidance_scale', 1.0),1.0)
|
||||||
images = genparams.get('images', [])
|
images = genparams.get('images', [])
|
||||||
|
audio = genparams.get('audio', [])
|
||||||
max_context_length = tryparseint(genparams.get('max_context_length', maxctx),maxctx)
|
max_context_length = tryparseint(genparams.get('max_context_length', maxctx),maxctx)
|
||||||
max_length = tryparseint(genparams.get('max_length', args.defaultgenamt),args.defaultgenamt)
|
max_length = tryparseint(genparams.get('max_length', args.defaultgenamt),args.defaultgenamt)
|
||||||
temperature = tryparsefloat(genparams.get('temperature', adapter_obj.get("temperature", 0.75)),0.75)
|
temperature = tryparsefloat(genparams.get('temperature', adapter_obj.get("temperature", 0.75)),0.75)
|
||||||
|
@ -1468,6 +1471,11 @@ def generate(genparams, stream_flag=False):
|
||||||
inputs.images[n] = "".encode("UTF-8")
|
inputs.images[n] = "".encode("UTF-8")
|
||||||
else:
|
else:
|
||||||
inputs.images[n] = images[n].encode("UTF-8")
|
inputs.images[n] = images[n].encode("UTF-8")
|
||||||
|
for n in range(audio_max):
|
||||||
|
if not audio or n >= len(audio):
|
||||||
|
inputs.audio[n] = "".encode("UTF-8")
|
||||||
|
else:
|
||||||
|
inputs.audio[n] = audio[n].encode("UTF-8")
|
||||||
global showmaxctxwarning
|
global showmaxctxwarning
|
||||||
if max_context_length > maxctx:
|
if max_context_length > maxctx:
|
||||||
if showmaxctxwarning:
|
if showmaxctxwarning:
|
||||||
|
|
|
@ -502,11 +502,16 @@ struct mpt_model {
|
||||||
std::map<std::string, struct ggml_v3_tensor *> tensors;
|
std::map<std::string, struct ggml_v3_tensor *> tensors;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llava_image
|
struct media_chunk
|
||||||
|
{
|
||||||
|
int32_t clp_image_tokens = 0; //holds number of tokens llava used in this chunk
|
||||||
|
float * clp_img_embd = nullptr; //this holds dynamic memory and must be freed each use!
|
||||||
|
};
|
||||||
|
struct media_object
|
||||||
{
|
{
|
||||||
std::string b64data = "";
|
std::string b64data = "";
|
||||||
int32_t clp_image_tokens = 0; //holds number of tokens llava used
|
std::vector<media_chunk> mediachunks;
|
||||||
float * clp_img_embd = nullptr; //this holds dynamic memory and must be freed each use!
|
bool is_audio = false; //if true its audio, otherwise its vision
|
||||||
};
|
};
|
||||||
|
|
||||||
struct speculative_draft_result
|
struct speculative_draft_result
|
||||||
|
|
|
@ -11,6 +11,17 @@
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
|
||||||
|
#define MINIAUDIO_IMPLEMENTATION
|
||||||
|
#ifndef MTMD_AUDIO_DEBUG
|
||||||
|
# define MA_NO_ENCODING
|
||||||
|
#endif
|
||||||
|
#define MA_NO_DEVICE_IO
|
||||||
|
#define MA_NO_RESOURCE_MANAGER
|
||||||
|
#define MA_NO_NODE_GRAPH
|
||||||
|
#define MA_NO_ENGINE
|
||||||
|
#define MA_NO_GENERATION
|
||||||
|
#define MA_API static
|
||||||
|
#include "miniaudio/miniaudio.h"
|
||||||
|
|
||||||
void utreplace(std::string & str, const std::string & needle, const std::string & replacement) {
|
void utreplace(std::string & str, const std::string & needle, const std::string & replacement) {
|
||||||
size_t pos = 0;
|
size_t pos = 0;
|
||||||
|
@ -501,47 +512,47 @@ kcpp_embd_batch::kcpp_embd_batch(float * embd, int32_t n_tokens, int32_t npast,
|
||||||
|
|
||||||
kcpp_embd_batch::kcpp_embd_batch(std::vector<llama_token> & tokens, int32_t npast, bool use_mrope, bool return_all_logits)
|
kcpp_embd_batch::kcpp_embd_batch(std::vector<llama_token> & tokens, int32_t npast, bool use_mrope, bool return_all_logits)
|
||||||
{
|
{
|
||||||
int32_t seq_id = 0;
|
int32_t seq_id = 0;
|
||||||
int32_t n_tokens = tokens.size();
|
int32_t n_tokens = tokens.size();
|
||||||
pos.resize(n_tokens * (use_mrope?4:1));
|
pos.resize(n_tokens * (use_mrope?4:1));
|
||||||
std::fill(pos.begin(), pos.end(), 0);
|
std::fill(pos.begin(), pos.end(), 0);
|
||||||
n_seq_id.resize(n_tokens);
|
n_seq_id.resize(n_tokens);
|
||||||
seq_ids.resize(n_tokens + 1);
|
seq_ids.resize(n_tokens + 1);
|
||||||
logits.resize(n_tokens);
|
logits.resize(n_tokens);
|
||||||
seq_id_0.resize(1);
|
seq_id_0.resize(1);
|
||||||
seq_id_0[0] = seq_id;
|
seq_id_0[0] = seq_id;
|
||||||
seq_ids[n_tokens] = nullptr;
|
seq_ids[n_tokens] = nullptr;
|
||||||
batch = {
|
batch = {
|
||||||
/*n_tokens =*/ n_tokens,
|
/*n_tokens =*/ n_tokens,
|
||||||
/*tokens =*/ tokens.data(),
|
/*tokens =*/ tokens.data(),
|
||||||
/*embd =*/ nullptr,
|
/*embd =*/ nullptr,
|
||||||
/*pos =*/ pos.data(),
|
/*pos =*/ pos.data(),
|
||||||
/*n_seq_id =*/ n_seq_id.data(),
|
/*n_seq_id =*/ n_seq_id.data(),
|
||||||
/*seq_id =*/ seq_ids.data(),
|
/*seq_id =*/ seq_ids.data(),
|
||||||
/*logits =*/ logits.data(),
|
/*logits =*/ logits.data(),
|
||||||
};
|
};
|
||||||
|
|
||||||
if(!use_mrope)
|
if(!use_mrope)
|
||||||
{
|
{
|
||||||
for (int i = 0; i < n_tokens; i++) {
|
for (int i = 0; i < n_tokens; i++) {
|
||||||
batch.pos [i] = npast + i;
|
batch.pos [i] = npast + i;
|
||||||
batch.n_seq_id[i] = 1;
|
batch.n_seq_id[i] = 1;
|
||||||
batch.seq_id [i] = seq_id_0.data();
|
batch.seq_id [i] = seq_id_0.data();
|
||||||
batch.logits [i] = (return_all_logits?true:false);
|
batch.logits [i] = (return_all_logits?true:false);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else
|
}
|
||||||
{
|
else
|
||||||
for (int i = 0; i < n_tokens; i++) {
|
{
|
||||||
batch.n_seq_id[i] = 1;
|
for (int i = 0; i < n_tokens; i++) {
|
||||||
batch.seq_id [i] = seq_id_0.data();
|
batch.n_seq_id[i] = 1;
|
||||||
batch.logits [i] = (return_all_logits?true:false);
|
batch.seq_id [i] = seq_id_0.data();
|
||||||
}
|
batch.logits [i] = (return_all_logits?true:false);
|
||||||
for (int j = 0; j < batch.n_tokens * 3; j++) {
|
|
||||||
batch.pos[j] = npast + (j % batch.n_tokens);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
batch.logits[n_tokens - 1] = true;
|
for (int j = 0; j < batch.n_tokens * 3; j++) {
|
||||||
|
batch.pos[j] = npast + (j % batch.n_tokens);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
batch.logits[n_tokens - 1] = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> split_string(const std::string& input, const std::string& separator) {
|
std::vector<std::string> split_string(const std::string& input, const std::string& separator) {
|
||||||
|
@ -559,4 +570,59 @@ std::vector<std::string> split_string(const std::string& input, const std::strin
|
||||||
result.push_back(input.substr(start));
|
result.push_back(input.substr(start));
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static bool buf_is_audio_file(const char * buf, size_t len) {
|
||||||
|
if (len < 12) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
|
||||||
|
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
|
||||||
|
bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
|
||||||
|
bool is_mp3 = len >= 3 && (
|
||||||
|
memcmp(buf, "ID3", 3) == 0 ||
|
||||||
|
// Check for MPEG sync word (simplified check)
|
||||||
|
((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
|
||||||
|
);
|
||||||
|
bool is_flac = memcmp(buf, "fLaC", 4) == 0;
|
||||||
|
|
||||||
|
return is_wav || is_mp3 || is_flac;
|
||||||
|
}
|
||||||
|
|
||||||
|
// returns true if the buffer is a valid audio file
|
||||||
|
bool kcpp_decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
|
||||||
|
if (!buf_is_audio_file((const char *)buf_in, len))
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
ma_result result;
|
||||||
|
const int channels = 1;
|
||||||
|
ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
|
||||||
|
ma_decoder decoder;
|
||||||
|
|
||||||
|
result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
|
||||||
|
if (result != MA_SUCCESS) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
ma_uint64 frame_count;
|
||||||
|
ma_uint64 frames_read;
|
||||||
|
result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
|
||||||
|
if (result != MA_SUCCESS) {
|
||||||
|
ma_decoder_uninit(&decoder);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
pcmf32_mono.resize(frame_count);
|
||||||
|
result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
|
||||||
|
if (result != MA_SUCCESS) {
|
||||||
|
ma_decoder_uninit(&decoder);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
ma_decoder_uninit(&decoder);
|
||||||
|
return true;
|
||||||
}
|
}
|
|
@ -66,6 +66,7 @@ std::vector<float> resample_wav(const std::vector<float>& input, uint32_t input_
|
||||||
int32_t kcpp_quick_sample(float * logits, const int n_logits, const std::vector<int32_t> & last_n_tokens, float rep_pen, float top_p, int top_k, float temp, std::mt19937 & rng);
|
int32_t kcpp_quick_sample(float * logits, const int n_logits, const std::vector<int32_t> & last_n_tokens, float rep_pen, float top_p, int top_k, float temp, std::mt19937 & rng);
|
||||||
|
|
||||||
std::vector<std::string> split_string(const std::string& input, const std::string& separator);
|
std::vector<std::string> split_string(const std::string& input, const std::string& separator);
|
||||||
|
bool kcpp_decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono);
|
||||||
|
|
||||||
struct kcpp_embd_batch { //duplcated from llava_embd_batch
|
struct kcpp_embd_batch { //duplcated from llava_embd_batch
|
||||||
std::vector<int32_t> pos;
|
std::vector<int32_t> pos;
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
|
#include "clip-impl.h"
|
||||||
#include "llava.h"
|
#include "llava.h"
|
||||||
|
#include "mtmd-audio.h"
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "ggml-cpp.h"
|
#include "ggml-cpp.h"
|
||||||
|
@ -13,35 +15,6 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
#if defined(LLAVA_LOG_OFF)
|
|
||||||
# define LOG_INF(...)
|
|
||||||
# define LOG_WRN(...)
|
|
||||||
# define LOG_ERR(...)
|
|
||||||
# define LOG_DBG(...)
|
|
||||||
#else // defined(LLAVA_LOG_OFF)
|
|
||||||
# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
|
||||||
# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
|
||||||
# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
|
||||||
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
|
||||||
#endif // defined(LLAVA_LOG_OFF)
|
|
||||||
|
|
||||||
// RGB uint8 image
|
|
||||||
struct clip_image_u8 {
|
|
||||||
int nx;
|
|
||||||
int ny;
|
|
||||||
|
|
||||||
std::vector<uint8_t> buf;
|
|
||||||
};
|
|
||||||
|
|
||||||
// RGB float32 image (NHWC)
|
|
||||||
// Memory layout: RGBRGBRGB...
|
|
||||||
struct clip_image_f32 {
|
|
||||||
int nx;
|
|
||||||
int ny;
|
|
||||||
|
|
||||||
std::vector<float> buf;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct clip_image_grid_shape {
|
struct clip_image_grid_shape {
|
||||||
int first;
|
int first;
|
||||||
int second;
|
int second;
|
||||||
|
@ -53,11 +26,6 @@ struct clip_image_f32_batch_deleter {
|
||||||
};
|
};
|
||||||
typedef std::unique_ptr<clip_image_f32_batch, clip_image_f32_batch_deleter> clip_image_f32_batch_ptr;
|
typedef std::unique_ptr<clip_image_f32_batch, clip_image_f32_batch_deleter> clip_image_f32_batch_ptr;
|
||||||
|
|
||||||
struct clip_image_size_deleter {
|
|
||||||
void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
|
|
||||||
};
|
|
||||||
typedef std::unique_ptr<clip_image_size, clip_image_size_deleter> clip_image_size_ptr;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Selects the best resolution from a list of possible resolutions based on the original size.
|
* Selects the best resolution from a list of possible resolutions based on the original size.
|
||||||
*
|
*
|
||||||
|
@ -471,3 +439,28 @@ void llava_image_embed_free(struct llava_image_embed * embed) {
|
||||||
free(embed->embed);
|
free(embed->embed);
|
||||||
free(embed);
|
free(embed);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//kcpp helper function
|
||||||
|
bool audio_embd_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const whisper_preprocessor::whisper_mel & mel_spec, float ** image_embd_out, int * n_img_pos_out)
|
||||||
|
{
|
||||||
|
clip_image_f32_ptr mel_f32(clip_image_f32_init());
|
||||||
|
mel_f32->nx = mel_spec.n_len;
|
||||||
|
mel_f32->ny = mel_spec.n_mel;
|
||||||
|
mel_f32->buf = std::move(mel_spec.data);
|
||||||
|
size_t n_tokens = clip_n_output_tokens(ctx_clip, mel_f32.get());
|
||||||
|
|
||||||
|
clip_image_f32_batch batch_f32;
|
||||||
|
batch_f32.is_audio = true;
|
||||||
|
batch_f32.entries.push_back(std::move(mel_f32));
|
||||||
|
|
||||||
|
int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
|
||||||
|
float * audio_embd = (float *)malloc(n_tokens * n_mmproj_embd);
|
||||||
|
bool ok = clip_image_batch_encode(
|
||||||
|
ctx_clip,
|
||||||
|
n_threads,
|
||||||
|
&batch_f32,
|
||||||
|
audio_embd);
|
||||||
|
*image_embd_out = audio_embd;
|
||||||
|
*n_img_pos_out = n_tokens;
|
||||||
|
return ok ? 0 : 1;
|
||||||
|
}
|
|
@ -26,6 +26,9 @@ struct llava_image_embed {
|
||||||
float * embed;
|
float * embed;
|
||||||
int n_image_pos;
|
int n_image_pos;
|
||||||
};
|
};
|
||||||
|
namespace whisper_preprocessor {
|
||||||
|
struct whisper_mel;
|
||||||
|
}
|
||||||
|
|
||||||
/** sanity check for clip <-> llava embed size match */
|
/** sanity check for clip <-> llava embed size match */
|
||||||
LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);
|
LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);
|
||||||
|
@ -42,6 +45,9 @@ LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
|
||||||
/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
|
/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
|
||||||
LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
|
LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
|
||||||
|
|
||||||
|
LLAVA_API bool audio_embd_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const whisper_preprocessor::whisper_mel & mel_spec, float ** image_embd_out, int * n_img_pos_out);
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue