diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e15b8cb5..1115088cf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -463,6 +463,8 @@ add_library(common2 src/unicode-data.cpp otherarch/utils.cpp otherarch/utils.h + otherarch/llmutils.cpp + otherarch/llmutils.h common/reasoning-budget.cpp common/reasoning-budget.h tools/mtmd/mtmd-audio.cpp diff --git a/Makefile b/Makefile index 24b43e945..73b11a1ae 100644 --- a/Makefile +++ b/Makefile @@ -110,10 +110,10 @@ endif CUBLASLD_FLAGS = CUBLAS_OBJS = -OBJS_FULL += ggml-alloc.o ggml-cpu-traits.o ggml-quants.o ggml-cpu-quants.o kcpp-quantmapper.o kcpp-repackmapper.o unicode.o unicode-common.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm.o common.o llama-impl.o sampling.o budget.o kcpputils.o mtmdaudio.o -OBJS_SIMPLE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx2.o ggml-cpu-quants.o kcpp-quantmapper_noavx2.o kcpp-repackmapper_noavx2.o unicode.o unicode-common.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx2.o common.o llama-impl.o sampling.o budget.o kcpputils.o mtmdaudio.o -OBJS_SIMPLER += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx1.o ggml-cpu-quants.o kcpp-quantmapper_noavx1.o kcpp-repackmapper_noavx1.o unicode.o unicode-common.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx1.o common.o llama-impl.o sampling.o budget.o kcpputils.o mtmdaudio.o -OBJS_FAILSAFE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_failsafe.o ggml-cpu-quants.o kcpp-quantmapper_failsafe.o kcpp-repackmapper_failsafe.o unicode.o unicode-common.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_failsafe.o common.o llama-impl.o sampling.o budget.o kcpputils.o mtmdaudio.o +OBJS_FULL += ggml-alloc.o ggml-cpu-traits.o ggml-quants.o ggml-cpu-quants.o kcpp-quantmapper.o kcpp-repackmapper.o unicode.o unicode-common.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm.o common.o llama-impl.o sampling.o budget.o kcpputils.o kcppllmutils.o mtmdaudio.o +OBJS_SIMPLE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx2.o ggml-cpu-quants.o kcpp-quantmapper_noavx2.o kcpp-repackmapper_noavx2.o unicode.o unicode-common.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx2.o common.o llama-impl.o sampling.o budget.o kcpputils.o kcppllmutils.o mtmdaudio.o +OBJS_SIMPLER += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx1.o ggml-cpu-quants.o kcpp-quantmapper_noavx1.o kcpp-repackmapper_noavx1.o unicode.o unicode-common.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx1.o common.o llama-impl.o sampling.o budget.o kcpputils.o kcppllmutils.o mtmdaudio.o +OBJS_FAILSAFE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_failsafe.o ggml-cpu-quants.o kcpp-quantmapper_failsafe.o kcpp-repackmapper_failsafe.o unicode.o unicode-common.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_failsafe.o common.o llama-impl.o sampling.o budget.o kcpputils.o kcppllmutils.o mtmdaudio.o # OS specific ifeq ($(UNAME_S),Linux) @@ -602,6 +602,8 @@ gguf.o: ggml/src/gguf.cpp ggml/include/gguf.h $(CXX) $(CXXFLAGS) -c $< -o $@ kcpputils.o: otherarch/utils.cpp otherarch/utils.h $(CXX) $(CXXFLAGS) -c $< -o $@ +kcppllmutils.o: otherarch/llmutils.cpp otherarch/llmutils.h + $(CXX) $(CXXFLAGS) -c $< -o $@ mtmdaudio.o: tools/mtmd/mtmd-audio.cpp tools/mtmd/mtmd-audio.h $(CXX) $(CXXFLAGS) -c $< -o $@ ggml-backend.o: ggml/src/ggml-backend.cpp ggml/src/ggml-backend-impl.h ggml/include/ggml.h ggml/include/ggml-backend.h diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 9b9f2365c..d2473f082 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -24,6 +24,7 @@ #include #include "utils.h" +#include "llmutils.h" //for easier compilation //concat source files into one file for compilation purposes diff --git a/otherarch/llmutils.cpp b/otherarch/llmutils.cpp new file mode 100644 index 000000000..46835e985 --- /dev/null +++ b/otherarch/llmutils.cpp @@ -0,0 +1,164 @@ + +#include "llmutils.h" + +void kcpp_embd_batch::init_kcpp_batch(int32_t n_tokens, + int32_t npast, + bool use_mrope, + bool return_all_logits, + bool mrope_is_image, + int img_nx, + int img_ny) { + const int n_pos_per_embd = use_mrope ? 4 : 1; + const llama_seq_id seq_id = 0; + + if (use_mrope && mrope_is_image) { + GGML_ASSERT(img_nx > 0 && img_ny > 0); + GGML_ASSERT(img_nx * img_ny == n_tokens); + } + + pos.resize(n_tokens * n_pos_per_embd); + std::fill(pos.begin(), pos.end(), 0); + + n_seq_id.resize(n_tokens); + seq_ids.resize(n_tokens + 1); + logits.resize(n_tokens); + seq_id_0.resize(1); + + seq_id_0[0] = seq_id; + seq_ids[n_tokens] = nullptr; + + batch.pos = pos.data(); + batch.n_seq_id = n_seq_id.data(); + batch.seq_id = seq_ids.data(); + batch.logits = logits.data(); + + for (int i = 0; i < n_tokens; ++i) { + n_seq_id[i] = 1; + seq_ids[i] = seq_id_0.data(); + logits[i] = return_all_logits; + } + + // ---- position encoding ---- + if (!use_mrope) { + for (int i = 0; i < n_tokens; ++i) { + pos[i] = npast + i; + } + } else if (!mrope_is_image) { + // 1D M-RoPE (audio / embedding stream) + for (int i = 0; i < n_tokens; ++i) { + pos[i + 0 * n_tokens] = npast + i; + pos[i + 1 * n_tokens] = npast + i; + pos[i + 2 * n_tokens] = npast + i; + pos[i + 3 * n_tokens] = 0; + } + } else { + // 2D image M-RoPE + int idx = 0; + for (int y = 0; y < img_ny; ++y) { + for (int x = 0; x < img_nx; ++x) { + pos[idx + 0 * n_tokens] = npast; + pos[idx + 1 * n_tokens] = npast + y; + pos[idx + 2 * n_tokens] = npast + x; + pos[idx + 3 * n_tokens] = 0; + ++idx; + } + } + } + + // Always request logits for last token + logits[n_tokens - 1] = true; +} + +//for embeddings +kcpp_embd_batch::kcpp_embd_batch(float * embd, + int32_t n_tokens, + int32_t npast, + bool use_mrope, + bool mrope_is_image, + int img_nx, + int img_ny) { + batch = { + /* n_tokens = */ n_tokens, + /* tokens = */ nullptr, + /* embd = */ embd, + /* pos = */ nullptr, + /* n_seq_id = */ nullptr, + /* seq_id = */ nullptr, + /* logits = */ nullptr, + }; + + init_kcpp_batch(n_tokens, npast, use_mrope, + /*return_all_logits=*/false, mrope_is_image, img_nx, img_ny); +} + +// for tokens +kcpp_embd_batch::kcpp_embd_batch(std::vector & tokens, + int32_t npast, + bool use_mrope, + bool return_all_logits, + bool mrope_is_image, + int img_nx, + int img_ny) { + batch = { + /* n_tokens = */ (int32_t) tokens.size(), + /* tokens = */ tokens.data(), + /* embd = */ nullptr, + /* pos = */ nullptr, + /* n_seq_id = */ nullptr, + /* seq_id = */ nullptr, + /* logits = */ nullptr, + }; + + init_kcpp_batch(batch.n_tokens, npast, use_mrope, return_all_logits, mrope_is_image, img_nx, img_ny); +} + +llama_batch kcpp_embd_batch::get_view(int offset, int n_tokens, int n_embd_mmproj) { + GGML_ASSERT(offset >= 0); + GGML_ASSERT(n_tokens > 0); + GGML_ASSERT(offset + n_tokens <= batch.n_tokens); + + const int total_tokens = batch.n_tokens; + llama_pos * pos_ptr = nullptr; + + // Detect M-RoPE vs normal RoPE + const bool is_mrope = (pos.size() > (size_t)total_tokens); + + pos_view.clear(); + + if (is_mrope) { + const int n_pos_per_embd = pos.size() / total_tokens; + GGML_ASSERT(n_pos_per_embd == 4); + + // Layout: + // src: [dim0_all_tokens][dim1_all_tokens][dim2_all_tokens][dim3_all_tokens] + // dst: same layout, but only [offset : offset + n_tokens] + pos_view.reserve(n_tokens * n_pos_per_embd); + + for (int dim = 0; dim < n_pos_per_embd; ++dim) { + const llama_pos * src = + pos.data() + dim * total_tokens + offset; + + pos_view.insert( + pos_view.end(), + src, + src + n_tokens + ); + } + + pos_ptr = pos_view.data(); + } + else { + // Normal RoPE: contiguous slice + pos_ptr = pos.data() + offset; + } + + return { + /* n_tokens = */ n_tokens, + /* tokens = */ nullptr, + /* embd = */ batch.embd ? batch.embd + offset*n_embd_mmproj : nullptr, + /* pos = */ pos_ptr, + /* n_seq_id = */ batch.n_seq_id + offset, + /* seq_id = */ batch.seq_id + offset, + /* logits = */ batch.logits + offset, + }; +} \ No newline at end of file diff --git a/otherarch/llmutils.h b/otherarch/llmutils.h new file mode 100644 index 000000000..e295362c0 --- /dev/null +++ b/otherarch/llmutils.h @@ -0,0 +1,54 @@ +#pragma once + +#include +#include +#include +#include +#include +#include "llama.h" + +//duplcated and modified from llava_embd_batch +struct kcpp_embd_batch { + std::vector pos; + std::vector pos_view; + std::vector n_seq_id; + std::vector seq_id_0; + std::vector seq_ids; + std::vector logits; + llama_batch batch; + + llama_batch get_view(int offset, int n_tokens, int n_embd_mmproj); + + // Embedding constructor + kcpp_embd_batch( + float * embd, + int32_t n_tokens, + int32_t npast, + bool use_mrope, + bool mrope_is_image = false, + int img_nx = 0, + int img_ny = 0 + ); + + // Token constructor + kcpp_embd_batch( + std::vector & tokens, + int32_t npast, + bool use_mrope, + bool return_all_logits, + bool mrope_is_image = false, + int img_nx = 0, + int img_ny = 0 + ); + +private: + void init_kcpp_batch( + int32_t n_tokens, + int32_t npast, + bool use_mrope, + bool return_all_logits, + bool mrope_is_image, + int img_nx, + int img_ny + ); +}; \ No newline at end of file diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp index d7c66ab7d..587f453c9 100644 --- a/otherarch/sdcpp/sdtype_adapter.cpp +++ b/otherarch/sdcpp/sdtype_adapter.cpp @@ -14,6 +14,8 @@ #include #include +#include "otherarch/utils.h" + #include "model_adapter.h" #include "tokenizers/vocab/vocab.h" #include "flux.hpp" @@ -54,10 +56,6 @@ using namespace torch_zip; #include "tokenizers/tokenizer.cpp" #include "tokenizers/tokenize_util.cpp" -// FIXME: llama.h errors out if included (through utils.h) -std::vector kcpp_base64_decode(const std::string & encoded_string); -std::string kcpp_base64_encode(const unsigned char* data, unsigned int data_length); -std::string get_timestamp_str(); // #include "preprocessing.hpp" #include "stable-diffusion.h" diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp index 408c4da81..5284ad716 100644 --- a/otherarch/tts_adapter.cpp +++ b/otherarch/tts_adapter.cpp @@ -1,5 +1,6 @@ #include "model_adapter.h" #include "otherarch/utils.h" +#include "otherarch/llmutils.h" #include "common.h" #include "sampling.h" diff --git a/otherarch/utils.cpp b/otherarch/utils.cpp index 71d648faa..32f0e4652 100644 --- a/otherarch/utils.cpp +++ b/otherarch/utils.cpp @@ -760,167 +760,6 @@ int32_t kcpp_quick_sample(float * logits, const int n_logits, const std::vector< return logits_id[idx].second; } -void kcpp_embd_batch::init_kcpp_batch(int32_t n_tokens, - int32_t npast, - bool use_mrope, - bool return_all_logits, - bool mrope_is_image, - int img_nx, - int img_ny) { - const int n_pos_per_embd = use_mrope ? 4 : 1; - const llama_seq_id seq_id = 0; - - if (use_mrope && mrope_is_image) { - GGML_ASSERT(img_nx > 0 && img_ny > 0); - GGML_ASSERT(img_nx * img_ny == n_tokens); - } - - pos.resize(n_tokens * n_pos_per_embd); - std::fill(pos.begin(), pos.end(), 0); - - n_seq_id.resize(n_tokens); - seq_ids.resize(n_tokens + 1); - logits.resize(n_tokens); - seq_id_0.resize(1); - - seq_id_0[0] = seq_id; - seq_ids[n_tokens] = nullptr; - - batch.pos = pos.data(); - batch.n_seq_id = n_seq_id.data(); - batch.seq_id = seq_ids.data(); - batch.logits = logits.data(); - - for (int i = 0; i < n_tokens; ++i) { - n_seq_id[i] = 1; - seq_ids[i] = seq_id_0.data(); - logits[i] = return_all_logits; - } - - // ---- position encoding ---- - if (!use_mrope) { - for (int i = 0; i < n_tokens; ++i) { - pos[i] = npast + i; - } - } else if (!mrope_is_image) { - // 1D M-RoPE (audio / embedding stream) - for (int i = 0; i < n_tokens; ++i) { - pos[i + 0 * n_tokens] = npast + i; - pos[i + 1 * n_tokens] = npast + i; - pos[i + 2 * n_tokens] = npast + i; - pos[i + 3 * n_tokens] = 0; - } - } else { - // 2D image M-RoPE - int idx = 0; - for (int y = 0; y < img_ny; ++y) { - for (int x = 0; x < img_nx; ++x) { - pos[idx + 0 * n_tokens] = npast; - pos[idx + 1 * n_tokens] = npast + y; - pos[idx + 2 * n_tokens] = npast + x; - pos[idx + 3 * n_tokens] = 0; - ++idx; - } - } - } - - // Always request logits for last token - logits[n_tokens - 1] = true; -} - -//for embeddings -kcpp_embd_batch::kcpp_embd_batch(float * embd, - int32_t n_tokens, - int32_t npast, - bool use_mrope, - bool mrope_is_image, - int img_nx, - int img_ny) { - batch = { - /* n_tokens = */ n_tokens, - /* tokens = */ nullptr, - /* embd = */ embd, - /* pos = */ nullptr, - /* n_seq_id = */ nullptr, - /* seq_id = */ nullptr, - /* logits = */ nullptr, - }; - - init_kcpp_batch(n_tokens, npast, use_mrope, - /*return_all_logits=*/false, mrope_is_image, img_nx, img_ny); -} - -// for tokens -kcpp_embd_batch::kcpp_embd_batch(std::vector & tokens, - int32_t npast, - bool use_mrope, - bool return_all_logits, - bool mrope_is_image, - int img_nx, - int img_ny) { - batch = { - /* n_tokens = */ (int32_t) tokens.size(), - /* tokens = */ tokens.data(), - /* embd = */ nullptr, - /* pos = */ nullptr, - /* n_seq_id = */ nullptr, - /* seq_id = */ nullptr, - /* logits = */ nullptr, - }; - - init_kcpp_batch(batch.n_tokens, npast, use_mrope, return_all_logits, mrope_is_image, img_nx, img_ny); -} - -llama_batch kcpp_embd_batch::get_view(int offset, int n_tokens, int n_embd_mmproj) { - GGML_ASSERT(offset >= 0); - GGML_ASSERT(n_tokens > 0); - GGML_ASSERT(offset + n_tokens <= batch.n_tokens); - - const int total_tokens = batch.n_tokens; - llama_pos * pos_ptr = nullptr; - - // Detect M-RoPE vs normal RoPE - const bool is_mrope = (pos.size() > (size_t)total_tokens); - - pos_view.clear(); - - if (is_mrope) { - const int n_pos_per_embd = pos.size() / total_tokens; - GGML_ASSERT(n_pos_per_embd == 4); - - // Layout: - // src: [dim0_all_tokens][dim1_all_tokens][dim2_all_tokens][dim3_all_tokens] - // dst: same layout, but only [offset : offset + n_tokens] - pos_view.reserve(n_tokens * n_pos_per_embd); - - for (int dim = 0; dim < n_pos_per_embd; ++dim) { - const llama_pos * src = - pos.data() + dim * total_tokens + offset; - - pos_view.insert( - pos_view.end(), - src, - src + n_tokens - ); - } - - pos_ptr = pos_view.data(); - } - else { - // Normal RoPE: contiguous slice - pos_ptr = pos.data() + offset; - } - - return { - /* n_tokens = */ n_tokens, - /* tokens = */ nullptr, - /* embd = */ batch.embd ? batch.embd + offset*n_embd_mmproj : nullptr, - /* pos = */ pos_ptr, - /* n_seq_id = */ batch.n_seq_id + offset, - /* seq_id = */ batch.seq_id + offset, - /* logits = */ batch.logits + offset, - }; -} std::vector split_string(const std::string& input, const std::string& separator) { std::vector result; diff --git a/otherarch/utils.h b/otherarch/utils.h index 3a5d00544..1c292db3e 100644 --- a/otherarch/utils.h +++ b/otherarch/utils.h @@ -8,16 +8,6 @@ #include #include #include "ggml_v3.h" -#include "llama.h" - -// -// CLI argument parsing -// - - -// -// Vocab utils -// struct gpt_vocab { using id = int32_t; @@ -73,6 +63,7 @@ std::vector split_string(const std::string& input, const std::strin bool kcpp_decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector & pcmf32_mono); bool kcpp_decode_audio_to_f32_stereo_48k(const uint8_t * data, size_t data_size, std::vector & pcm, int & T_audio); +typedef struct ggml_backend_device * ggml_backend_dev_t; std::vector kcpp_parse_device_list(const std::string & value); bool kcpp_string_ends_with(const std::string& str, const std::string& suffix); @@ -81,52 +72,6 @@ int ComputeSharedPrefixLength(const std::vector &tokens_a,const std::vector float ComputePrefixMatchPercent(const std::vector &tokens_a,const std::vector &tokens_b); bool FullyContainedPrefix(std::vector &sequence1, std::vector &sequence2); -//duplcated and modified from llava_embd_batch -struct kcpp_embd_batch { - std::vector pos; - std::vector pos_view; - std::vector n_seq_id; - std::vector seq_id_0; - std::vector seq_ids; - std::vector logits; - llama_batch batch; - - llama_batch get_view(int offset, int n_tokens, int n_embd_mmproj); - - // Embedding constructor - kcpp_embd_batch( - float * embd, - int32_t n_tokens, - int32_t npast, - bool use_mrope, - bool mrope_is_image = false, - int img_nx = 0, - int img_ny = 0 - ); - - // Token constructor - kcpp_embd_batch( - std::vector & tokens, - int32_t npast, - bool use_mrope, - bool return_all_logits, - bool mrope_is_image = false, - int img_nx = 0, - int img_ny = 0 - ); - -private: - void init_kcpp_batch( - int32_t n_tokens, - int32_t npast, - bool use_mrope, - bool return_all_logits, - bool mrope_is_image, - int img_nx, - int img_ny - ); -}; - #pragma pack(push, 1) struct wav16_header { char riff[4] = {'R', 'I', 'F', 'F'};