mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
standardize tts linting and formatting
This commit is contained in:
parent
cfc1a0d4ef
commit
9935ac093f
24 changed files with 371 additions and 355 deletions
|
@ -474,7 +474,7 @@ set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
|
||||||
add_library(tts_adapter
|
add_library(tts_adapter
|
||||||
otherarch/tts_adapter.cpp)
|
otherarch/tts_adapter.cpp)
|
||||||
target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./tools ./common)
|
target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./otherarch/ttscpp/include ./otherarch/ttscpp/src ./tools ./common)
|
||||||
target_compile_features(tts_adapter PUBLIC cxx_std_17) # don't bump
|
target_compile_features(tts_adapter PUBLIC cxx_std_17) # don't bump
|
||||||
target_link_libraries(tts_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
|
target_link_libraries(tts_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
|
||||||
set_target_properties(tts_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(tts_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
|
2
Makefile
2
Makefile
|
@ -729,7 +729,7 @@ mainvk: tools/main/main.cpp common/arg.cpp build-info.h ggml_v4_vulkan.o ggml-cp
|
||||||
$(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
embedding: examples/embedding/embedding.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
|
embedding: examples/embedding/embedding.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
ttscppmain: otherarch/ttscpp/cli/cli.cpp otherarch/ttscpp/cli/playback.cpp otherarch/ttscpp/cli/playback.h otherarch/ttscpp/cli/write_file.cpp otherarch/ttscpp/cli/write_file.h otherarch/ttscpp/cli/vad.cpp otherarch/ttscpp/cli/vad.h otherarch/ttscpp/src/tts.cpp otherarch/ttscpp/src/tokenizer.cpp otherarch/ttscpp/src/sampler.cpp otherarch/ttscpp/src/parler_model.cpp otherarch/ttscpp/src/dac_model.cpp otherarch/ttscpp/src/ttsutil.cpp otherarch/ttscpp/src/args.cpp otherarch/ttscpp/src/t5_encoder_model.cpp otherarch/ttscpp/src/phonemizer.cpp otherarch/ttscpp/src/tts_model.cpp otherarch/ttscpp/src/kokoro_model.cpp otherarch/ttscpp/src/dia_model.cpp otherarch/ttscpp/src/orpheus_model.cpp otherarch/ttscpp/src/snac_model.cpp otherarch/ttscpp/src/general_neural_audio_codec.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
|
ttscppmain: otherarch/ttscpp/cli/cli.cpp otherarch/ttscpp/cli/playback.cpp otherarch/ttscpp/cli/playback.h otherarch/ttscpp/cli/write_file.cpp otherarch/ttscpp/cli/write_file.h otherarch/ttscpp/cli/vad.cpp otherarch/ttscpp/cli/vad.h otherarch/ttscpp/src/tts.cpp otherarch/ttscpp/src/ttstokenizer.cpp otherarch/ttscpp/src/ttssampler.cpp otherarch/ttscpp/src/parler_model.cpp otherarch/ttscpp/src/dac_model.cpp otherarch/ttscpp/src/ttsutil.cpp otherarch/ttscpp/src/ttsargs.cpp otherarch/ttscpp/src/ttst5_encoder_model.cpp otherarch/ttscpp/src/phonemizer.cpp otherarch/ttscpp/src/tts_model.cpp otherarch/ttscpp/src/kokoro_model.cpp otherarch/ttscpp/src/dia_model.cpp otherarch/ttscpp/src/orpheus_model.cpp otherarch/ttscpp/src/snac_model.cpp otherarch/ttscpp/src/general_neural_audio_codec.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
ggml/src/ggml-vulkan-shaders.cpp:
|
ggml/src/ggml-vulkan-shaders.cpp:
|
||||||
|
|
|
@ -25,6 +25,22 @@
|
||||||
#define M_PI 3.14159265358979323846
|
#define M_PI 3.14159265358979323846
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
//imports required for tts.cpp to work
|
||||||
|
#include "tts.cpp"
|
||||||
|
#include "ttstokenizer.cpp"
|
||||||
|
#include "ttssampler.cpp"
|
||||||
|
#include "parler_model.cpp"
|
||||||
|
#include "dac_model.cpp"
|
||||||
|
#include "ttsutil.cpp"
|
||||||
|
#include "ttst5_encoder_model.cpp"
|
||||||
|
#include "phonemizer.cpp"
|
||||||
|
#include "tts_model.cpp"
|
||||||
|
#include "kokoro_model.cpp"
|
||||||
|
#include "dia_model.cpp"
|
||||||
|
#include "orpheus_model.cpp"
|
||||||
|
#include "snac_model.cpp"
|
||||||
|
#include "general_neural_audio_codec.cpp"
|
||||||
|
|
||||||
enum TTS_VER
|
enum TTS_VER
|
||||||
{
|
{
|
||||||
TTS_VER_2,
|
TTS_VER_2,
|
||||||
|
|
|
@ -9,8 +9,8 @@ float energy(float * chunk, int count) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void apply_energy_voice_inactivity_detection(
|
void apply_energy_voice_inactivity_detection(
|
||||||
tts_response & data,
|
tts_response & data,
|
||||||
float sample_rate,
|
float sample_rate,
|
||||||
int ms_per_frame,
|
int ms_per_frame,
|
||||||
int frame_threshold,
|
int frame_threshold,
|
||||||
float normalized_energy_threshold,
|
float normalized_energy_threshold,
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -12,7 +12,7 @@
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
#include "tokenizer.h"
|
#include "ttstokenizer.h"
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
|
|
||||||
|
@ -33,16 +33,16 @@ static const std::unordered_set<std::string> ONE_LETTER_WORDS = {
|
||||||
"i",
|
"i",
|
||||||
};
|
};
|
||||||
/*
|
/*
|
||||||
* The two letter and three letter words listed below have been filtered down from the complete list of english two and three letter words
|
* The two letter and three letter words listed below have been filtered down from the complete list of english two and three letter words
|
||||||
* via several criteria:
|
* via several criteria:
|
||||||
* 1. All non-EN-US words have been removed
|
* 1. All non-EN-US words have been removed
|
||||||
* 2. All three letter acronyms have been removed (as these lists are used to identify acronyms)
|
* 2. All three letter acronyms have been removed (as these lists are used to identify acronyms)
|
||||||
* 3. All archaic, deprecated, or poetic words have been removed.
|
* 3. All archaic, deprecated, or poetic words have been removed.
|
||||||
* 4. All literary, abbreviative, and slang words have been removed if they see no more than a mean of 30 daily searches via google (over the
|
* 4. All literary, abbreviative, and slang words have been removed if they see no more than a mean of 30 daily searches via google (over the
|
||||||
* last 10 years).
|
* last 10 years).
|
||||||
*
|
*
|
||||||
* After the lists were filtered by the criteria described above, removed items were reviewed. Any item which had entered the common EN-US
|
* After the lists were filtered by the criteria described above, removed items were reviewed. Any item which had entered the common EN-US
|
||||||
* vernacular but was not identified as of American origin was reintroduced into the sets below.
|
* vernacular but was not identified as of American origin was reintroduced into the sets below.
|
||||||
*/
|
*/
|
||||||
static const std::unordered_set<std::string> TWO_LETTER_WORDS = {
|
static const std::unordered_set<std::string> TWO_LETTER_WORDS = {
|
||||||
"ab", "ah", "am", "an", "as", "at", "aw", "ax", "ay", "be", "bo", "br",
|
"ab", "ah", "am", "an", "as", "at", "aw", "ax", "ay", "be", "bo", "br",
|
||||||
|
@ -50,7 +50,7 @@ static const std::unordered_set<std::string> TWO_LETTER_WORDS = {
|
||||||
"id", "if", "in", "is", "it", "la", "lo", "ma", "me", "mm", "my", "na",
|
"id", "if", "in", "is", "it", "la", "lo", "ma", "me", "mm", "my", "na",
|
||||||
"no", "of", "oh", "oi", "on", "oo", "or", "ow", "ox", "oy", "pa", "qi",
|
"no", "of", "oh", "oi", "on", "oo", "or", "ow", "ox", "oy", "pa", "qi",
|
||||||
"re", "sh", "so", "to", "uh", "um", "un", "up", "us", "we", "wo", "ya",
|
"re", "sh", "so", "to", "uh", "um", "un", "up", "us", "we", "wo", "ya",
|
||||||
"ye", "yo",
|
"ye", "yo",
|
||||||
};
|
};
|
||||||
static const std::unordered_set<std::string> THREE_LETTER_WORDS = {
|
static const std::unordered_set<std::string> THREE_LETTER_WORDS = {
|
||||||
"aah", "abs", "aby", "ace", "ach", "ack", "act", "add", "ado", "ads", "aft", "age",
|
"aah", "abs", "aby", "ace", "ach", "ack", "act", "add", "ado", "ads", "aft", "age",
|
||||||
|
@ -292,7 +292,7 @@ static std::string STOPPING_TOKENS = ".,:;!?";
|
||||||
|
|
||||||
#ifdef ESPEAK_INSTALL
|
#ifdef ESPEAK_INSTALL
|
||||||
/**
|
/**
|
||||||
* espeak-ng uses globals to persist and manage its state so it is not compatible with
|
* espeak-ng uses globals to persist and manage its state so it is not compatible with
|
||||||
* threaded parallelism (https://github.com/espeak-ng/espeak-ng/issues/1527).
|
* threaded parallelism (https://github.com/espeak-ng/espeak-ng/issues/1527).
|
||||||
* This singleton acts as a mutex wrapped provider for all espeak phonemization methods such
|
* This singleton acts as a mutex wrapped provider for all espeak phonemization methods such
|
||||||
* that multiple instances of the kokoro_runner can be initialized and called in parallel.
|
* that multiple instances of the kokoro_runner can be initialized and called in parallel.
|
||||||
|
@ -323,7 +323,7 @@ public:
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
enum lookup_code {
|
enum lookup_code {
|
||||||
SUCCESS = 100,
|
SUCCESS_TOTAL = 100,
|
||||||
SUCCESS_PARTIAL = 101,
|
SUCCESS_PARTIAL = 101,
|
||||||
FAILURE_UNFOUND = 200,
|
FAILURE_UNFOUND = 200,
|
||||||
FAILURE_PHONETIC = 201,
|
FAILURE_PHONETIC = 201,
|
||||||
|
@ -368,7 +368,7 @@ struct conditions {
|
||||||
void update_for_word(std::string word,bool allow_for_upper_check = true);
|
void update_for_word(std::string word,bool allow_for_upper_check = true);
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The corpus struct is simply a small wrapper class that is used to perform simple look forward and backwards in the text
|
* The corpus struct is simply a small wrapper class that is used to perform simple look forward and backwards in the text
|
||||||
* which is being phonemized. This can be used to discern how to convert chunks of text in a consistent and protective fashion
|
* which is being phonemized. This can be used to discern how to convert chunks of text in a consistent and protective fashion
|
||||||
* in order to accurately phonemize complicated text.
|
* in order to accurately phonemize complicated text.
|
||||||
|
@ -376,7 +376,7 @@ struct conditions {
|
||||||
struct corpus {
|
struct corpus {
|
||||||
corpus(const char * text, size_t size): size(size), text(text) {};
|
corpus(const char * text, size_t size): size(size), text(text) {};
|
||||||
size_t location = 0;
|
size_t location = 0;
|
||||||
size_t size;
|
size_t size;
|
||||||
const char * text;
|
const char * text;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -397,9 +397,9 @@ struct corpus {
|
||||||
std::string after_until(int after, std::string val);
|
std::string after_until(int after, std::string val);
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The TTS phonemizer works by splitting each word into distinct graphemes, and for each grapheme the phonemizer will look at the grapheme that came
|
* The TTS phonemizer works by splitting each word into distinct graphemes, and for each grapheme the phonemizer will look at the grapheme that came
|
||||||
* before, after, and for any word specific exceptions in order to compile a
|
* before, after, and for any word specific exceptions in order to compile a
|
||||||
*/
|
*/
|
||||||
struct phonemizer_rule {
|
struct phonemizer_rule {
|
||||||
~phonemizer_rule() {
|
~phonemizer_rule() {
|
||||||
|
@ -436,10 +436,10 @@ private:
|
||||||
|
|
||||||
struct word_phonemizer * word_phonemizer_from_gguf(gguf_context * meta);
|
struct word_phonemizer * word_phonemizer_from_gguf(gguf_context * meta);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The general translation approach that espeak uses is to lookup words in the dictionary and return a list of possible matches per lookup.
|
* The general translation approach that espeak uses is to lookup words in the dictionary and return a list of possible matches per lookup.
|
||||||
* Each match contains flags which describe the match's conditions and limitations and optionally a pronunciation. When a pronunciation is not returned,
|
* Each match contains flags which describe the match's conditions and limitations and optionally a pronunciation. When a pronunciation is not returned,
|
||||||
* it usually means that the word needs to be pronounced phonetically, the word belongs to another language, or that the original content is a
|
* it usually means that the word needs to be pronounced phonetically, the word belongs to another language, or that the original content is a
|
||||||
* token representation of a different word (e.g. with numbers).
|
* token representation of a different word (e.g. with numbers).
|
||||||
*
|
*
|
||||||
* Since it does not make sense to have the core lexer reperform this lookup operation with represented words or via distinct languages, those behaviors
|
* Since it does not make sense to have the core lexer reperform this lookup operation with represented words or via distinct languages, those behaviors
|
||||||
|
@ -470,7 +470,7 @@ struct phoneme_dictionary {
|
||||||
|
|
||||||
struct phoneme_dictionary * phoneme_dictionary_from_gguf(gguf_context * meta);
|
struct phoneme_dictionary * phoneme_dictionary_from_gguf(gguf_context * meta);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* In general, I would like to avoid requiring the installation of otherwise broad and technically complicated libraries,
|
* In general, I would like to avoid requiring the installation of otherwise broad and technically complicated libraries,
|
||||||
* like espeak, especially when they are only being used for a small portion of their overall functionality. While avoiding these
|
* like espeak, especially when they are only being used for a small portion of their overall functionality. While avoiding these
|
||||||
* requirements will keep the default installation cost of TTS.cpp down, it is also unlikely that TTS.cpp will support
|
* requirements will keep the default installation cost of TTS.cpp down, it is also unlikely that TTS.cpp will support
|
||||||
|
@ -478,8 +478,8 @@ struct phoneme_dictionary * phoneme_dictionary_from_gguf(gguf_context * meta);
|
||||||
* espeak. As such, the phonemizer struct described below will support simple text to IPA phoneme functionality out of the box,
|
* espeak. As such, the phonemizer struct described below will support simple text to IPA phoneme functionality out of the box,
|
||||||
* while also optionally acting as an interface for espeak phonemization.
|
* while also optionally acting as an interface for espeak phonemization.
|
||||||
*
|
*
|
||||||
* Phonemization seems to use a pattern close to the common lexer, such that at each index or chunk of text forward and backward context
|
* Phonemization seems to use a pattern close to the common lexer, such that at each index or chunk of text forward and backward context
|
||||||
* views are used to support single pass translation. As such, the TTS.cpp phonemization pattern I've decided to implement behaves
|
* views are used to support single pass translation. As such, the TTS.cpp phonemization pattern I've decided to implement behaves
|
||||||
* effecively like a simple router lexer. It will only support utf-8 encoded text and english IPA conversion.
|
* effecively like a simple router lexer. It will only support utf-8 encoded text and english IPA conversion.
|
||||||
*/
|
*/
|
||||||
struct phonemizer {
|
struct phonemizer {
|
||||||
|
|
|
@ -28,7 +28,7 @@ const std::map<std::string, tts_arch> SUPPORTED_ARCHITECTURES = {
|
||||||
{ "orpheus", ORPHEUS_ARCH }
|
{ "orpheus", ORPHEUS_ARCH }
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Given a map from keys to values, creates a new map from values to keys
|
/// Given a map from keys to values, creates a new map from values to keys
|
||||||
template<typename K, typename V>
|
template<typename K, typename V>
|
||||||
static std::map<V, K> reverse_map(const std::map<K, V>& m) {
|
static std::map<V, K> reverse_map(const std::map<K, V>& m) {
|
||||||
std::map<V, K> r;
|
std::map<V, K> r;
|
||||||
|
@ -43,10 +43,10 @@ const std::map<tts_arch, std::string> ARCHITECTURE_NAMES = reverse_map(SUPPORTED
|
||||||
struct generation_configuration {
|
struct generation_configuration {
|
||||||
generation_configuration(
|
generation_configuration(
|
||||||
std::string voice = "",
|
std::string voice = "",
|
||||||
int top_k = 50,
|
int top_k = 50,
|
||||||
float temperature = 1.0,
|
float temperature = 1.0,
|
||||||
float repetition_penalty = 1.0,
|
float repetition_penalty = 1.0,
|
||||||
bool use_cross_attn = true,
|
bool use_cross_attn = true,
|
||||||
std::string espeak_voice_id = "",
|
std::string espeak_voice_id = "",
|
||||||
int max_tokens = 0,
|
int max_tokens = 0,
|
||||||
float top_p = 1.0,
|
float top_p = 1.0,
|
||||||
|
|
|
@ -22,13 +22,13 @@ struct dac_quantize_layer {
|
||||||
// this struct maintains the static tensors for the dac audio decoder graph.
|
// this struct maintains the static tensors for the dac audio decoder graph.
|
||||||
// As such, this is designed to contain basic configuration and ggml tensor support for DAC.
|
// As such, this is designed to contain basic configuration and ggml tensor support for DAC.
|
||||||
// The dac_runner describes how the graph is built and run.
|
// The dac_runner describes how the graph is built and run.
|
||||||
struct dac_model : tts_model {
|
struct dac_model : tts_model {
|
||||||
// These configs are essentially built for the 44khZ 8kbps standard DAC model audio encoder and decoder
|
// These configs are essentially built for the 44khZ 8kbps standard DAC model audio encoder and decoder
|
||||||
uint32_t n_layers = 4;
|
uint32_t n_layers = 4;
|
||||||
uint32_t n_heads = 9;
|
uint32_t n_heads = 9;
|
||||||
uint32_t up_sampling_factor = 512;
|
uint32_t up_sampling_factor = 512;
|
||||||
uint32_t max_generation_size = 2580;
|
uint32_t max_generation_size = 2580;
|
||||||
|
|
||||||
struct ggml_tensor * in_conv_kernel;
|
struct ggml_tensor * in_conv_kernel;
|
||||||
struct ggml_tensor * in_conv_bias;
|
struct ggml_tensor * in_conv_bias;
|
||||||
struct ggml_tensor * out_conv_kernel;
|
struct ggml_tensor * out_conv_kernel;
|
||||||
|
@ -53,11 +53,11 @@ void assign_to_audio_encoder(dac_model * model, std::string name, ggml_tensor *
|
||||||
// the context used for running the dac model
|
// the context used for running the dac model
|
||||||
struct dac_context : runner_context {
|
struct dac_context : runner_context {
|
||||||
dac_context(dac_model * model, int n_threads): runner_context(n_threads), model(model) {};
|
dac_context(dac_model * model, int n_threads): runner_context(n_threads), model(model) {};
|
||||||
|
|
||||||
struct dac_model * model;
|
struct dac_model * model;
|
||||||
|
|
||||||
struct ggml_tensor * inp_tokens;
|
struct ggml_tensor * inp_tokens;
|
||||||
|
|
||||||
void build_schedule() {
|
void build_schedule() {
|
||||||
runner_context::build_schedule(model->max_nodes());
|
runner_context::build_schedule(model->max_nodes());
|
||||||
}
|
}
|
||||||
|
@ -85,11 +85,11 @@ struct dac_runner : tts_runner {
|
||||||
}
|
}
|
||||||
dac_model * model;
|
dac_model * model;
|
||||||
dac_context * dctx;
|
dac_context * dctx;
|
||||||
|
|
||||||
void init_build() {
|
void init_build() {
|
||||||
tts_runner::init_build(&dctx->buf_compute_meta);
|
tts_runner::init_build(&dctx->buf_compute_meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
void prepare_post_load();
|
void prepare_post_load();
|
||||||
struct ggml_cgraph * build_dac_graph(dac_ubatch & batch);
|
struct ggml_cgraph * build_dac_graph(dac_ubatch & batch);
|
||||||
void run(uint32_t * input_tokens, uint32_t sequence_length, struct tts_response * outputs);
|
void run(uint32_t * input_tokens, uint32_t sequence_length, struct tts_response * outputs);
|
||||||
|
|
|
@ -119,7 +119,7 @@ void dia_model::assign_to_decoder_layer(std::string part, dia_decoder_layer * la
|
||||||
set_tensor(layer->self_attn_norm, tensor);
|
set_tensor(layer->self_attn_norm, tensor);
|
||||||
} else if (part == "pre_mlp_norm") {
|
} else if (part == "pre_mlp_norm") {
|
||||||
layer->mlp_norm = ggml_dup_tensor(ctx, tensor);
|
layer->mlp_norm = ggml_dup_tensor(ctx, tensor);
|
||||||
set_tensor(layer->mlp_norm, tensor);
|
set_tensor(layer->mlp_norm, tensor);
|
||||||
} else if (part == "pre_ca_norm") {
|
} else if (part == "pre_ca_norm") {
|
||||||
layer->cross_attn_norm = ggml_dup_tensor(ctx, tensor);
|
layer->cross_attn_norm = ggml_dup_tensor(ctx, tensor);
|
||||||
set_tensor(layer->cross_attn_norm, tensor);
|
set_tensor(layer->cross_attn_norm, tensor);
|
||||||
|
@ -151,7 +151,7 @@ void dia_model::prep_layers() {
|
||||||
dia_decoder_layer * l = new dia_decoder_layer;
|
dia_decoder_layer * l = new dia_decoder_layer;
|
||||||
decoder->layers.push_back(l);
|
decoder->layers.push_back(l);
|
||||||
}
|
}
|
||||||
|
|
||||||
decoder->embds.reserve((size_t) n_output_heads);
|
decoder->embds.reserve((size_t) n_output_heads);
|
||||||
decoder->heads.reserve((size_t) n_output_heads);
|
decoder->heads.reserve((size_t) n_output_heads);
|
||||||
for (int i = 0; i < n_output_heads; i++) {
|
for (int i = 0; i < n_output_heads; i++) {
|
||||||
|
@ -196,7 +196,7 @@ void dia_model::prep_constants(gguf_context * meta) {
|
||||||
int encoder_attn_heads_key = gguf_find_key(meta, "dia.encoder.attn_heads");
|
int encoder_attn_heads_key = gguf_find_key(meta, "dia.encoder.attn_heads");
|
||||||
if (encoder_attn_heads_key != -1) {
|
if (encoder_attn_heads_key != -1) {
|
||||||
encoder_attn_heads = gguf_get_val_u32(meta, encoder_attn_heads_key);
|
encoder_attn_heads = gguf_get_val_u32(meta, encoder_attn_heads_key);
|
||||||
}
|
}
|
||||||
|
|
||||||
int head_size_key = gguf_find_key(meta, "dia.attn_head_size");
|
int head_size_key = gguf_find_key(meta, "dia.attn_head_size");
|
||||||
if (head_size_key != -1) {
|
if (head_size_key != -1) {
|
||||||
|
@ -271,7 +271,7 @@ struct dia_context * build_new_dia_context(struct dia_model * model, int n_threa
|
||||||
return dctx;
|
return dctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool dia_kv_cache_init(struct dia_kv_cache * cache, dia_model * model, dia_context * dctx) {
|
static bool dia_kv_cache_init(struct dia_kv_cache * cache, dia_model * model, dia_context * dctx) {
|
||||||
ggml_backend_buffer_type_t buft = nullptr;
|
ggml_backend_buffer_type_t buft = nullptr;
|
||||||
// this will only really support cpu or metal for the time being;
|
// this will only really support cpu or metal for the time being;
|
||||||
if (dctx->backend != nullptr) {
|
if (dctx->backend != nullptr) {
|
||||||
|
@ -382,7 +382,7 @@ static struct ggml_tensor * build_dia_encoder(ggml_context * ctx, dia_model * mo
|
||||||
struct ggml_tensor * cur = ggml_reshape_3d(ctx, ggml_get_rows(ctx, model->encoder->embedding, dctx->inp_tokens), model->encoder_hidden_size, model->max_encoder_context_length, 2);
|
struct ggml_tensor * cur = ggml_reshape_3d(ctx, ggml_get_rows(ctx, model->encoder->embedding, dctx->inp_tokens), model->encoder_hidden_size, model->max_encoder_context_length, 2);
|
||||||
for (auto layer : model->encoder->layers) {
|
for (auto layer : model->encoder->layers) {
|
||||||
struct ggml_tensor * residual = cur;
|
struct ggml_tensor * residual = cur;
|
||||||
|
|
||||||
cur = dia_layer_norm(ctx, cur, layer->self_attn_norm);
|
cur = dia_layer_norm(ctx, cur, layer->self_attn_norm);
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
|
@ -402,7 +402,7 @@ static struct ggml_tensor * build_dia_encoder(ggml_context * ctx, dia_model * mo
|
||||||
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3);
|
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3);
|
||||||
|
|
||||||
// It is unclear why the attention ops in Dia's encoder don't project to the embedding dimension size as is standard. Instead they up project to the decoder's embedding dimension
|
// It is unclear why the attention ops in Dia's encoder don't project to the embedding dimension size as is standard. Instead they up project to the decoder's embedding dimension
|
||||||
// then down project back the the encoder embedding dimension.
|
// then down project back the the encoder embedding dimension.
|
||||||
cur = ggml_cont_3d(ctx, kqv_merged, model->decoder_hidden_size, model->max_encoder_context_length, 2);
|
cur = ggml_cont_3d(ctx, kqv_merged, model->decoder_hidden_size, model->max_encoder_context_length, 2);
|
||||||
cur = ggml_mul_mat(ctx, layer->o, cur);
|
cur = ggml_mul_mat(ctx, layer->o, cur);
|
||||||
}
|
}
|
||||||
|
@ -443,10 +443,10 @@ static struct ggml_tensor * repeat_interleave_dim1(ggml_context * ctx, struct gg
|
||||||
static void build_dia_self_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * k, struct ggml_tensor * v, dia_ubatch & batch, int layer_index) {
|
static void build_dia_self_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * k, struct ggml_tensor * v, dia_ubatch & batch, int layer_index) {
|
||||||
int64_t attn_size = model->head_size * model->decoder_attn_heads;
|
int64_t attn_size = model->head_size * model->decoder_attn_heads;
|
||||||
|
|
||||||
struct ggml_tensor * k_cache_view =
|
struct ggml_tensor * k_cache_view =
|
||||||
ggml_view_2d(
|
ggml_view_2d(
|
||||||
ctx, kv->k_l[layer_index], attn_size, 2,
|
ctx, kv->k_l[layer_index], attn_size, 2,
|
||||||
attn_size * model->max_generation_size * ggml_element_size(kv->k_l[layer_index]),
|
attn_size * model->max_generation_size * ggml_element_size(kv->k_l[layer_index]),
|
||||||
attn_size*dctx->current_position*ggml_element_size(kv->k_l[layer_index]));
|
attn_size*dctx->current_position*ggml_element_size(kv->k_l[layer_index]));
|
||||||
|
|
||||||
k = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, k, model->head_size, model->decoder_attn_heads / model->decoder_query_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2);
|
k = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, k, model->head_size, model->decoder_attn_heads / model->decoder_query_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2);
|
||||||
|
@ -461,8 +461,8 @@ static void build_dia_self_kv_store(ggml_context * ctx, dia_context * dctx, dia_
|
||||||
struct ggml_tensor * v_cache_view = nullptr;
|
struct ggml_tensor * v_cache_view = nullptr;
|
||||||
|
|
||||||
v_cache_view = ggml_view_2d(
|
v_cache_view = ggml_view_2d(
|
||||||
ctx, kv->v_l[layer_index], attn_size, 2,
|
ctx, kv->v_l[layer_index], attn_size, 2,
|
||||||
attn_size * model->max_generation_size * ggml_element_size(kv->v_l[layer_index]),
|
attn_size * model->max_generation_size * ggml_element_size(kv->v_l[layer_index]),
|
||||||
attn_size*dctx->current_position*ggml_element_size(kv->v_l[layer_index]));
|
attn_size*dctx->current_position*ggml_element_size(kv->v_l[layer_index]));
|
||||||
|
|
||||||
// Since the sequence length should always be 1 here this is the most pertinent time to repeat the heads for grouped query attention.
|
// Since the sequence length should always be 1 here this is the most pertinent time to repeat the heads for grouped query attention.
|
||||||
|
@ -476,11 +476,11 @@ static void build_dia_self_kv_store(ggml_context * ctx, dia_context * dctx, dia_
|
||||||
static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * encoder_hidden_states, int layer_index) {
|
static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * encoder_hidden_states, int layer_index) {
|
||||||
dia_decoder_layer * layer = model->decoder->layers[layer_index];
|
dia_decoder_layer * layer = model->decoder->layers[layer_index];
|
||||||
struct ggml_tensor * encoder_states_key_view = ggml_cont(ctx, ggml_view_3d(
|
struct ggml_tensor * encoder_states_key_view = ggml_cont(ctx, ggml_view_3d(
|
||||||
ctx,
|
ctx,
|
||||||
encoder_hidden_states,
|
encoder_hidden_states,
|
||||||
model->encoder_hidden_size,
|
model->encoder_hidden_size,
|
||||||
dctx->prompt_size,
|
dctx->prompt_size,
|
||||||
2,
|
2,
|
||||||
model->encoder_hidden_size * ggml_element_size(encoder_hidden_states), model->encoder_hidden_size * model->max_encoder_context_length * ggml_element_size(encoder_hidden_states), 0));
|
model->encoder_hidden_size * ggml_element_size(encoder_hidden_states), model->encoder_hidden_size * model->max_encoder_context_length * ggml_element_size(encoder_hidden_states), 0));
|
||||||
|
|
||||||
struct ggml_tensor * k = ggml_mul_mat(ctx, layer->cross_attn_k, encoder_states_key_view);
|
struct ggml_tensor * k = ggml_mul_mat(ctx, layer->cross_attn_k, encoder_states_key_view);
|
||||||
|
@ -491,8 +491,8 @@ static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia
|
||||||
|
|
||||||
struct ggml_tensor * k_cache_view =
|
struct ggml_tensor * k_cache_view =
|
||||||
ggml_view_4d(
|
ggml_view_4d(
|
||||||
ctx, kv->cross_k_l[layer_index], model->head_size, model->decoder_attn_heads, 2, dctx->prompt_size,
|
ctx, kv->cross_k_l[layer_index], model->head_size, model->decoder_attn_heads, 2, dctx->prompt_size,
|
||||||
model->head_size*ggml_element_size(kv->cross_k_l[layer_index]),
|
model->head_size*ggml_element_size(kv->cross_k_l[layer_index]),
|
||||||
model->head_size*model->decoder_attn_heads*ggml_element_size(kv->cross_k_l[layer_index]),
|
model->head_size*model->decoder_attn_heads*ggml_element_size(kv->cross_k_l[layer_index]),
|
||||||
model->head_size*model->decoder_attn_heads*2*ggml_element_size(kv->cross_k_l[layer_index]),
|
model->head_size*model->decoder_attn_heads*2*ggml_element_size(kv->cross_k_l[layer_index]),
|
||||||
0);
|
0);
|
||||||
|
@ -504,10 +504,10 @@ static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia
|
||||||
|
|
||||||
struct ggml_tensor * v_cache_view =
|
struct ggml_tensor * v_cache_view =
|
||||||
ggml_view_4d(
|
ggml_view_4d(
|
||||||
ctx, kv->cross_v_l[layer_index], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2,
|
ctx, kv->cross_v_l[layer_index], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2,
|
||||||
model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]),
|
model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]),
|
||||||
model->head_size*model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]),
|
model->head_size*model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]),
|
||||||
model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(kv->cross_v_l[layer_index]),
|
model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(kv->cross_v_l[layer_index]),
|
||||||
0);
|
0);
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(ctx, v, v_cache_view));
|
ggml_build_forward_expand(gf, ggml_cpy(ctx, v, v_cache_view));
|
||||||
|
@ -515,11 +515,11 @@ static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia
|
||||||
|
|
||||||
static struct ggml_tensor * build_dia_decoder(
|
static struct ggml_tensor * build_dia_decoder(
|
||||||
ggml_cgraph * gf,
|
ggml_cgraph * gf,
|
||||||
ggml_context * ctx,
|
ggml_context * ctx,
|
||||||
dia_model * model,
|
dia_model * model,
|
||||||
dia_context * dctx,
|
dia_context * dctx,
|
||||||
dia_kv_cache * cache,
|
dia_kv_cache * cache,
|
||||||
dia_ubatch & batch,
|
dia_ubatch & batch,
|
||||||
struct ggml_tensor * encoder_hidden_states) {
|
struct ggml_tensor * encoder_hidden_states) {
|
||||||
dctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.sequence_length);
|
dctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.sequence_length);
|
||||||
ggml_set_input(dctx->positions);
|
ggml_set_input(dctx->positions);
|
||||||
|
@ -528,7 +528,7 @@ static struct ggml_tensor * build_dia_decoder(
|
||||||
for (int l = 0; l < model->decoder->layers.size(); l++){
|
for (int l = 0; l < model->decoder->layers.size(); l++){
|
||||||
dia_decoder_layer * layer = model->decoder->layers[l];
|
dia_decoder_layer * layer = model->decoder->layers[l];
|
||||||
struct ggml_tensor * residual = cur;
|
struct ggml_tensor * residual = cur;
|
||||||
|
|
||||||
cur = dia_layer_norm(ctx, cur, layer->self_attn_norm);
|
cur = dia_layer_norm(ctx, cur, layer->self_attn_norm);
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
|
@ -546,13 +546,13 @@ static struct ggml_tensor * build_dia_decoder(
|
||||||
0);
|
0);
|
||||||
k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));
|
k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));
|
||||||
|
|
||||||
struct ggml_tensor * v =
|
struct ggml_tensor * v =
|
||||||
ggml_view_3d(ctx, cache->v_l[l],
|
ggml_view_3d(ctx, cache->v_l[l],
|
||||||
model->head_size * model->decoder_attn_heads, dctx->current_position + 1, 2,
|
model->head_size * model->decoder_attn_heads, dctx->current_position + 1, 2,
|
||||||
ggml_element_size(cache->v_l[l]) * model->decoder_attn_heads * model->head_size,
|
ggml_element_size(cache->v_l[l]) * model->decoder_attn_heads * model->head_size,
|
||||||
ggml_element_size(cache->v_l[l]) * model->decoder_attn_heads * model->head_size * model->max_generation_size,
|
ggml_element_size(cache->v_l[l]) * model->decoder_attn_heads * model->head_size * model->max_generation_size,
|
||||||
0);
|
0);
|
||||||
v = ggml_cont_4d(ctx, ggml_transpose(ctx, v), dctx->current_position + 1, model->head_size, model->decoder_attn_heads, 2);
|
v = ggml_cont_4d(ctx, ggml_transpose(ctx, v), dctx->current_position + 1, model->head_size, model->decoder_attn_heads, 2);
|
||||||
|
|
||||||
// As noted in the encoder Dia uses the Neo-X protocol for RoPE.
|
// As noted in the encoder Dia uses the Neo-X protocol for RoPE.
|
||||||
Qcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, Qcur, model->head_size, model->decoder_attn_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2);
|
Qcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, Qcur, model->head_size, model->decoder_attn_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2);
|
||||||
|
@ -583,22 +583,22 @@ static struct ggml_tensor * build_dia_decoder(
|
||||||
build_dia_cross_kv_store(ctx, dctx, model, cache, gf, encoder_hidden_states, l);
|
build_dia_cross_kv_store(ctx, dctx, model, cache, gf, encoder_hidden_states, l);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * cross_k =
|
struct ggml_tensor * cross_k =
|
||||||
ggml_view_4d(
|
ggml_view_4d(
|
||||||
ctx, cache->cross_k_l[l], model->head_size, model->decoder_attn_heads, 2,
|
ctx, cache->cross_k_l[l], model->head_size, model->decoder_attn_heads, 2,
|
||||||
model->max_encoder_context_length, model->head_size*ggml_element_size(cache->cross_k_l[l]),
|
model->max_encoder_context_length, model->head_size*ggml_element_size(cache->cross_k_l[l]),
|
||||||
model->head_size*model->decoder_attn_heads*ggml_element_size(cache->cross_k_l[l]),
|
model->head_size*model->decoder_attn_heads*ggml_element_size(cache->cross_k_l[l]),
|
||||||
model->head_size*model->decoder_attn_heads*2*ggml_element_size(cache->cross_k_l[l]),
|
model->head_size*model->decoder_attn_heads*2*ggml_element_size(cache->cross_k_l[l]),
|
||||||
0);
|
0);
|
||||||
// the double permute operation shouldn't be necessary here, but it seems that currently ggml permute only currently alows for a single
|
// the double permute operation shouldn't be necessary here, but it seems that currently ggml permute only currently alows for a single
|
||||||
// axis pair to be transposed.
|
// axis pair to be transposed.
|
||||||
cross_k = ggml_cont(ctx, ggml_permute(ctx, ggml_permute(ctx, cross_k, 0, 1, 3, 2), 0, 2, 1, 3));
|
cross_k = ggml_cont(ctx, ggml_permute(ctx, ggml_permute(ctx, cross_k, 0, 1, 3, 2), 0, 2, 1, 3));
|
||||||
|
|
||||||
struct ggml_tensor * cross_v =
|
struct ggml_tensor * cross_v =
|
||||||
ggml_cont(ctx, ggml_view_4d(
|
ggml_cont(ctx, ggml_view_4d(
|
||||||
ctx, cache->cross_v_l[l], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2,
|
ctx, cache->cross_v_l[l], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2,
|
||||||
model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]),
|
model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]),
|
||||||
model->head_size*model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]),
|
model->head_size*model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]),
|
||||||
model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(cache->cross_v_l[l]),
|
model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(cache->cross_v_l[l]),
|
||||||
0));
|
0));
|
||||||
|
|
||||||
|
@ -637,10 +637,10 @@ static struct ggml_tensor * build_dia_decoder(
|
||||||
}
|
}
|
||||||
|
|
||||||
void dia_runner::tokenize_sentence(std::string sentence, dia_ubatch & batch) {
|
void dia_runner::tokenize_sentence(std::string sentence, dia_ubatch & batch) {
|
||||||
// Dia's tokenization process is unusual. Essentially Dia takes the byte value for each character and uses that as
|
// Dia's tokenization process is unusual. Essentially Dia takes the byte value for each character and uses that as
|
||||||
// a token array. Additionally, because Dia performs a cfg-scale adjustment before sampling tokens, it is necessary to
|
// a token array. Additionally, because Dia performs a cfg-scale adjustment before sampling tokens, it is necessary to
|
||||||
// generate with a conditioned context (i.e. with the text) and an unconditioned context (i.e. without any text) so that
|
// generate with a conditioned context (i.e. with the text) and an unconditioned context (i.e. without any text) so that
|
||||||
// proper adjustments can be perfored at each generation step. This means that we need to pad the end of our tokens to the
|
// proper adjustments can be perfored at each generation step. This means that we need to pad the end of our tokens to the
|
||||||
// max context size for both the conditional and unconditional sequence.
|
// max context size for both the conditional and unconditional sequence.
|
||||||
|
|
||||||
// if the sentence isn't prepended by dialogue start tokens, [S1] or [S2], then append one.
|
// if the sentence isn't prepended by dialogue start tokens, [S1] or [S2], then append one.
|
||||||
|
@ -699,7 +699,7 @@ dia_ubatch dia_runner::batch_from_sentence(std::string sentence) {
|
||||||
* 1. Dia cleans its output generation by adding the difference between its text based output (its conditional output) and its unconditional output
|
* 1. Dia cleans its output generation by adding the difference between its text based output (its conditional output) and its unconditional output
|
||||||
* to the conditional ouput before sampling. This is why the batch is set to two throughout the graph.
|
* to the conditional ouput before sampling. This is why the batch is set to two throughout the graph.
|
||||||
*
|
*
|
||||||
* 2. Dia's decoder attends across the entire encoded space including the pad buffer which receives a unique attention mask. This is why the
|
* 2. Dia's decoder attends across the entire encoded space including the pad buffer which receives a unique attention mask. This is why the
|
||||||
* encoder sequence is always max length.
|
* encoder sequence is always max length.
|
||||||
*/
|
*/
|
||||||
struct ggml_cgraph * dia_runner::build_dia_graph(dia_ubatch & batch) {
|
struct ggml_cgraph * dia_runner::build_dia_graph(dia_ubatch & batch) {
|
||||||
|
@ -716,7 +716,7 @@ struct ggml_cgraph * dia_runner::build_dia_graph(dia_ubatch & batch) {
|
||||||
ggml_set_name(cur, "decoder_output");
|
ggml_set_name(cur, "decoder_output");
|
||||||
ggml_build_forward_expand(gf, cur);
|
ggml_build_forward_expand(gf, cur);
|
||||||
free_build();
|
free_build();
|
||||||
|
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -758,11 +758,11 @@ int dia_runner::decode(dia_ubatch & batch) {
|
||||||
dctx->output_tokens.reserve(dctx->max_generation_size * model->n_output_heads);
|
dctx->output_tokens.reserve(dctx->max_generation_size * model->n_output_heads);
|
||||||
}
|
}
|
||||||
ggml_backend_sched_reset(dctx->sched);
|
ggml_backend_sched_reset(dctx->sched);
|
||||||
|
|
||||||
const size_t logits_size = model->output_vocab_size * dctx->max_generation_size * model->n_output_heads;
|
const size_t logits_size = model->output_vocab_size * dctx->max_generation_size * model->n_output_heads;
|
||||||
const size_t prev_size = dctx->buf_output ? ggml_backend_buffer_get_size(dctx->buf_output) : 0;
|
const size_t prev_size = dctx->buf_output ? ggml_backend_buffer_get_size(dctx->buf_output) : 0;
|
||||||
const size_t new_size = logits_size * sizeof(float);
|
const size_t new_size = logits_size * sizeof(float);
|
||||||
|
|
||||||
if (!dctx->buf_output || prev_size < new_size) {
|
if (!dctx->buf_output || prev_size < new_size) {
|
||||||
if (dctx->buf_output) {
|
if (dctx->buf_output) {
|
||||||
ggml_backend_buffer_free(dctx->buf_output);
|
ggml_backend_buffer_free(dctx->buf_output);
|
||||||
|
@ -772,7 +772,7 @@ int dia_runner::decode(dia_ubatch & batch) {
|
||||||
|
|
||||||
dctx->buf_output = ggml_backend_buft_alloc_buffer(dctx->backend_cpu_buffer, new_size);
|
dctx->buf_output = ggml_backend_buft_alloc_buffer(dctx->backend_cpu_buffer, new_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
dctx->logits = (float *) ggml_backend_buffer_get_base(dctx->buf_output);
|
dctx->logits = (float *) ggml_backend_buffer_get_base(dctx->buf_output);
|
||||||
|
|
||||||
ggml_cgraph * gf = build_dia_graph(batch);
|
ggml_cgraph * gf = build_dia_graph(batch);
|
||||||
|
@ -817,7 +817,7 @@ bool dia_runner::check_stopping(dia_ubatch & batch) {
|
||||||
if (dctx->delay_steps == -1 && (batch.audio_tokens[0] == model->eos_token_id || dctx->current_position >= dctx->max_generation_size - model->max_delay)) {
|
if (dctx->delay_steps == -1 && (batch.audio_tokens[0] == model->eos_token_id || dctx->current_position >= dctx->max_generation_size - model->max_delay)) {
|
||||||
dctx->delay_steps = model->max_delay;
|
dctx->delay_steps = model->max_delay;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dctx->delay_steps > 0) {
|
if (dctx->delay_steps > 0) {
|
||||||
int step_after_eos = model->max_delay - dctx->delay_steps;
|
int step_after_eos = model->max_delay - dctx->delay_steps;
|
||||||
for (int i = 0; i < model->delay_pattern.size(); i++) {
|
for (int i = 0; i < model->delay_pattern.size(); i++) {
|
||||||
|
@ -907,5 +907,5 @@ void dia_runner::assign_weight(std::string name, ggml_tensor * tensor) {
|
||||||
dac_runner->model->assign_weight(name.substr(14), tensor);
|
dac_runner->model->assign_weight(name.substr(14), tensor);
|
||||||
} else {
|
} else {
|
||||||
model->assign_weight(name, tensor);
|
model->assign_weight(name, tensor);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "dac_model.h"
|
#include "dac_model.h"
|
||||||
#include "sampler.h"
|
#include "ttssampler.h"
|
||||||
|
|
||||||
struct dia_encoder_layer {
|
struct dia_encoder_layer {
|
||||||
struct ggml_tensor * k;
|
struct ggml_tensor * k;
|
||||||
|
@ -22,7 +22,7 @@ struct dia_decoder_layer {
|
||||||
struct ggml_tensor * self_attn_v;
|
struct ggml_tensor * self_attn_v;
|
||||||
struct ggml_tensor * self_attn_o;
|
struct ggml_tensor * self_attn_o;
|
||||||
struct ggml_tensor * self_attn_norm;
|
struct ggml_tensor * self_attn_norm;
|
||||||
|
|
||||||
struct ggml_tensor * cross_attn_k;
|
struct ggml_tensor * cross_attn_k;
|
||||||
struct ggml_tensor * cross_attn_q;
|
struct ggml_tensor * cross_attn_q;
|
||||||
struct ggml_tensor * cross_attn_v;
|
struct ggml_tensor * cross_attn_v;
|
||||||
|
@ -76,7 +76,7 @@ struct dia_model : tts_model {
|
||||||
|
|
||||||
dia_encoder * encoder;
|
dia_encoder * encoder;
|
||||||
dia_decoder * decoder;
|
dia_decoder * decoder;
|
||||||
|
|
||||||
void assign_weight(std::string name, ggml_tensor * tensor);
|
void assign_weight(std::string name, ggml_tensor * tensor);
|
||||||
void assign_to_encoder(std::vector<std::string> parts, struct ggml_tensor * tensor, std::string name);
|
void assign_to_encoder(std::vector<std::string> parts, struct ggml_tensor * tensor, std::string name);
|
||||||
void assign_to_decoder(std::vector<std::string> parts, struct ggml_tensor * tensor, std::string name);
|
void assign_to_decoder(std::vector<std::string> parts, struct ggml_tensor * tensor, std::string name);
|
||||||
|
@ -103,15 +103,15 @@ struct dia_context : runner_context {
|
||||||
uint32_t max_generation_size; // this is set by the generation context or defaults to the config set on dia model.
|
uint32_t max_generation_size; // this is set by the generation context or defaults to the config set on dia model.
|
||||||
|
|
||||||
std::vector<uint32_t> output_tokens;
|
std::vector<uint32_t> output_tokens;
|
||||||
struct dia_model * model;
|
struct dia_model * model;
|
||||||
|
|
||||||
struct ggml_tensor * inp_tokens;
|
struct ggml_tensor * inp_tokens;
|
||||||
struct ggml_tensor * audio_inp_tokens;
|
struct ggml_tensor * audio_inp_tokens;
|
||||||
struct ggml_tensor * positions;
|
struct ggml_tensor * positions;
|
||||||
struct ggml_tensor * encode_positions;
|
struct ggml_tensor * encode_positions;
|
||||||
struct ggml_tensor * encode_attn_mask;
|
struct ggml_tensor * encode_attn_mask;
|
||||||
struct ggml_tensor * cross_attn_mask;
|
struct ggml_tensor * cross_attn_mask;
|
||||||
|
|
||||||
void build_schedule() {
|
void build_schedule() {
|
||||||
runner_context::build_schedule(model->max_nodes());
|
runner_context::build_schedule(model->max_nodes());
|
||||||
}
|
}
|
||||||
|
@ -126,11 +126,11 @@ struct dia_kv_cache {
|
||||||
|
|
||||||
std::vector<struct ggml_tensor *> k_l;
|
std::vector<struct ggml_tensor *> k_l;
|
||||||
std::vector<struct ggml_tensor *> v_l;
|
std::vector<struct ggml_tensor *> v_l;
|
||||||
|
|
||||||
struct ggml_context * ctx;
|
struct ggml_context * ctx;
|
||||||
ggml_backend_buffer_type_t buft;
|
ggml_backend_buffer_type_t buft;
|
||||||
ggml_backend_buffer_t buf;
|
ggml_backend_buffer_t buf;
|
||||||
|
|
||||||
void free() {
|
void free() {
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
ggml_backend_buffer_free(buf);
|
ggml_backend_buffer_free(buf);
|
||||||
|
|
|
@ -53,7 +53,7 @@ namespace general_neural_audio_codec {
|
||||||
|
|
||||||
uint32_t padding;
|
uint32_t padding;
|
||||||
uint32_t stride;
|
uint32_t stride;
|
||||||
|
|
||||||
std::vector<residual_unit> residual_blocks;
|
std::vector<residual_unit> residual_blocks;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -3,11 +3,11 @@
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include "tts_model.h"
|
#include "tts_model.h"
|
||||||
#include "tokenizer.h"
|
#include "ttstokenizer.h"
|
||||||
#include "phonemizer.h"
|
#include "phonemizer.h"
|
||||||
|
|
||||||
// Rather than using ISO 639-2 language codes, Kokoro voice pack specify their corresponding language via their first letter.
|
// Rather than using ISO 639-2 language codes, Kokoro voice pack specify their corresponding language via their first letter.
|
||||||
// Below is a map that describes the relationship between those designations and espeak-ng's voice identifiers so that the
|
// Below is a map that describes the relationship between those designations and espeak-ng's voice identifiers so that the
|
||||||
// appropriate phonemization protocol can inferred from the Kokoro voice.
|
// appropriate phonemization protocol can inferred from the Kokoro voice.
|
||||||
static std::map<char, std::string> KOKORO_LANG_TO_ESPEAK_ID = {
|
static std::map<char, std::string> KOKORO_LANG_TO_ESPEAK_ID = {
|
||||||
{'a', "gmw/en-US"},
|
{'a', "gmw/en-US"},
|
||||||
|
@ -22,7 +22,7 @@ static std::map<char, std::string> KOKORO_LANG_TO_ESPEAK_ID = {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct lstm_cell {
|
struct lstm_cell {
|
||||||
std::vector<ggml_tensor*> weights;
|
std::vector<ggml_tensor*> weights;
|
||||||
std::vector<ggml_tensor*> biases;
|
std::vector<ggml_tensor*> biases;
|
||||||
std::vector<ggml_tensor*> reverse_weights;
|
std::vector<ggml_tensor*> reverse_weights;
|
||||||
std::vector<ggml_tensor*> reverse_biases;
|
std::vector<ggml_tensor*> reverse_biases;
|
||||||
|
@ -197,8 +197,8 @@ struct kokoro_model : tts_model {
|
||||||
// standard configuration for duration prediction
|
// standard configuration for duration prediction
|
||||||
uint32_t f0_n_blocks = 3;
|
uint32_t f0_n_blocks = 3;
|
||||||
uint32_t n_duration_prediction_layers = 3;
|
uint32_t n_duration_prediction_layers = 3;
|
||||||
// while it is technically possible for the duration predictor to assign 50 values per token there is no practical need to
|
// while it is technically possible for the duration predictor to assign 50 values per token there is no practical need to
|
||||||
// allocate that many items to the sequence as it is impossible for all tokens to require such long durations and each
|
// allocate that many items to the sequence as it is impossible for all tokens to require such long durations and each
|
||||||
// allocation increases node allocation size by O(N)
|
// allocation increases node allocation size by O(N)
|
||||||
uint32_t max_duration_per_token = 20;
|
uint32_t max_duration_per_token = 20;
|
||||||
uint32_t style_half_size = 128;
|
uint32_t style_half_size = 128;
|
||||||
|
@ -221,7 +221,7 @@ struct kokoro_model : tts_model {
|
||||||
float noise_std = 0.003f;
|
float noise_std = 0.003f;
|
||||||
float voice_threshold = 10.0f;
|
float voice_threshold = 10.0f;
|
||||||
float sample_rate = 24000.0f;
|
float sample_rate = 24000.0f;
|
||||||
std::string window = "hann";
|
std::string window = "hann";
|
||||||
|
|
||||||
// It is really annoying that ggml doesn't allow using non ggml tensors as the operator for simple math ops.
|
// It is really annoying that ggml doesn't allow using non ggml tensors as the operator for simple math ops.
|
||||||
// This is just the constant defined above as a tensor.
|
// This is just the constant defined above as a tensor.
|
||||||
|
@ -259,7 +259,7 @@ struct kokoro_model : tts_model {
|
||||||
// Decoding and Generation portion of the model
|
// Decoding and Generation portion of the model
|
||||||
struct kokoro_decoder * decoder;
|
struct kokoro_decoder * decoder;
|
||||||
|
|
||||||
// the default hidden states need to be initialized
|
// the default hidden states need to be initialized
|
||||||
std::vector<lstm*> lstms;
|
std::vector<lstm*> lstms;
|
||||||
|
|
||||||
size_t duration_node_counter = 0;
|
size_t duration_node_counter = 0;
|
||||||
|
@ -317,15 +317,15 @@ struct kokoro_duration_context : runner_context {
|
||||||
~kokoro_duration_context() {
|
~kokoro_duration_context() {
|
||||||
ggml_backend_buffer_free(buf_len_output);
|
ggml_backend_buffer_free(buf_len_output);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string voice = "af_alloy";
|
std::string voice = "af_alloy";
|
||||||
struct kokoro_model * model;
|
struct kokoro_model * model;
|
||||||
ggml_backend_buffer_t buf_len_output = nullptr;
|
ggml_backend_buffer_t buf_len_output = nullptr;
|
||||||
|
|
||||||
|
|
||||||
size_t logits_size = 0; // capacity (of floats) for logits
|
size_t logits_size = 0; // capacity (of floats) for logits
|
||||||
float * lens = nullptr;
|
float * lens = nullptr;
|
||||||
|
|
||||||
struct ggml_tensor * inp_tokens;
|
struct ggml_tensor * inp_tokens;
|
||||||
struct ggml_tensor * positions;
|
struct ggml_tensor * positions;
|
||||||
struct ggml_tensor * attn_mask;
|
struct ggml_tensor * attn_mask;
|
||||||
|
@ -356,7 +356,7 @@ struct kokoro_duration_response {
|
||||||
};
|
};
|
||||||
|
|
||||||
// This struct is intended to manage graph and compute for the duration prediction portion of the kokoro model.
|
// This struct is intended to manage graph and compute for the duration prediction portion of the kokoro model.
|
||||||
// Duration computation and speech generation are separated into distinct graphs because the precomputed graph structure of ggml doesn't
|
// Duration computation and speech generation are separated into distinct graphs because the precomputed graph structure of ggml doesn't
|
||||||
// support the tensor dependent views that would otherwise be necessary.
|
// support the tensor dependent views that would otherwise be necessary.
|
||||||
struct kokoro_duration_runner : tts_runner {
|
struct kokoro_duration_runner : tts_runner {
|
||||||
kokoro_duration_runner(kokoro_model * model, kokoro_duration_context * context, single_pass_tokenizer * tokenizer): model(model), kctx(context), tokenizer(tokenizer) {};
|
kokoro_duration_runner(kokoro_model * model, kokoro_duration_context * context, single_pass_tokenizer * tokenizer): model(model), kctx(context), tokenizer(tokenizer) {};
|
||||||
|
@ -375,7 +375,7 @@ struct kokoro_duration_runner : tts_runner {
|
||||||
void init_build() {
|
void init_build() {
|
||||||
tts_runner::init_build(&kctx->buf_compute_meta);
|
tts_runner::init_build(&kctx->buf_compute_meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
void prepare_post_load();
|
void prepare_post_load();
|
||||||
struct kokoro_ubatch build_worst_case_batch();
|
struct kokoro_ubatch build_worst_case_batch();
|
||||||
void set_inputs(kokoro_ubatch & batch);
|
void set_inputs(kokoro_ubatch & batch);
|
||||||
|
@ -397,7 +397,7 @@ struct kokoro_context : runner_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string voice = "af_alloy";
|
std::string voice = "af_alloy";
|
||||||
|
|
||||||
struct kokoro_model * model;
|
struct kokoro_model * model;
|
||||||
|
|
||||||
uint32_t total_duration;
|
uint32_t total_duration;
|
||||||
|
@ -408,7 +408,7 @@ struct kokoro_context : runner_context {
|
||||||
struct ggml_tensor * duration_mask;
|
struct ggml_tensor * duration_mask;
|
||||||
struct ggml_tensor * window_sq_sum; // needs to be calculatd from the generator window.
|
struct ggml_tensor * window_sq_sum; // needs to be calculatd from the generator window.
|
||||||
struct ggml_tensor * uv_noise_data;
|
struct ggml_tensor * uv_noise_data;
|
||||||
|
|
||||||
void build_schedule() {
|
void build_schedule() {
|
||||||
runner_context::build_schedule(model->max_gen_nodes()*30);
|
runner_context::build_schedule(model->max_gen_nodes()*30);
|
||||||
}
|
}
|
||||||
|
|
|
@ -150,7 +150,7 @@ orpheus_context * build_new_orpheus_context(orpheus_model * model, int n_threads
|
||||||
return octx;
|
return octx;
|
||||||
}
|
}
|
||||||
|
|
||||||
void orpheus_runner::orpheus_kv_cache_init() {
|
void orpheus_runner::orpheus_kv_cache_init() {
|
||||||
ggml_backend_buffer_type_t buft = nullptr;
|
ggml_backend_buffer_type_t buft = nullptr;
|
||||||
if (octx->backend != nullptr) {
|
if (octx->backend != nullptr) {
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
|
@ -192,21 +192,21 @@ void orpheus_runner::orpheus_kv_cache_init() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void orpheus_runner::orpheus_build_kv_store(struct ggml_context * ctx, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, int index, uint32_t n_tokens, int repeat) {
|
void orpheus_runner::orpheus_build_kv_store(struct ggml_context * ctx, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, int index, uint32_t n_tokens, int repeat) {
|
||||||
k_cur = ggml_rope_ext(ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, k_cur, model->head_size, model->n_kv_attn_heads, n_tokens)), octx->positions, model->rope_frequencies,
|
k_cur = ggml_rope_ext(ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, k_cur, model->head_size, model->n_kv_attn_heads, n_tokens)), octx->positions, model->rope_frequencies,
|
||||||
model->head_size, 2,0, 500000.0f,
|
model->head_size, 2,0, 500000.0f,
|
||||||
1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
|
1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
|
||||||
|
|
||||||
// A performance comparison between this method, i.e. performing 3 incremental copy operations in order to achieve repeat_interleave,
|
// A performance comparison between this method, i.e. performing 3 incremental copy operations in order to achieve repeat_interleave,
|
||||||
// and performing the repeat operation upfront before performign a single copy needs to be performed in order to better optimize this function.
|
// and performing the repeat operation upfront before performign a single copy needs to be performed in order to better optimize this function.
|
||||||
// Additionally, it might be more performant for the values transposition to be performed prior to appending it to the cache, as it would save us
|
// Additionally, it might be more performant for the values transposition to be performed prior to appending it to the cache, as it would save us
|
||||||
// from incrementally larger transpositions with generation.
|
// from incrementally larger transpositions with generation.
|
||||||
for (int i = 0; i < repeat; i++) {
|
for (int i = 0; i < repeat; i++) {
|
||||||
struct ggml_tensor * k_cache_view = ggml_view_3d(
|
struct ggml_tensor * k_cache_view = ggml_view_3d(
|
||||||
ctx,
|
ctx,
|
||||||
kv_self->k_l[index],
|
kv_self->k_l[index],
|
||||||
model->head_size,
|
model->head_size,
|
||||||
model->n_kv_attn_heads,
|
model->n_kv_attn_heads,
|
||||||
n_tokens,
|
n_tokens,
|
||||||
ggml_element_size(kv_self->k_l[index]) * model->head_size * repeat,
|
ggml_element_size(kv_self->k_l[index]) * model->head_size * repeat,
|
||||||
ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size,
|
ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size,
|
||||||
ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size * octx->current_position + i * ggml_element_size(kv_self->k_l[index]) * model->head_size
|
ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size * octx->current_position + i * ggml_element_size(kv_self->k_l[index]) * model->head_size
|
||||||
|
@ -230,19 +230,19 @@ void orpheus_runner::orpheus_kv_cache_init() {
|
||||||
struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch) {
|
struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch) {
|
||||||
init_build();
|
init_build();
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false);
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
const int32_t full_sequence_length = octx->current_position + (uint32_t) batch.n_tokens;
|
const int32_t full_sequence_length = octx->current_position + (uint32_t) batch.n_tokens;
|
||||||
octx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
|
octx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
|
||||||
ggml_set_input(octx->positions);
|
ggml_set_input(octx->positions);
|
||||||
octx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
|
octx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
|
||||||
ggml_set_input(octx->inp_tokens);
|
ggml_set_input(octx->inp_tokens);
|
||||||
inpL = ggml_get_rows(ctx, model->embd, octx->inp_tokens);
|
inpL = ggml_get_rows(ctx, model->embd, octx->inp_tokens);
|
||||||
|
|
||||||
struct ggml_tensor * KQ_mask_dec = build_attn_mask(ctx, octx, batch);
|
struct ggml_tensor * KQ_mask_dec = build_attn_mask(ctx, octx, batch);
|
||||||
|
|
||||||
for (int l = 0; l < model->n_layers; l++) {
|
for (int l = 0; l < model->n_layers; l++) {
|
||||||
struct ggml_tensor * residual = inpL;
|
struct ggml_tensor * residual = inpL;
|
||||||
cur = orpheus_build_layer_norm(ctx, inpL, model->layers[l].input_norm);
|
cur = orpheus_build_layer_norm(ctx, inpL, model->layers[l].input_norm);
|
||||||
|
@ -261,8 +261,8 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch)
|
||||||
model->head_size, full_sequence_length, model->n_attn_heads,
|
model->head_size, full_sequence_length, model->n_attn_heads,
|
||||||
ggml_element_size(kv_self->k_l[l]) * model->n_attn_heads * model->head_size,
|
ggml_element_size(kv_self->k_l[l]) * model->n_attn_heads * model->head_size,
|
||||||
ggml_element_size(kv_self->k_l[l]) * model->head_size,
|
ggml_element_size(kv_self->k_l[l]) * model->head_size,
|
||||||
0));
|
0));
|
||||||
|
|
||||||
struct ggml_tensor * v =
|
struct ggml_tensor * v =
|
||||||
ggml_view_2d(ctx, kv_self->v_l[l],
|
ggml_view_2d(ctx, kv_self->v_l[l],
|
||||||
model->hidden_size, full_sequence_length,
|
model->hidden_size, full_sequence_length,
|
||||||
|
@ -272,7 +272,7 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch)
|
||||||
v = ggml_cont_3d(ctx, ggml_transpose(ctx, v), full_sequence_length, model->head_size, model->n_attn_heads);
|
v = ggml_cont_3d(ctx, ggml_transpose(ctx, v), full_sequence_length, model->head_size, model->n_attn_heads);
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(
|
Qcur = ggml_rope_ext(
|
||||||
ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, Qcur, model->head_size, model->n_attn_heads, batch.n_tokens)),
|
ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, Qcur, model->head_size, model->n_attn_heads, batch.n_tokens)),
|
||||||
octx->positions, model->rope_frequencies, model->head_size, 2, 0, 500000.0f, // rope theta
|
octx->positions, model->rope_frequencies, model->head_size, 2, 0, 500000.0f, // rope theta
|
||||||
1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
|
1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
|
||||||
|
|
||||||
|
@ -286,7 +286,7 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch)
|
||||||
}
|
}
|
||||||
|
|
||||||
cur = ggml_add(ctx, attn_out, residual);
|
cur = ggml_add(ctx, attn_out, residual);
|
||||||
|
|
||||||
struct ggml_tensor * residualffn = cur;
|
struct ggml_tensor * residualffn = cur;
|
||||||
|
|
||||||
// mlp
|
// mlp
|
||||||
|
@ -298,7 +298,7 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch)
|
||||||
cur = ggml_add(ctx, cur, residualffn);
|
cur = ggml_add(ctx, cur, residualffn);
|
||||||
inpL = cur;
|
inpL = cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
cur = orpheus_build_layer_norm(ctx, cur, model->output_norm);
|
cur = orpheus_build_layer_norm(ctx, cur, model->output_norm);
|
||||||
// only about 40k of the output head is actually uses for generation purposes. Ideally the head tensor should be shrunk and sampled tokens should be incremented.
|
// only about 40k of the output head is actually uses for generation purposes. Ideally the head tensor should be shrunk and sampled tokens should be incremented.
|
||||||
cur = ggml_mul_mat(ctx, model->head, cur);
|
cur = ggml_mul_mat(ctx, model->head, cur);
|
||||||
|
@ -307,15 +307,15 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch)
|
||||||
}
|
}
|
||||||
ggml_build_forward_expand(gf, cur);
|
ggml_build_forward_expand(gf, cur);
|
||||||
free_build();
|
free_build();
|
||||||
|
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
void orpheus_runner::decode(orpheus_ubatch & batch) {
|
void orpheus_runner::decode(orpheus_ubatch & batch) {
|
||||||
ggml_backend_sched_reset(octx->sched);
|
ggml_backend_sched_reset(octx->sched);
|
||||||
|
|
||||||
octx->output_tokens.reserve(model->max_generation_size);
|
octx->output_tokens.reserve(model->max_generation_size);
|
||||||
|
|
||||||
const size_t new_size = model->vocab_size * model->max_generation_size * sizeof(float);
|
const size_t new_size = model->vocab_size * model->max_generation_size * sizeof(float);
|
||||||
octx->prep_output_buffer(new_size);
|
octx->prep_output_buffer(new_size);
|
||||||
|
|
||||||
|
@ -324,10 +324,10 @@ void orpheus_runner::decode(orpheus_ubatch & batch) {
|
||||||
// the output is always the last tensor in the graph
|
// the output is always the last tensor in the graph
|
||||||
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
||||||
ggml_backend_sched_alloc_graph(octx->sched, gf);
|
ggml_backend_sched_alloc_graph(octx->sched, gf);
|
||||||
|
|
||||||
set_inputs(batch);
|
set_inputs(batch);
|
||||||
ggml_backend_sched_graph_compute_async(octx->sched, gf);
|
ggml_backend_sched_graph_compute_async(octx->sched, gf);
|
||||||
|
|
||||||
float * logits_out = octx->logits + octx->n_outputs * model->vocab_size;
|
float * logits_out = octx->logits + octx->n_outputs * model->vocab_size;
|
||||||
octx->get_ggml_node_data(res, logits_out, model->vocab_size * sizeof(float));
|
octx->get_ggml_node_data(res, logits_out, model->vocab_size * sizeof(float));
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "sampler.h"
|
#include "ttssampler.h"
|
||||||
#include "tokenizer.h"
|
#include "ttstokenizer.h"
|
||||||
#include "snac_model.h"
|
#include "snac_model.h"
|
||||||
|
|
||||||
// Orpheus uses vLLM with a llama-3 architecture. The only critical difference from the normal llama architecture is the use of kv heads.
|
// Orpheus uses vLLM with a llama-3 architecture. The only critical difference from the normal llama architecture is the use of kv heads.
|
||||||
|
@ -73,7 +73,7 @@ struct orpheus_context : runner_context {
|
||||||
struct ggml_tensor * positions;
|
struct ggml_tensor * positions;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct orpheus_kv_cache {
|
struct orpheus_kv_cache {
|
||||||
ggml_type cache_type = GGML_TYPE_F32;
|
ggml_type cache_type = GGML_TYPE_F32;
|
||||||
|
|
||||||
std::vector<struct ggml_tensor *> k_l;
|
std::vector<struct ggml_tensor *> k_l;
|
||||||
|
@ -104,11 +104,11 @@ struct orpheus_ubatch {
|
||||||
|
|
||||||
struct orpheus_runner : tts_runner {
|
struct orpheus_runner : tts_runner {
|
||||||
orpheus_runner(
|
orpheus_runner(
|
||||||
orpheus_model * model,
|
orpheus_model * model,
|
||||||
snac_runner * audio_decoder,
|
snac_runner * audio_decoder,
|
||||||
orpheus_context * octx,
|
orpheus_context * octx,
|
||||||
bpe_tokenizer * bt,
|
bpe_tokenizer * bt,
|
||||||
sampler * samp,
|
sampler * samp,
|
||||||
orpheus_kv_cache * cache): model(model), srunner(audio_decoder), octx(octx), tokenizer(bt), generation_sampler(samp), kv_self(cache) {
|
orpheus_kv_cache * cache): model(model), srunner(audio_decoder), octx(octx), tokenizer(bt), generation_sampler(samp), kv_self(cache) {
|
||||||
tts_runner::sampling_rate = 24000.0f;
|
tts_runner::sampling_rate = 24000.0f;
|
||||||
generation_sampler->n_output_heads = 1;
|
generation_sampler->n_output_heads = 1;
|
||||||
|
|
|
@ -2,8 +2,8 @@
|
||||||
#define parler_model_h
|
#define parler_model_h
|
||||||
|
|
||||||
#include "dac_model.h"
|
#include "dac_model.h"
|
||||||
#include "t5_encoder_model.h"
|
#include "ttst5_encoder_model.h"
|
||||||
#include "sampler.h"
|
#include "ttssampler.h"
|
||||||
|
|
||||||
enum parler_tensor {
|
enum parler_tensor {
|
||||||
PARLER_EMBD,
|
PARLER_EMBD,
|
||||||
|
@ -38,17 +38,17 @@ struct parler_layer {
|
||||||
struct ggml_tensor * self_attn_o_proj;
|
struct ggml_tensor * self_attn_o_proj;
|
||||||
struct ggml_tensor * self_attn_norm;
|
struct ggml_tensor * self_attn_norm;
|
||||||
struct ggml_tensor * self_attn_norm_bias;
|
struct ggml_tensor * self_attn_norm_bias;
|
||||||
|
|
||||||
struct ggml_tensor * attn_k_proj;
|
struct ggml_tensor * attn_k_proj;
|
||||||
struct ggml_tensor * attn_q_proj;
|
struct ggml_tensor * attn_q_proj;
|
||||||
struct ggml_tensor * attn_v_proj;
|
struct ggml_tensor * attn_v_proj;
|
||||||
struct ggml_tensor * attn_o_proj;
|
struct ggml_tensor * attn_o_proj;
|
||||||
struct ggml_tensor * attn_norm;
|
struct ggml_tensor * attn_norm;
|
||||||
struct ggml_tensor * attn_norm_bias;
|
struct ggml_tensor * attn_norm_bias;
|
||||||
|
|
||||||
struct ggml_tensor * cross_k;
|
struct ggml_tensor * cross_k;
|
||||||
struct ggml_tensor * cross_v;
|
struct ggml_tensor * cross_v;
|
||||||
|
|
||||||
struct ggml_tensor * fc1;
|
struct ggml_tensor * fc1;
|
||||||
struct ggml_tensor * fc2;
|
struct ggml_tensor * fc2;
|
||||||
struct ggml_tensor * final_norm;
|
struct ggml_tensor * final_norm;
|
||||||
|
@ -74,18 +74,18 @@ struct parler_tts_model : tts_model {
|
||||||
uint32_t prompt_vocab_size;
|
uint32_t prompt_vocab_size;
|
||||||
|
|
||||||
bool use_cross_attn = true;
|
bool use_cross_attn = true;
|
||||||
|
|
||||||
std::vector<struct ggml_tensor*> embds;
|
std::vector<struct ggml_tensor*> embds;
|
||||||
std::vector<parler_layer*> layers;
|
std::vector<parler_layer*> layers;
|
||||||
std::vector<struct ggml_tensor*> heads;
|
std::vector<struct ggml_tensor*> heads;
|
||||||
|
|
||||||
struct ggml_tensor * precomputed_input_emb;
|
struct ggml_tensor * precomputed_input_emb;
|
||||||
struct ggml_tensor * precomputed_positional_embds;
|
struct ggml_tensor * precomputed_positional_embds;
|
||||||
|
|
||||||
struct ggml_tensor * layer_norm;
|
struct ggml_tensor * layer_norm;
|
||||||
struct ggml_tensor * layer_norm_bias;
|
struct ggml_tensor * layer_norm_bias;
|
||||||
struct ggml_tensor * prompt_embd;
|
struct ggml_tensor * prompt_embd;
|
||||||
|
|
||||||
void assign_weight(std::string name, ggml_tensor * tensor);
|
void assign_weight(std::string name, ggml_tensor * tensor);
|
||||||
void prep_constants(gguf_context * meta);
|
void prep_constants(gguf_context * meta);
|
||||||
void prep_layers(gguf_context * meta);
|
void prep_layers(gguf_context * meta);
|
||||||
|
@ -107,21 +107,21 @@ struct parler_context : runner_context {
|
||||||
std::vector<bool> eos_seen;
|
std::vector<bool> eos_seen;
|
||||||
|
|
||||||
bool use_cache = true;
|
bool use_cache = true;
|
||||||
|
|
||||||
size_t output_size = 0; // capacity (of tokens positions) for the output buffers
|
size_t output_size = 0; // capacity (of tokens positions) for the output buffers
|
||||||
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
|
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
|
||||||
uint32_t current_position = 0; // current position in the active sequence
|
uint32_t current_position = 0; // current position in the active sequence
|
||||||
uint32_t prompt_end_position = 0; // the position of the text prompt termination (used for adjusting the cache when incrementally generating)
|
uint32_t prompt_end_position = 0; // the position of the text prompt termination (used for adjusting the cache when incrementally generating)
|
||||||
int32_t seq_id; // a unique identifier associated with the active sequence.
|
int32_t seq_id; // a unique identifier associated with the active sequence.
|
||||||
|
|
||||||
std::vector<uint32_t> output_tokens;
|
std::vector<uint32_t> output_tokens;
|
||||||
|
|
||||||
struct ggml_tensor * inp_tokens;
|
struct ggml_tensor * inp_tokens;
|
||||||
struct ggml_tensor * audio_inp_tokens;
|
struct ggml_tensor * audio_inp_tokens;
|
||||||
struct ggml_tensor * positions;
|
struct ggml_tensor * positions;
|
||||||
struct ggml_tensor * attn_mask;
|
struct ggml_tensor * attn_mask;
|
||||||
struct ggml_tensor * attn_mask_cross;
|
struct ggml_tensor * attn_mask_cross;
|
||||||
|
|
||||||
void build_schedule() {
|
void build_schedule() {
|
||||||
runner_context::build_schedule(model->max_nodes());
|
runner_context::build_schedule(model->max_nodes());
|
||||||
}
|
}
|
||||||
|
@ -130,17 +130,17 @@ struct parler_context : runner_context {
|
||||||
|
|
||||||
struct parler_kv_cache {
|
struct parler_kv_cache {
|
||||||
int32_t seq_id;
|
int32_t seq_id;
|
||||||
|
|
||||||
ggml_type type_k = GGML_TYPE_F32;
|
ggml_type type_k = GGML_TYPE_F32;
|
||||||
ggml_type type_v = GGML_TYPE_F32;
|
ggml_type type_v = GGML_TYPE_F32;
|
||||||
|
|
||||||
std::vector<struct ggml_tensor *> k_l;
|
std::vector<struct ggml_tensor *> k_l;
|
||||||
std::vector<struct ggml_tensor *> v_l;
|
std::vector<struct ggml_tensor *> v_l;
|
||||||
|
|
||||||
struct ggml_context * ctx;
|
struct ggml_context * ctx;
|
||||||
ggml_backend_buffer_type_t buft;
|
ggml_backend_buffer_type_t buft;
|
||||||
ggml_backend_buffer_t buf;
|
ggml_backend_buffer_t buf;
|
||||||
|
|
||||||
void free() {
|
void free() {
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
ggml_backend_buffer_free(buf);
|
ggml_backend_buffer_free(buf);
|
||||||
|
@ -152,8 +152,8 @@ struct parler_kv_cache {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct parler_ubatch {
|
struct parler_ubatch {
|
||||||
parler_ubatch(bool audio_generation, size_t n_tokens, size_t n_audio_tokens, size_t sequence_length,
|
parler_ubatch(bool audio_generation, size_t n_tokens, size_t n_audio_tokens, size_t sequence_length,
|
||||||
uint32_t * tokens, uint32_t * audio_tokens, uint32_t * positions, uint32_t * true_order,
|
uint32_t * tokens, uint32_t * audio_tokens, uint32_t * positions, uint32_t * true_order,
|
||||||
int current_step): audio_generation(audio_generation), n_tokens(n_tokens), n_audio_tokens(n_audio_tokens), sequence_length(sequence_length), tokens(tokens), audio_tokens(audio_tokens), positions(positions), true_order(true_order), current_step(current_step) {};
|
int current_step): audio_generation(audio_generation), n_tokens(n_tokens), n_audio_tokens(n_audio_tokens), sequence_length(sequence_length), tokens(tokens), audio_tokens(audio_tokens), positions(positions), true_order(true_order), current_step(current_step) {};
|
||||||
parler_ubatch() {};
|
parler_ubatch() {};
|
||||||
bool audio_generation; // whether we are receiving codebook decoded tokens or text tokens
|
bool audio_generation; // whether we are receiving codebook decoded tokens or text tokens
|
||||||
|
|
|
@ -543,7 +543,7 @@ dictionary_response * phoneme_dictionary::lookup(corpus * text, std::string valu
|
||||||
}
|
}
|
||||||
std::vector<dictionary_response*> possibilities = lookup_map.at(value);
|
std::vector<dictionary_response*> possibilities = lookup_map.at(value);
|
||||||
for (auto possible : possibilities) {
|
for (auto possible : possibilities) {
|
||||||
if (possible->code == SUCCESS || (possible->code == SUCCESS_PARTIAL && possible->is_match(text, flags))) {
|
if (possible->code == SUCCESS_TOTAL || (possible->code == SUCCESS_PARTIAL && possible->is_match(text, flags))) {
|
||||||
return possible;
|
return possible;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -818,7 +818,7 @@ bool phonemizer::process_word(corpus* text, std::string* output, std::string wor
|
||||||
output->append(" ");
|
output->append(" ");
|
||||||
}
|
}
|
||||||
flags->update_for_word(word);
|
flags->update_for_word(word);
|
||||||
if (response->code != SUCCESS) {
|
if (response->code != SUCCESS_TOTAL) {
|
||||||
word += response->after_match;
|
word += response->after_match;
|
||||||
output->append(response->value);
|
output->append(response->value);
|
||||||
text->size_pop(word.size()+unaccented_size_difference);
|
text->size_pop(word.size()+unaccented_size_difference);
|
||||||
|
@ -1072,7 +1072,7 @@ dictionary_response * response_from_string(std::string value, std::string key) {
|
||||||
bool not_at_start = key[0] == '#';
|
bool not_at_start = key[0] == '#';
|
||||||
bool not_at_end = key.back() == '#';
|
bool not_at_end = key.back() == '#';
|
||||||
if (!has_spacing) {
|
if (!has_spacing) {
|
||||||
dictionary_response * resp = new dictionary_response(SUCCESS, value);
|
dictionary_response * resp = new dictionary_response(SUCCESS_TOTAL, value);
|
||||||
resp->expects_to_be_proceeded_by_number = expects_to_be_proceeded_by_number;
|
resp->expects_to_be_proceeded_by_number = expects_to_be_proceeded_by_number;
|
||||||
resp->not_at_clause_end = not_at_end;
|
resp->not_at_clause_end = not_at_end;
|
||||||
resp->not_at_clause_start = not_at_start;
|
resp->not_at_clause_start = not_at_start;
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
|
|
||||||
// SNAC, Scale Neural Audio Codec, is another neural audio codec much like DAC.
|
// SNAC, Scale Neural Audio Codec, is another neural audio codec much like DAC.
|
||||||
// The key differences are that it uses grouping in the residual units of its layers,
|
// The key differences are that it uses grouping in the residual units of its layers,
|
||||||
// performs a repeat_interleave over the second and third input channels, applies
|
// performs a repeat_interleave over the second and third input channels, applies
|
||||||
// a noise convolutional layer after input encoding for each layer, and applies
|
// a noise convolutional layer after input encoding for each layer, and applies
|
||||||
// an extra convolutional layer before residual layers are applied.
|
// an extra convolutional layer before residual layers are applied.
|
||||||
struct snac_model : tts_model {
|
struct snac_model : tts_model {
|
||||||
|
@ -19,7 +19,7 @@ struct snac_model : tts_model {
|
||||||
uint32_t noise_steps[4] = {8, 64, 256, 512};
|
uint32_t noise_steps[4] = {8, 64, 256, 512};
|
||||||
uint32_t noise_steps_sum = 840;
|
uint32_t noise_steps_sum = 840;
|
||||||
bool use_noise = true;
|
bool use_noise = true;
|
||||||
|
|
||||||
struct ggml_tensor * repeat_interleave_buffer;
|
struct ggml_tensor * repeat_interleave_buffer;
|
||||||
|
|
||||||
struct ggml_tensor * in_conv_kernel;
|
struct ggml_tensor * in_conv_kernel;
|
||||||
|
@ -46,12 +46,12 @@ struct snac_model : tts_model {
|
||||||
// the context used for running the snac model
|
// the context used for running the snac model
|
||||||
struct snac_context : runner_context {
|
struct snac_context : runner_context {
|
||||||
snac_context(snac_model * model, int n_threads): runner_context(n_threads), model(model) {};
|
snac_context(snac_model * model, int n_threads): runner_context(n_threads), model(model) {};
|
||||||
|
|
||||||
struct snac_model * model;
|
struct snac_model * model;
|
||||||
|
|
||||||
struct ggml_tensor * inp_tokens;
|
struct ggml_tensor * inp_tokens;
|
||||||
struct ggml_tensor * noise;
|
struct ggml_tensor * noise;
|
||||||
|
|
||||||
void build_schedule() {
|
void build_schedule() {
|
||||||
runner_context::build_schedule(model->max_nodes());
|
runner_context::build_schedule(model->max_nodes());
|
||||||
}
|
}
|
||||||
|
@ -74,11 +74,11 @@ struct snac_runner : tts_runner {
|
||||||
}
|
}
|
||||||
snac_model * model;
|
snac_model * model;
|
||||||
snac_context * sctx;
|
snac_context * sctx;
|
||||||
|
|
||||||
void init_build() {
|
void init_build() {
|
||||||
tts_runner::init_build(&sctx->buf_compute_meta);
|
tts_runner::init_build(&sctx->buf_compute_meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_inputs(std::vector<std::vector<uint32_t>> & tokens);
|
void set_inputs(std::vector<std::vector<uint32_t>> & tokens);
|
||||||
void prepare_post_load();
|
void prepare_post_load();
|
||||||
struct ggml_cgraph * build_snac_graph(size_t sequence_length);
|
struct ggml_cgraph * build_snac_graph(size_t sequence_length);
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#include "sampler.h"
|
#include "ttssampler.h"
|
||||||
|
|
||||||
void sampler::sample(float * logits, std::vector<uint32_t> & output_tokens) {
|
void sampler::sample(float * logits, std::vector<uint32_t> & output_tokens) {
|
||||||
// assume that we are pointing to the start of the first token output;
|
// assume that we are pointing to the start of the first token output;
|
||||||
|
@ -6,7 +6,7 @@ void sampler::sample(float * logits, std::vector<uint32_t> & output_tokens) {
|
||||||
return max(logits, output_tokens);
|
return max(logits, output_tokens);
|
||||||
}
|
}
|
||||||
std::vector<uint32_t> max_vals;
|
std::vector<uint32_t> max_vals;
|
||||||
// the max_head_probs variable is used when top-p is applied but exists to address the case in which top-k and top-p cause the cumulative probability of the nucleus to beless than or
|
// the max_head_probs variable is used when top-p is applied but exists to address the case in which top-k and top-p cause the cumulative probability of the nucleus to beless than or
|
||||||
// equal to top_p;
|
// equal to top_p;
|
||||||
std::vector<float> max_head_probs;
|
std::vector<float> max_head_probs;
|
||||||
|
|
||||||
|
@ -189,7 +189,7 @@ void sampler::max(float * logits, std::vector<uint32_t> & output_tokens) {
|
||||||
uint32_t token_id = 0;
|
uint32_t token_id = 0;
|
||||||
for (uint32_t ii = 0; ii < vocab_size; ii++) {
|
for (uint32_t ii = 0; ii < vocab_size; ii++) {
|
||||||
float v = *(logits+i*vocab_size+ii);
|
float v = *(logits+i*vocab_size+ii);
|
||||||
// while repetition penalty will never be used for maximum token selection, it is used for the logarithmic stabilization of
|
// while repetition penalty will never be used for maximum token selection, it is used for the logarithmic stabilization of
|
||||||
// the softmax function in which case it is possible for repetition counts to be set.
|
// the softmax function in which case it is possible for repetition counts to be set.
|
||||||
if (has_repetition_penalty && last_token_ids[i] == ii) {
|
if (has_repetition_penalty && last_token_ids[i] == ii) {
|
||||||
v /= (pow(repetition_penalty, repetition_counts[i]));
|
v /= (pow(repetition_penalty, repetition_counts[i]));
|
|
@ -21,7 +21,7 @@ struct sampler {
|
||||||
std::vector<uint32_t> repetition_counts;
|
std::vector<uint32_t> repetition_counts;
|
||||||
bool do_sample = true;
|
bool do_sample = true;
|
||||||
bool apply_softmax = true;
|
bool apply_softmax = true;
|
||||||
|
|
||||||
void sample(float * logits, std::vector<uint32_t> & output_tokens);
|
void sample(float * logits, std::vector<uint32_t> & output_tokens);
|
||||||
void softmax(float * logits, std::vector<std::vector<size_t>> picks, std::vector<uint32_t> max_indices);
|
void softmax(float * logits, std::vector<std::vector<size_t>> picks, std::vector<uint32_t> max_indices);
|
||||||
void max(float * logits, std::vector<uint32_t> & output_tokens);
|
void max(float * logits, std::vector<uint32_t> & output_tokens);
|
|
@ -1,4 +1,4 @@
|
||||||
#include "t5_encoder_model.h"
|
#include "ttst5_encoder_model.h"
|
||||||
|
|
||||||
static const std::map<std::string, t5_tensor> T5_TENSOR_GGUF_LOOKUP = {
|
static const std::map<std::string, t5_tensor> T5_TENSOR_GGUF_LOOKUP = {
|
||||||
{"t5encoder.token_embd", T5_EMBD},
|
{"t5encoder.token_embd", T5_EMBD},
|
||||||
|
@ -139,7 +139,7 @@ void t5_encoder::prep_constants(gguf_context * meta) {
|
||||||
int bos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.bos_token_id");
|
int bos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.bos_token_id");
|
||||||
if (bos_token_id_key != -1) {
|
if (bos_token_id_key != -1) {
|
||||||
bos_token_id = gguf_get_val_u32(meta, bos_token_id_key);
|
bos_token_id = gguf_get_val_u32(meta, bos_token_id_key);
|
||||||
}
|
}
|
||||||
|
|
||||||
int eos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.eos_token_id");
|
int eos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.eos_token_id");
|
||||||
if (eos_token_id_key != -1) {
|
if (eos_token_id_key != -1) {
|
||||||
|
@ -219,7 +219,7 @@ struct ggml_cgraph * t5_runner::build_t5_graph(t5_ubatch & batch) {
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
//t5ctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
|
//t5ctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
|
||||||
//ggml_set_input(t5ctx->positions);
|
//ggml_set_input(t5ctx->positions);
|
||||||
|
|
||||||
|
@ -233,7 +233,7 @@ struct ggml_cgraph * t5_runner::build_t5_graph(t5_ubatch & batch) {
|
||||||
|
|
||||||
struct ggml_tensor * KQ_mask_dec = build_t5_attn_mask(ctx, t5ctx, batch);
|
struct ggml_tensor * KQ_mask_dec = build_t5_attn_mask(ctx, t5ctx, batch);
|
||||||
struct ggml_tensor * pos_bias = build_t5_pos_bias(ctx, t5ctx->inp_pos_bucket, model->relative_attn_bias);
|
struct ggml_tensor * pos_bias = build_t5_pos_bias(ctx, t5ctx->inp_pos_bucket, model->relative_attn_bias);
|
||||||
|
|
||||||
for (int l = 0; l < model->n_layers; l++) {
|
for (int l = 0; l < model->n_layers; l++) {
|
||||||
struct ggml_tensor * residual = inpL;
|
struct ggml_tensor * residual = inpL;
|
||||||
|
|
||||||
|
@ -293,7 +293,7 @@ struct ggml_cgraph * t5_runner::build_t5_graph(t5_ubatch & batch) {
|
||||||
ggml_build_forward_expand(gf, cur);
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
free_build();
|
free_build();
|
||||||
|
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -312,7 +312,7 @@ void t5_runner::set_inputs(t5_ubatch & batch) {
|
||||||
for (int ii = 0; ii < batch.n_tokens; ii++) {
|
for (int ii = 0; ii < batch.n_tokens; ii++) {
|
||||||
int ab_rpos = abs(i - ii);
|
int ab_rpos = abs(i - ii);
|
||||||
int rpos = i - ii;
|
int rpos = i - ii;
|
||||||
attn_mask[i*batch.n_tokens + ii] = 0.0f; //ii > i ? -INFINITY : 0.0f;
|
attn_mask[i*batch.n_tokens + ii] = 0.0f; //ii > i ? -INFINITY : 0.0f;
|
||||||
pos_bucket[i*batch.n_tokens + ii] = (uint32_t) (rpos > 0 ? n_buckets : 0) + (ab_rpos < max_exact ? ab_rpos : std::min((n_buckets - 1), (max_exact + (int)((log((ab_rpos / max_exact)) / logarithmic_denominator) * max_exact))));
|
pos_bucket[i*batch.n_tokens + ii] = (uint32_t) (rpos > 0 ? n_buckets : 0) + (ab_rpos < max_exact ? ab_rpos : std::min((n_buckets - 1), (max_exact + (int)((log((ab_rpos / max_exact)) / logarithmic_denominator) * max_exact))));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -324,10 +324,10 @@ void t5_runner::run(uint32_t * input_tokens, uint32_t sequence_length, struct tt
|
||||||
batch.input_tokens = input_tokens;
|
batch.input_tokens = input_tokens;
|
||||||
batch.n_tokens = sequence_length;
|
batch.n_tokens = sequence_length;
|
||||||
ggml_backend_sched_reset(t5ctx->sched);
|
ggml_backend_sched_reset(t5ctx->sched);
|
||||||
|
|
||||||
const size_t prev_size = t5ctx->buf_output ? ggml_backend_buffer_get_size(t5ctx->buf_output) : 0;
|
const size_t prev_size = t5ctx->buf_output ? ggml_backend_buffer_get_size(t5ctx->buf_output) : 0;
|
||||||
const size_t new_size = model->max_context_length * model->output_size * sizeof(float);
|
const size_t new_size = model->max_context_length * model->output_size * sizeof(float);
|
||||||
|
|
||||||
if (!t5ctx->buf_output || prev_size < new_size) {
|
if (!t5ctx->buf_output || prev_size < new_size) {
|
||||||
if (t5ctx->buf_output) {
|
if (t5ctx->buf_output) {
|
||||||
ggml_backend_buffer_free(t5ctx->buf_output);
|
ggml_backend_buffer_free(t5ctx->buf_output);
|
||||||
|
@ -337,7 +337,7 @@ void t5_runner::run(uint32_t * input_tokens, uint32_t sequence_length, struct tt
|
||||||
|
|
||||||
t5ctx->buf_output = ggml_backend_buft_alloc_buffer(t5ctx->backend_cpu_buffer, new_size);
|
t5ctx->buf_output = ggml_backend_buft_alloc_buffer(t5ctx->backend_cpu_buffer, new_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
outputs->data = (float *) ggml_backend_buffer_get_base(t5ctx->buf_output);
|
outputs->data = (float *) ggml_backend_buffer_get_base(t5ctx->buf_output);
|
||||||
ggml_backend_buffer_clear(t5ctx->buf_output, 0);
|
ggml_backend_buffer_clear(t5ctx->buf_output, 0);
|
||||||
struct ggml_cgraph * gf = NULL;
|
struct ggml_cgraph * gf = NULL;
|
|
@ -2,7 +2,7 @@
|
||||||
#define t5_encoder_model_h
|
#define t5_encoder_model_h
|
||||||
|
|
||||||
#include "tts_model.h"
|
#include "tts_model.h"
|
||||||
#include "tokenizer.h"
|
#include "ttstokenizer.h"
|
||||||
|
|
||||||
|
|
||||||
enum t5_tensor {
|
enum t5_tensor {
|
||||||
|
@ -75,14 +75,14 @@ void assign_to_t5_layer(t5_encoder * model, t5_layer & layer, std::string name,
|
||||||
|
|
||||||
struct t5_context : runner_context {
|
struct t5_context : runner_context {
|
||||||
t5_context(t5_encoder * model, int n_threads): runner_context(n_threads), model(model) {};
|
t5_context(t5_encoder * model, int n_threads): runner_context(n_threads), model(model) {};
|
||||||
|
|
||||||
struct t5_encoder * model;
|
struct t5_encoder * model;
|
||||||
|
|
||||||
struct ggml_tensor * inp_tokens;
|
struct ggml_tensor * inp_tokens;
|
||||||
struct ggml_tensor * positions;
|
struct ggml_tensor * positions;
|
||||||
struct ggml_tensor * attn_mask;
|
struct ggml_tensor * attn_mask;
|
||||||
struct ggml_tensor * inp_pos_bucket;
|
struct ggml_tensor * inp_pos_bucket;
|
||||||
|
|
||||||
void build_schedule() {
|
void build_schedule() {
|
||||||
runner_context::build_schedule(model->max_nodes());
|
runner_context::build_schedule(model->max_nodes());
|
||||||
}
|
}
|
||||||
|
@ -116,7 +116,7 @@ struct t5_runner : tts_runner {
|
||||||
void init_build() {
|
void init_build() {
|
||||||
tts_runner::init_build(&t5ctx->buf_compute_meta);
|
tts_runner::init_build(&t5ctx->buf_compute_meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
void prepare_post_load();
|
void prepare_post_load();
|
||||||
struct t5_ubatch build_worst_case_batch();
|
struct t5_ubatch build_worst_case_batch();
|
||||||
void set_inputs(t5_ubatch & batch);
|
void set_inputs(t5_ubatch & batch);
|
|
@ -1,4 +1,4 @@
|
||||||
#include "tokenizer.h"
|
#include "ttstokenizer.h"
|
||||||
|
|
||||||
void token_trie::add(const std::string & gram, uint32_t token) {
|
void token_trie::add(const std::string & gram, uint32_t token) {
|
||||||
_add(gram, token, 0);
|
_add(gram, token, 0);
|
Loading…
Add table
Add a link
Reference in a new issue