standardize tts linting and formatting

This commit is contained in:
Concedo 2025-08-17 14:11:30 +08:00
parent cfc1a0d4ef
commit 9935ac093f
24 changed files with 371 additions and 355 deletions

View file

@ -474,7 +474,7 @@ set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
add_library(tts_adapter add_library(tts_adapter
otherarch/tts_adapter.cpp) otherarch/tts_adapter.cpp)
target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./tools ./common) target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./otherarch/ttscpp/include ./otherarch/ttscpp/src ./tools ./common)
target_compile_features(tts_adapter PUBLIC cxx_std_17) # don't bump target_compile_features(tts_adapter PUBLIC cxx_std_17) # don't bump
target_link_libraries(tts_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS}) target_link_libraries(tts_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
set_target_properties(tts_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(tts_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)

View file

@ -729,7 +729,7 @@ mainvk: tools/main/main.cpp common/arg.cpp build-info.h ggml_v4_vulkan.o ggml-cp
$(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS)
embedding: examples/embedding/embedding.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS) embedding: examples/embedding/embedding.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
ttscppmain: otherarch/ttscpp/cli/cli.cpp otherarch/ttscpp/cli/playback.cpp otherarch/ttscpp/cli/playback.h otherarch/ttscpp/cli/write_file.cpp otherarch/ttscpp/cli/write_file.h otherarch/ttscpp/cli/vad.cpp otherarch/ttscpp/cli/vad.h otherarch/ttscpp/src/tts.cpp otherarch/ttscpp/src/tokenizer.cpp otherarch/ttscpp/src/sampler.cpp otherarch/ttscpp/src/parler_model.cpp otherarch/ttscpp/src/dac_model.cpp otherarch/ttscpp/src/ttsutil.cpp otherarch/ttscpp/src/args.cpp otherarch/ttscpp/src/t5_encoder_model.cpp otherarch/ttscpp/src/phonemizer.cpp otherarch/ttscpp/src/tts_model.cpp otherarch/ttscpp/src/kokoro_model.cpp otherarch/ttscpp/src/dia_model.cpp otherarch/ttscpp/src/orpheus_model.cpp otherarch/ttscpp/src/snac_model.cpp otherarch/ttscpp/src/general_neural_audio_codec.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS) ttscppmain: otherarch/ttscpp/cli/cli.cpp otherarch/ttscpp/cli/playback.cpp otherarch/ttscpp/cli/playback.h otherarch/ttscpp/cli/write_file.cpp otherarch/ttscpp/cli/write_file.h otherarch/ttscpp/cli/vad.cpp otherarch/ttscpp/cli/vad.h otherarch/ttscpp/src/tts.cpp otherarch/ttscpp/src/ttstokenizer.cpp otherarch/ttscpp/src/ttssampler.cpp otherarch/ttscpp/src/parler_model.cpp otherarch/ttscpp/src/dac_model.cpp otherarch/ttscpp/src/ttsutil.cpp otherarch/ttscpp/src/ttsargs.cpp otherarch/ttscpp/src/ttst5_encoder_model.cpp otherarch/ttscpp/src/phonemizer.cpp otherarch/ttscpp/src/tts_model.cpp otherarch/ttscpp/src/kokoro_model.cpp otherarch/ttscpp/src/dia_model.cpp otherarch/ttscpp/src/orpheus_model.cpp otherarch/ttscpp/src/snac_model.cpp otherarch/ttscpp/src/general_neural_audio_codec.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
ggml/src/ggml-vulkan-shaders.cpp: ggml/src/ggml-vulkan-shaders.cpp:

View file

@ -25,6 +25,22 @@
#define M_PI 3.14159265358979323846 #define M_PI 3.14159265358979323846
#endif #endif
//imports required for tts.cpp to work
#include "tts.cpp"
#include "ttstokenizer.cpp"
#include "ttssampler.cpp"
#include "parler_model.cpp"
#include "dac_model.cpp"
#include "ttsutil.cpp"
#include "ttst5_encoder_model.cpp"
#include "phonemizer.cpp"
#include "tts_model.cpp"
#include "kokoro_model.cpp"
#include "dia_model.cpp"
#include "orpheus_model.cpp"
#include "snac_model.cpp"
#include "general_neural_audio_codec.cpp"
enum TTS_VER enum TTS_VER
{ {
TTS_VER_2, TTS_VER_2,

View file

@ -9,8 +9,8 @@ float energy(float * chunk, int count) {
} }
void apply_energy_voice_inactivity_detection( void apply_energy_voice_inactivity_detection(
tts_response & data, tts_response & data,
float sample_rate, float sample_rate,
int ms_per_frame, int ms_per_frame,
int frame_threshold, int frame_threshold,
float normalized_energy_threshold, float normalized_energy_threshold,

File diff suppressed because it is too large Load diff

View file

@ -12,7 +12,7 @@
#include <unordered_map> #include <unordered_map>
#include <map> #include <map>
#include <unordered_set> #include <unordered_set>
#include "tokenizer.h" #include "ttstokenizer.h"
#include <algorithm> #include <algorithm>
#include <mutex> #include <mutex>
@ -33,16 +33,16 @@ static const std::unordered_set<std::string> ONE_LETTER_WORDS = {
"i", "i",
}; };
/* /*
* The two letter and three letter words listed below have been filtered down from the complete list of english two and three letter words * The two letter and three letter words listed below have been filtered down from the complete list of english two and three letter words
* via several criteria: * via several criteria:
* 1. All non-EN-US words have been removed * 1. All non-EN-US words have been removed
* 2. All three letter acronyms have been removed (as these lists are used to identify acronyms) * 2. All three letter acronyms have been removed (as these lists are used to identify acronyms)
* 3. All archaic, deprecated, or poetic words have been removed. * 3. All archaic, deprecated, or poetic words have been removed.
* 4. All literary, abbreviative, and slang words have been removed if they see no more than a mean of 30 daily searches via google (over the * 4. All literary, abbreviative, and slang words have been removed if they see no more than a mean of 30 daily searches via google (over the
* last 10 years). * last 10 years).
* *
* After the lists were filtered by the criteria described above, removed items were reviewed. Any item which had entered the common EN-US * After the lists were filtered by the criteria described above, removed items were reviewed. Any item which had entered the common EN-US
* vernacular but was not identified as of American origin was reintroduced into the sets below. * vernacular but was not identified as of American origin was reintroduced into the sets below.
*/ */
static const std::unordered_set<std::string> TWO_LETTER_WORDS = { static const std::unordered_set<std::string> TWO_LETTER_WORDS = {
"ab", "ah", "am", "an", "as", "at", "aw", "ax", "ay", "be", "bo", "br", "ab", "ah", "am", "an", "as", "at", "aw", "ax", "ay", "be", "bo", "br",
@ -50,7 +50,7 @@ static const std::unordered_set<std::string> TWO_LETTER_WORDS = {
"id", "if", "in", "is", "it", "la", "lo", "ma", "me", "mm", "my", "na", "id", "if", "in", "is", "it", "la", "lo", "ma", "me", "mm", "my", "na",
"no", "of", "oh", "oi", "on", "oo", "or", "ow", "ox", "oy", "pa", "qi", "no", "of", "oh", "oi", "on", "oo", "or", "ow", "ox", "oy", "pa", "qi",
"re", "sh", "so", "to", "uh", "um", "un", "up", "us", "we", "wo", "ya", "re", "sh", "so", "to", "uh", "um", "un", "up", "us", "we", "wo", "ya",
"ye", "yo", "ye", "yo",
}; };
static const std::unordered_set<std::string> THREE_LETTER_WORDS = { static const std::unordered_set<std::string> THREE_LETTER_WORDS = {
"aah", "abs", "aby", "ace", "ach", "ack", "act", "add", "ado", "ads", "aft", "age", "aah", "abs", "aby", "ace", "ach", "ack", "act", "add", "ado", "ads", "aft", "age",
@ -292,7 +292,7 @@ static std::string STOPPING_TOKENS = ".,:;!?";
#ifdef ESPEAK_INSTALL #ifdef ESPEAK_INSTALL
/** /**
* espeak-ng uses globals to persist and manage its state so it is not compatible with * espeak-ng uses globals to persist and manage its state so it is not compatible with
* threaded parallelism (https://github.com/espeak-ng/espeak-ng/issues/1527). * threaded parallelism (https://github.com/espeak-ng/espeak-ng/issues/1527).
* This singleton acts as a mutex wrapped provider for all espeak phonemization methods such * This singleton acts as a mutex wrapped provider for all espeak phonemization methods such
* that multiple instances of the kokoro_runner can be initialized and called in parallel. * that multiple instances of the kokoro_runner can be initialized and called in parallel.
@ -323,7 +323,7 @@ public:
#endif #endif
enum lookup_code { enum lookup_code {
SUCCESS = 100, SUCCESS_TOTAL = 100,
SUCCESS_PARTIAL = 101, SUCCESS_PARTIAL = 101,
FAILURE_UNFOUND = 200, FAILURE_UNFOUND = 200,
FAILURE_PHONETIC = 201, FAILURE_PHONETIC = 201,
@ -368,7 +368,7 @@ struct conditions {
void update_for_word(std::string word,bool allow_for_upper_check = true); void update_for_word(std::string word,bool allow_for_upper_check = true);
}; };
/* /*
* The corpus struct is simply a small wrapper class that is used to perform simple look forward and backwards in the text * The corpus struct is simply a small wrapper class that is used to perform simple look forward and backwards in the text
* which is being phonemized. This can be used to discern how to convert chunks of text in a consistent and protective fashion * which is being phonemized. This can be used to discern how to convert chunks of text in a consistent and protective fashion
* in order to accurately phonemize complicated text. * in order to accurately phonemize complicated text.
@ -376,7 +376,7 @@ struct conditions {
struct corpus { struct corpus {
corpus(const char * text, size_t size): size(size), text(text) {}; corpus(const char * text, size_t size): size(size), text(text) {};
size_t location = 0; size_t location = 0;
size_t size; size_t size;
const char * text; const char * text;
/* /*
@ -397,9 +397,9 @@ struct corpus {
std::string after_until(int after, std::string val); std::string after_until(int after, std::string val);
}; };
/* /*
* The TTS phonemizer works by splitting each word into distinct graphemes, and for each grapheme the phonemizer will look at the grapheme that came * The TTS phonemizer works by splitting each word into distinct graphemes, and for each grapheme the phonemizer will look at the grapheme that came
* before, after, and for any word specific exceptions in order to compile a * before, after, and for any word specific exceptions in order to compile a
*/ */
struct phonemizer_rule { struct phonemizer_rule {
~phonemizer_rule() { ~phonemizer_rule() {
@ -436,10 +436,10 @@ private:
struct word_phonemizer * word_phonemizer_from_gguf(gguf_context * meta); struct word_phonemizer * word_phonemizer_from_gguf(gguf_context * meta);
/* /*
* The general translation approach that espeak uses is to lookup words in the dictionary and return a list of possible matches per lookup. * The general translation approach that espeak uses is to lookup words in the dictionary and return a list of possible matches per lookup.
* Each match contains flags which describe the match's conditions and limitations and optionally a pronunciation. When a pronunciation is not returned, * Each match contains flags which describe the match's conditions and limitations and optionally a pronunciation. When a pronunciation is not returned,
* it usually means that the word needs to be pronounced phonetically, the word belongs to another language, or that the original content is a * it usually means that the word needs to be pronounced phonetically, the word belongs to another language, or that the original content is a
* token representation of a different word (e.g. with numbers). * token representation of a different word (e.g. with numbers).
* *
* Since it does not make sense to have the core lexer reperform this lookup operation with represented words or via distinct languages, those behaviors * Since it does not make sense to have the core lexer reperform this lookup operation with represented words or via distinct languages, those behaviors
@ -470,7 +470,7 @@ struct phoneme_dictionary {
struct phoneme_dictionary * phoneme_dictionary_from_gguf(gguf_context * meta); struct phoneme_dictionary * phoneme_dictionary_from_gguf(gguf_context * meta);
/* /*
* In general, I would like to avoid requiring the installation of otherwise broad and technically complicated libraries, * In general, I would like to avoid requiring the installation of otherwise broad and technically complicated libraries,
* like espeak, especially when they are only being used for a small portion of their overall functionality. While avoiding these * like espeak, especially when they are only being used for a small portion of their overall functionality. While avoiding these
* requirements will keep the default installation cost of TTS.cpp down, it is also unlikely that TTS.cpp will support * requirements will keep the default installation cost of TTS.cpp down, it is also unlikely that TTS.cpp will support
@ -478,8 +478,8 @@ struct phoneme_dictionary * phoneme_dictionary_from_gguf(gguf_context * meta);
* espeak. As such, the phonemizer struct described below will support simple text to IPA phoneme functionality out of the box, * espeak. As such, the phonemizer struct described below will support simple text to IPA phoneme functionality out of the box,
* while also optionally acting as an interface for espeak phonemization. * while also optionally acting as an interface for espeak phonemization.
* *
* Phonemization seems to use a pattern close to the common lexer, such that at each index or chunk of text forward and backward context * Phonemization seems to use a pattern close to the common lexer, such that at each index or chunk of text forward and backward context
* views are used to support single pass translation. As such, the TTS.cpp phonemization pattern I've decided to implement behaves * views are used to support single pass translation. As such, the TTS.cpp phonemization pattern I've decided to implement behaves
* effecively like a simple router lexer. It will only support utf-8 encoded text and english IPA conversion. * effecively like a simple router lexer. It will only support utf-8 encoded text and english IPA conversion.
*/ */
struct phonemizer { struct phonemizer {

View file

@ -28,7 +28,7 @@ const std::map<std::string, tts_arch> SUPPORTED_ARCHITECTURES = {
{ "orpheus", ORPHEUS_ARCH } { "orpheus", ORPHEUS_ARCH }
}; };
/// Given a map from keys to values, creates a new map from values to keys /// Given a map from keys to values, creates a new map from values to keys
template<typename K, typename V> template<typename K, typename V>
static std::map<V, K> reverse_map(const std::map<K, V>& m) { static std::map<V, K> reverse_map(const std::map<K, V>& m) {
std::map<V, K> r; std::map<V, K> r;
@ -43,10 +43,10 @@ const std::map<tts_arch, std::string> ARCHITECTURE_NAMES = reverse_map(SUPPORTED
struct generation_configuration { struct generation_configuration {
generation_configuration( generation_configuration(
std::string voice = "", std::string voice = "",
int top_k = 50, int top_k = 50,
float temperature = 1.0, float temperature = 1.0,
float repetition_penalty = 1.0, float repetition_penalty = 1.0,
bool use_cross_attn = true, bool use_cross_attn = true,
std::string espeak_voice_id = "", std::string espeak_voice_id = "",
int max_tokens = 0, int max_tokens = 0,
float top_p = 1.0, float top_p = 1.0,

View file

@ -22,13 +22,13 @@ struct dac_quantize_layer {
// this struct maintains the static tensors for the dac audio decoder graph. // this struct maintains the static tensors for the dac audio decoder graph.
// As such, this is designed to contain basic configuration and ggml tensor support for DAC. // As such, this is designed to contain basic configuration and ggml tensor support for DAC.
// The dac_runner describes how the graph is built and run. // The dac_runner describes how the graph is built and run.
struct dac_model : tts_model { struct dac_model : tts_model {
// These configs are essentially built for the 44khZ 8kbps standard DAC model audio encoder and decoder // These configs are essentially built for the 44khZ 8kbps standard DAC model audio encoder and decoder
uint32_t n_layers = 4; uint32_t n_layers = 4;
uint32_t n_heads = 9; uint32_t n_heads = 9;
uint32_t up_sampling_factor = 512; uint32_t up_sampling_factor = 512;
uint32_t max_generation_size = 2580; uint32_t max_generation_size = 2580;
struct ggml_tensor * in_conv_kernel; struct ggml_tensor * in_conv_kernel;
struct ggml_tensor * in_conv_bias; struct ggml_tensor * in_conv_bias;
struct ggml_tensor * out_conv_kernel; struct ggml_tensor * out_conv_kernel;
@ -53,11 +53,11 @@ void assign_to_audio_encoder(dac_model * model, std::string name, ggml_tensor *
// the context used for running the dac model // the context used for running the dac model
struct dac_context : runner_context { struct dac_context : runner_context {
dac_context(dac_model * model, int n_threads): runner_context(n_threads), model(model) {}; dac_context(dac_model * model, int n_threads): runner_context(n_threads), model(model) {};
struct dac_model * model; struct dac_model * model;
struct ggml_tensor * inp_tokens; struct ggml_tensor * inp_tokens;
void build_schedule() { void build_schedule() {
runner_context::build_schedule(model->max_nodes()); runner_context::build_schedule(model->max_nodes());
} }
@ -85,11 +85,11 @@ struct dac_runner : tts_runner {
} }
dac_model * model; dac_model * model;
dac_context * dctx; dac_context * dctx;
void init_build() { void init_build() {
tts_runner::init_build(&dctx->buf_compute_meta); tts_runner::init_build(&dctx->buf_compute_meta);
} }
void prepare_post_load(); void prepare_post_load();
struct ggml_cgraph * build_dac_graph(dac_ubatch & batch); struct ggml_cgraph * build_dac_graph(dac_ubatch & batch);
void run(uint32_t * input_tokens, uint32_t sequence_length, struct tts_response * outputs); void run(uint32_t * input_tokens, uint32_t sequence_length, struct tts_response * outputs);

View file

@ -119,7 +119,7 @@ void dia_model::assign_to_decoder_layer(std::string part, dia_decoder_layer * la
set_tensor(layer->self_attn_norm, tensor); set_tensor(layer->self_attn_norm, tensor);
} else if (part == "pre_mlp_norm") { } else if (part == "pre_mlp_norm") {
layer->mlp_norm = ggml_dup_tensor(ctx, tensor); layer->mlp_norm = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->mlp_norm, tensor); set_tensor(layer->mlp_norm, tensor);
} else if (part == "pre_ca_norm") { } else if (part == "pre_ca_norm") {
layer->cross_attn_norm = ggml_dup_tensor(ctx, tensor); layer->cross_attn_norm = ggml_dup_tensor(ctx, tensor);
set_tensor(layer->cross_attn_norm, tensor); set_tensor(layer->cross_attn_norm, tensor);
@ -151,7 +151,7 @@ void dia_model::prep_layers() {
dia_decoder_layer * l = new dia_decoder_layer; dia_decoder_layer * l = new dia_decoder_layer;
decoder->layers.push_back(l); decoder->layers.push_back(l);
} }
decoder->embds.reserve((size_t) n_output_heads); decoder->embds.reserve((size_t) n_output_heads);
decoder->heads.reserve((size_t) n_output_heads); decoder->heads.reserve((size_t) n_output_heads);
for (int i = 0; i < n_output_heads; i++) { for (int i = 0; i < n_output_heads; i++) {
@ -196,7 +196,7 @@ void dia_model::prep_constants(gguf_context * meta) {
int encoder_attn_heads_key = gguf_find_key(meta, "dia.encoder.attn_heads"); int encoder_attn_heads_key = gguf_find_key(meta, "dia.encoder.attn_heads");
if (encoder_attn_heads_key != -1) { if (encoder_attn_heads_key != -1) {
encoder_attn_heads = gguf_get_val_u32(meta, encoder_attn_heads_key); encoder_attn_heads = gguf_get_val_u32(meta, encoder_attn_heads_key);
} }
int head_size_key = gguf_find_key(meta, "dia.attn_head_size"); int head_size_key = gguf_find_key(meta, "dia.attn_head_size");
if (head_size_key != -1) { if (head_size_key != -1) {
@ -271,7 +271,7 @@ struct dia_context * build_new_dia_context(struct dia_model * model, int n_threa
return dctx; return dctx;
} }
static bool dia_kv_cache_init(struct dia_kv_cache * cache, dia_model * model, dia_context * dctx) { static bool dia_kv_cache_init(struct dia_kv_cache * cache, dia_model * model, dia_context * dctx) {
ggml_backend_buffer_type_t buft = nullptr; ggml_backend_buffer_type_t buft = nullptr;
// this will only really support cpu or metal for the time being; // this will only really support cpu or metal for the time being;
if (dctx->backend != nullptr) { if (dctx->backend != nullptr) {
@ -382,7 +382,7 @@ static struct ggml_tensor * build_dia_encoder(ggml_context * ctx, dia_model * mo
struct ggml_tensor * cur = ggml_reshape_3d(ctx, ggml_get_rows(ctx, model->encoder->embedding, dctx->inp_tokens), model->encoder_hidden_size, model->max_encoder_context_length, 2); struct ggml_tensor * cur = ggml_reshape_3d(ctx, ggml_get_rows(ctx, model->encoder->embedding, dctx->inp_tokens), model->encoder_hidden_size, model->max_encoder_context_length, 2);
for (auto layer : model->encoder->layers) { for (auto layer : model->encoder->layers) {
struct ggml_tensor * residual = cur; struct ggml_tensor * residual = cur;
cur = dia_layer_norm(ctx, cur, layer->self_attn_norm); cur = dia_layer_norm(ctx, cur, layer->self_attn_norm);
// self-attention // self-attention
{ {
@ -402,7 +402,7 @@ static struct ggml_tensor * build_dia_encoder(ggml_context * ctx, dia_model * mo
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3); struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3);
// It is unclear why the attention ops in Dia's encoder don't project to the embedding dimension size as is standard. Instead they up project to the decoder's embedding dimension // It is unclear why the attention ops in Dia's encoder don't project to the embedding dimension size as is standard. Instead they up project to the decoder's embedding dimension
// then down project back the the encoder embedding dimension. // then down project back the the encoder embedding dimension.
cur = ggml_cont_3d(ctx, kqv_merged, model->decoder_hidden_size, model->max_encoder_context_length, 2); cur = ggml_cont_3d(ctx, kqv_merged, model->decoder_hidden_size, model->max_encoder_context_length, 2);
cur = ggml_mul_mat(ctx, layer->o, cur); cur = ggml_mul_mat(ctx, layer->o, cur);
} }
@ -443,10 +443,10 @@ static struct ggml_tensor * repeat_interleave_dim1(ggml_context * ctx, struct gg
static void build_dia_self_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * k, struct ggml_tensor * v, dia_ubatch & batch, int layer_index) { static void build_dia_self_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * k, struct ggml_tensor * v, dia_ubatch & batch, int layer_index) {
int64_t attn_size = model->head_size * model->decoder_attn_heads; int64_t attn_size = model->head_size * model->decoder_attn_heads;
struct ggml_tensor * k_cache_view = struct ggml_tensor * k_cache_view =
ggml_view_2d( ggml_view_2d(
ctx, kv->k_l[layer_index], attn_size, 2, ctx, kv->k_l[layer_index], attn_size, 2,
attn_size * model->max_generation_size * ggml_element_size(kv->k_l[layer_index]), attn_size * model->max_generation_size * ggml_element_size(kv->k_l[layer_index]),
attn_size*dctx->current_position*ggml_element_size(kv->k_l[layer_index])); attn_size*dctx->current_position*ggml_element_size(kv->k_l[layer_index]));
k = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, k, model->head_size, model->decoder_attn_heads / model->decoder_query_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2); k = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, k, model->head_size, model->decoder_attn_heads / model->decoder_query_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2);
@ -461,8 +461,8 @@ static void build_dia_self_kv_store(ggml_context * ctx, dia_context * dctx, dia_
struct ggml_tensor * v_cache_view = nullptr; struct ggml_tensor * v_cache_view = nullptr;
v_cache_view = ggml_view_2d( v_cache_view = ggml_view_2d(
ctx, kv->v_l[layer_index], attn_size, 2, ctx, kv->v_l[layer_index], attn_size, 2,
attn_size * model->max_generation_size * ggml_element_size(kv->v_l[layer_index]), attn_size * model->max_generation_size * ggml_element_size(kv->v_l[layer_index]),
attn_size*dctx->current_position*ggml_element_size(kv->v_l[layer_index])); attn_size*dctx->current_position*ggml_element_size(kv->v_l[layer_index]));
// Since the sequence length should always be 1 here this is the most pertinent time to repeat the heads for grouped query attention. // Since the sequence length should always be 1 here this is the most pertinent time to repeat the heads for grouped query attention.
@ -476,11 +476,11 @@ static void build_dia_self_kv_store(ggml_context * ctx, dia_context * dctx, dia_
static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * encoder_hidden_states, int layer_index) { static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * encoder_hidden_states, int layer_index) {
dia_decoder_layer * layer = model->decoder->layers[layer_index]; dia_decoder_layer * layer = model->decoder->layers[layer_index];
struct ggml_tensor * encoder_states_key_view = ggml_cont(ctx, ggml_view_3d( struct ggml_tensor * encoder_states_key_view = ggml_cont(ctx, ggml_view_3d(
ctx, ctx,
encoder_hidden_states, encoder_hidden_states,
model->encoder_hidden_size, model->encoder_hidden_size,
dctx->prompt_size, dctx->prompt_size,
2, 2,
model->encoder_hidden_size * ggml_element_size(encoder_hidden_states), model->encoder_hidden_size * model->max_encoder_context_length * ggml_element_size(encoder_hidden_states), 0)); model->encoder_hidden_size * ggml_element_size(encoder_hidden_states), model->encoder_hidden_size * model->max_encoder_context_length * ggml_element_size(encoder_hidden_states), 0));
struct ggml_tensor * k = ggml_mul_mat(ctx, layer->cross_attn_k, encoder_states_key_view); struct ggml_tensor * k = ggml_mul_mat(ctx, layer->cross_attn_k, encoder_states_key_view);
@ -491,8 +491,8 @@ static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia
struct ggml_tensor * k_cache_view = struct ggml_tensor * k_cache_view =
ggml_view_4d( ggml_view_4d(
ctx, kv->cross_k_l[layer_index], model->head_size, model->decoder_attn_heads, 2, dctx->prompt_size, ctx, kv->cross_k_l[layer_index], model->head_size, model->decoder_attn_heads, 2, dctx->prompt_size,
model->head_size*ggml_element_size(kv->cross_k_l[layer_index]), model->head_size*ggml_element_size(kv->cross_k_l[layer_index]),
model->head_size*model->decoder_attn_heads*ggml_element_size(kv->cross_k_l[layer_index]), model->head_size*model->decoder_attn_heads*ggml_element_size(kv->cross_k_l[layer_index]),
model->head_size*model->decoder_attn_heads*2*ggml_element_size(kv->cross_k_l[layer_index]), model->head_size*model->decoder_attn_heads*2*ggml_element_size(kv->cross_k_l[layer_index]),
0); 0);
@ -504,10 +504,10 @@ static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia
struct ggml_tensor * v_cache_view = struct ggml_tensor * v_cache_view =
ggml_view_4d( ggml_view_4d(
ctx, kv->cross_v_l[layer_index], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2, ctx, kv->cross_v_l[layer_index], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2,
model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]), model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]),
model->head_size*model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]), model->head_size*model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]),
model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(kv->cross_v_l[layer_index]), model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(kv->cross_v_l[layer_index]),
0); 0);
ggml_build_forward_expand(gf, ggml_cpy(ctx, v, v_cache_view)); ggml_build_forward_expand(gf, ggml_cpy(ctx, v, v_cache_view));
@ -515,11 +515,11 @@ static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia
static struct ggml_tensor * build_dia_decoder( static struct ggml_tensor * build_dia_decoder(
ggml_cgraph * gf, ggml_cgraph * gf,
ggml_context * ctx, ggml_context * ctx,
dia_model * model, dia_model * model,
dia_context * dctx, dia_context * dctx,
dia_kv_cache * cache, dia_kv_cache * cache,
dia_ubatch & batch, dia_ubatch & batch,
struct ggml_tensor * encoder_hidden_states) { struct ggml_tensor * encoder_hidden_states) {
dctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.sequence_length); dctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.sequence_length);
ggml_set_input(dctx->positions); ggml_set_input(dctx->positions);
@ -528,7 +528,7 @@ static struct ggml_tensor * build_dia_decoder(
for (int l = 0; l < model->decoder->layers.size(); l++){ for (int l = 0; l < model->decoder->layers.size(); l++){
dia_decoder_layer * layer = model->decoder->layers[l]; dia_decoder_layer * layer = model->decoder->layers[l];
struct ggml_tensor * residual = cur; struct ggml_tensor * residual = cur;
cur = dia_layer_norm(ctx, cur, layer->self_attn_norm); cur = dia_layer_norm(ctx, cur, layer->self_attn_norm);
// self-attention // self-attention
{ {
@ -546,13 +546,13 @@ static struct ggml_tensor * build_dia_decoder(
0); 0);
k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3)); k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));
struct ggml_tensor * v = struct ggml_tensor * v =
ggml_view_3d(ctx, cache->v_l[l], ggml_view_3d(ctx, cache->v_l[l],
model->head_size * model->decoder_attn_heads, dctx->current_position + 1, 2, model->head_size * model->decoder_attn_heads, dctx->current_position + 1, 2,
ggml_element_size(cache->v_l[l]) * model->decoder_attn_heads * model->head_size, ggml_element_size(cache->v_l[l]) * model->decoder_attn_heads * model->head_size,
ggml_element_size(cache->v_l[l]) * model->decoder_attn_heads * model->head_size * model->max_generation_size, ggml_element_size(cache->v_l[l]) * model->decoder_attn_heads * model->head_size * model->max_generation_size,
0); 0);
v = ggml_cont_4d(ctx, ggml_transpose(ctx, v), dctx->current_position + 1, model->head_size, model->decoder_attn_heads, 2); v = ggml_cont_4d(ctx, ggml_transpose(ctx, v), dctx->current_position + 1, model->head_size, model->decoder_attn_heads, 2);
// As noted in the encoder Dia uses the Neo-X protocol for RoPE. // As noted in the encoder Dia uses the Neo-X protocol for RoPE.
Qcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, Qcur, model->head_size, model->decoder_attn_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2); Qcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, Qcur, model->head_size, model->decoder_attn_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2);
@ -583,22 +583,22 @@ static struct ggml_tensor * build_dia_decoder(
build_dia_cross_kv_store(ctx, dctx, model, cache, gf, encoder_hidden_states, l); build_dia_cross_kv_store(ctx, dctx, model, cache, gf, encoder_hidden_states, l);
} }
struct ggml_tensor * cross_k = struct ggml_tensor * cross_k =
ggml_view_4d( ggml_view_4d(
ctx, cache->cross_k_l[l], model->head_size, model->decoder_attn_heads, 2, ctx, cache->cross_k_l[l], model->head_size, model->decoder_attn_heads, 2,
model->max_encoder_context_length, model->head_size*ggml_element_size(cache->cross_k_l[l]), model->max_encoder_context_length, model->head_size*ggml_element_size(cache->cross_k_l[l]),
model->head_size*model->decoder_attn_heads*ggml_element_size(cache->cross_k_l[l]), model->head_size*model->decoder_attn_heads*ggml_element_size(cache->cross_k_l[l]),
model->head_size*model->decoder_attn_heads*2*ggml_element_size(cache->cross_k_l[l]), model->head_size*model->decoder_attn_heads*2*ggml_element_size(cache->cross_k_l[l]),
0); 0);
// the double permute operation shouldn't be necessary here, but it seems that currently ggml permute only currently alows for a single // the double permute operation shouldn't be necessary here, but it seems that currently ggml permute only currently alows for a single
// axis pair to be transposed. // axis pair to be transposed.
cross_k = ggml_cont(ctx, ggml_permute(ctx, ggml_permute(ctx, cross_k, 0, 1, 3, 2), 0, 2, 1, 3)); cross_k = ggml_cont(ctx, ggml_permute(ctx, ggml_permute(ctx, cross_k, 0, 1, 3, 2), 0, 2, 1, 3));
struct ggml_tensor * cross_v = struct ggml_tensor * cross_v =
ggml_cont(ctx, ggml_view_4d( ggml_cont(ctx, ggml_view_4d(
ctx, cache->cross_v_l[l], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2, ctx, cache->cross_v_l[l], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2,
model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]), model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]),
model->head_size*model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]), model->head_size*model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]),
model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(cache->cross_v_l[l]), model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(cache->cross_v_l[l]),
0)); 0));
@ -637,10 +637,10 @@ static struct ggml_tensor * build_dia_decoder(
} }
void dia_runner::tokenize_sentence(std::string sentence, dia_ubatch & batch) { void dia_runner::tokenize_sentence(std::string sentence, dia_ubatch & batch) {
// Dia's tokenization process is unusual. Essentially Dia takes the byte value for each character and uses that as // Dia's tokenization process is unusual. Essentially Dia takes the byte value for each character and uses that as
// a token array. Additionally, because Dia performs a cfg-scale adjustment before sampling tokens, it is necessary to // a token array. Additionally, because Dia performs a cfg-scale adjustment before sampling tokens, it is necessary to
// generate with a conditioned context (i.e. with the text) and an unconditioned context (i.e. without any text) so that // generate with a conditioned context (i.e. with the text) and an unconditioned context (i.e. without any text) so that
// proper adjustments can be perfored at each generation step. This means that we need to pad the end of our tokens to the // proper adjustments can be perfored at each generation step. This means that we need to pad the end of our tokens to the
// max context size for both the conditional and unconditional sequence. // max context size for both the conditional and unconditional sequence.
// if the sentence isn't prepended by dialogue start tokens, [S1] or [S2], then append one. // if the sentence isn't prepended by dialogue start tokens, [S1] or [S2], then append one.
@ -699,7 +699,7 @@ dia_ubatch dia_runner::batch_from_sentence(std::string sentence) {
* 1. Dia cleans its output generation by adding the difference between its text based output (its conditional output) and its unconditional output * 1. Dia cleans its output generation by adding the difference between its text based output (its conditional output) and its unconditional output
* to the conditional ouput before sampling. This is why the batch is set to two throughout the graph. * to the conditional ouput before sampling. This is why the batch is set to two throughout the graph.
* *
* 2. Dia's decoder attends across the entire encoded space including the pad buffer which receives a unique attention mask. This is why the * 2. Dia's decoder attends across the entire encoded space including the pad buffer which receives a unique attention mask. This is why the
* encoder sequence is always max length. * encoder sequence is always max length.
*/ */
struct ggml_cgraph * dia_runner::build_dia_graph(dia_ubatch & batch) { struct ggml_cgraph * dia_runner::build_dia_graph(dia_ubatch & batch) {
@ -716,7 +716,7 @@ struct ggml_cgraph * dia_runner::build_dia_graph(dia_ubatch & batch) {
ggml_set_name(cur, "decoder_output"); ggml_set_name(cur, "decoder_output");
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
free_build(); free_build();
return gf; return gf;
} }
@ -758,11 +758,11 @@ int dia_runner::decode(dia_ubatch & batch) {
dctx->output_tokens.reserve(dctx->max_generation_size * model->n_output_heads); dctx->output_tokens.reserve(dctx->max_generation_size * model->n_output_heads);
} }
ggml_backend_sched_reset(dctx->sched); ggml_backend_sched_reset(dctx->sched);
const size_t logits_size = model->output_vocab_size * dctx->max_generation_size * model->n_output_heads; const size_t logits_size = model->output_vocab_size * dctx->max_generation_size * model->n_output_heads;
const size_t prev_size = dctx->buf_output ? ggml_backend_buffer_get_size(dctx->buf_output) : 0; const size_t prev_size = dctx->buf_output ? ggml_backend_buffer_get_size(dctx->buf_output) : 0;
const size_t new_size = logits_size * sizeof(float); const size_t new_size = logits_size * sizeof(float);
if (!dctx->buf_output || prev_size < new_size) { if (!dctx->buf_output || prev_size < new_size) {
if (dctx->buf_output) { if (dctx->buf_output) {
ggml_backend_buffer_free(dctx->buf_output); ggml_backend_buffer_free(dctx->buf_output);
@ -772,7 +772,7 @@ int dia_runner::decode(dia_ubatch & batch) {
dctx->buf_output = ggml_backend_buft_alloc_buffer(dctx->backend_cpu_buffer, new_size); dctx->buf_output = ggml_backend_buft_alloc_buffer(dctx->backend_cpu_buffer, new_size);
} }
dctx->logits = (float *) ggml_backend_buffer_get_base(dctx->buf_output); dctx->logits = (float *) ggml_backend_buffer_get_base(dctx->buf_output);
ggml_cgraph * gf = build_dia_graph(batch); ggml_cgraph * gf = build_dia_graph(batch);
@ -817,7 +817,7 @@ bool dia_runner::check_stopping(dia_ubatch & batch) {
if (dctx->delay_steps == -1 && (batch.audio_tokens[0] == model->eos_token_id || dctx->current_position >= dctx->max_generation_size - model->max_delay)) { if (dctx->delay_steps == -1 && (batch.audio_tokens[0] == model->eos_token_id || dctx->current_position >= dctx->max_generation_size - model->max_delay)) {
dctx->delay_steps = model->max_delay; dctx->delay_steps = model->max_delay;
} }
if (dctx->delay_steps > 0) { if (dctx->delay_steps > 0) {
int step_after_eos = model->max_delay - dctx->delay_steps; int step_after_eos = model->max_delay - dctx->delay_steps;
for (int i = 0; i < model->delay_pattern.size(); i++) { for (int i = 0; i < model->delay_pattern.size(); i++) {
@ -907,5 +907,5 @@ void dia_runner::assign_weight(std::string name, ggml_tensor * tensor) {
dac_runner->model->assign_weight(name.substr(14), tensor); dac_runner->model->assign_weight(name.substr(14), tensor);
} else { } else {
model->assign_weight(name, tensor); model->assign_weight(name, tensor);
} }
} }

View file

@ -1,7 +1,7 @@
#pragma once #pragma once
#include "dac_model.h" #include "dac_model.h"
#include "sampler.h" #include "ttssampler.h"
struct dia_encoder_layer { struct dia_encoder_layer {
struct ggml_tensor * k; struct ggml_tensor * k;
@ -22,7 +22,7 @@ struct dia_decoder_layer {
struct ggml_tensor * self_attn_v; struct ggml_tensor * self_attn_v;
struct ggml_tensor * self_attn_o; struct ggml_tensor * self_attn_o;
struct ggml_tensor * self_attn_norm; struct ggml_tensor * self_attn_norm;
struct ggml_tensor * cross_attn_k; struct ggml_tensor * cross_attn_k;
struct ggml_tensor * cross_attn_q; struct ggml_tensor * cross_attn_q;
struct ggml_tensor * cross_attn_v; struct ggml_tensor * cross_attn_v;
@ -76,7 +76,7 @@ struct dia_model : tts_model {
dia_encoder * encoder; dia_encoder * encoder;
dia_decoder * decoder; dia_decoder * decoder;
void assign_weight(std::string name, ggml_tensor * tensor); void assign_weight(std::string name, ggml_tensor * tensor);
void assign_to_encoder(std::vector<std::string> parts, struct ggml_tensor * tensor, std::string name); void assign_to_encoder(std::vector<std::string> parts, struct ggml_tensor * tensor, std::string name);
void assign_to_decoder(std::vector<std::string> parts, struct ggml_tensor * tensor, std::string name); void assign_to_decoder(std::vector<std::string> parts, struct ggml_tensor * tensor, std::string name);
@ -103,15 +103,15 @@ struct dia_context : runner_context {
uint32_t max_generation_size; // this is set by the generation context or defaults to the config set on dia model. uint32_t max_generation_size; // this is set by the generation context or defaults to the config set on dia model.
std::vector<uint32_t> output_tokens; std::vector<uint32_t> output_tokens;
struct dia_model * model; struct dia_model * model;
struct ggml_tensor * inp_tokens; struct ggml_tensor * inp_tokens;
struct ggml_tensor * audio_inp_tokens; struct ggml_tensor * audio_inp_tokens;
struct ggml_tensor * positions; struct ggml_tensor * positions;
struct ggml_tensor * encode_positions; struct ggml_tensor * encode_positions;
struct ggml_tensor * encode_attn_mask; struct ggml_tensor * encode_attn_mask;
struct ggml_tensor * cross_attn_mask; struct ggml_tensor * cross_attn_mask;
void build_schedule() { void build_schedule() {
runner_context::build_schedule(model->max_nodes()); runner_context::build_schedule(model->max_nodes());
} }
@ -126,11 +126,11 @@ struct dia_kv_cache {
std::vector<struct ggml_tensor *> k_l; std::vector<struct ggml_tensor *> k_l;
std::vector<struct ggml_tensor *> v_l; std::vector<struct ggml_tensor *> v_l;
struct ggml_context * ctx; struct ggml_context * ctx;
ggml_backend_buffer_type_t buft; ggml_backend_buffer_type_t buft;
ggml_backend_buffer_t buf; ggml_backend_buffer_t buf;
void free() { void free() {
ggml_free(ctx); ggml_free(ctx);
ggml_backend_buffer_free(buf); ggml_backend_buffer_free(buf);

View file

@ -53,7 +53,7 @@ namespace general_neural_audio_codec {
uint32_t padding; uint32_t padding;
uint32_t stride; uint32_t stride;
std::vector<residual_unit> residual_blocks; std::vector<residual_unit> residual_blocks;
}; };

View file

@ -3,11 +3,11 @@
#include <stdlib.h> #include <stdlib.h>
#include "tts_model.h" #include "tts_model.h"
#include "tokenizer.h" #include "ttstokenizer.h"
#include "phonemizer.h" #include "phonemizer.h"
// Rather than using ISO 639-2 language codes, Kokoro voice pack specify their corresponding language via their first letter. // Rather than using ISO 639-2 language codes, Kokoro voice pack specify their corresponding language via their first letter.
// Below is a map that describes the relationship between those designations and espeak-ng's voice identifiers so that the // Below is a map that describes the relationship between those designations and espeak-ng's voice identifiers so that the
// appropriate phonemization protocol can inferred from the Kokoro voice. // appropriate phonemization protocol can inferred from the Kokoro voice.
static std::map<char, std::string> KOKORO_LANG_TO_ESPEAK_ID = { static std::map<char, std::string> KOKORO_LANG_TO_ESPEAK_ID = {
{'a', "gmw/en-US"}, {'a', "gmw/en-US"},
@ -22,7 +22,7 @@ static std::map<char, std::string> KOKORO_LANG_TO_ESPEAK_ID = {
}; };
struct lstm_cell { struct lstm_cell {
std::vector<ggml_tensor*> weights; std::vector<ggml_tensor*> weights;
std::vector<ggml_tensor*> biases; std::vector<ggml_tensor*> biases;
std::vector<ggml_tensor*> reverse_weights; std::vector<ggml_tensor*> reverse_weights;
std::vector<ggml_tensor*> reverse_biases; std::vector<ggml_tensor*> reverse_biases;
@ -197,8 +197,8 @@ struct kokoro_model : tts_model {
// standard configuration for duration prediction // standard configuration for duration prediction
uint32_t f0_n_blocks = 3; uint32_t f0_n_blocks = 3;
uint32_t n_duration_prediction_layers = 3; uint32_t n_duration_prediction_layers = 3;
// while it is technically possible for the duration predictor to assign 50 values per token there is no practical need to // while it is technically possible for the duration predictor to assign 50 values per token there is no practical need to
// allocate that many items to the sequence as it is impossible for all tokens to require such long durations and each // allocate that many items to the sequence as it is impossible for all tokens to require such long durations and each
// allocation increases node allocation size by O(N) // allocation increases node allocation size by O(N)
uint32_t max_duration_per_token = 20; uint32_t max_duration_per_token = 20;
uint32_t style_half_size = 128; uint32_t style_half_size = 128;
@ -221,7 +221,7 @@ struct kokoro_model : tts_model {
float noise_std = 0.003f; float noise_std = 0.003f;
float voice_threshold = 10.0f; float voice_threshold = 10.0f;
float sample_rate = 24000.0f; float sample_rate = 24000.0f;
std::string window = "hann"; std::string window = "hann";
// It is really annoying that ggml doesn't allow using non ggml tensors as the operator for simple math ops. // It is really annoying that ggml doesn't allow using non ggml tensors as the operator for simple math ops.
// This is just the constant defined above as a tensor. // This is just the constant defined above as a tensor.
@ -259,7 +259,7 @@ struct kokoro_model : tts_model {
// Decoding and Generation portion of the model // Decoding and Generation portion of the model
struct kokoro_decoder * decoder; struct kokoro_decoder * decoder;
// the default hidden states need to be initialized // the default hidden states need to be initialized
std::vector<lstm*> lstms; std::vector<lstm*> lstms;
size_t duration_node_counter = 0; size_t duration_node_counter = 0;
@ -317,15 +317,15 @@ struct kokoro_duration_context : runner_context {
~kokoro_duration_context() { ~kokoro_duration_context() {
ggml_backend_buffer_free(buf_len_output); ggml_backend_buffer_free(buf_len_output);
} }
std::string voice = "af_alloy"; std::string voice = "af_alloy";
struct kokoro_model * model; struct kokoro_model * model;
ggml_backend_buffer_t buf_len_output = nullptr; ggml_backend_buffer_t buf_len_output = nullptr;
size_t logits_size = 0; // capacity (of floats) for logits size_t logits_size = 0; // capacity (of floats) for logits
float * lens = nullptr; float * lens = nullptr;
struct ggml_tensor * inp_tokens; struct ggml_tensor * inp_tokens;
struct ggml_tensor * positions; struct ggml_tensor * positions;
struct ggml_tensor * attn_mask; struct ggml_tensor * attn_mask;
@ -356,7 +356,7 @@ struct kokoro_duration_response {
}; };
// This struct is intended to manage graph and compute for the duration prediction portion of the kokoro model. // This struct is intended to manage graph and compute for the duration prediction portion of the kokoro model.
// Duration computation and speech generation are separated into distinct graphs because the precomputed graph structure of ggml doesn't // Duration computation and speech generation are separated into distinct graphs because the precomputed graph structure of ggml doesn't
// support the tensor dependent views that would otherwise be necessary. // support the tensor dependent views that would otherwise be necessary.
struct kokoro_duration_runner : tts_runner { struct kokoro_duration_runner : tts_runner {
kokoro_duration_runner(kokoro_model * model, kokoro_duration_context * context, single_pass_tokenizer * tokenizer): model(model), kctx(context), tokenizer(tokenizer) {}; kokoro_duration_runner(kokoro_model * model, kokoro_duration_context * context, single_pass_tokenizer * tokenizer): model(model), kctx(context), tokenizer(tokenizer) {};
@ -375,7 +375,7 @@ struct kokoro_duration_runner : tts_runner {
void init_build() { void init_build() {
tts_runner::init_build(&kctx->buf_compute_meta); tts_runner::init_build(&kctx->buf_compute_meta);
} }
void prepare_post_load(); void prepare_post_load();
struct kokoro_ubatch build_worst_case_batch(); struct kokoro_ubatch build_worst_case_batch();
void set_inputs(kokoro_ubatch & batch); void set_inputs(kokoro_ubatch & batch);
@ -397,7 +397,7 @@ struct kokoro_context : runner_context {
} }
std::string voice = "af_alloy"; std::string voice = "af_alloy";
struct kokoro_model * model; struct kokoro_model * model;
uint32_t total_duration; uint32_t total_duration;
@ -408,7 +408,7 @@ struct kokoro_context : runner_context {
struct ggml_tensor * duration_mask; struct ggml_tensor * duration_mask;
struct ggml_tensor * window_sq_sum; // needs to be calculatd from the generator window. struct ggml_tensor * window_sq_sum; // needs to be calculatd from the generator window.
struct ggml_tensor * uv_noise_data; struct ggml_tensor * uv_noise_data;
void build_schedule() { void build_schedule() {
runner_context::build_schedule(model->max_gen_nodes()*30); runner_context::build_schedule(model->max_gen_nodes()*30);
} }

View file

@ -150,7 +150,7 @@ orpheus_context * build_new_orpheus_context(orpheus_model * model, int n_threads
return octx; return octx;
} }
void orpheus_runner::orpheus_kv_cache_init() { void orpheus_runner::orpheus_kv_cache_init() {
ggml_backend_buffer_type_t buft = nullptr; ggml_backend_buffer_type_t buft = nullptr;
if (octx->backend != nullptr) { if (octx->backend != nullptr) {
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
@ -192,21 +192,21 @@ void orpheus_runner::orpheus_kv_cache_init() {
} }
void orpheus_runner::orpheus_build_kv_store(struct ggml_context * ctx, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, int index, uint32_t n_tokens, int repeat) { void orpheus_runner::orpheus_build_kv_store(struct ggml_context * ctx, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, int index, uint32_t n_tokens, int repeat) {
k_cur = ggml_rope_ext(ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, k_cur, model->head_size, model->n_kv_attn_heads, n_tokens)), octx->positions, model->rope_frequencies, k_cur = ggml_rope_ext(ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, k_cur, model->head_size, model->n_kv_attn_heads, n_tokens)), octx->positions, model->rope_frequencies,
model->head_size, 2,0, 500000.0f, model->head_size, 2,0, 500000.0f,
1.0f, 0.0f, 1.0f, 0.0f, 0.0f); 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
// A performance comparison between this method, i.e. performing 3 incremental copy operations in order to achieve repeat_interleave, // A performance comparison between this method, i.e. performing 3 incremental copy operations in order to achieve repeat_interleave,
// and performing the repeat operation upfront before performign a single copy needs to be performed in order to better optimize this function. // and performing the repeat operation upfront before performign a single copy needs to be performed in order to better optimize this function.
// Additionally, it might be more performant for the values transposition to be performed prior to appending it to the cache, as it would save us // Additionally, it might be more performant for the values transposition to be performed prior to appending it to the cache, as it would save us
// from incrementally larger transpositions with generation. // from incrementally larger transpositions with generation.
for (int i = 0; i < repeat; i++) { for (int i = 0; i < repeat; i++) {
struct ggml_tensor * k_cache_view = ggml_view_3d( struct ggml_tensor * k_cache_view = ggml_view_3d(
ctx, ctx,
kv_self->k_l[index], kv_self->k_l[index],
model->head_size, model->head_size,
model->n_kv_attn_heads, model->n_kv_attn_heads,
n_tokens, n_tokens,
ggml_element_size(kv_self->k_l[index]) * model->head_size * repeat, ggml_element_size(kv_self->k_l[index]) * model->head_size * repeat,
ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size, ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size,
ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size * octx->current_position + i * ggml_element_size(kv_self->k_l[index]) * model->head_size ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size * octx->current_position + i * ggml_element_size(kv_self->k_l[index]) * model->head_size
@ -230,19 +230,19 @@ void orpheus_runner::orpheus_kv_cache_init() {
struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch) { struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch) {
init_build(); init_build();
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false);
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * inpL; struct ggml_tensor * inpL;
const int32_t full_sequence_length = octx->current_position + (uint32_t) batch.n_tokens; const int32_t full_sequence_length = octx->current_position + (uint32_t) batch.n_tokens;
octx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens); octx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
ggml_set_input(octx->positions); ggml_set_input(octx->positions);
octx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens); octx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
ggml_set_input(octx->inp_tokens); ggml_set_input(octx->inp_tokens);
inpL = ggml_get_rows(ctx, model->embd, octx->inp_tokens); inpL = ggml_get_rows(ctx, model->embd, octx->inp_tokens);
struct ggml_tensor * KQ_mask_dec = build_attn_mask(ctx, octx, batch); struct ggml_tensor * KQ_mask_dec = build_attn_mask(ctx, octx, batch);
for (int l = 0; l < model->n_layers; l++) { for (int l = 0; l < model->n_layers; l++) {
struct ggml_tensor * residual = inpL; struct ggml_tensor * residual = inpL;
cur = orpheus_build_layer_norm(ctx, inpL, model->layers[l].input_norm); cur = orpheus_build_layer_norm(ctx, inpL, model->layers[l].input_norm);
@ -261,8 +261,8 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch)
model->head_size, full_sequence_length, model->n_attn_heads, model->head_size, full_sequence_length, model->n_attn_heads,
ggml_element_size(kv_self->k_l[l]) * model->n_attn_heads * model->head_size, ggml_element_size(kv_self->k_l[l]) * model->n_attn_heads * model->head_size,
ggml_element_size(kv_self->k_l[l]) * model->head_size, ggml_element_size(kv_self->k_l[l]) * model->head_size,
0)); 0));
struct ggml_tensor * v = struct ggml_tensor * v =
ggml_view_2d(ctx, kv_self->v_l[l], ggml_view_2d(ctx, kv_self->v_l[l],
model->hidden_size, full_sequence_length, model->hidden_size, full_sequence_length,
@ -272,7 +272,7 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch)
v = ggml_cont_3d(ctx, ggml_transpose(ctx, v), full_sequence_length, model->head_size, model->n_attn_heads); v = ggml_cont_3d(ctx, ggml_transpose(ctx, v), full_sequence_length, model->head_size, model->n_attn_heads);
Qcur = ggml_rope_ext( Qcur = ggml_rope_ext(
ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, Qcur, model->head_size, model->n_attn_heads, batch.n_tokens)), ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, Qcur, model->head_size, model->n_attn_heads, batch.n_tokens)),
octx->positions, model->rope_frequencies, model->head_size, 2, 0, 500000.0f, // rope theta octx->positions, model->rope_frequencies, model->head_size, 2, 0, 500000.0f, // rope theta
1.0f, 0.0f, 1.0f, 0.0f, 0.0f); 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
@ -286,7 +286,7 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch)
} }
cur = ggml_add(ctx, attn_out, residual); cur = ggml_add(ctx, attn_out, residual);
struct ggml_tensor * residualffn = cur; struct ggml_tensor * residualffn = cur;
// mlp // mlp
@ -298,7 +298,7 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch)
cur = ggml_add(ctx, cur, residualffn); cur = ggml_add(ctx, cur, residualffn);
inpL = cur; inpL = cur;
} }
cur = orpheus_build_layer_norm(ctx, cur, model->output_norm); cur = orpheus_build_layer_norm(ctx, cur, model->output_norm);
// only about 40k of the output head is actually uses for generation purposes. Ideally the head tensor should be shrunk and sampled tokens should be incremented. // only about 40k of the output head is actually uses for generation purposes. Ideally the head tensor should be shrunk and sampled tokens should be incremented.
cur = ggml_mul_mat(ctx, model->head, cur); cur = ggml_mul_mat(ctx, model->head, cur);
@ -307,15 +307,15 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch)
} }
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
free_build(); free_build();
return gf; return gf;
} }
void orpheus_runner::decode(orpheus_ubatch & batch) { void orpheus_runner::decode(orpheus_ubatch & batch) {
ggml_backend_sched_reset(octx->sched); ggml_backend_sched_reset(octx->sched);
octx->output_tokens.reserve(model->max_generation_size); octx->output_tokens.reserve(model->max_generation_size);
const size_t new_size = model->vocab_size * model->max_generation_size * sizeof(float); const size_t new_size = model->vocab_size * model->max_generation_size * sizeof(float);
octx->prep_output_buffer(new_size); octx->prep_output_buffer(new_size);
@ -324,10 +324,10 @@ void orpheus_runner::decode(orpheus_ubatch & batch) {
// the output is always the last tensor in the graph // the output is always the last tensor in the graph
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
ggml_backend_sched_alloc_graph(octx->sched, gf); ggml_backend_sched_alloc_graph(octx->sched, gf);
set_inputs(batch); set_inputs(batch);
ggml_backend_sched_graph_compute_async(octx->sched, gf); ggml_backend_sched_graph_compute_async(octx->sched, gf);
float * logits_out = octx->logits + octx->n_outputs * model->vocab_size; float * logits_out = octx->logits + octx->n_outputs * model->vocab_size;
octx->get_ggml_node_data(res, logits_out, model->vocab_size * sizeof(float)); octx->get_ggml_node_data(res, logits_out, model->vocab_size * sizeof(float));

View file

@ -1,7 +1,7 @@
#pragma once #pragma once
#include "sampler.h" #include "ttssampler.h"
#include "tokenizer.h" #include "ttstokenizer.h"
#include "snac_model.h" #include "snac_model.h"
// Orpheus uses vLLM with a llama-3 architecture. The only critical difference from the normal llama architecture is the use of kv heads. // Orpheus uses vLLM with a llama-3 architecture. The only critical difference from the normal llama architecture is the use of kv heads.
@ -73,7 +73,7 @@ struct orpheus_context : runner_context {
struct ggml_tensor * positions; struct ggml_tensor * positions;
}; };
struct orpheus_kv_cache { struct orpheus_kv_cache {
ggml_type cache_type = GGML_TYPE_F32; ggml_type cache_type = GGML_TYPE_F32;
std::vector<struct ggml_tensor *> k_l; std::vector<struct ggml_tensor *> k_l;
@ -104,11 +104,11 @@ struct orpheus_ubatch {
struct orpheus_runner : tts_runner { struct orpheus_runner : tts_runner {
orpheus_runner( orpheus_runner(
orpheus_model * model, orpheus_model * model,
snac_runner * audio_decoder, snac_runner * audio_decoder,
orpheus_context * octx, orpheus_context * octx,
bpe_tokenizer * bt, bpe_tokenizer * bt,
sampler * samp, sampler * samp,
orpheus_kv_cache * cache): model(model), srunner(audio_decoder), octx(octx), tokenizer(bt), generation_sampler(samp), kv_self(cache) { orpheus_kv_cache * cache): model(model), srunner(audio_decoder), octx(octx), tokenizer(bt), generation_sampler(samp), kv_self(cache) {
tts_runner::sampling_rate = 24000.0f; tts_runner::sampling_rate = 24000.0f;
generation_sampler->n_output_heads = 1; generation_sampler->n_output_heads = 1;

View file

@ -2,8 +2,8 @@
#define parler_model_h #define parler_model_h
#include "dac_model.h" #include "dac_model.h"
#include "t5_encoder_model.h" #include "ttst5_encoder_model.h"
#include "sampler.h" #include "ttssampler.h"
enum parler_tensor { enum parler_tensor {
PARLER_EMBD, PARLER_EMBD,
@ -38,17 +38,17 @@ struct parler_layer {
struct ggml_tensor * self_attn_o_proj; struct ggml_tensor * self_attn_o_proj;
struct ggml_tensor * self_attn_norm; struct ggml_tensor * self_attn_norm;
struct ggml_tensor * self_attn_norm_bias; struct ggml_tensor * self_attn_norm_bias;
struct ggml_tensor * attn_k_proj; struct ggml_tensor * attn_k_proj;
struct ggml_tensor * attn_q_proj; struct ggml_tensor * attn_q_proj;
struct ggml_tensor * attn_v_proj; struct ggml_tensor * attn_v_proj;
struct ggml_tensor * attn_o_proj; struct ggml_tensor * attn_o_proj;
struct ggml_tensor * attn_norm; struct ggml_tensor * attn_norm;
struct ggml_tensor * attn_norm_bias; struct ggml_tensor * attn_norm_bias;
struct ggml_tensor * cross_k; struct ggml_tensor * cross_k;
struct ggml_tensor * cross_v; struct ggml_tensor * cross_v;
struct ggml_tensor * fc1; struct ggml_tensor * fc1;
struct ggml_tensor * fc2; struct ggml_tensor * fc2;
struct ggml_tensor * final_norm; struct ggml_tensor * final_norm;
@ -74,18 +74,18 @@ struct parler_tts_model : tts_model {
uint32_t prompt_vocab_size; uint32_t prompt_vocab_size;
bool use_cross_attn = true; bool use_cross_attn = true;
std::vector<struct ggml_tensor*> embds; std::vector<struct ggml_tensor*> embds;
std::vector<parler_layer*> layers; std::vector<parler_layer*> layers;
std::vector<struct ggml_tensor*> heads; std::vector<struct ggml_tensor*> heads;
struct ggml_tensor * precomputed_input_emb; struct ggml_tensor * precomputed_input_emb;
struct ggml_tensor * precomputed_positional_embds; struct ggml_tensor * precomputed_positional_embds;
struct ggml_tensor * layer_norm; struct ggml_tensor * layer_norm;
struct ggml_tensor * layer_norm_bias; struct ggml_tensor * layer_norm_bias;
struct ggml_tensor * prompt_embd; struct ggml_tensor * prompt_embd;
void assign_weight(std::string name, ggml_tensor * tensor); void assign_weight(std::string name, ggml_tensor * tensor);
void prep_constants(gguf_context * meta); void prep_constants(gguf_context * meta);
void prep_layers(gguf_context * meta); void prep_layers(gguf_context * meta);
@ -107,21 +107,21 @@ struct parler_context : runner_context {
std::vector<bool> eos_seen; std::vector<bool> eos_seen;
bool use_cache = true; bool use_cache = true;
size_t output_size = 0; // capacity (of tokens positions) for the output buffers size_t output_size = 0; // capacity (of tokens positions) for the output buffers
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
uint32_t current_position = 0; // current position in the active sequence uint32_t current_position = 0; // current position in the active sequence
uint32_t prompt_end_position = 0; // the position of the text prompt termination (used for adjusting the cache when incrementally generating) uint32_t prompt_end_position = 0; // the position of the text prompt termination (used for adjusting the cache when incrementally generating)
int32_t seq_id; // a unique identifier associated with the active sequence. int32_t seq_id; // a unique identifier associated with the active sequence.
std::vector<uint32_t> output_tokens; std::vector<uint32_t> output_tokens;
struct ggml_tensor * inp_tokens; struct ggml_tensor * inp_tokens;
struct ggml_tensor * audio_inp_tokens; struct ggml_tensor * audio_inp_tokens;
struct ggml_tensor * positions; struct ggml_tensor * positions;
struct ggml_tensor * attn_mask; struct ggml_tensor * attn_mask;
struct ggml_tensor * attn_mask_cross; struct ggml_tensor * attn_mask_cross;
void build_schedule() { void build_schedule() {
runner_context::build_schedule(model->max_nodes()); runner_context::build_schedule(model->max_nodes());
} }
@ -130,17 +130,17 @@ struct parler_context : runner_context {
struct parler_kv_cache { struct parler_kv_cache {
int32_t seq_id; int32_t seq_id;
ggml_type type_k = GGML_TYPE_F32; ggml_type type_k = GGML_TYPE_F32;
ggml_type type_v = GGML_TYPE_F32; ggml_type type_v = GGML_TYPE_F32;
std::vector<struct ggml_tensor *> k_l; std::vector<struct ggml_tensor *> k_l;
std::vector<struct ggml_tensor *> v_l; std::vector<struct ggml_tensor *> v_l;
struct ggml_context * ctx; struct ggml_context * ctx;
ggml_backend_buffer_type_t buft; ggml_backend_buffer_type_t buft;
ggml_backend_buffer_t buf; ggml_backend_buffer_t buf;
void free() { void free() {
ggml_free(ctx); ggml_free(ctx);
ggml_backend_buffer_free(buf); ggml_backend_buffer_free(buf);
@ -152,8 +152,8 @@ struct parler_kv_cache {
}; };
struct parler_ubatch { struct parler_ubatch {
parler_ubatch(bool audio_generation, size_t n_tokens, size_t n_audio_tokens, size_t sequence_length, parler_ubatch(bool audio_generation, size_t n_tokens, size_t n_audio_tokens, size_t sequence_length,
uint32_t * tokens, uint32_t * audio_tokens, uint32_t * positions, uint32_t * true_order, uint32_t * tokens, uint32_t * audio_tokens, uint32_t * positions, uint32_t * true_order,
int current_step): audio_generation(audio_generation), n_tokens(n_tokens), n_audio_tokens(n_audio_tokens), sequence_length(sequence_length), tokens(tokens), audio_tokens(audio_tokens), positions(positions), true_order(true_order), current_step(current_step) {}; int current_step): audio_generation(audio_generation), n_tokens(n_tokens), n_audio_tokens(n_audio_tokens), sequence_length(sequence_length), tokens(tokens), audio_tokens(audio_tokens), positions(positions), true_order(true_order), current_step(current_step) {};
parler_ubatch() {}; parler_ubatch() {};
bool audio_generation; // whether we are receiving codebook decoded tokens or text tokens bool audio_generation; // whether we are receiving codebook decoded tokens or text tokens

View file

@ -543,7 +543,7 @@ dictionary_response * phoneme_dictionary::lookup(corpus * text, std::string valu
} }
std::vector<dictionary_response*> possibilities = lookup_map.at(value); std::vector<dictionary_response*> possibilities = lookup_map.at(value);
for (auto possible : possibilities) { for (auto possible : possibilities) {
if (possible->code == SUCCESS || (possible->code == SUCCESS_PARTIAL && possible->is_match(text, flags))) { if (possible->code == SUCCESS_TOTAL || (possible->code == SUCCESS_PARTIAL && possible->is_match(text, flags))) {
return possible; return possible;
} }
} }
@ -818,7 +818,7 @@ bool phonemizer::process_word(corpus* text, std::string* output, std::string wor
output->append(" "); output->append(" ");
} }
flags->update_for_word(word); flags->update_for_word(word);
if (response->code != SUCCESS) { if (response->code != SUCCESS_TOTAL) {
word += response->after_match; word += response->after_match;
output->append(response->value); output->append(response->value);
text->size_pop(word.size()+unaccented_size_difference); text->size_pop(word.size()+unaccented_size_difference);
@ -1072,7 +1072,7 @@ dictionary_response * response_from_string(std::string value, std::string key) {
bool not_at_start = key[0] == '#'; bool not_at_start = key[0] == '#';
bool not_at_end = key.back() == '#'; bool not_at_end = key.back() == '#';
if (!has_spacing) { if (!has_spacing) {
dictionary_response * resp = new dictionary_response(SUCCESS, value); dictionary_response * resp = new dictionary_response(SUCCESS_TOTAL, value);
resp->expects_to_be_proceeded_by_number = expects_to_be_proceeded_by_number; resp->expects_to_be_proceeded_by_number = expects_to_be_proceeded_by_number;
resp->not_at_clause_end = not_at_end; resp->not_at_clause_end = not_at_end;
resp->not_at_clause_start = not_at_start; resp->not_at_clause_start = not_at_start;

View file

@ -4,7 +4,7 @@
// SNAC, Scale Neural Audio Codec, is another neural audio codec much like DAC. // SNAC, Scale Neural Audio Codec, is another neural audio codec much like DAC.
// The key differences are that it uses grouping in the residual units of its layers, // The key differences are that it uses grouping in the residual units of its layers,
// performs a repeat_interleave over the second and third input channels, applies // performs a repeat_interleave over the second and third input channels, applies
// a noise convolutional layer after input encoding for each layer, and applies // a noise convolutional layer after input encoding for each layer, and applies
// an extra convolutional layer before residual layers are applied. // an extra convolutional layer before residual layers are applied.
struct snac_model : tts_model { struct snac_model : tts_model {
@ -19,7 +19,7 @@ struct snac_model : tts_model {
uint32_t noise_steps[4] = {8, 64, 256, 512}; uint32_t noise_steps[4] = {8, 64, 256, 512};
uint32_t noise_steps_sum = 840; uint32_t noise_steps_sum = 840;
bool use_noise = true; bool use_noise = true;
struct ggml_tensor * repeat_interleave_buffer; struct ggml_tensor * repeat_interleave_buffer;
struct ggml_tensor * in_conv_kernel; struct ggml_tensor * in_conv_kernel;
@ -46,12 +46,12 @@ struct snac_model : tts_model {
// the context used for running the snac model // the context used for running the snac model
struct snac_context : runner_context { struct snac_context : runner_context {
snac_context(snac_model * model, int n_threads): runner_context(n_threads), model(model) {}; snac_context(snac_model * model, int n_threads): runner_context(n_threads), model(model) {};
struct snac_model * model; struct snac_model * model;
struct ggml_tensor * inp_tokens; struct ggml_tensor * inp_tokens;
struct ggml_tensor * noise; struct ggml_tensor * noise;
void build_schedule() { void build_schedule() {
runner_context::build_schedule(model->max_nodes()); runner_context::build_schedule(model->max_nodes());
} }
@ -74,11 +74,11 @@ struct snac_runner : tts_runner {
} }
snac_model * model; snac_model * model;
snac_context * sctx; snac_context * sctx;
void init_build() { void init_build() {
tts_runner::init_build(&sctx->buf_compute_meta); tts_runner::init_build(&sctx->buf_compute_meta);
} }
void set_inputs(std::vector<std::vector<uint32_t>> & tokens); void set_inputs(std::vector<std::vector<uint32_t>> & tokens);
void prepare_post_load(); void prepare_post_load();
struct ggml_cgraph * build_snac_graph(size_t sequence_length); struct ggml_cgraph * build_snac_graph(size_t sequence_length);

View file

@ -1,4 +1,4 @@
#include "sampler.h" #include "ttssampler.h"
void sampler::sample(float * logits, std::vector<uint32_t> & output_tokens) { void sampler::sample(float * logits, std::vector<uint32_t> & output_tokens) {
// assume that we are pointing to the start of the first token output; // assume that we are pointing to the start of the first token output;
@ -6,7 +6,7 @@ void sampler::sample(float * logits, std::vector<uint32_t> & output_tokens) {
return max(logits, output_tokens); return max(logits, output_tokens);
} }
std::vector<uint32_t> max_vals; std::vector<uint32_t> max_vals;
// the max_head_probs variable is used when top-p is applied but exists to address the case in which top-k and top-p cause the cumulative probability of the nucleus to beless than or // the max_head_probs variable is used when top-p is applied but exists to address the case in which top-k and top-p cause the cumulative probability of the nucleus to beless than or
// equal to top_p; // equal to top_p;
std::vector<float> max_head_probs; std::vector<float> max_head_probs;
@ -189,7 +189,7 @@ void sampler::max(float * logits, std::vector<uint32_t> & output_tokens) {
uint32_t token_id = 0; uint32_t token_id = 0;
for (uint32_t ii = 0; ii < vocab_size; ii++) { for (uint32_t ii = 0; ii < vocab_size; ii++) {
float v = *(logits+i*vocab_size+ii); float v = *(logits+i*vocab_size+ii);
// while repetition penalty will never be used for maximum token selection, it is used for the logarithmic stabilization of // while repetition penalty will never be used for maximum token selection, it is used for the logarithmic stabilization of
// the softmax function in which case it is possible for repetition counts to be set. // the softmax function in which case it is possible for repetition counts to be set.
if (has_repetition_penalty && last_token_ids[i] == ii) { if (has_repetition_penalty && last_token_ids[i] == ii) {
v /= (pow(repetition_penalty, repetition_counts[i])); v /= (pow(repetition_penalty, repetition_counts[i]));

View file

@ -21,7 +21,7 @@ struct sampler {
std::vector<uint32_t> repetition_counts; std::vector<uint32_t> repetition_counts;
bool do_sample = true; bool do_sample = true;
bool apply_softmax = true; bool apply_softmax = true;
void sample(float * logits, std::vector<uint32_t> & output_tokens); void sample(float * logits, std::vector<uint32_t> & output_tokens);
void softmax(float * logits, std::vector<std::vector<size_t>> picks, std::vector<uint32_t> max_indices); void softmax(float * logits, std::vector<std::vector<size_t>> picks, std::vector<uint32_t> max_indices);
void max(float * logits, std::vector<uint32_t> & output_tokens); void max(float * logits, std::vector<uint32_t> & output_tokens);

View file

@ -1,4 +1,4 @@
#include "t5_encoder_model.h" #include "ttst5_encoder_model.h"
static const std::map<std::string, t5_tensor> T5_TENSOR_GGUF_LOOKUP = { static const std::map<std::string, t5_tensor> T5_TENSOR_GGUF_LOOKUP = {
{"t5encoder.token_embd", T5_EMBD}, {"t5encoder.token_embd", T5_EMBD},
@ -139,7 +139,7 @@ void t5_encoder::prep_constants(gguf_context * meta) {
int bos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.bos_token_id"); int bos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.bos_token_id");
if (bos_token_id_key != -1) { if (bos_token_id_key != -1) {
bos_token_id = gguf_get_val_u32(meta, bos_token_id_key); bos_token_id = gguf_get_val_u32(meta, bos_token_id_key);
} }
int eos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.eos_token_id"); int eos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.eos_token_id");
if (eos_token_id_key != -1) { if (eos_token_id_key != -1) {
@ -219,7 +219,7 @@ struct ggml_cgraph * t5_runner::build_t5_graph(t5_ubatch & batch) {
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * inpL; struct ggml_tensor * inpL;
//t5ctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens); //t5ctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
//ggml_set_input(t5ctx->positions); //ggml_set_input(t5ctx->positions);
@ -233,7 +233,7 @@ struct ggml_cgraph * t5_runner::build_t5_graph(t5_ubatch & batch) {
struct ggml_tensor * KQ_mask_dec = build_t5_attn_mask(ctx, t5ctx, batch); struct ggml_tensor * KQ_mask_dec = build_t5_attn_mask(ctx, t5ctx, batch);
struct ggml_tensor * pos_bias = build_t5_pos_bias(ctx, t5ctx->inp_pos_bucket, model->relative_attn_bias); struct ggml_tensor * pos_bias = build_t5_pos_bias(ctx, t5ctx->inp_pos_bucket, model->relative_attn_bias);
for (int l = 0; l < model->n_layers; l++) { for (int l = 0; l < model->n_layers; l++) {
struct ggml_tensor * residual = inpL; struct ggml_tensor * residual = inpL;
@ -293,7 +293,7 @@ struct ggml_cgraph * t5_runner::build_t5_graph(t5_ubatch & batch) {
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
free_build(); free_build();
return gf; return gf;
} }
@ -312,7 +312,7 @@ void t5_runner::set_inputs(t5_ubatch & batch) {
for (int ii = 0; ii < batch.n_tokens; ii++) { for (int ii = 0; ii < batch.n_tokens; ii++) {
int ab_rpos = abs(i - ii); int ab_rpos = abs(i - ii);
int rpos = i - ii; int rpos = i - ii;
attn_mask[i*batch.n_tokens + ii] = 0.0f; //ii > i ? -INFINITY : 0.0f; attn_mask[i*batch.n_tokens + ii] = 0.0f; //ii > i ? -INFINITY : 0.0f;
pos_bucket[i*batch.n_tokens + ii] = (uint32_t) (rpos > 0 ? n_buckets : 0) + (ab_rpos < max_exact ? ab_rpos : std::min((n_buckets - 1), (max_exact + (int)((log((ab_rpos / max_exact)) / logarithmic_denominator) * max_exact)))); pos_bucket[i*batch.n_tokens + ii] = (uint32_t) (rpos > 0 ? n_buckets : 0) + (ab_rpos < max_exact ? ab_rpos : std::min((n_buckets - 1), (max_exact + (int)((log((ab_rpos / max_exact)) / logarithmic_denominator) * max_exact))));
} }
} }
@ -324,10 +324,10 @@ void t5_runner::run(uint32_t * input_tokens, uint32_t sequence_length, struct tt
batch.input_tokens = input_tokens; batch.input_tokens = input_tokens;
batch.n_tokens = sequence_length; batch.n_tokens = sequence_length;
ggml_backend_sched_reset(t5ctx->sched); ggml_backend_sched_reset(t5ctx->sched);
const size_t prev_size = t5ctx->buf_output ? ggml_backend_buffer_get_size(t5ctx->buf_output) : 0; const size_t prev_size = t5ctx->buf_output ? ggml_backend_buffer_get_size(t5ctx->buf_output) : 0;
const size_t new_size = model->max_context_length * model->output_size * sizeof(float); const size_t new_size = model->max_context_length * model->output_size * sizeof(float);
if (!t5ctx->buf_output || prev_size < new_size) { if (!t5ctx->buf_output || prev_size < new_size) {
if (t5ctx->buf_output) { if (t5ctx->buf_output) {
ggml_backend_buffer_free(t5ctx->buf_output); ggml_backend_buffer_free(t5ctx->buf_output);
@ -337,7 +337,7 @@ void t5_runner::run(uint32_t * input_tokens, uint32_t sequence_length, struct tt
t5ctx->buf_output = ggml_backend_buft_alloc_buffer(t5ctx->backend_cpu_buffer, new_size); t5ctx->buf_output = ggml_backend_buft_alloc_buffer(t5ctx->backend_cpu_buffer, new_size);
} }
outputs->data = (float *) ggml_backend_buffer_get_base(t5ctx->buf_output); outputs->data = (float *) ggml_backend_buffer_get_base(t5ctx->buf_output);
ggml_backend_buffer_clear(t5ctx->buf_output, 0); ggml_backend_buffer_clear(t5ctx->buf_output, 0);
struct ggml_cgraph * gf = NULL; struct ggml_cgraph * gf = NULL;

View file

@ -2,7 +2,7 @@
#define t5_encoder_model_h #define t5_encoder_model_h
#include "tts_model.h" #include "tts_model.h"
#include "tokenizer.h" #include "ttstokenizer.h"
enum t5_tensor { enum t5_tensor {
@ -75,14 +75,14 @@ void assign_to_t5_layer(t5_encoder * model, t5_layer & layer, std::string name,
struct t5_context : runner_context { struct t5_context : runner_context {
t5_context(t5_encoder * model, int n_threads): runner_context(n_threads), model(model) {}; t5_context(t5_encoder * model, int n_threads): runner_context(n_threads), model(model) {};
struct t5_encoder * model; struct t5_encoder * model;
struct ggml_tensor * inp_tokens; struct ggml_tensor * inp_tokens;
struct ggml_tensor * positions; struct ggml_tensor * positions;
struct ggml_tensor * attn_mask; struct ggml_tensor * attn_mask;
struct ggml_tensor * inp_pos_bucket; struct ggml_tensor * inp_pos_bucket;
void build_schedule() { void build_schedule() {
runner_context::build_schedule(model->max_nodes()); runner_context::build_schedule(model->max_nodes());
} }
@ -116,7 +116,7 @@ struct t5_runner : tts_runner {
void init_build() { void init_build() {
tts_runner::init_build(&t5ctx->buf_compute_meta); tts_runner::init_build(&t5ctx->buf_compute_meta);
} }
void prepare_post_load(); void prepare_post_load();
struct t5_ubatch build_worst_case_batch(); struct t5_ubatch build_worst_case_batch();
void set_inputs(t5_ubatch & batch); void set_inputs(t5_ubatch & batch);

View file

@ -1,4 +1,4 @@
#include "tokenizer.h" #include "ttstokenizer.h"
void token_trie::add(const std::string & gram, uint32_t token) { void token_trie::add(const std::string & gram, uint32_t token) {
_add(gram, token, 0); _add(gram, token, 0);