standardize tts linting and formatting

2025-09-11 17:44:38 +00:00 · 2025-08-17 14:11:30 +08:00 · 2025-08-17 14:11:30 +08:00 · 9935ac093f
commit 9935ac093f
parent cfc1a0d4ef
24 changed files with 371 additions and 355 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -474,7 +474,7 @@ set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)

 add_library(tts_adapter
            otherarch/tts_adapter.cpp)
-target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./tools ./common)
+target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./otherarch/ttscpp/include ./otherarch/ttscpp/src ./tools ./common)
 target_compile_features(tts_adapter PUBLIC cxx_std_17) # don't bump
 target_link_libraries(tts_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(tts_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
--- a/2
+++ b/2
@ -729,7 +729,7 @@ mainvk: tools/main/main.cpp common/arg.cpp build-info.h ggml_v4_vulkan.o ggml-cp
 	$(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 embedding: examples/embedding/embedding.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-ttscppmain: otherarch/ttscpp/cli/cli.cpp otherarch/ttscpp/cli/playback.cpp otherarch/ttscpp/cli/playback.h otherarch/ttscpp/cli/write_file.cpp otherarch/ttscpp/cli/write_file.h otherarch/ttscpp/cli/vad.cpp otherarch/ttscpp/cli/vad.h otherarch/ttscpp/src/tts.cpp otherarch/ttscpp/src/tokenizer.cpp otherarch/ttscpp/src/sampler.cpp otherarch/ttscpp/src/parler_model.cpp otherarch/ttscpp/src/dac_model.cpp otherarch/ttscpp/src/ttsutil.cpp otherarch/ttscpp/src/args.cpp otherarch/ttscpp/src/t5_encoder_model.cpp otherarch/ttscpp/src/phonemizer.cpp otherarch/ttscpp/src/tts_model.cpp otherarch/ttscpp/src/kokoro_model.cpp otherarch/ttscpp/src/dia_model.cpp otherarch/ttscpp/src/orpheus_model.cpp otherarch/ttscpp/src/snac_model.cpp otherarch/ttscpp/src/general_neural_audio_codec.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
+ttscppmain: otherarch/ttscpp/cli/cli.cpp otherarch/ttscpp/cli/playback.cpp otherarch/ttscpp/cli/playback.h otherarch/ttscpp/cli/write_file.cpp otherarch/ttscpp/cli/write_file.h otherarch/ttscpp/cli/vad.cpp otherarch/ttscpp/cli/vad.h otherarch/ttscpp/src/tts.cpp otherarch/ttscpp/src/ttstokenizer.cpp otherarch/ttscpp/src/ttssampler.cpp otherarch/ttscpp/src/parler_model.cpp otherarch/ttscpp/src/dac_model.cpp otherarch/ttscpp/src/ttsutil.cpp otherarch/ttscpp/src/ttsargs.cpp otherarch/ttscpp/src/ttst5_encoder_model.cpp otherarch/ttscpp/src/phonemizer.cpp otherarch/ttscpp/src/tts_model.cpp otherarch/ttscpp/src/kokoro_model.cpp otherarch/ttscpp/src/dia_model.cpp otherarch/ttscpp/src/orpheus_model.cpp otherarch/ttscpp/src/snac_model.cpp otherarch/ttscpp/src/general_neural_audio_codec.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 ggml/src/ggml-vulkan-shaders.cpp:
--- a/otherarch/tts_adapter.cpp
+++ b/otherarch/tts_adapter.cpp
@ -25,6 +25,22 @@
 #define M_PI		3.14159265358979323846
 #endif

+//imports required for tts.cpp to work
+#include "tts.cpp"
+#include "ttstokenizer.cpp"
+#include "ttssampler.cpp"
+#include "parler_model.cpp"
+#include "dac_model.cpp"
+#include "ttsutil.cpp"
+#include "ttst5_encoder_model.cpp"
+#include "phonemizer.cpp"
+#include "tts_model.cpp"
+#include "kokoro_model.cpp"
+#include "dia_model.cpp"
+#include "orpheus_model.cpp"
+#include "snac_model.cpp"
+#include "general_neural_audio_codec.cpp"
+
 enum TTS_VER
 {
    TTS_VER_2,
--- a/otherarch/ttscpp/cli/vad.cpp
+++ b/otherarch/ttscpp/cli/vad.cpp
@ -9,8 +9,8 @@ float energy(float * chunk, int count) {
 }

 void apply_energy_voice_inactivity_detection(
-	tts_response & data, 
-	float sample_rate, 
+	tts_response & data,
+	float sample_rate,
 	int ms_per_frame,
 	int frame_threshold,
 	float normalized_energy_threshold,
--- a/otherarch/ttscpp/include/audio_file.h
+++ b/otherarch/ttscpp/include/audio_file.h
--- a/otherarch/ttscpp/include/phonemizer.h
+++ b/otherarch/ttscpp/include/phonemizer.h
@ -12,7 +12,7 @@
 #include <unordered_map>
 #include <map>
 #include <unordered_set>
-#include "tokenizer.h"
+#include "ttstokenizer.h"
 #include <algorithm>
 #include <mutex>

@ -33,16 +33,16 @@ static const std::unordered_set<std::string> ONE_LETTER_WORDS = {
 	"i",
 };
 /*
- * The two letter and three letter words listed below have been filtered down from the complete list of english two and three letter words 
+ * The two letter and three letter words listed below have been filtered down from the complete list of english two and three letter words
 * via several criteria:
 *   1. All non-EN-US words have been removed
 * 	 2. All three letter acronyms have been removed (as these lists are used to identify acronyms)
- *   3. All archaic, deprecated, or poetic words have been removed. 
- * 	 4. All literary, abbreviative, and slang words have been removed if they see no more than a mean of 30 daily searches via google (over the 
- *	 last 10 years). 
- * 
- * After the lists were filtered by the criteria described above, removed items were reviewed. Any item which had entered the common EN-US 
- * vernacular but was not identified as of American origin was reintroduced into the sets below. 
+ *   3. All archaic, deprecated, or poetic words have been removed.
+ * 	 4. All literary, abbreviative, and slang words have been removed if they see no more than a mean of 30 daily searches via google (over the
+ *	 last 10 years).
+ *
+ * After the lists were filtered by the criteria described above, removed items were reviewed. Any item which had entered the common EN-US
+ * vernacular but was not identified as of American origin was reintroduced into the sets below.
 */
 static const std::unordered_set<std::string> TWO_LETTER_WORDS = {
 	"ab", "ah", "am", "an", "as", "at", "aw", "ax", "ay", "be", "bo", "br",
@ -50,7 +50,7 @@ static const std::unordered_set<std::string> TWO_LETTER_WORDS = {
 	"id", "if", "in", "is", "it", "la", "lo", "ma", "me", "mm", "my", "na",
 	"no", "of", "oh", "oi", "on", "oo", "or", "ow", "ox", "oy", "pa", "qi",
 	"re", "sh", "so", "to", "uh", "um", "un", "up", "us", "we", "wo", "ya",
-	"ye", "yo", 
+	"ye", "yo",
 };
 static const std::unordered_set<std::string> THREE_LETTER_WORDS = {
 	"aah", "abs", "aby", "ace", "ach", "ack", "act", "add", "ado", "ads", "aft", "age",
@ -292,7 +292,7 @@ static std::string STOPPING_TOKENS = ".,:;!?";

 #ifdef ESPEAK_INSTALL
 /**
- * espeak-ng uses globals to persist and manage its state so it is not compatible with 
+ * espeak-ng uses globals to persist and manage its state so it is not compatible with
 * threaded parallelism (https://github.com/espeak-ng/espeak-ng/issues/1527).
 * This singleton acts as a mutex wrapped provider for all espeak phonemization methods such
 * that multiple instances of the kokoro_runner can be initialized and called in parallel.
@ -323,7 +323,7 @@ public:
 #endif

 enum lookup_code {
-	SUCCESS = 100,
+	SUCCESS_TOTAL = 100,
 	SUCCESS_PARTIAL = 101,
 	FAILURE_UNFOUND = 200,
 	FAILURE_PHONETIC = 201,
@ -368,7 +368,7 @@ struct conditions {
 	void update_for_word(std::string word,bool allow_for_upper_check = true);
 };

-/* 
+/*
 * The corpus struct is simply a small wrapper class that is used to perform simple look forward and backwards in the text
 * which is being phonemized. This can be used to discern how to convert chunks of text in a consistent and protective fashion
 * in order to accurately phonemize complicated text.
@ -376,7 +376,7 @@ struct conditions {
 struct corpus {
 	corpus(const char * text, size_t size): size(size), text(text) {};
 	size_t location = 0;
-	size_t size; 
+	size_t size;
 	const char * text;

 	/*
@ -397,9 +397,9 @@ struct corpus {
 	std::string after_until(int after, std::string val);
 };

-/* 
+/*
 * The TTS phonemizer works by splitting each word into distinct graphemes, and for each grapheme the phonemizer will look at the grapheme that came
- * before, after, and for any word specific exceptions in order to compile a 
+ * before, after, and for any word specific exceptions in order to compile a
 */
 struct phonemizer_rule {
 	~phonemizer_rule() {
@ -436,10 +436,10 @@ private:

 struct word_phonemizer * word_phonemizer_from_gguf(gguf_context * meta);

-/* 
+/*
 * The general translation approach that espeak uses is to lookup words in the dictionary and return a list of possible matches per lookup.
 * Each match contains flags which describe the match's conditions and limitations and optionally a pronunciation. When a pronunciation is not returned,
- * it usually means that the word needs to be pronounced phonetically, the word belongs to another language, or that the original content is a 
+ * it usually means that the word needs to be pronounced phonetically, the word belongs to another language, or that the original content is a
 * token representation of a different word (e.g. with numbers).
 *
 * Since it does not make sense to have the core lexer reperform this lookup operation with represented words or via distinct languages, those behaviors
@ -470,7 +470,7 @@ struct phoneme_dictionary {

 struct phoneme_dictionary * phoneme_dictionary_from_gguf(gguf_context * meta);

-/* 
+/*
 * In general, I would like to avoid requiring the installation of otherwise broad and technically complicated libraries,
 * like espeak, especially when they are only being used for a small portion of their overall functionality. While avoiding these
 * requirements will keep the default installation cost of TTS.cpp down, it is also unlikely that TTS.cpp will support
@ -478,8 +478,8 @@ struct phoneme_dictionary * phoneme_dictionary_from_gguf(gguf_context * meta);
 * espeak. As such, the phonemizer struct described below will support simple text to IPA phoneme functionality out of the box,
 * while also optionally acting as an interface for espeak phonemization.
 *
- * Phonemization seems to use a pattern close to the common lexer, such that at each index or chunk of text forward and backward context 
- * views are used to support single pass translation. As such, the TTS.cpp phonemization pattern I've decided to implement behaves 
+ * Phonemization seems to use a pattern close to the common lexer, such that at each index or chunk of text forward and backward context
+ * views are used to support single pass translation. As such, the TTS.cpp phonemization pattern I've decided to implement behaves
 * effecively like a simple router lexer. It will only support utf-8 encoded text and english IPA conversion.
 */
 struct phonemizer {
--- a/otherarch/ttscpp/include/ttscommon.h
+++ b/otherarch/ttscpp/include/ttscommon.h
@ -28,7 +28,7 @@ const std::map<std::string, tts_arch> SUPPORTED_ARCHITECTURES = {
 	{ "orpheus", ORPHEUS_ARCH }
 };

-/// Given a map from keys to values, creates a new map from values to keys 
+/// Given a map from keys to values, creates a new map from values to keys
 template<typename K, typename V>
 static std::map<V, K> reverse_map(const std::map<K, V>& m) {
    std::map<V, K> r;
@ -43,10 +43,10 @@ const std::map<tts_arch, std::string> ARCHITECTURE_NAMES = reverse_map(SUPPORTED
 struct generation_configuration {
    generation_configuration(
    	std::string voice = "",
-    	int top_k = 50, 
-    	float temperature = 1.0, 
-    	float repetition_penalty = 1.0, 
-    	bool use_cross_attn = true, 
+    	int top_k = 50,
+    	float temperature = 1.0,
+    	float repetition_penalty = 1.0,
+    	bool use_cross_attn = true,
    	std::string espeak_voice_id = "",
    	int max_tokens = 0,
    	float top_p = 1.0,
--- a/otherarch/ttscpp/src/dac_model.h
+++ b/otherarch/ttscpp/src/dac_model.h
@ -22,13 +22,13 @@ struct dac_quantize_layer {
 // this struct maintains the static tensors for the dac audio decoder graph.
 // As such, this is designed to contain basic configuration and ggml tensor support for DAC.
 // The dac_runner describes how the graph is built and run.
-struct dac_model : tts_model {    
+struct dac_model : tts_model {
    // These configs  are essentially built for the 44khZ 8kbps standard DAC model audio encoder and decoder
    uint32_t n_layers = 4;
    uint32_t n_heads = 9;
    uint32_t up_sampling_factor = 512;
    uint32_t max_generation_size = 2580;
-    
+
    struct ggml_tensor * in_conv_kernel;
    struct ggml_tensor * in_conv_bias;
    struct ggml_tensor * out_conv_kernel;
@ -53,11 +53,11 @@ void assign_to_audio_encoder(dac_model * model, std::string name, ggml_tensor *
 // the context used for running the dac model
 struct dac_context : runner_context {
    dac_context(dac_model * model, int n_threads): runner_context(n_threads), model(model) {};
-    
+
    struct dac_model * model;
-        
+
    struct ggml_tensor * inp_tokens;
-    
+
    void build_schedule() {
        runner_context::build_schedule(model->max_nodes());
    }
@ -85,11 +85,11 @@ struct dac_runner : tts_runner {
    }
    dac_model * model;
    dac_context * dctx;
-    
+
    void init_build() {
        tts_runner::init_build(&dctx->buf_compute_meta);
    }
-    
+
    void prepare_post_load();
    struct ggml_cgraph * build_dac_graph(dac_ubatch & batch);
    void run(uint32_t * input_tokens, uint32_t sequence_length, struct tts_response * outputs);
--- a/otherarch/ttscpp/src/dia_model.cpp
+++ b/otherarch/ttscpp/src/dia_model.cpp
@ -119,7 +119,7 @@ void dia_model::assign_to_decoder_layer(std::string part, dia_decoder_layer * la
        set_tensor(layer->self_attn_norm, tensor);
    } else if (part == "pre_mlp_norm") {
        layer->mlp_norm = ggml_dup_tensor(ctx, tensor);
-        set_tensor(layer->mlp_norm, tensor);    
+        set_tensor(layer->mlp_norm, tensor);
    } else if (part == "pre_ca_norm") {
        layer->cross_attn_norm = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer->cross_attn_norm, tensor);
@ -151,7 +151,7 @@ void dia_model::prep_layers() {
        dia_decoder_layer * l = new dia_decoder_layer;
        decoder->layers.push_back(l);
    }
-    
+
    decoder->embds.reserve((size_t) n_output_heads);
    decoder->heads.reserve((size_t) n_output_heads);
    for (int i = 0; i < n_output_heads; i++) {
@ -196,7 +196,7 @@ void dia_model::prep_constants(gguf_context * meta) {
    int encoder_attn_heads_key = gguf_find_key(meta, "dia.encoder.attn_heads");
    if (encoder_attn_heads_key != -1) {
        encoder_attn_heads = gguf_get_val_u32(meta, encoder_attn_heads_key);
-    }    
+    }

    int head_size_key = gguf_find_key(meta, "dia.attn_head_size");
    if (head_size_key != -1) {
@ -271,7 +271,7 @@ struct dia_context * build_new_dia_context(struct dia_model * model, int n_threa
    return dctx;
 }

-static bool dia_kv_cache_init(struct dia_kv_cache * cache, dia_model * model, dia_context * dctx) {    
+static bool dia_kv_cache_init(struct dia_kv_cache * cache, dia_model * model, dia_context * dctx) {
    ggml_backend_buffer_type_t buft = nullptr;
    // this will only really support cpu or metal for the time being;
    if (dctx->backend != nullptr) {
@ -382,7 +382,7 @@ static struct ggml_tensor * build_dia_encoder(ggml_context * ctx, dia_model * mo
    struct ggml_tensor * cur = ggml_reshape_3d(ctx, ggml_get_rows(ctx, model->encoder->embedding, dctx->inp_tokens), model->encoder_hidden_size, model->max_encoder_context_length, 2);
    for (auto layer : model->encoder->layers) {
        struct ggml_tensor * residual = cur;
-        
+
        cur = dia_layer_norm(ctx, cur, layer->self_attn_norm);
        // self-attention
        {
@ -402,7 +402,7 @@ static struct ggml_tensor * build_dia_encoder(ggml_context * ctx, dia_model * mo
            struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3);

            // It is unclear why the attention ops in Dia's encoder don't project to the embedding dimension size as is standard. Instead they up project to the decoder's embedding dimension
-            // then down project back the the encoder embedding dimension. 
+            // then down project back the the encoder embedding dimension.
            cur = ggml_cont_3d(ctx, kqv_merged, model->decoder_hidden_size, model->max_encoder_context_length, 2);
            cur = ggml_mul_mat(ctx, layer->o, cur);
        }
@ -443,10 +443,10 @@ static struct ggml_tensor * repeat_interleave_dim1(ggml_context * ctx, struct gg
 static void build_dia_self_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * k, struct ggml_tensor * v, dia_ubatch & batch, int layer_index) {
    int64_t attn_size = model->head_size * model->decoder_attn_heads;

-    struct ggml_tensor * k_cache_view = 
+    struct ggml_tensor * k_cache_view =
        ggml_view_2d(
-                ctx, kv->k_l[layer_index], attn_size, 2, 
-                attn_size * model->max_generation_size * ggml_element_size(kv->k_l[layer_index]), 
+                ctx, kv->k_l[layer_index], attn_size, 2,
+                attn_size * model->max_generation_size * ggml_element_size(kv->k_l[layer_index]),
                attn_size*dctx->current_position*ggml_element_size(kv->k_l[layer_index]));

    k = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, k, model->head_size, model->decoder_attn_heads / model->decoder_query_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2);
@ -461,8 +461,8 @@ static void build_dia_self_kv_store(ggml_context * ctx, dia_context * dctx, dia_
    struct ggml_tensor * v_cache_view = nullptr;

    v_cache_view = ggml_view_2d(
-            ctx, kv->v_l[layer_index], attn_size, 2, 
-            attn_size * model->max_generation_size * ggml_element_size(kv->v_l[layer_index]), 
+            ctx, kv->v_l[layer_index], attn_size, 2,
+            attn_size * model->max_generation_size * ggml_element_size(kv->v_l[layer_index]),
            attn_size*dctx->current_position*ggml_element_size(kv->v_l[layer_index]));

    // Since the sequence length should always be 1 here this is the most pertinent time to repeat the heads for grouped query attention.
@ -476,11 +476,11 @@ static void build_dia_self_kv_store(ggml_context * ctx, dia_context * dctx, dia_
 static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * encoder_hidden_states, int layer_index) {
    dia_decoder_layer * layer = model->decoder->layers[layer_index];
    struct ggml_tensor * encoder_states_key_view = ggml_cont(ctx, ggml_view_3d(
-        ctx, 
-        encoder_hidden_states, 
-        model->encoder_hidden_size, 
-        dctx->prompt_size, 
-        2, 
+        ctx,
+        encoder_hidden_states,
+        model->encoder_hidden_size,
+        dctx->prompt_size,
+        2,
        model->encoder_hidden_size * ggml_element_size(encoder_hidden_states), model->encoder_hidden_size * model->max_encoder_context_length * ggml_element_size(encoder_hidden_states), 0));

    struct ggml_tensor * k = ggml_mul_mat(ctx, layer->cross_attn_k, encoder_states_key_view);
@ -491,8 +491,8 @@ static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia

    struct ggml_tensor * k_cache_view =
        ggml_view_4d(
-                ctx, kv->cross_k_l[layer_index], model->head_size, model->decoder_attn_heads, 2, dctx->prompt_size, 
-                model->head_size*ggml_element_size(kv->cross_k_l[layer_index]), 
+                ctx, kv->cross_k_l[layer_index], model->head_size, model->decoder_attn_heads, 2, dctx->prompt_size,
+                model->head_size*ggml_element_size(kv->cross_k_l[layer_index]),
                model->head_size*model->decoder_attn_heads*ggml_element_size(kv->cross_k_l[layer_index]),
                model->head_size*model->decoder_attn_heads*2*ggml_element_size(kv->cross_k_l[layer_index]),
                0);
@ -504,10 +504,10 @@ static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia

    struct ggml_tensor * v_cache_view =
        ggml_view_4d(
-                ctx, kv->cross_v_l[layer_index], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2, 
-                model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]), 
-                model->head_size*model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]), 
-                model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(kv->cross_v_l[layer_index]), 
+                ctx, kv->cross_v_l[layer_index], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2,
+                model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]),
+                model->head_size*model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]),
+                model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(kv->cross_v_l[layer_index]),
                0);

    ggml_build_forward_expand(gf, ggml_cpy(ctx, v, v_cache_view));
@ -515,11 +515,11 @@ static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia

 static struct ggml_tensor * build_dia_decoder(
        ggml_cgraph * gf,
-        ggml_context * ctx, 
-        dia_model * model, 
-        dia_context * dctx, 
-        dia_kv_cache * cache, 
-        dia_ubatch & batch, 
+        ggml_context * ctx,
+        dia_model * model,
+        dia_context * dctx,
+        dia_kv_cache * cache,
+        dia_ubatch & batch,
        struct ggml_tensor * encoder_hidden_states) {
    dctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.sequence_length);
    ggml_set_input(dctx->positions);
@ -528,7 +528,7 @@ static struct ggml_tensor * build_dia_decoder(
    for (int l = 0; l < model->decoder->layers.size(); l++){
        dia_decoder_layer * layer = model->decoder->layers[l];
        struct ggml_tensor * residual = cur;
-        
+
        cur = dia_layer_norm(ctx, cur, layer->self_attn_norm);
        // self-attention
        {
@ -546,13 +546,13 @@ static struct ggml_tensor * build_dia_decoder(
                        0);
            k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));

-            struct ggml_tensor * v = 
+            struct ggml_tensor * v =
                ggml_view_3d(ctx, cache->v_l[l],
                        model->head_size * model->decoder_attn_heads, dctx->current_position + 1, 2,
                        ggml_element_size(cache->v_l[l]) * model->decoder_attn_heads * model->head_size,
                        ggml_element_size(cache->v_l[l]) * model->decoder_attn_heads * model->head_size * model->max_generation_size,
                        0);
-            v = ggml_cont_4d(ctx, ggml_transpose(ctx, v), dctx->current_position + 1, model->head_size, model->decoder_attn_heads, 2); 
+            v = ggml_cont_4d(ctx, ggml_transpose(ctx, v), dctx->current_position + 1, model->head_size, model->decoder_attn_heads, 2);

            // As noted in the encoder Dia uses the Neo-X protocol for RoPE.
            Qcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, Qcur, model->head_size, model->decoder_attn_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2);
@ -583,22 +583,22 @@ static struct ggml_tensor * build_dia_decoder(
                build_dia_cross_kv_store(ctx, dctx, model, cache, gf, encoder_hidden_states, l);
            }

-            struct ggml_tensor * cross_k = 
+            struct ggml_tensor * cross_k =
                ggml_view_4d(
                        ctx, cache->cross_k_l[l], model->head_size, model->decoder_attn_heads, 2,
-                        model->max_encoder_context_length, model->head_size*ggml_element_size(cache->cross_k_l[l]), 
-                        model->head_size*model->decoder_attn_heads*ggml_element_size(cache->cross_k_l[l]), 
-                        model->head_size*model->decoder_attn_heads*2*ggml_element_size(cache->cross_k_l[l]),                 
+                        model->max_encoder_context_length, model->head_size*ggml_element_size(cache->cross_k_l[l]),
+                        model->head_size*model->decoder_attn_heads*ggml_element_size(cache->cross_k_l[l]),
+                        model->head_size*model->decoder_attn_heads*2*ggml_element_size(cache->cross_k_l[l]),
                        0);
            // the double permute operation shouldn't be necessary here, but it seems that currently ggml permute only currently alows for a single
            // axis pair to be transposed.
            cross_k = ggml_cont(ctx, ggml_permute(ctx, ggml_permute(ctx, cross_k, 0, 1, 3, 2), 0, 2, 1, 3));

-            struct ggml_tensor * cross_v = 
+            struct ggml_tensor * cross_v =
                ggml_cont(ctx, ggml_view_4d(
                        ctx, cache->cross_v_l[l], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2,
-                        model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]), 
-                        model->head_size*model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]), 
+                        model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]),
+                        model->head_size*model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]),
                        model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(cache->cross_v_l[l]),
                        0));

@ -637,10 +637,10 @@ static struct ggml_tensor * build_dia_decoder(
 }

 void dia_runner::tokenize_sentence(std::string sentence, dia_ubatch & batch) {
-    // Dia's tokenization process is unusual. Essentially Dia takes the byte value for each character and uses that as 
-    // a token array. Additionally, because Dia performs a cfg-scale adjustment before sampling tokens, it is necessary to 
+    // Dia's tokenization process is unusual. Essentially Dia takes the byte value for each character and uses that as
+    // a token array. Additionally, because Dia performs a cfg-scale adjustment before sampling tokens, it is necessary to
    // generate with a conditioned context (i.e. with the text) and an unconditioned context (i.e. without any text) so that
-    // proper adjustments can be perfored at each generation step. This means that we need to pad the end of our tokens to the 
+    // proper adjustments can be perfored at each generation step. This means that we need to pad the end of our tokens to the
    // max context size for both the conditional and unconditional sequence.

    // if the sentence isn't prepended by dialogue start tokens, [S1] or [S2], then append one.
@ -699,7 +699,7 @@ dia_ubatch dia_runner::batch_from_sentence(std::string sentence) {
 * 1.  Dia cleans its output generation by adding the difference between its text based output (its conditional output) and its unconditional output
 *     to the conditional ouput before sampling. This is why the batch is set to two throughout the graph.
 *
- * 2.  Dia's decoder attends across the entire encoded space including the pad buffer which receives a unique attention mask. This is why the 
+ * 2.  Dia's decoder attends across the entire encoded space including the pad buffer which receives a unique attention mask. This is why the
 *     encoder sequence is always max length.
 */
 struct ggml_cgraph * dia_runner::build_dia_graph(dia_ubatch & batch) {
@ -716,7 +716,7 @@ struct ggml_cgraph * dia_runner::build_dia_graph(dia_ubatch & batch) {
    ggml_set_name(cur, "decoder_output");
    ggml_build_forward_expand(gf, cur);
    free_build();
-    
+
    return gf;
 }

@ -758,11 +758,11 @@ int dia_runner::decode(dia_ubatch & batch) {
        dctx->output_tokens.reserve(dctx->max_generation_size * model->n_output_heads);
    }
    ggml_backend_sched_reset(dctx->sched);
-        
+
    const size_t logits_size = model->output_vocab_size * dctx->max_generation_size * model->n_output_heads;
    const size_t prev_size = dctx->buf_output ? ggml_backend_buffer_get_size(dctx->buf_output) : 0;
    const size_t new_size  = logits_size * sizeof(float);
-    
+
    if (!dctx->buf_output || prev_size < new_size) {
        if (dctx->buf_output) {
            ggml_backend_buffer_free(dctx->buf_output);
@ -772,7 +772,7 @@ int dia_runner::decode(dia_ubatch & batch) {

        dctx->buf_output = ggml_backend_buft_alloc_buffer(dctx->backend_cpu_buffer, new_size);
    }
-    
+
    dctx->logits = (float *) ggml_backend_buffer_get_base(dctx->buf_output);

    ggml_cgraph * gf = build_dia_graph(batch);
@ -817,7 +817,7 @@ bool dia_runner::check_stopping(dia_ubatch & batch) {
    if (dctx->delay_steps == -1 && (batch.audio_tokens[0] == model->eos_token_id || dctx->current_position >= dctx->max_generation_size - model->max_delay)) {
        dctx->delay_steps = model->max_delay;
    }
-    
+
    if (dctx->delay_steps > 0) {
        int step_after_eos = model->max_delay - dctx->delay_steps;
        for (int i = 0; i < model->delay_pattern.size(); i++) {
@ -907,5 +907,5 @@ void dia_runner::assign_weight(std::string name, ggml_tensor * tensor) {
        dac_runner->model->assign_weight(name.substr(14), tensor);
    } else {
        model->assign_weight(name, tensor);
-    }   
+    }
 }
--- a/otherarch/ttscpp/src/dia_model.h
+++ b/otherarch/ttscpp/src/dia_model.h
@ -1,7 +1,7 @@
 #pragma once

 #include "dac_model.h"
-#include "sampler.h"
+#include "ttssampler.h"

 struct dia_encoder_layer {
    struct ggml_tensor * k;
@ -22,7 +22,7 @@ struct dia_decoder_layer {
    struct ggml_tensor * self_attn_v;
    struct ggml_tensor * self_attn_o;
    struct ggml_tensor * self_attn_norm;
-    
+
    struct ggml_tensor * cross_attn_k;
    struct ggml_tensor * cross_attn_q;
    struct ggml_tensor * cross_attn_v;
@ -76,7 +76,7 @@ struct dia_model : tts_model {

    dia_encoder * encoder;
    dia_decoder * decoder;
-    
+
    void assign_weight(std::string name, ggml_tensor * tensor);
    void assign_to_encoder(std::vector<std::string> parts, struct ggml_tensor * tensor, std::string name);
    void assign_to_decoder(std::vector<std::string> parts, struct ggml_tensor * tensor, std::string name);
@ -103,15 +103,15 @@ struct dia_context : runner_context {
    uint32_t max_generation_size; // this is set by the generation context or defaults to the config set on dia model.

    std::vector<uint32_t> output_tokens;
-    struct dia_model * model;    
-    
+    struct dia_model * model;
+
    struct ggml_tensor * inp_tokens;
    struct ggml_tensor * audio_inp_tokens;
    struct ggml_tensor * positions;
    struct ggml_tensor * encode_positions;
    struct ggml_tensor * encode_attn_mask;
    struct ggml_tensor * cross_attn_mask;
-    
+
    void build_schedule() {
        runner_context::build_schedule(model->max_nodes());
    }
@ -126,11 +126,11 @@ struct dia_kv_cache {

    std::vector<struct ggml_tensor *> k_l;
    std::vector<struct ggml_tensor *> v_l;
-    
+
    struct ggml_context * ctx;
    ggml_backend_buffer_type_t buft;
    ggml_backend_buffer_t buf;
-    
+
    void free() {
        ggml_free(ctx);
        ggml_backend_buffer_free(buf);
--- a/otherarch/ttscpp/src/general_neural_audio_codec.h
+++ b/otherarch/ttscpp/src/general_neural_audio_codec.h
@ -53,7 +53,7 @@ namespace general_neural_audio_codec {

        uint32_t padding;
        uint32_t stride;
-        
+
        std::vector<residual_unit> residual_blocks;
    };

--- a/otherarch/ttscpp/src/kokoro_model.h
+++ b/otherarch/ttscpp/src/kokoro_model.h
@ -3,11 +3,11 @@

 #include <stdlib.h>
 #include "tts_model.h"
-#include "tokenizer.h"
+#include "ttstokenizer.h"
 #include "phonemizer.h"

 // Rather than using ISO 639-2 language codes, Kokoro voice pack specify their corresponding language via their first letter.
-// Below is a map that describes the relationship between those designations and espeak-ng's voice identifiers so that the 
+// Below is a map that describes the relationship between those designations and espeak-ng's voice identifiers so that the
 // appropriate phonemization protocol can inferred from the Kokoro voice.
 static std::map<char, std::string> KOKORO_LANG_TO_ESPEAK_ID = {
 	{'a', "gmw/en-US"},
@ -22,7 +22,7 @@ static std::map<char, std::string> KOKORO_LANG_TO_ESPEAK_ID = {
 };

 struct lstm_cell {
-	std::vector<ggml_tensor*> weights; 
+	std::vector<ggml_tensor*> weights;
 	std::vector<ggml_tensor*> biases;
 	std::vector<ggml_tensor*> reverse_weights;
 	std::vector<ggml_tensor*> reverse_biases;
@ -197,8 +197,8 @@ struct kokoro_model : tts_model {
 	// standard configuration for duration prediction
 	uint32_t f0_n_blocks = 3;
 	uint32_t n_duration_prediction_layers = 3;
-	// while it is technically possible for the duration predictor to assign 50 values per token there is no practical need to 
-	// allocate that many items to the sequence as it is impossible for all tokens to require such long durations and each 
+	// while it is technically possible for the duration predictor to assign 50 values per token there is no practical need to
+	// allocate that many items to the sequence as it is impossible for all tokens to require such long durations and each
 	// allocation increases node allocation size by O(N)
 	uint32_t max_duration_per_token = 20;
 	uint32_t style_half_size = 128;
@ -221,7 +221,7 @@ struct kokoro_model : tts_model {
 	float noise_std = 0.003f;
 	float voice_threshold = 10.0f;
 	float sample_rate = 24000.0f;
-	std::string window = "hann"; 
+	std::string window = "hann";

 	// It is really annoying that ggml doesn't allow using non ggml tensors as the operator for simple math ops.
 	// This is just the constant defined above as a tensor.
@ -259,7 +259,7 @@ struct kokoro_model : tts_model {
 	// Decoding and Generation portion of the model
 	struct kokoro_decoder * decoder;

-	// the default hidden states need to be initialized 
+	// the default hidden states need to be initialized
 	std::vector<lstm*> lstms;

 	size_t duration_node_counter = 0;
@ -317,15 +317,15 @@ struct kokoro_duration_context : runner_context {
    ~kokoro_duration_context() {
        ggml_backend_buffer_free(buf_len_output);
    }
-    
+
    std::string voice = "af_alloy";
    struct kokoro_model * model;
    ggml_backend_buffer_t buf_len_output = nullptr;

-    
+
    size_t  logits_size = 0; // capacity (of floats) for logits
    float * lens 		= nullptr;
-    
+
    struct ggml_tensor * inp_tokens;
    struct ggml_tensor * positions;
    struct ggml_tensor * attn_mask;
@ -356,7 +356,7 @@ struct kokoro_duration_response {
 };

 // This struct is intended to manage graph and compute for the duration prediction portion of the kokoro model.
-// Duration computation and speech generation are separated into distinct graphs because the precomputed graph structure of ggml doesn't 
+// Duration computation and speech generation are separated into distinct graphs because the precomputed graph structure of ggml doesn't
 // support the tensor dependent views that would otherwise be necessary.
 struct kokoro_duration_runner : tts_runner {
    kokoro_duration_runner(kokoro_model * model, kokoro_duration_context * context, single_pass_tokenizer * tokenizer): model(model), kctx(context), tokenizer(tokenizer) {};
@ -375,7 +375,7 @@ struct kokoro_duration_runner : tts_runner {
    void init_build() {
        tts_runner::init_build(&kctx->buf_compute_meta);
    }
-    
+
    void prepare_post_load();
    struct kokoro_ubatch build_worst_case_batch();
    void set_inputs(kokoro_ubatch & batch);
@ -397,7 +397,7 @@ struct kokoro_context : runner_context {
    }

    std::string voice = "af_alloy";
-    
+
    struct kokoro_model * model;

    uint32_t total_duration;
@ -408,7 +408,7 @@ struct kokoro_context : runner_context {
    struct ggml_tensor * duration_mask;
    struct ggml_tensor * window_sq_sum; // needs to be calculatd from the generator window.
    struct ggml_tensor * uv_noise_data;
-    
+
    void build_schedule() {
        runner_context::build_schedule(model->max_gen_nodes()*30);
    }
--- a/otherarch/ttscpp/src/orpheus_model.cpp
+++ b/otherarch/ttscpp/src/orpheus_model.cpp
@ -150,7 +150,7 @@ orpheus_context * build_new_orpheus_context(orpheus_model * model, int n_threads
    return octx;
 }

-void orpheus_runner::orpheus_kv_cache_init() {    
+void orpheus_runner::orpheus_kv_cache_init() {
    ggml_backend_buffer_type_t buft = nullptr;
    if (octx->backend != nullptr) {
 #ifdef GGML_USE_METAL
@ -192,21 +192,21 @@ void orpheus_runner::orpheus_kv_cache_init() {
 }

 void orpheus_runner::orpheus_build_kv_store(struct ggml_context * ctx, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, int index, uint32_t n_tokens, int repeat) {
-    k_cur = ggml_rope_ext(ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, k_cur, model->head_size, model->n_kv_attn_heads, n_tokens)), octx->positions, model->rope_frequencies, 
+    k_cur = ggml_rope_ext(ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, k_cur, model->head_size, model->n_kv_attn_heads, n_tokens)), octx->positions, model->rope_frequencies,
                model->head_size, 2,0, 500000.0f,
                1.0f, 0.0f, 1.0f, 0.0f, 0.0f);

    // A performance comparison between this method, i.e. performing 3 incremental copy operations in order to achieve repeat_interleave,
    // and performing the repeat operation upfront before performign a single copy needs to be performed in order to better optimize this function.
-    // Additionally, it might be more performant for the values transposition to be performed prior to appending it to the cache, as it would save us 
+    // Additionally, it might be more performant for the values transposition to be performed prior to appending it to the cache, as it would save us
    // from incrementally larger transpositions with generation.
    for (int i = 0; i < repeat; i++) {
        struct ggml_tensor * k_cache_view = ggml_view_3d(
-            ctx, 
-            kv_self->k_l[index], 
+            ctx,
+            kv_self->k_l[index],
            model->head_size,
            model->n_kv_attn_heads,
-            n_tokens, 
+            n_tokens,
            ggml_element_size(kv_self->k_l[index]) * model->head_size * repeat,
            ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size,
            ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size * octx->current_position + i * ggml_element_size(kv_self->k_l[index]) * model->head_size
@ -230,19 +230,19 @@ void orpheus_runner::orpheus_kv_cache_init() {
 struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch) {
    init_build();
    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false);
-    
+
    struct ggml_tensor * cur;
    struct ggml_tensor * inpL;
-    
+
    const int32_t full_sequence_length = octx->current_position + (uint32_t) batch.n_tokens;
    octx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
    ggml_set_input(octx->positions);
    octx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
    ggml_set_input(octx->inp_tokens);
    inpL = ggml_get_rows(ctx, model->embd, octx->inp_tokens);
-    
+
    struct ggml_tensor * KQ_mask_dec = build_attn_mask(ctx, octx, batch);
-    
+
    for (int l = 0; l < model->n_layers; l++) {
        struct ggml_tensor * residual = inpL;
        cur = orpheus_build_layer_norm(ctx, inpL, model->layers[l].input_norm);
@ -261,8 +261,8 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch)
                        model->head_size, full_sequence_length, model->n_attn_heads,
                        ggml_element_size(kv_self->k_l[l]) * model->n_attn_heads * model->head_size,
                        ggml_element_size(kv_self->k_l[l]) * model->head_size,
-                        0));            
-            
+                        0));
+
            struct ggml_tensor * v =
                ggml_view_2d(ctx, kv_self->v_l[l],
                        model->hidden_size, full_sequence_length,
@ -272,7 +272,7 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch)
            v = ggml_cont_3d(ctx, ggml_transpose(ctx, v), full_sequence_length, model->head_size, model->n_attn_heads);

            Qcur = ggml_rope_ext(
-                ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, Qcur, model->head_size, model->n_attn_heads, batch.n_tokens)), 
+                ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, Qcur, model->head_size, model->n_attn_heads, batch.n_tokens)),
                octx->positions, model->rope_frequencies, model->head_size, 2, 0, 500000.0f, // rope theta
                1.0f, 0.0f, 1.0f, 0.0f, 0.0f);

@ -286,7 +286,7 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch)
        }

        cur = ggml_add(ctx, attn_out, residual);
-        
+
        struct ggml_tensor * residualffn = cur;

        // mlp
@ -298,7 +298,7 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch)
        cur = ggml_add(ctx, cur, residualffn);
        inpL = cur;
    }
-    
+
    cur = orpheus_build_layer_norm(ctx, cur, model->output_norm);
    // only about 40k of the output head is actually uses for generation purposes. Ideally the head tensor should be shrunk and sampled tokens should be incremented.
    cur = ggml_mul_mat(ctx, model->head, cur);
@ -307,15 +307,15 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch)
    }
    ggml_build_forward_expand(gf, cur);
    free_build();
-    
+
    return gf;
 }

 void orpheus_runner::decode(orpheus_ubatch & batch) {
    ggml_backend_sched_reset(octx->sched);
-    
+
    octx->output_tokens.reserve(model->max_generation_size);
-    
+
    const size_t new_size  = model->vocab_size * model->max_generation_size * sizeof(float);
    octx->prep_output_buffer(new_size);

@ -324,10 +324,10 @@ void orpheus_runner::decode(orpheus_ubatch & batch) {
    // the output is always the last tensor in the graph
    struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
    ggml_backend_sched_alloc_graph(octx->sched, gf);
-    
+
    set_inputs(batch);
    ggml_backend_sched_graph_compute_async(octx->sched, gf);
- 
+
    float * logits_out = octx->logits + octx->n_outputs * model->vocab_size;
    octx->get_ggml_node_data(res, logits_out, model->vocab_size * sizeof(float));

--- a/otherarch/ttscpp/src/orpheus_model.h
+++ b/otherarch/ttscpp/src/orpheus_model.h
@ -1,7 +1,7 @@
 #pragma once

-#include "sampler.h"
-#include "tokenizer.h"
+#include "ttssampler.h"
+#include "ttstokenizer.h"
 #include "snac_model.h"

 // Orpheus uses vLLM with a llama-3 architecture. The only critical difference from the normal llama architecture is the use of kv heads.
@ -73,7 +73,7 @@ struct orpheus_context : runner_context {
    struct ggml_tensor * positions;
 };

-struct orpheus_kv_cache {    
+struct orpheus_kv_cache {
    ggml_type cache_type = GGML_TYPE_F32;

    std::vector<struct ggml_tensor *> k_l;
@ -104,11 +104,11 @@ struct orpheus_ubatch {

 struct orpheus_runner : tts_runner {
    orpheus_runner(
-            orpheus_model * model, 
-            snac_runner * audio_decoder, 
-            orpheus_context * octx, 
-            bpe_tokenizer * bt, 
-            sampler * samp, 
+            orpheus_model * model,
+            snac_runner * audio_decoder,
+            orpheus_context * octx,
+            bpe_tokenizer * bt,
+            sampler * samp,
            orpheus_kv_cache * cache): model(model), srunner(audio_decoder), octx(octx), tokenizer(bt), generation_sampler(samp), kv_self(cache) {
        tts_runner::sampling_rate = 24000.0f;
        generation_sampler->n_output_heads = 1;
--- a/otherarch/ttscpp/src/parler_model.h
+++ b/otherarch/ttscpp/src/parler_model.h
@ -2,8 +2,8 @@
 #define parler_model_h

 #include "dac_model.h"
-#include "t5_encoder_model.h"
-#include "sampler.h"
+#include "ttst5_encoder_model.h"
+#include "ttssampler.h"

 enum parler_tensor {
    PARLER_EMBD,
@ -38,17 +38,17 @@ struct parler_layer {
    struct ggml_tensor * self_attn_o_proj;
    struct ggml_tensor * self_attn_norm;
    struct ggml_tensor * self_attn_norm_bias;
-    
+
    struct ggml_tensor * attn_k_proj;
    struct ggml_tensor * attn_q_proj;
    struct ggml_tensor * attn_v_proj;
    struct ggml_tensor * attn_o_proj;
    struct ggml_tensor * attn_norm;
    struct ggml_tensor * attn_norm_bias;
-    
+
    struct ggml_tensor * cross_k;
    struct ggml_tensor * cross_v;
-    
+
    struct ggml_tensor * fc1;
    struct ggml_tensor * fc2;
    struct ggml_tensor * final_norm;
@ -74,18 +74,18 @@ struct parler_tts_model : tts_model {
    uint32_t prompt_vocab_size;

    bool use_cross_attn = true;
-    
+
    std::vector<struct ggml_tensor*> embds;
    std::vector<parler_layer*> layers;
    std::vector<struct ggml_tensor*> heads;
-    
+
    struct ggml_tensor * precomputed_input_emb;
    struct ggml_tensor * precomputed_positional_embds;
-    
+
    struct ggml_tensor * layer_norm;
    struct ggml_tensor * layer_norm_bias;
    struct ggml_tensor * prompt_embd;
-    
+
    void assign_weight(std::string name, ggml_tensor * tensor);
    void prep_constants(gguf_context * meta);
    void prep_layers(gguf_context * meta);
@ -107,21 +107,21 @@ struct parler_context : runner_context {
    std::vector<bool> eos_seen;

    bool use_cache = true;
-    
+
    size_t  output_size = 0; // capacity (of tokens positions) for the output buffers
    int32_t n_outputs   = 0; // number of actually-used outputs in the current ubatch or last logical batch
    uint32_t current_position = 0; // current position in the active sequence
    uint32_t prompt_end_position = 0; // the position of the text prompt termination (used for adjusting the cache when incrementally generating)
    int32_t seq_id; // a unique identifier associated with the active sequence.
-    
+
    std::vector<uint32_t> output_tokens;
-    
+
    struct ggml_tensor * inp_tokens;
    struct ggml_tensor * audio_inp_tokens;
    struct ggml_tensor * positions;
    struct ggml_tensor * attn_mask;
    struct ggml_tensor * attn_mask_cross;
-    
+
    void build_schedule() {
        runner_context::build_schedule(model->max_nodes());
    }
@ -130,17 +130,17 @@ struct parler_context : runner_context {

 struct parler_kv_cache {
    int32_t seq_id;
-    
+
    ggml_type type_k = GGML_TYPE_F32;
    ggml_type type_v = GGML_TYPE_F32;

    std::vector<struct ggml_tensor *> k_l;
    std::vector<struct ggml_tensor *> v_l;
-    
+
    struct ggml_context * ctx;
    ggml_backend_buffer_type_t buft;
    ggml_backend_buffer_t buf;
-    
+
    void free() {
        ggml_free(ctx);
        ggml_backend_buffer_free(buf);
@ -152,8 +152,8 @@ struct parler_kv_cache {
 };

 struct parler_ubatch {
-    parler_ubatch(bool audio_generation, size_t n_tokens, size_t n_audio_tokens, size_t sequence_length, 
-        uint32_t * tokens, uint32_t * audio_tokens, uint32_t * positions, uint32_t * true_order, 
+    parler_ubatch(bool audio_generation, size_t n_tokens, size_t n_audio_tokens, size_t sequence_length,
+        uint32_t * tokens, uint32_t * audio_tokens, uint32_t * positions, uint32_t * true_order,
        int current_step): audio_generation(audio_generation), n_tokens(n_tokens), n_audio_tokens(n_audio_tokens), sequence_length(sequence_length), tokens(tokens), audio_tokens(audio_tokens), positions(positions), true_order(true_order), current_step(current_step) {};
    parler_ubatch() {};
    bool audio_generation; // whether we are receiving codebook decoded tokens or text tokens
--- a/otherarch/ttscpp/src/phonemizer.cpp
+++ b/otherarch/ttscpp/src/phonemizer.cpp
@ -543,7 +543,7 @@ dictionary_response * phoneme_dictionary::lookup(corpus * text, std::string valu
 	}
 	std::vector<dictionary_response*> possibilities = lookup_map.at(value);
 	for (auto possible : possibilities) {
-		if (possible->code == SUCCESS || (possible->code == SUCCESS_PARTIAL && possible->is_match(text, flags))) {
+		if (possible->code == SUCCESS_TOTAL || (possible->code == SUCCESS_PARTIAL && possible->is_match(text, flags))) {
 			return possible;
 		}
 	}
@ -818,7 +818,7 @@ bool phonemizer::process_word(corpus* text, std::string* output, std::string wor
 			output->append(" ");
 		}
 		flags->update_for_word(word);
-		if (response->code != SUCCESS) {
+		if (response->code != SUCCESS_TOTAL) {
 			word += response->after_match;
 			output->append(response->value);
 			text->size_pop(word.size()+unaccented_size_difference);
@ -1072,7 +1072,7 @@ dictionary_response * response_from_string(std::string value, std::string key) {
 	bool not_at_start = key[0] == '#';
 	bool not_at_end = key.back() == '#';
    if (!has_spacing) {
-    	dictionary_response * resp = new dictionary_response(SUCCESS, value);
+    	dictionary_response * resp = new dictionary_response(SUCCESS_TOTAL, value);
    	resp->expects_to_be_proceeded_by_number = expects_to_be_proceeded_by_number;
    	resp->not_at_clause_end = not_at_end;
    	resp->not_at_clause_start = not_at_start;
--- a/otherarch/ttscpp/src/snac_model.h
+++ b/otherarch/ttscpp/src/snac_model.h
@ -4,7 +4,7 @@

 // SNAC, Scale Neural Audio Codec, is another neural audio codec much like DAC.
 // The key differences are that it uses grouping in the residual units of its layers,
-// performs a repeat_interleave over the second and third input channels, applies 
+// performs a repeat_interleave over the second and third input channels, applies
 // a noise convolutional layer after input encoding for each layer, and applies
 // an extra convolutional layer before residual layers are applied.
 struct snac_model : tts_model {
@ -19,7 +19,7 @@ struct snac_model : tts_model {
    uint32_t noise_steps[4] = {8, 64, 256, 512};
    uint32_t noise_steps_sum = 840;
    bool use_noise = true;
-    
+
    struct ggml_tensor * repeat_interleave_buffer;

    struct ggml_tensor * in_conv_kernel;
@ -46,12 +46,12 @@ struct snac_model : tts_model {
 // the context used for running the snac model
 struct snac_context : runner_context {
    snac_context(snac_model * model, int n_threads): runner_context(n_threads), model(model) {};
-    
+
    struct snac_model * model;
-        
+
    struct ggml_tensor * inp_tokens;
    struct ggml_tensor * noise;
-    
+
    void build_schedule() {
        runner_context::build_schedule(model->max_nodes());
    }
@ -74,11 +74,11 @@ struct snac_runner : tts_runner {
    }
    snac_model * model;
    snac_context * sctx;
-    
+
    void init_build() {
        tts_runner::init_build(&sctx->buf_compute_meta);
    }
-    
+
    void set_inputs(std::vector<std::vector<uint32_t>> & tokens);
    void prepare_post_load();
    struct ggml_cgraph * build_snac_graph(size_t sequence_length);
--- a/otherarch/ttscpp/src/ttsargs.cpp
+++ b/otherarch/ttscpp/src/ttsargs.cpp
--- a/otherarch/ttscpp/src/ttssampler.cpp
+++ b/otherarch/ttscpp/src/ttssampler.cpp
@ -1,4 +1,4 @@
-#include "sampler.h"
+#include "ttssampler.h"

 void sampler::sample(float * logits, std::vector<uint32_t> & output_tokens) {
    // assume that we are pointing to the start of the first token output;
@ -6,7 +6,7 @@ void sampler::sample(float * logits, std::vector<uint32_t> & output_tokens) {
        return max(logits, output_tokens);
    }
    std::vector<uint32_t> max_vals;
-    // the max_head_probs variable is used when top-p is applied but exists to address the case in which top-k and top-p cause the cumulative probability of the nucleus to beless than or 
+    // the max_head_probs variable is used when top-p is applied but exists to address the case in which top-k and top-p cause the cumulative probability of the nucleus to beless than or
    // equal to top_p;
    std::vector<float> max_head_probs;

@ -189,7 +189,7 @@ void sampler::max(float * logits, std::vector<uint32_t> & output_tokens) {
        uint32_t token_id = 0;
        for (uint32_t ii = 0; ii < vocab_size; ii++) {
            float v = *(logits+i*vocab_size+ii);
-            // while repetition penalty will never be used for maximum token selection, it is used for the logarithmic stabilization of 
+            // while repetition penalty will never be used for maximum token selection, it is used for the logarithmic stabilization of
            // the softmax function in which case it is possible for repetition counts to be set.
            if (has_repetition_penalty && last_token_ids[i] == ii) {
                v /= (pow(repetition_penalty, repetition_counts[i]));
--- a/otherarch/ttscpp/src/ttssampler.h
+++ b/otherarch/ttscpp/src/ttssampler.h
@ -21,7 +21,7 @@ struct sampler {
    std::vector<uint32_t> repetition_counts;
    bool do_sample = true;
    bool apply_softmax = true;
-    
+
    void sample(float * logits, std::vector<uint32_t> & output_tokens);
    void softmax(float * logits, std::vector<std::vector<size_t>> picks, std::vector<uint32_t> max_indices);
    void max(float * logits, std::vector<uint32_t> & output_tokens);
--- a/otherarch/ttscpp/src/ttst5_encoder_model.cpp
+++ b/otherarch/ttscpp/src/ttst5_encoder_model.cpp
@ -1,4 +1,4 @@
-#include "t5_encoder_model.h"
+#include "ttst5_encoder_model.h"

 static const std::map<std::string, t5_tensor> T5_TENSOR_GGUF_LOOKUP = {
    {"t5encoder.token_embd", T5_EMBD},
@ -139,7 +139,7 @@ void t5_encoder::prep_constants(gguf_context * meta) {
    int bos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.bos_token_id");
    if (bos_token_id_key != -1) {
        bos_token_id = gguf_get_val_u32(meta, bos_token_id_key);
-    }    
+    }

    int eos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.eos_token_id");
    if (eos_token_id_key != -1) {
@ -219,7 +219,7 @@ struct ggml_cgraph * t5_runner::build_t5_graph(t5_ubatch & batch) {

    struct ggml_tensor * cur;
    struct ggml_tensor * inpL;
-    
+
    //t5ctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
    //ggml_set_input(t5ctx->positions);

@ -233,7 +233,7 @@ struct ggml_cgraph * t5_runner::build_t5_graph(t5_ubatch & batch) {

    struct ggml_tensor * KQ_mask_dec = build_t5_attn_mask(ctx, t5ctx, batch);
    struct ggml_tensor * pos_bias = build_t5_pos_bias(ctx, t5ctx->inp_pos_bucket, model->relative_attn_bias);
-    
+
    for (int l = 0; l < model->n_layers; l++) {
        struct ggml_tensor * residual = inpL;

@ -293,7 +293,7 @@ struct ggml_cgraph * t5_runner::build_t5_graph(t5_ubatch & batch) {
    ggml_build_forward_expand(gf, cur);

    free_build();
-    
+
    return gf;
 }

@ -312,7 +312,7 @@ void t5_runner::set_inputs(t5_ubatch & batch) {
        for (int ii = 0; ii < batch.n_tokens; ii++) {
        	int ab_rpos = abs(i - ii);
        	int rpos = i - ii;
-            attn_mask[i*batch.n_tokens + ii] = 0.0f; //ii > i ? -INFINITY : 0.0f; 
+            attn_mask[i*batch.n_tokens + ii] = 0.0f; //ii > i ? -INFINITY : 0.0f;
            pos_bucket[i*batch.n_tokens + ii] = (uint32_t) (rpos > 0 ? n_buckets : 0) + (ab_rpos < max_exact ? ab_rpos : std::min((n_buckets - 1), (max_exact + (int)((log((ab_rpos / max_exact)) / logarithmic_denominator) * max_exact))));
        }
    }
@ -324,10 +324,10 @@ void t5_runner::run(uint32_t * input_tokens, uint32_t sequence_length, struct tt
    batch.input_tokens = input_tokens;
    batch.n_tokens = sequence_length;
    ggml_backend_sched_reset(t5ctx->sched);
-    
+
    const size_t prev_size = t5ctx->buf_output ? ggml_backend_buffer_get_size(t5ctx->buf_output) : 0;
    const size_t new_size = model->max_context_length * model->output_size * sizeof(float);
-    
+
    if (!t5ctx->buf_output || prev_size < new_size) {
        if (t5ctx->buf_output) {
            ggml_backend_buffer_free(t5ctx->buf_output);
@ -337,7 +337,7 @@ void t5_runner::run(uint32_t * input_tokens, uint32_t sequence_length, struct tt

        t5ctx->buf_output = ggml_backend_buft_alloc_buffer(t5ctx->backend_cpu_buffer, new_size);
    }
-    
+
    outputs->data = (float *) ggml_backend_buffer_get_base(t5ctx->buf_output);
    ggml_backend_buffer_clear(t5ctx->buf_output, 0);
    struct ggml_cgraph * gf = NULL;
--- a/otherarch/ttscpp/src/ttst5_encoder_model.h
+++ b/otherarch/ttscpp/src/ttst5_encoder_model.h
@ -2,7 +2,7 @@
 #define t5_encoder_model_h

 #include "tts_model.h"
-#include "tokenizer.h"
+#include "ttstokenizer.h"


 enum t5_tensor {
@ -75,14 +75,14 @@ void assign_to_t5_layer(t5_encoder * model, t5_layer & layer, std::string name,

 struct t5_context : runner_context {
    t5_context(t5_encoder * model, int n_threads): runner_context(n_threads), model(model) {};
-    
+
    struct t5_encoder * model;
-    
+
    struct ggml_tensor * inp_tokens;
    struct ggml_tensor * positions;
    struct ggml_tensor * attn_mask;
    struct ggml_tensor * inp_pos_bucket;
-    
+
    void build_schedule() {
        runner_context::build_schedule(model->max_nodes());
    }
@ -116,7 +116,7 @@ struct t5_runner : tts_runner {
    void init_build() {
        tts_runner::init_build(&t5ctx->buf_compute_meta);
    }
-    
+
    void prepare_post_load();
    struct t5_ubatch build_worst_case_batch();
    void set_inputs(t5_ubatch & batch);
--- a/otherarch/ttscpp/src/ttstokenizer.cpp
+++ b/otherarch/ttscpp/src/ttstokenizer.cpp
@ -1,4 +1,4 @@
-#include "tokenizer.h"
+#include "ttstokenizer.h"

 void token_trie::add(const std::string & gram, uint32_t token) {
    _add(gram, token, 0);
--- a/otherarch/ttscpp/src/ttstokenizer.h
+++ b/otherarch/ttscpp/src/ttstokenizer.h