builds but crashes

2025-09-11 09:34:37 +00:00 · 2025-08-17 00:09:03 +08:00 · 2025-08-17 00:09:03 +08:00 · bc04366a65
commit bc04366a65
parent 2bf128587d
43 changed files with 12183 additions and 2 deletions
--- a/6
+++ b/6
@ -55,8 +55,8 @@ ifdef KCPP_SANITIZE
 	CFLAGS += -fsanitize=undefined -fsanitize-undefined-trap-on-error
 	CXXFLAGS += -fsanitize=undefined -fsanitize-undefined-trap-on-error
 endif
-CFLAGS   += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./vendor -I./vendor/stb -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_REPACK
+CFLAGS   += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./vendor -I./vendor/stb -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/ttscpp/include -I./otherarch/ttscpp/src -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_REPACK
-CXXFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./vendor -I./vendor/stb -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c++17 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_REPACK
+CXXFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./vendor -I./vendor/stb -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/ttscpp/include -I./otherarch/ttscpp/src -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c++17 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_REPACK
 ifndef KCPP_DEBUG
 	CFLAGS += -DNDEBUG -s
 	CXXFLAGS += -DNDEBUG -s
@ -729,6 +729,8 @@ mainvk: tools/main/main.cpp common/arg.cpp build-info.h ggml_v4_vulkan.o ggml-cp
 	$(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 embedding: examples/embedding/embedding.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 ttscppmain: otherarch/ttscpp/examples/cli/cli.cpp otherarch/ttscpp/examples/cli/playback.cpp otherarch/ttscpp/examples/cli/playback.h otherarch/ttscpp/examples/cli/write_file.cpp otherarch/ttscpp/examples/cli/write_file.h otherarch/ttscpp/examples/cli/vad.cpp otherarch/ttscpp/examples/cli/vad.h otherarch/ttscpp/src/tts.cpp otherarch/ttscpp/src/tokenizer.cpp otherarch/ttscpp/src/sampler.cpp otherarch/ttscpp/src/parler_model.cpp otherarch/ttscpp/src/dac_model.cpp otherarch/ttscpp/src/ttsutil.cpp otherarch/ttscpp/src/args.cpp otherarch/ttscpp/src/t5_encoder_model.cpp otherarch/ttscpp/src/phonemizer.cpp otherarch/ttscpp/src/tts_model.cpp otherarch/ttscpp/src/kokoro_model.cpp otherarch/ttscpp/src/dia_model.cpp otherarch/ttscpp/src/orpheus_model.cpp otherarch/ttscpp/src/snac_model.cpp otherarch/ttscpp/src/general_neural_audio_codec.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 ggml/src/ggml-vulkan-shaders.cpp:
 ifdef VULKAN_BUILD
--- a/README.md
+++ b/README.md
@ -191,6 +191,7 @@ and it will install everything required. Alternatively, you can download the abo
 - KoboldCpp code and other files are also under the AGPL v3.0 License unless otherwise stated
 - Llama.cpp source repo is at https://github.com/ggml-org/llama.cpp (MIT)
 - Stable-diffusion.cpp source repo is at https://github.com/leejet/stable-diffusion.cpp (MIT)
 - TTS.cpp source repo is at https://github.com/mmwillet/TTS.cpp (MIT)
 - KoboldCpp source repo is at https://github.com/LostRuins/koboldcpp (AGPL)
 - KoboldAI Lite source repo is at https://github.com/LostRuins/lite.koboldai.net (AGPL)
 - For any further enquiries, contact @concedo on discord, or LostRuins on github.
--- a/otherarch/ttscpp/TTSCPP_LICENSE
+++ b/otherarch/ttscpp/TTSCPP_LICENSE
@ -0,0 +1,24 @@
 The original TTS.cpp is made by mmwillet, repo can be found at https://github.com/mmwillet/TTS.cpp
 KoboldCpp uses a minimal implementation with some files removed.
 MIT License
 Copyright (c) 2023-2024 The ggml authors
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/otherarch/ttscpp/examples/cli/cli.cpp
+++ b/otherarch/ttscpp/examples/cli/cli.cpp
@ -0,0 +1,96 @@
 #include "tts.h"
 #include "ttsargs.h"
 #include "ttscommon.h"
 #include "playback.h"
 #include "vad.h"
 #include "write_file.h"
 #include <thread>
 class tts_timing_printer {
    const int64_t start_us{[] {
        ggml_time_init();
        return ggml_time_us();
    }()};
 public:
    ~tts_timing_printer() {
        const int64_t end_us{ggml_time_us()};
        // Just a simple "total time" for now before adding "load" / "prompt eval" / "eval" from llama_print_timings
        printf("total time = %.2f ms\n", (end_us - start_us) / 1000.0f);
    }
 };
 int main(int argc, const char ** argv) {
    const tts_timing_printer _{};
    float default_temperature = 1.0f;
    int default_n_threads = std::max((int)std::thread::hardware_concurrency(), 1);
    int default_top_k = 50;
    int default_max_tokens = 0;
    float default_repetition_penalty = 1.0f;
    float default_top_p = 1.0f;
    arg_list args;
    args.add_argument(string_arg("--model-path", "(REQUIRED) The local path of the gguf model file for Parler TTS mini or large v1, Dia, or Kokoro.", "-mp", true));
    args.add_argument(string_arg("--prompt", "(REQUIRED) The text prompt for which to generate audio in quotation markers.", "-p", true));
    args.add_argument(string_arg("--save-path", "(OPTIONAL) The path to save the audio output to in a .wav format. Defaults to TTS.cpp.wav", "-sp", false, "TTS.cpp.wav"));
    args.add_argument(float_arg("--temperature", "The temperature to use when generating outputs. Defaults to 1.0.", "-t", false, &default_temperature));
    args.add_argument(int_arg("--n-threads", "The number of cpu threads to run generation with. Defaults to hardware concurrency. If hardware concurrency cannot be determined then it defaults to 1.", "-nt", false, &default_n_threads));
    args.add_argument(int_arg("--topk", "(OPTIONAL) When set to an integer value greater than 0 generation uses nucleus sampling over topk nucleaus size. Defaults to 50.", "-tk", false, &default_top_k));
    args.add_argument(float_arg("--repetition-penalty", "The by channel repetition penalty to be applied the sampled output of the model. defaults to 1.0.", "-r", false, &default_repetition_penalty));
    args.add_argument(bool_arg("--use-metal", "(OPTIONAL) Whether to use metal acceleration", "-m"));
    args.add_argument(bool_arg("--no-cross-attn", "(OPTIONAL) Whether to not include cross attention", "-ca"));
    args.add_argument(string_arg("--conditional-prompt", "(OPTIONAL) A distinct conditional prompt to use for generating. If none is provided the preencoded prompt is used. '--text-encoder-path' must be set to use conditional generation.", "-cp", false));
    args.add_argument(string_arg("--text-encoder-path", "(OPTIONAL) The local path of the text encoder gguf model for conditional generaiton.", "-tep", false));
    args.add_argument(string_arg("--voice", "(OPTIONAL) The voice to use to generate the audio. This is only used for models with voice packs.", "-v", false, "af_alloy"));
    args.add_argument(bool_arg("--vad", "(OPTIONAL) whether to apply voice inactivity detection (VAD) and strip silence form the end of the output (particularly useful for Parler TTS). By default, no VAD is applied.", "-va"));
    args.add_argument(string_arg("--espeak-voice-id", "(OPTIONAL) The espeak voice id to use for phonemization. This should only be specified when the correct espeak voice cannot be inferred from the kokoro voice ( see MultiLanguage Configuration in the README for more info).", "-eid", false));
    args.add_argument(int_arg("--max-tokens", "(OPTIONAL) The max audio tokens or token batches to generate where each represents approximates 11 ms of audio. Only applied to Dia generation. If set to zero as is its default then the default max generation size. Warning values under 15 are not supported.", "-mt", false, &default_max_tokens));
    args.add_argument(float_arg("--top-p", "(OPTIONAL) the sum of probabilities to sample over. Must be a value between 0.0 and 1.0. Defaults to 1.0.", "-tp", false, &default_top_p));
    register_play_tts_response_args(args);
    args.parse(argc, argv);
    if (args.for_help) {
        args.help();
        exit(0);
    }
    args.validate();
    std::string conditional_prompt = args.get_string_param("--conditional-prompt");
    std::string text_encoder_path = args.get_string_param("--text-encoder-path");
    if (conditional_prompt.size() > 0 && text_encoder_path.size() <= 0) {
        fprintf(stderr, "The '--text-encoder-path' must be specified when '--condtional-prompt' is passed.\n");
        exit(1);
    }
    if (*args.get_float_param("--top-p") > 1.0f || *args.get_float_param("--top-p") <= 0.0f) {
        fprintf(stderr, "The '--top-p' value must be between 0.0 and 1.0. It was set to '%.6f'.\n", *args.get_float_param("--top-p"));
        exit(1);
    }
    generation_configuration * config = new generation_configuration(
        args.get_string_param("--voice"),
        *args.get_int_param("--topk"),
        *args.get_float_param("--temperature"),
        *args.get_float_param("--repetition-penalty"),
        !args.get_bool_param("--no-cross-attn"),
        args.get_string_param("--espeak-voice-id"),
        *args.get_int_param("--max-tokens"),
        *args.get_float_param("--top-p"));
    struct tts_runner * runner = runner_from_file(args.get_string_param("--model-path"), *args.get_int_param("--n-threads"), config, !args.get_bool_param("--use-metal"));
    if (conditional_prompt.size() > 0) {
        update_conditional_prompt(runner, text_encoder_path, conditional_prompt, true);
    }
    tts_response data;
    generate(runner, args.get_string_param("--prompt"), &data, config);
    if (data.n_outputs == 0) {
        fprintf(stderr, "Got empty response for prompt, '%s'.\n", args.get_string_param("--prompt").c_str());
        exit(1);
    }
    if (args.get_bool_param("--vad")) {
        apply_energy_voice_inactivity_detection(data, runner->sampling_rate);
    }
    if (!play_tts_response(args, data, runner->sampling_rate)) {
        write_audio_file(data, args.get_string_param("--save-path"), runner->sampling_rate);
    }
    return 0;
 }
--- a/otherarch/ttscpp/examples/cli/playback.cpp
+++ b/otherarch/ttscpp/examples/cli/playback.cpp
@ -0,0 +1,62 @@
 #include <cstdint>
 #include "playback.h"
 #ifndef SDL2_INSTALL
 void register_play_tts_response_args(arg_list & args) {
    // Hide --play
 }
 bool play_tts_response(arg_list & args, const tts_response & data, float sample_rate) {
    return false;
 }
 #else
 #include "SDL.h"
 void register_play_tts_response_args(arg_list & args) {
    args.add_argument(bool_arg("--play", "(OPTIONAL) Whether to play back the audio immediately instead of saving it to file."));
 }
 bool play_tts_response(arg_list & args, const tts_response & data, float sample_rate) {
    if (!args.get_bool_param("--play")) {
        return false;
    }
    if (SDL_Init(SDL_INIT_AUDIO)) {
        fprintf(stderr, "SDL_INIT failed\n");
        exit(1);
    }
    const SDL_AudioSpec desired{
        .freq = static_cast<int>(sample_rate),
        .format = AUDIO_F32,
        .channels = 1,
        .silence = 0,
        .padding = 0,
        .size = static_cast<unsigned>(data.n_outputs),
        .callback = nullptr,
        .userdata = nullptr,
    };
    const SDL_AudioDeviceID dev = SDL_OpenAudioDevice(nullptr, false, &desired, nullptr, 0);
    if (!dev) {
        fprintf(stderr, "SDL_OpenAudioDevice failed\n");
        exit(1);
    }
    SDL_PauseAudioDevice(dev, false);
    fprintf(stdout, "Playing %ld samples of audio\n", data.n_outputs);
    if (SDL_QueueAudio(dev, data.data, data.n_outputs * sizeof(data.data[0]))) {
        fprintf(stderr, "SDL_QueueAudio failed\n");
        exit(1);
    }
    SDL_Event event;
    while (SDL_GetQueuedAudioSize(dev)) {
        if (SDL_PollEvent(&event) && event.type == SDL_QUIT) break;
        SDL_Delay(100);
    }
    SDL_CloseAudioDevice(dev);
    SDL_Quit();
    return true;
 }
 #endif
--- a/otherarch/ttscpp/examples/cli/playback.h
+++ b/otherarch/ttscpp/examples/cli/playback.h
@ -0,0 +1,7 @@
 #pragma once
 #include "ttsargs.h"
 #include "ttscommon.h"
 void register_play_tts_response_args(arg_list & args);
 bool play_tts_response(arg_list & args, const tts_response & data, float sample_rate);
--- a/otherarch/ttscpp/examples/cli/vad.cpp
+++ b/otherarch/ttscpp/examples/cli/vad.cpp
@ -0,0 +1,68 @@
 #include "vad.h"
 float energy(float * chunk, int count) {
 	float en = 0.0f;
 	for (int i = 0; i < count; i++) {
 		en += powf(chunk[i], 2.0f);
 	}
 	return en;
 }
 void apply_energy_voice_inactivity_detection(
 	tts_response & data, 
 	float sample_rate, 
 	int ms_per_frame,
 	int frame_threshold,
 	float normalized_energy_threshold,
 	int trailing_silent_frames,
 	int early_cutoff_seconds_threshold,
 	float early_cutoff_energy_threshold) {
 	int samples_per_frame = (int) (ms_per_frame * sample_rate / 1000.0f);
 	int n_frames = (int) (data.n_outputs / samples_per_frame);
 	int early_cuttoff_frames = (int)((early_cutoff_seconds_threshold * 1000) / ms_per_frame);
 	// for min-max normalization
 	float max_energy = 0.0f;
 	float min_energy = 0.0f;
 	float * energies = (float *) malloc(n_frames * sizeof(float));
 	int silent_frames = 0;
 	// compute the energies and the necessary elements for min-max normalization
 	for (int i = 0; i < n_frames; i++) {
 		float * chunk = data.data + i * samples_per_frame;
 		energies[i] = energy(chunk, samples_per_frame);
 		if (i == 0) {
 			max_energy = energies[i];
 			min_energy = energies[i];
 		} else if (energies[i] > max_energy) {
 			max_energy = energies[i];
 		} else if (energies[i] < min_energy) {
 			min_energy = energies[i];
 		}
 		if (energies[i] <= early_cutoff_energy_threshold) {
 			silent_frames++;
 		} else {
 			silent_frames = 0;
 		}
 		if (silent_frames >= early_cuttoff_frames) {
 			data.n_outputs = (i + trailing_silent_frames - silent_frames) * samples_per_frame;
 			free(energies);
 			return;
 		}
 	}
 	int concurrent_silent_frames = 0;
 	for (int i = n_frames; i > 0; i--) {
 		float frame_energy = (energies[i-1] - min_energy) / (max_energy - min_energy);
 		if (frame_energy < normalized_energy_threshold) {
 			concurrent_silent_frames++;
 		} else {
 			break;
 		}
 	}
 	if (concurrent_silent_frames >= frame_threshold) {
 		data.n_outputs -= ((concurrent_silent_frames - trailing_silent_frames) * samples_per_frame);
 	}
 	free(energies);
 }
--- a/otherarch/ttscpp/examples/cli/vad.h
+++ b/otherarch/ttscpp/examples/cli/vad.h
@ -0,0 +1,21 @@
 #pragma once
 #include <math.h>
 #include "ttscommon.h"
 float energy(float * chunk, int count);
 /*
 * This function is used to trim trailing silence at the end of audio data within the tts_response struct.
 * It detects silence by min-max normalizing energy and trimming frames which fall under a relative threshold.
 */
 void apply_energy_voice_inactivity_detection(
 	tts_response & data,
 	float sample_rate = 44100.0f, // the sample rate of the audio
 	int ms_per_frame = 10, // the audio time per frame
 	int frame_threshold = 20, // the number of trailing empty frames upon which silence is clipped.
 	float normalized_energy_threshold = 0.01f, // the normalized threshold to determine a silent frame
 	int trailing_silent_frames = 5, // the number of frames of silence to allow
 	int early_cutoff_seconds_threshold = 3, // the number of seconds of complete silence before terminating and cutting audio early
 	float early_cutoff_energy_threshold = 0.1 // the energy threshold for treating a frame as silent for early cutoff
 );
--- a/otherarch/ttscpp/examples/cli/write_file.cpp
+++ b/otherarch/ttscpp/examples/cli/write_file.cpp
@ -0,0 +1,12 @@
 #include <cstdint>
 #include "write_file.h"
 #include "audio_file.h"
 void write_audio_file(const tts_response & data, std::string path, float sample_rate) {
    fprintf(stdout, "Writing audio file: %s\n", path.c_str());
    AudioFile<float> file;
    file.setSampleRate(sample_rate);
    file.samples[0] = std::vector(data.data, data.data + data.n_outputs);
    file.save(path, AudioFileFormat::Wave);
    file.printSummary();
 }
--- a/otherarch/ttscpp/examples/cli/write_file.h
+++ b/otherarch/ttscpp/examples/cli/write_file.h
@ -0,0 +1,5 @@
 #pragma once
 #include "ttscommon.h"
 void write_audio_file(const tts_response & data, std::string path = "TTS.cpp.wav", float sample_rate = 44100.0f);
--- a/otherarch/ttscpp/examples/phonemize/phonemize.cpp
+++ b/otherarch/ttscpp/examples/phonemize/phonemize.cpp
@ -0,0 +1,31 @@
 #include "phonemizer.h"
 #include "ttsargs.h"
 #include <stdio.h>
 int main(int argc, const char ** argv) {
    arg_list args;
    args.add_argument(string_arg("--phonemizer-path", "(OPTIONAL) The local path of the gguf phonemiser file for TTS.cpp phonemizer. This is required if not using espeak.", "-mp"));
    args.add_argument(string_arg("--prompt", "(REQUIRED) The text prompt to phonemize.", "-p", true));
    args.add_argument(bool_arg("--use-espeak", "(OPTIONAL) Whether to use espeak to generate phonems.", "-ue"));
    args.add_argument(string_arg("--espeak-voice-id", "(OPTIONAL) The voice id to use for espeak phonemization. Defaults to 'gmw/en-US'.", "-eid", false, "gmw/en-US"));
    args.parse(argc, argv);
    if (args.for_help) {
        args.help();
        return 0;
    }
    args.validate();
    if (!args.get_bool_param("--use-espeak") && args.get_string_param("--phonemizer-path") == "") {
        fprintf(stderr, "The '--phonemizer-path' must be specified when '--use-espeak' is not true.\n");
        exit(1);
    }
    phonemizer * ph;
    if (args.get_bool_param("--use-espeak")) {
        ph = espeak_phonemizer(false, args.get_string_param("--espeak-voice-id"));
    } else {
        ph = phonemizer_from_file(args.get_string_param("--phonemizer-path"));
    }
    std::string response = ph->text_to_phonemes(args.get_string_param("--prompt"));
    fprintf(stdout, "%s\n", response.c_str());
    return 0;
 }
--- a/otherarch/ttscpp/include/audio_file.h
+++ b/otherarch/ttscpp/include/audio_file.h
--- a/otherarch/ttscpp/include/phonemizer.h
+++ b/otherarch/ttscpp/include/phonemizer.h
@ -0,0 +1,533 @@
 #ifndef phonemizer_h
 #define phonemizer_h
 #ifdef ESPEAK_INSTALL
 # ifdef ESPEAK_INSTALL_LOCAL
 #  include "speak_lib.h"
 # else
 #  include <espeak-ng/speak_lib.h>
 # endif
 #endif
 #include <unordered_map>
 #include <map>
 #include <unordered_set>
 #include "tokenizer.h"
 #include <algorithm>
 #include <mutex>
 static const std::string ALPHABET = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
 static const std::string ACCENTED_A = "àãâäáåÀÃÂÄÁÅ";
 static const std::string ACCENTED_C = "çÇ";
 static const std::string ACCENTED_E = "èêëéÈÊËÉ";
 static const std::string ACCENTED_I = "ìîïíÌÎÏÍ";
 static const std::string ACCENTED_N = "ñÑ";
 static const std::string ACCENTED_O = "òõôöóøÒÕÔÖÓØ";
 static const std::string ACCENTED_U = "ùûüúÙÛÜÚ";
 static const std::string COMMON_ACCENTED_CHARACTERS = ACCENTED_A + ACCENTED_C + ACCENTED_E + ACCENTED_I + ACCENTED_N + ACCENTED_O + ACCENTED_U;
 static const std::string WORD_CHARACTERS = ALPHABET + "." + COMMON_ACCENTED_CHARACTERS;
 static const std::string NON_CLAUSE_WORD_CHARACTERS = ALPHABET + COMMON_ACCENTED_CHARACTERS + "'";
 static const std::string VOWELS = "aeiouy";
 static const std::unordered_set<std::string> ONE_LETTER_WORDS = {
 	"a",
 	"i",
 };
 /*
 * The two letter and three letter words listed below have been filtered down from the complete list of english two and three letter words 
 * via several criteria:
 *   1. All non-EN-US words have been removed
 * 	 2. All three letter acronyms have been removed (as these lists are used to identify acronyms)
 *   3. All archaic, deprecated, or poetic words have been removed. 
 * 	 4. All literary, abbreviative, and slang words have been removed if they see no more than a mean of 30 daily searches via google (over the 
 *	 last 10 years). 
 * 
 * After the lists were filtered by the criteria described above, removed items were reviewed. Any item which had entered the common EN-US 
 * vernacular but was not identified as of American origin was reintroduced into the sets below. 
 */
 static const std::unordered_set<std::string> TWO_LETTER_WORDS = {
 	"ab", "ah", "am", "an", "as", "at", "aw", "ax", "ay", "be", "bo", "br",
 	"by", "do", "eh", "er", "ew", "ex", "go", "ha", "he", "hi", "hm", "ho",
 	"id", "if", "in", "is", "it", "la", "lo", "ma", "me", "mm", "my", "na",
 	"no", "of", "oh", "oi", "on", "oo", "or", "ow", "ox", "oy", "pa", "qi",
 	"re", "sh", "so", "to", "uh", "um", "un", "up", "us", "we", "wo", "ya",
 	"ye", "yo", 
 };
 static const std::unordered_set<std::string> THREE_LETTER_WORDS = {
 	"aah", "abs", "aby", "ace", "ach", "ack", "act", "add", "ado", "ads", "aft", "age",
 	"ago", "aha", "ahi", "aid", "ail", "aim", "air", "alb", "ale", "all", "alp", "alt",
 	"ama", "amp", "and", "ant", "any", "ape", "app", "apt", "arc", "are", "arf", "ark",
 	"arm", "art", "ash", "ask", "asp", "ass", "ate", "awe", "axe", "aye", "baa", "bad",
 	"bae", "bag", "bah", "bam", "ban", "bao", "bap", "bar", "bat", "bay", "bed", "bee",
 	"beg", "bet", "bez", "bib", "bid", "big", "bin", "bio", "bis", "bit", "biz", "boa",
 	"bod", "bog", "boi", "boo", "bop", "bot", "bow", "box", "boy", "bra", "bro", "brr",
 	"bub", "bud", "bug", "bum", "bun", "bur", "bus", "but", "buy", "bye", "cab", "caf",
 	"cam", "can", "cap", "car", "cat", "caw", "chi", "cig", "cis", "cly", "cob", "cod",
 	"cog", "col", "con", "coo", "cop", "cos", "cot", "cow", "cox", "coy", "cry", "cub",
 	"cue", "cum", "cup", "cur", "cut", "cuz", "dab", "dad", "dag", "dal", "dam", "dap",
 	"das", "daw", "day", "deb", "def", "del", "den", "dep", "dew", "dib", "did", "die",
 	"dif", "dig", "dim", "din", "dip", "dis", "div", "doc", "doe", "dog", "doh", "dom",
 	"don", "dos", "dot", "dox", "dry", "dub", "dud", "due", "dug", "duh", "dum", "dun",
 	"duo", "dup", "dur", "dye", "ear", "eat", "ebb", "eco", "eek", "eel", "egg", "ego",
 	"elf", "elk", "elm", "emo", "emu", "end", "eon", "era", "err", "est", "eve", "eww",
 	"eye", "fab", "fad", "fae", "fag", "fah", "fam", "fan", "fap", "far", "fat", "fav",
 	"fax", "fay", "fed", "fee", "feh", "fem", "fen", "few", "fey", "fez", "fib", "fid",
 	"fig", "fin", "fir", "fit", "fix", "flu", "fly", "fob", "foe", "fog", "foo", "fop",
 	"for", "fox", "fro", "fry", "fub", "fun", "fur", "gab", "gad", "gag", "gal", "gam",
 	"gap", "gas", "gay", "gee", "gel", "gem", "gen", "geo", "get", "gib", "gid", "gif",
 	"gig", "gin", "gip", "git", "goa", "gob", "god", "goo", "gor", "got", "gov", "grr",
 	"gum", "gun", "gup", "gut", "guy", "gym", "gyp", "had", "hag", "hah", "haj", "ham",
 	"hap", "has", "hat", "haw", "hay", "heh", "hem", "hen", "her", "hes", "hew", "hex",
 	"hey", "hic", "hid", "him", "hip", "his", "hit", "hmm", "hod", "hoe", "hog", "hop",
 	"hot", "how", "hoy", "hub", "hue", "hug", "huh", "hum", "hun", "hup", "hut", "ice",
 	"ich", "ick", "icy", "ids", "ifs", "ill", "imp", "ink", "inn", "int", "ion", "ire",
 	"irk", "ism", "its", "ivy", "jab", "jam", "jap", "jar", "jaw", "jay", "jet", "jib",
 	"jig", "jin", "job", "joe", "jog", "jot", "joy", "jug", "jut", "kat", "kaw", "kay",
 	"ked", "keg", "key", "kid", "kin", "kit", "kob", "koi", "lab", "lac", "lad", "lag",
 	"lam", "lap", "law", "lax", "lay", "led", "leg", "lei", "lek", "let", "lev", "lex",
 	"lib", "lid", "lie", "lip", "lit", "lob", "log", "loo", "lop", "lot", "low", "lug",
 	"luv", "lye", "mac", "mad", "mag", "mam", "man", "map", "mar", "mat", "maw", "max",
 	"may", "med", "meg", "meh", "mel", "men", "met", "mew", "mib", "mid", "mig", "mil",
 	"mix", "mmm", "mob", "mod", "mog", "mol", "mom", "mon", "moo", "mop", "mow", "mud",
 	"mug", "mum", "mut", "nab", "nag", "nah", "nan", "nap", "nat", "naw", "nay", "nef",
 	"neg", "net", "new", "nib", "nil", "nip", "nit", "nob", "nod", "nog", "noh", "nom",
 	"non", "noo", "nor", "not", "now", "noy", "nth", "nub", "nun", "nut", "nyx", "oaf",
 	"oak", "oar", "oat", "oba", "obs", "oca", "odd", "ode", "off", "oft", "ohm", "oil",
 	"oke", "old", "one", "oof", "ooh", "oom", "oop", "ops", "opt", "orb", "orc", "ore",
 	"org", "ort", "oud", "our", "out", "ova", "owe", "owl", "own", "oxy", "pad", "pah",
 	"pal", "pan", "par", "pas", "pat", "paw", "pax", "pay", "pea", "pec", "pee", "peg",
 	"pen", "pep", "per", "pes", "pet", "pew", "phi", "pho", "pht", "pic", "pie", "pig",
 	"pin", "pip", "pit", "pix", "ply", "pod", "poi", "pol", "poo", "pop", "pos", "pot",
 	"pow", "pox", "pre", "pro", "pry", "psi", "pst", "pub", "pug", "puh", "pul", "pun",
 	"pup", "pur", "pus", "put", "pwn", "pya", "pyx", "qat", "rad", "rag", "rai", "raj",
 	"ram", "ran", "rap", "rat", "raw", "ray", "reb", "rec", "red", "ref", "reg", "rem",
 	"res", "ret", "rex", "rez", "rho", "ria", "rib", "rid", "rig", "rim", "rin", "rip",
 	"rob", "roc", "rod", "roe", "rom", "rot", "row", "rub", "rue", "rug", "rum", "run",
 	"rut", "rya", "rye", "sac", "sad", "sag", "sal", "sap", "sat", "saw", "sax", "say",
 	"sea", "sec", "see", "seg", "sen", "set", "sew", "sex", "she", "shh", "shy", "sib",
 	"sic", "sig", "sim", "sin", "sip", "sir", "sis", "sit", "six", "ska", "ski", "sky",
 	"sly", "sob", "sod", "sol", "som", "son", "sop", "sot", "sou", "sow", "sox", "soy",
 	"spa", "spy", "sty", "sub", "sue", "sum", "sun", "sup", "sus", "tab", "tad", "tag",
 	"tai", "taj", "tan", "tao", "tap", "tar", "tat", "tau", "tav", "taw", "tax", "tea",
 	"tec", "tee", "teg", "tel", "ten", "tet", "tex", "the", "tho", "thy", "tic", "tie",
 	"til", "tin", "tip", "tis", "tit", "tod", "toe", "ton", "too", "top", "tor", "tot",
 	"tow", "toy", "try", "tsk", "tub", "tug", "tui", "tum", "tun", "tup", "tut", "tux",
 	"two", "ugh", "umm", "ump", "uni", "ups", "urd", "urn", "use", "uta", "ute", "utu",
 	"uwu", "vac", "van", "var", "vas", "vat", "vav", "vax", "vee", "veg", "vet", "vex",
 	"via", "vid", "vie", "vig", "vim", "vol", "vow", "vox", "vug", "wad", "wag", "wan",
 	"wap", "war", "was", "wat", "wax", "way", "web", "wed", "wee", "wen", "wet", "wey",
 	"who", "why", "wig", "win", "wit", "wiz", "woe", "wok", "won", "woo", "wop", "wow",
 	"wry", "wud", "wus", "yag", "yah", "yak", "yam", "yap", "yar", "yaw", "yay", "yea",
 	"yeh", "yen", "yep", "yes", "yet", "yew", "yin", "yip", "yok", "you", "yow", "yum",
 	"yup", "zag", "zap", "zax", "zed", "zee", "zen", "zig", "zip", "zit", "zoo", "zzz"
 };
 static const std::map<const char, std::string> LETTER_PHONEMES = {
 	{'a', "ˈeɪ"},
 	{'b', "bˈiː"},
 	{'c', "sˈiː"},
 	{'d', "dˈiː"},
 	{'e', "ˈiː"},
 	{'f', "ˈɛf"},
 	{'j', "dʒˈeɪ"},
 	{'h', "ˈeɪtʃ"},
 	{'i', "ˈaɪ"},
 	{'j', "dʒˈeɪ"},
 	{'k', "kˈeɪ"},
 	{'l', "ˈɛl"},
 	{'m', "ˈɛm"},
 	{'n', "ˈɛn"},
 	{'o', "ˈoʊ"},
 	{'p', "pˈiː"},
 	{'q', "kjˈuː"},
 	{'r', "ˈɑːɹ"},
 	{'s', "ˈɛs"},
 	{'t', "tˈiː"},
 	{'u', "jˈuː"},
 	{'v', "vˈiː"},
 	{'w', "dˈʌbəljˌuː"},
 	{'x', "ˈɛks"},
 	{'y', "wˈaɪ"},
 	{'z', "zˈiː"}
 };
 static const std::string SPACE_CHARACTERS = " \t\f\n";
 static const std::string NOOP_BREAKS = "{}[]():;,\"";
 static const std::string CLAUSE_BREAKS = ".!?";
 static const std::string TRILLION_PHONEME = "tɹˈɪliən";
 static const long long int TRILLION = 1000000000000;
 static const std::string BILLION_PHONEME = "bˈɪliən";
 static const int BILLION = 1000000000;
 static const std::string MILLION_PHONEME = "mˈɪliən";
 static const int MILLION = 1000000;
 static const std::string POINT_PHONEME = "pˈɔɪnt";
 static const std::string THOUSAND_PHONEME = "θˈaʊzənd";
 static const std::string HUNDRED_PHONEME = "hˈʌndɹɪd";
 static const std::string NUMBER_CHARACTERS = "0123456789";
 static const std::string COMPATIBLE_NUMERICS = NUMBER_CHARACTERS + "., ";
 static const long long int LARGEST_PRONOUNCABLE_NUMBER = 999999999999999;
 static const std::vector<std::string> NUMBER_PHONEMES = {
 	"zˈiəɹoʊ",
 	"wˈʌn",
 	"tˈuː",
 	"θɹˈiː",
 	"fˈɔːɹ",
 	"fˈaɪv",
 	"sˈɪks",
 	"sˈɛvən",
 	"ˈeɪt",
 	"nˈaɪn",
 	"tˈɛn",
 	"ɪlˈɛvən",
 	"twˈɛlv",
 	"θˈɜːtiːn",
 	"fˈɔːɹtiːn",
 	"fˈɪftiːn",
 	"sˈɪkstiːn",
 	"sˈɛvəntˌiːn",
 	"ˈeɪtiːn",
 	"nˈaɪntiːn"
 };
 static const std::vector<std::string> SUB_HUNDRED_NUMBERS = {
 	"twˈɛnti",
 	"θˈɜːɾi",
 	"fˈɔːɹɾi",
 	"fˈɪfti",
 	"sˈɪksti",
 	"sˈɛvənti",
 	"ˈeɪɾi",
 	"nˈaɪnti"
 };
 static const std::map<std::string, std::string> REPLACEABLE = {
 	{"*", "ˈæstɚɹˌɪsk"},
 	{"+", "plˈʌs"},
 	{"&", "ˈænd"},
 	{"%", "pɚsˈɛnt"},
 	{"@", "ˈæt"},
 	{"#", "hˈæʃ"},
 	{"$", "dˈɑːlɚ"},
 	{"~", "tˈɪldə"},
 	{"¢", "sˈɛnts"},
 	{"£", "pˈaʊnd"},
 	{"¥", "jˈɛn"},
 	{"₨", "ɹˈuːpiː"},
 	{"€", "jˈʊɹɹoʊz"},
 	{"₹", "ɹˈuːpiː"},
 	{"♯", "ʃˈɑːɹp"},
 	{"♭", "flˈæt"},
 	{"≈", "ɐpɹˈɑːksɪmətli"},
 	{"≠", "nˈɑːt ˈiːkwəl tʊ"},
 	{"≤", "lˈɛs ɔːɹ ˈiːkwəl tʊ"},
 	{"≥", "ɡɹˈeɪɾɚɹ ɔːɹ ˈiːkwəl tʊ"},
 	{">", "ɡɹˈeɪɾɚ ðɐn"},
 	{"<", "lˈɛs ðɐn"},
 	{"=", "ˈiːkwəlz"},
 	{"±", "plˈʌs ɔːɹ mˈaɪnəs"},
 	{"×", "tˈaɪmz"},
 	{"÷", "dᵻvˈaɪdᵻd bˈaɪ"},
 	{"℞", "pɹɪskɹˈɪpʃən"},
 	{"№", "nˈuːməˌoʊ"},
 	{"°", "dᵻɡɹˈiːz"},
 	{"∴", "ðˈɛɹfɔːɹ"},
 	{"∵", "bɪkˈʌz"},
 	{"√", "skwˈɛɹ ɹˈuːt"},
 	{"∛", "kjˈuːb ɹˈuːt"},
 	{"∑", "sˈʌm sˈaɪn"},
 	{"∂", "dˈɛltə"},
 	{"←", "lˈɛft ˈæɹoʊ"},
 	{"↑", "ˈʌp ˈæɹoʊ"},
 	{"→", "ɹˈaɪt ˈæɹoʊ"},
 	{"↓", "dˈaʊn ˈæɹoʊ"},
 	{"−", "mˈaɪnəs"},
 	{"¶", "pˈæɹəɡɹˌæf"},
 	{"§", "sˈɛkʃən"},
 };
 static const std::string ROMAN_NUMERAL_CHARACTERS = "MDCLXVImdclxvi";
 static const std::map<std::string, int> ROMAN_NUMERALS = {
 	{"m", 1000},
 	{"mm", 2000},
 	{"mmm", 3000},
 	{"c", 100},
 	{"cc", 200},
 	{"ccc", 300},
 	{"cd", 400},
 	{"cm", 900},
 	{"dc", 600},
 	{"dcc", 700},
 	{"dccc", 800},
 	{"x", 10},
 	{"xx", 20},
 	{"xxx", 30},
 	{"xl", 40},
 	{"l", 50},
 	{"lx", 60},
 	{"lxx", 70},
 	{"lxxx", 80},
 	{"xc", 90},
 	{"i", 1},
 	{"ii", 2},
 	{"iii", 3},
 	{"iv", 4},
 	{"v", 5},
 	{"vi", 6},
 	{"vii", 7},
 	{"viii", 8},
 	{"ix", 9},
 };
 static const std::map<std::string, std::string> CONTRACTION_PHONEMES = {
 	{"re", "r"},
 	{"ve", "əv"},
 	{"ll", "l"},
 	{"d", "d"},
 	{"t", "t"},
 };
 // characters that Espeak-ng treats as stopping tokens.
 static std::string STOPPING_TOKENS = ".,:;!?";
 #ifdef ESPEAK_INSTALL
 /**
 * espeak-ng uses globals to persist and manage its state so it is not compatible with 
 * threaded parallelism (https://github.com/espeak-ng/espeak-ng/issues/1527).
 * This singleton acts as a mutex wrapped provider for all espeak phonemization methods such
 * that multiple instances of the kokoro_runner can be initialized and called in parallel.
 */
 class espeak_wrapper {
 private:
    static espeak_wrapper * instance;
    static std::mutex mutex;
 protected:
    espeak_wrapper() {};
    ~espeak_wrapper() {};
    bool espeak_initialized = false;
 public:
 	// singletons aren't copyable
    espeak_wrapper(espeak_wrapper &other) = delete;
    // singletons aren't assignable
    void operator=(const espeak_wrapper &) = delete;
    static espeak_wrapper * get_instance();
    const espeak_VOICE ** list_voices();
    espeak_ERROR set_voice(const char * voice_code);
    const char * text_to_phonemes(const void ** textptr, int textmode, int phonememode);
    void initialize(espeak_AUDIO_OUTPUT output, int buflength, const char * path, int options);
 };
 #endif
 enum lookup_code {
 	SUCCESS = 100,
 	SUCCESS_PARTIAL = 101,
 	FAILURE_UNFOUND = 200,
 	FAILURE_PHONETIC = 201,
 };
 enum phoneme_type {
 	IPA = 1,
 	ESPEAK_PHONEMES = 2,
 };
 enum phonemizer_type {
 	TTS_PHONEMIZER = 0,
 	ESPEAK = 1,
 };
 std::string parse_voice_code(std::string voice_code);
 void update_voice(std::string voice_code);
 const std::unordered_set<std::string> inline_combine_sets(const std::vector<std::unordered_set<std::string>> sets);
 int upper_count(std::string word);
 bool is_all_upper(std::string word);
 bool is_roman_numeral(char letter);
 bool can_be_roman_numeral(std::string word);
 bool is_alphabetic(char letter);
 bool is_numeric(char letter);
 std::string replace_accents(std::string word);
 std::string build_subthousand_phoneme(int value);
 std::string build_number_phoneme(long long int remainder);
 // The conditions struct is used to track and describe stateful criteria while converting text to phonemes.
 struct conditions {
 	bool hyphenated = false;
 	bool was_all_capitalized = false;
 	bool was_word = false;
 	bool was_punctuated_acronym = false;
 	bool was_number = false;
 	bool beginning_of_clause = true;
 	void reset_for_clause_end();
 	void reset_for_space();
 	void update_for_word(std::string word,bool allow_for_upper_check = true);
 };
 /* 
 * The corpus struct is simply a small wrapper class that is used to perform simple look forward and backwards in the text
 * which is being phonemized. This can be used to discern how to convert chunks of text in a consistent and protective fashion
 * in order to accurately phonemize complicated text.
 */
 struct corpus {
 	corpus(const char * text, size_t size): size(size), text(text) {};
 	size_t location = 0;
 	size_t size; 
 	const char * text;
 	/*
 	 * These all return strings because we are parsing in utf-8. As such the count variables passed to all the functions do not represent
 	 * the byte offset to pull to but rather the number of full utf-8 characters to pull (this can include 2, 3, and 4 byte characters).
 	 */
 	std::string next(int count = 1);
 	std::string last(int count = 1);
 	std::string pop(int count = 1);
 	std::string after(int after = 1, int count = 1);
 	// this is used for popping byte count rather than unique character count.
 	std::string size_pop(size_t pop_size);
 	std::string next_in(std::string val, bool* has_accent = nullptr);
 	std::string pop_in(std::string val);
 	std::string after_until(int after, std::string val);
 };
 /* 
 * The TTS phonemizer works by splitting each word into distinct graphemes, and for each grapheme the phonemizer will look at the grapheme that came
 * before, after, and for any word specific exceptions in order to compile a 
 */
 struct phonemizer_rule {
 	~phonemizer_rule() {
 		for (auto it : rules) {
 			delete it.second;
 		}
 	}
 	std::unordered_map<std::string, phonemizer_rule*> rules;
 	std::string value = "";
 	std::string lookup_rule(std::vector<std::string> & keys, int index);
 };
 typedef std::unordered_map<std::string, phonemizer_rule*> rules_lookup;
 struct word_phonemizer {
 	word_phonemizer(struct single_pass_tokenizer * tokenizer): tokenizer(tokenizer) {};
 	~word_phonemizer() {
 		for (auto it : rules) {
 			delete it.second;
 		}
 		delete tokenizer;
 	}
 	struct single_pass_tokenizer * tokenizer;
 	rules_lookup rules;
 	std::string phonemize(std::string word);
 	void add_rule(std::vector<std::string> keys, std::string phoneme);
 private:
 	std::string lookup_rule(std::string word, std::string current_token, std::string last_token, std::string next_token);
 };
 struct word_phonemizer * word_phonemizer_from_gguf(gguf_context * meta);
 /* 
 * The general translation approach that espeak uses is to lookup words in the dictionary and return a list of possible matches per lookup.
 * Each match contains flags which describe the match's conditions and limitations and optionally a pronunciation. When a pronunciation is not returned,
 * it usually means that the word needs to be pronounced phonetically, the word belongs to another language, or that the original content is a 
 * token representation of a different word (e.g. with numbers).
 *
 * Since it does not make sense to have the core lexer reperform this lookup operation with represented words or via distinct languages, those behaviors
 * are managed by the lookup operation itself and thus the lookup operation will only fail when phonetic or acronym content should be produced.
 */
 struct dictionary_response {
 	dictionary_response(lookup_code code, std::string value = ""): code(code), value(value) {}
 	std::string value;
 	lookup_code code;
 	bool expects_to_be_proceeded_by_number = false;
 	bool not_at_clause_end = false;
 	bool not_at_clause_start = false;
 	std::string after_match = "";
 	bool is_successful();
 	bool is_match(corpus* text, conditions* flags);
 };
 dictionary_response * response_from_string(std::string value, std::string key);
 struct phoneme_dictionary {
 	std::unordered_map<std::string, std::vector<dictionary_response*>> lookup_map;
 	dictionary_response* lookup(corpus* text,std::string value, conditions* flags);
 	dictionary_response* not_found_response = new dictionary_response(FAILURE_UNFOUND);
 	dictionary_response* phonetic_fallback_response = new dictionary_response(FAILURE_PHONETIC);
 };
 struct phoneme_dictionary * phoneme_dictionary_from_gguf(gguf_context * meta);
 /* 
 * In general, I would like to avoid requiring the installation of otherwise broad and technically complicated libraries,
 * like espeak, especially when they are only being used for a small portion of their overall functionality. While avoiding these
 * requirements will keep the default installation cost of TTS.cpp down, it is also unlikely that TTS.cpp will support
 * the level of variability in phonemization that espeak currently does. In this regard, I have chosen to optionally support usage of
 * espeak. As such, the phonemizer struct described below will support simple text to IPA phoneme functionality out of the box,
 * while also optionally acting as an interface for espeak phonemization.
 *
 * Phonemization seems to use a pattern close to the common lexer, such that at each index or chunk of text forward and backward context 
 * views are used to support single pass translation. As such, the TTS.cpp phonemization pattern I've decided to implement behaves 
 * effecively like a simple router lexer. It will only support utf-8 encoded text and english IPA conversion.
 */
 struct phonemizer {
 	phonemizer(struct phoneme_dictionary * dict, struct word_phonemizer * phonetic_phonemizer, bool preserve_punctuation = true): dict(dict), phonetic_phonemizer(phonetic_phonemizer), preserve_punctuation(preserve_punctuation) {};
 	~phonemizer() {
 		delete dict;
 		delete phonetic_phonemizer;
 	}
 	const std::unordered_set<std::string> small_english_words = inline_combine_sets({THREE_LETTER_WORDS, TWO_LETTER_WORDS, ONE_LETTER_WORDS});
 	std::string separator = " ";
 	phoneme_type phoneme_mode = IPA;
 	phonemizer_type mode = TTS_PHONEMIZER;
 	bool preserve_punctuation = true;
 	struct phoneme_dictionary * dict;
 	struct word_phonemizer * phonetic_phonemizer;
 	void text_to_phonemes(std::string text, std::string* output);
 	void text_to_phonemes(const char * text, size_t size, std::string* output);
 	std::string text_to_phonemes(std::string text);
 	std::string text_to_phonemes(const char * text, size_t size);
 #ifdef ESPEAK_INSTALL
 	std::string espeak_text_to_phonemes(const char * text);
 #endif
 	bool process_word(corpus* text, std::string* output, std::string word, conditions * flags, bool has_accent = false);
 	void append_numeric_series(std::string series, std::string* output, conditions * flags);
 	bool is_acronym_like(corpus* text, std::string word, conditions* flags);
 	bool route(corpus* text, std::string* output, conditions* flags);
 	bool handle_space(corpus* text, std::string* output, conditions* flags);
 	bool handle_contraction(corpus* text, std::string* output, conditions* flags);
 	bool handle_possession_plural(corpus* text, std::string* output, conditions* flags);
 	bool handle_replacement(corpus* text, std::string next, std::string* output, conditions * flags);
 	bool handle_phonetic(corpus* text, std::string word, std::string* output, conditions* flags, size_t unaccented_size_difference);
 	bool handle_acronym(corpus* text, std::string word, std::string* output, conditions * flags);
 	bool handle_roman_numeral(corpus* text, std::string* output, conditions * flags);
 	bool handle_word(corpus* text, std::string* output, conditions* flags);
 	bool handle_numeric_series(corpus* text, std::string* output, conditions* flags);
 	bool handle_numeric(corpus* text, std::string* output, conditions* flags);
 	bool handle_punctuation(corpus* text, std::string next, std::string* output, conditions* flags);
 	bool handle_unknown(corpus* text);
 };
 struct phonemizer * phonemizer_from_gguf(gguf_context * meta, const std::string espeak_voice_code = "gmw/en-US");
 struct phonemizer * phonemizer_from_file(const std::string fname, const std::string espeak_voice_code = "gmw/en-US");
 struct phonemizer * espeak_phonemizer(bool use_espeak_phonemes = false, std::string espeak_voice_code = "gmw/en-US");
 #endif
--- a/otherarch/ttscpp/include/tts.h
+++ b/otherarch/ttscpp/include/tts.h
@ -0,0 +1,34 @@
 #ifndef tts_h
 #define tts_h
 #include "parler_model.h"
 #include "kokoro_model.h"
 #include "dia_model.h"
 #include "orpheus_model.h"
 #include <thread>
 #include <fstream>
 #include <array>
 struct tts_runner * parler_tts_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only);
 struct tts_runner * kokoro_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only);
 struct tts_runner * dia_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only);
 struct tts_runner * orpheus_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only);
 struct tts_runner * runner_from_file(const std::string & fname, int n_threads, generation_configuration * config, bool cpu_only = true);
 int generate(tts_runner * runner, std::string sentence, struct tts_response * response, generation_configuration * config);
 void update_conditional_prompt(tts_runner * runner, const std::string file_path, const std::string prompt, bool cpu_only = true);
 std::vector<std::string> list_voices(tts_runner * runner);
 struct quantization_params {
    quantization_params(uint32_t n_threads, enum ggml_type quantize_type): n_threads(n_threads), quantize_type(quantize_type) {};
    uint32_t n_threads;
    enum ggml_type quantize_type; // quantization type
    bool quantize_output_heads = false;
    bool quantize_text_embeddings = false;
    bool quantize_cross_attn_kv = false;
    bool convert_dac_to_f16 = false;
    bool convert_non_quantizable_to_f16 = false;
 };
 void quantize_gguf(const std::string & ifile, const std::string & ofile, struct quantization_params * params);
 #endif
--- a/otherarch/ttscpp/include/ttsargs.h
+++ b/otherarch/ttscpp/include/ttsargs.h
@ -0,0 +1,115 @@
 #ifndef args_h
 #define args_h
 #include <stdio.h>
 #include <iostream>
 #include <vector>
 struct arg {
    std::string full_name;
    std::string abbreviation = "";
    std::string description = "";
    bool required = false;
    bool has_param = false;
    std::string help_text();
 };
 struct bool_arg : public arg {
    bool_arg(std::string fn, std::string desc = "", std::string abbr = "", bool req = false, bool val = false) {
        full_name = fn;
        description = desc;
        abbreviation = abbr;
        required = req;
        value = val;
    };
    bool value = false;
 };
 struct string_arg : public arg {
    string_arg(std::string fn, std::string desc = "", std::string abbr = "", bool req = false, std::string val = "") {
        full_name = fn;
        description = desc;
        abbreviation = abbr;
        required = req;
        value = val;
    };
    bool has_param = true;
    std::string value;
    int parse(int argc, const char ** argv);
 };
 struct int_arg : public arg {
    int_arg(std::string fn, std::string desc = "", std::string abbr = "", bool req = false, int * val = nullptr) {
        full_name = fn;
        description = desc;
        abbreviation = abbr;
        required = req;
        value = val;
    };
    int * value;
    int parse(int argc, const char ** argv);
 };
 struct float_arg : public arg {
    float_arg(std::string fn, std::string desc = "", std::string abbr = "", bool req = false, float * val = nullptr) {
        full_name = fn;
        description = desc;
        abbreviation = abbr;
        required = req;
        value = val;
    };
    bool has_param = true;
    float * value;
    int parse(int argc, const char ** argv);
 };
 struct arg_list {
    std::vector<float_arg> fargs;
    std::vector<int_arg> iargs;
    std::vector<bool_arg> bargs;
    std::vector<string_arg> sargs;
    bool for_help = false;
    void add_argument(float_arg arg) {
        fargs.push_back(arg);
    }
    void add_argument(int_arg arg) {
        iargs.push_back(arg);
    }
    void add_argument(bool_arg arg) {
        bargs.push_back(arg);
    }
    void add_argument(string_arg arg) {
        sargs.push_back(arg);
    }
    void help();
    void validate();
    void parse(int argc, const char ** argv);
    int find_and_parse(std::string name, int argc, const char ** argv);
    std::string get_string_param(std::string full_name);
    int * get_int_param(std::string full_name);
    float * get_float_param(std::string full_name);
    bool get_bool_param(std::string full_name);
 };
 #endif
--- a/otherarch/ttscpp/include/ttscommon.h
+++ b/otherarch/ttscpp/include/ttscommon.h
@ -0,0 +1,80 @@
 #ifndef common_h
 #define common_h
 #include <cstdint>
 #include <string>
 #include <map>
 #include <vector>
 // Using this simple struct as opposed to a common std::vector allows us to return the cpu buffer
 // pointer directly rather than copying the contents of the buffer to a predefined std::vector.
 struct tts_response {
 	float * data;
 	size_t n_outputs = 0;
 	uint32_t hidden_size; // this parameter is only currently used by the t5_encoder for which n_outputs corresponds to sequence length;
 };
 enum tts_arch {
 	PARLER_TTS_ARCH = 0,
 	KOKORO_ARCH = 1,
 	DIA_ARCH = 2,
 	ORPHEUS_ARCH = 3,
 };
 const std::map<std::string, tts_arch> SUPPORTED_ARCHITECTURES = {
 	{ "parler-tts", PARLER_TTS_ARCH },
 	{ "kokoro", KOKORO_ARCH },
 	{ "dia", DIA_ARCH },
 	{ "orpheus", ORPHEUS_ARCH }
 };
 /// Given a map from keys to values, creates a new map from values to keys 
 template<typename K, typename V>
 static std::map<V, K> reverse_map(const std::map<K, V>& m) {
    std::map<V, K> r;
    for (const auto& kv : m) {
        r[kv.second] = kv.first;
    }
    return r;
 }
 const std::map<tts_arch, std::string> ARCHITECTURE_NAMES = reverse_map(SUPPORTED_ARCHITECTURES);
 struct generation_configuration {
    generation_configuration(
    	std::string voice = "",
    	int top_k = 50, 
    	float temperature = 1.0, 
    	float repetition_penalty = 1.0, 
    	bool use_cross_attn = true, 
    	std::string espeak_voice_id = "",
    	int max_tokens = 0,
    	float top_p = 1.0,
    	bool sample = true): top_k(top_k), temperature(temperature), repetition_penalty(repetition_penalty), use_cross_attn(use_cross_attn), sample(sample), voice(voice), espeak_voice_id(espeak_voice_id), max_tokens(max_tokens), top_p(top_p) {};
    bool use_cross_attn;
    float temperature;
    float repetition_penalty;
    float top_p;
    int top_k;
    int max_tokens;
    std::string voice = "";
    bool sample = true;
    std::string espeak_voice_id = "";
 };
 struct tts_runner {
 	tts_arch arch;
 	struct ggml_context * ctx = nullptr;
 	float sampling_rate = 44100.0f;
 	bool supports_voices = false;
 	std::string arch_name() {
 		return ARCHITECTURE_NAMES.at(arch);
 	}
 	void init_build(std::vector<uint8_t>* buf_compute_meta);
 	void free_build();
 };
 #endif
--- a/otherarch/ttscpp/src/args.cpp
+++ b/otherarch/ttscpp/src/args.cpp
@ -0,0 +1,164 @@
 #include "ttsargs.h"
 std::string arg::help_text() {
    std::string htxt = full_name;
    if (abbreviation != "") {
        htxt += " (" + abbreviation + ")";
    }
    htxt += ":\n    ";
    if (description != "") {
        htxt += description + "\n";
    } else {
        htxt += "is a " + (std::string)(required ? "required " : "optional ") + "parameter.\n";
    }
    return htxt;
 }
 int string_arg::parse(int argc, const char ** argv) {
    required = false;
    value.assign(argv[0]);
    return 1;
 }
 int int_arg::parse(int argc, const char ** argv) {
    if (required) {
        required = false;
    }
    int val = atoi(argv[0]);
    *value = val;
    return 1;
 }
 int float_arg::parse(int argc, const char ** argv) {
    if (required) {
        required = false;
    }
    float val = strtof(argv[0], nullptr);
    *value = val;
    return 1;
 }
 void arg_list::help() {
    std::string help_text = "";
    for (auto arg : fargs) {
        help_text += arg.help_text();
    }
    for (auto arg : iargs) {
        help_text += arg.help_text();
    }
    for (auto arg : bargs) {
        help_text += arg.help_text();
    }
    for (auto arg : sargs) {
        help_text += arg.help_text();
    }
    fprintf(stdout, "%s", help_text.c_str());
 }
 void arg_list::validate() {
    for (auto arg : fargs) {
        if (arg.required) {
            fprintf(stderr, "argument '%s' is required.\n", arg.full_name.c_str());
            exit(1);
        }
    }
    for (auto arg : iargs) {
        if (arg.required) {
            fprintf(stderr, "argument '%s' is required.\n", arg.full_name.c_str());
            exit(1);
        }
    }
    for (auto arg : bargs) {
        if (arg.required) {
            fprintf(stderr, "argument '%s' is required.\n", arg.full_name.c_str());
            exit(1);
        }
    }
    for (auto arg : sargs) {
        if (arg.required) {
            fprintf(stderr, "argument '%s' is required.\n", arg.full_name.c_str());
            exit(1);
        }
    }
 }
 void arg_list::parse(int argc, const char ** argv) {
    int current_arg = 1;
    while (current_arg < argc) {
        std::string name(argv[current_arg]);
        if (name == "--help") {
            for_help = true;
            return;
        }
        current_arg += 1;
        current_arg += find_and_parse(name, argc - current_arg, argv + current_arg);
    }
 }
 int arg_list::find_and_parse(std::string name, int argc, const char ** argv) {
    for (int i = 0; i < fargs.size(); i++) {
        if (fargs[i].full_name == name || fargs[i].abbreviation == name) {
            return fargs[i].parse(argc, argv);
        }
    }
    for (int i = 0; i < iargs.size(); i++) {
        if (iargs[i].full_name == name || iargs[i].abbreviation == name) {
            return iargs[i].parse(argc, argv);
        }
    }
    for (int i = 0; i < bargs.size(); i++) {
        if (bargs[i].full_name == name || bargs[i].abbreviation == name) {
            bargs[i].value = !bargs[i].value;
            bargs[i].required = false;
            return 0;
        }
    }
    for (int i = 0; i < sargs.size(); i++) {
        if (sargs[i].full_name == name || sargs[i].abbreviation == name) {
            return sargs[i].parse(argc, argv);
        }
    }
    fprintf(stderr, "argument '%s' is not a valid argument. Call '--help' for information on all valid arguments.\n", name.c_str());
    exit(1);
 }
 std::string arg_list::get_string_param(std::string full_name) {
    for (auto arg : sargs) {
        if (arg.full_name == full_name) {
            return arg.value;
        }
    }
    return "";
 }
 int * arg_list::get_int_param(std::string full_name) {
    for (auto arg : iargs) {
        if (arg.full_name == full_name) {
            return arg.value;
        }
    }
    return nullptr;
 }
 float * arg_list::get_float_param(std::string full_name) {
    for (auto arg : fargs) {
        if (arg.full_name == full_name) {
            return arg.value;
        }
    }
    return nullptr;
 }
 bool arg_list::get_bool_param(std::string full_name) {
    for (auto arg : bargs) {
        if (arg.full_name == full_name) {
            return arg.value;
        }
    }
    return false;
 }
--- a/otherarch/ttscpp/src/dac_model.cpp
+++ b/otherarch/ttscpp/src/dac_model.cpp
@ -0,0 +1,212 @@
 #include "dac_model.h"
 #include <algorithm>
 #include <stdexcept>
 // For loading DAC model from gguf file.
 static const std::map<std::string, dac_tensor> DAC_TENSOR_GGUF_LOOKUP = {
    {"initial.bias", DAC_ENCODER_IN_BIAS},
    {"initial.weight", DAC_ENCODER_IN_KERNEL},
    {"final.bias", DAC_ENCODER_OUT_BIAS},
    {"final.weight", DAC_ENCODER_OUT_KERNEL},
    {"final.alpha", DAC_ENCODER_SNAKE_ALPHA},
 };
 void dac_model::prep_constants(gguf_context * meta) {
    int output_heads_key = search_for_gguf_keys(meta, {"parler-tts.decoder.output_heads", "output_heads", "dia.decoder.output_heads"});
    if (output_heads_key != -1) {
        n_heads = gguf_get_val_u32(meta, output_heads_key);
    }
    int sampling_factor_key = search_for_gguf_keys(meta, {"dac.up_sampling_factor", "up_sampling_factor"});
    if (sampling_factor_key != -1) {
        up_sampling_factor = gguf_get_val_u32(meta, sampling_factor_key);
    }
    int max_gen_key = search_for_gguf_keys(meta, {"parler-tts.decoder.max_generation", "max_generation", "dia.decoder.max_generation"});
    if (max_gen_key != -1) {
        max_generation_size = gguf_get_val_u32(meta, max_gen_key);
    }
 }
 void dac_model::prep_layers(gguf_context * meta) {
    for (int i = 0; i < n_heads; i++) {
        quantizer_layers.push_back(general_neural_audio_codec::residual_vector_quantize_layer{});
    }
    for (int i = 0; i < n_layers; i++) {
        std::string stride_key = "dac_layer_stride_" + std::to_string(i);
        std::string padding_key = "dac_layer_padding_" + std::to_string(i);
        int layer_stride_key = search_for_gguf_keys(meta, {"dac." + stride_key, stride_key});
        if (layer_stride_key == -1) {
            TTS_ABORT("key %s must be specified in gguf file inorder to initialize the DAC audio decoder.", stride_key.c_str());
        }
        int layer_padding_key = search_for_gguf_keys(meta, {"dac." + padding_key, padding_key});
        if (layer_padding_key == -1) {
            TTS_ABORT("key %s must be specified in gguf file inorder to initialize the DAC audio decoder.", padding_key.c_str());
        }
        layers.push_back(
            general_neural_audio_codec::layer{
                gguf_get_val_u32(meta, layer_padding_key),
                gguf_get_val_u32(meta, layer_stride_key),
            }
        );
    }
 }
 void dac_model::assign_weight(std::string name, ggml_tensor * tensor) {
    assign_to_audio_encoder(this, name, tensor);
 }
 void assign_to_audio_encoder(dac_model * model, std::string name, ggml_tensor * tensor) {
    if (DAC_TENSOR_GGUF_LOOKUP.find(name) != DAC_TENSOR_GGUF_LOOKUP.end()) {
        switch(DAC_TENSOR_GGUF_LOOKUP.at(name)) {
            case DAC_ENCODER_IN_BIAS:
                model->in_conv_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor));
                model->set_tensor(model->in_conv_bias, tensor);
                break;
            case DAC_ENCODER_IN_KERNEL:
                model->in_conv_kernel = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(model->in_conv_kernel, tensor);
                break;
            case DAC_ENCODER_OUT_BIAS:
                model->out_conv_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor));
                model->set_tensor(model->out_conv_bias, tensor);
                break;
            case DAC_ENCODER_OUT_KERNEL:
                model->out_conv_kernel = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(model->out_conv_kernel, tensor);
                break;
            case DAC_ENCODER_SNAKE_ALPHA:
                model->snake_alpha = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(model->snake_alpha, tensor);
                break;
            default:
                fprintf(stdout, "unassigned tensor %s\n", name.c_str());
                break;
        }
    } else if (std::find_if(name.begin(), name.end(), ::isdigit) != name.end())  {
        auto pair = parse_layer_count(name);
        int l = pair.first;
        std::string lt_name = pair.second;
        if (name.find("quantizers") != std::string::npos) {
            general_neural_audio_codec::assign_to_quantize_layer((tts_model *) model, model->quantizer_layers[l], lt_name, tensor);
        } else {
            general_neural_audio_codec::assign_to_layer((tts_model *) model, model->layers[l - 1], lt_name, tensor);
        }
    }
 }
 static struct ggml_tensor * dac_build_audio_inputs(struct ggml_context * ctx, struct dac_context * dctx, const dac_ubatch & batch, std::vector<general_neural_audio_codec::residual_vector_quantize_layer> layers) {
    struct ggml_tensor * embd;
    dctx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.sequence_length*dctx->model->n_heads);
    ggml_set_input(dctx->inp_tokens);
    if (dctx->backend) {
        ggml_backend_sched_set_tensor_backend(dctx->sched, dctx->inp_tokens, dctx->backend);
    }
    for(int i = 0; i < dctx->model->n_heads; i++) {
        auto quantize_layer = dctx->model->quantizer_layers[i];
        struct ggml_tensor * code = ggml_cont(ctx, ggml_view_2d(ctx, dctx->inp_tokens, 1, batch.sequence_length, dctx->model->n_heads*ggml_type_size(GGML_TYPE_I32), i*ggml_type_size(GGML_TYPE_I32)));
        code = ggml_reshape_1d(ctx, code, batch.sequence_length);
        code = general_neural_audio_codec::build_quantize_layer(ctx, code, quantize_layer);
        if (i == 0) {
            embd = code;
        } else {
            embd = ggml_add(ctx, embd, code);
        }
    }
    return embd;
 }
 struct dac_context * build_new_dac_context(struct dac_model * model, int n_threads, bool use_cpu) {
    dac_context * dctx = new dac_context(model, n_threads);
    if (!use_cpu) {
 #ifdef GGML_USE_METAL
        dctx->backend = ggml_backend_metal_init();
 #endif
    }
    dctx->backend_cpu = ggml_backend_cpu_init();
    dctx->set_threads();
    dctx->build_schedule();
    dctx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_nodes() + ggml_graph_overhead_custom(model->max_nodes(), false));
    return dctx;
 }
 void dac_runner::prepare_post_load() {
    dac_ubatch batch;
    batch.sequence_length = model->max_generation_size;
    ggml_cgraph * gf = build_dac_graph(batch);
    dctx->prep_schedule(gf);
 }
 struct ggml_cgraph * dac_runner::build_dac_graph(dac_ubatch & batch) {
    init_build();
    // splitting this out from the primary graph so that we can better manage streaming (i.e. sentence chunks are better performed this way)
    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false);
    struct ggml_tensor * cur;
    struct ggml_tensor * inputs;
    inputs = dac_build_audio_inputs(ctx, dctx, batch, model->quantizer_layers);
    ggml_set_name(inputs, "quanitzed_inputs");
    // everything besides the inputs is just a forward pass
    cur = ggml_conv_1d_tts(ctx, model->in_conv_kernel, inputs, 1, 3, 1);
    cur = ggml_add(ctx, cur, model->in_conv_bias);
    for (auto l : model->layers) {
        cur = general_neural_audio_codec::build_layer(ctx, cur, l);
    }
    cur = snake_1d(ctx, model->snake_alpha, cur);
    cur = ggml_conv_1d_tts(ctx, model->out_conv_kernel, cur, 1, 3, 1);
    cur = ggml_add(ctx, cur, model->out_conv_bias);
    cur = ggml_tanh(ctx, cur);
    ggml_build_forward_expand(gf, cur);
    free_build();
    return gf;
 }
 void dac_runner::run(uint32_t * input_tokens, uint32_t sequence_length, struct tts_response * outputs) {
    dac_ubatch batch;
    batch.input_tokens = input_tokens;
    batch.sequence_length = sequence_length;
    ggml_backend_sched_reset(dctx->sched);
    const size_t prev_size = dctx->buf_output ? ggml_backend_buffer_get_size(dctx->buf_output) : 0;
    const size_t new_size = model->max_generation_size * model->up_sampling_factor * sizeof(float);
    if (!dctx->buf_output || prev_size < new_size) {
        if (dctx->buf_output) {
            ggml_backend_buffer_free(dctx->buf_output);
            dctx->buf_output = nullptr;
            dctx->logits = nullptr;
        }
        dctx->buf_output = ggml_backend_buft_alloc_buffer(dctx->backend_cpu_buffer, new_size);
    }
    outputs->data = (float *) ggml_backend_buffer_get_base(dctx->buf_output);
    ggml_backend_buffer_clear(dctx->buf_output, 0);
    struct ggml_cgraph * gf = NULL;
    gf = build_dac_graph(batch);
    // the output is always the last tensor in the graph
    struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1];
    ggml_backend_sched_alloc_graph(dctx->sched, gf);
    ggml_backend_tensor_set(dctx->inp_tokens, batch.input_tokens, 0, batch.sequence_length*model->n_heads*ggml_element_size(dctx->inp_tokens));
    ggml_backend_sched_graph_compute_async(dctx->sched, gf);
    dctx->get_ggml_node_data(result, outputs->data, batch.sequence_length*sizeof(float)*model->up_sampling_factor);
    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
    // overlap with device computation.
    ggml_backend_sched_reset(dctx->sched);
    outputs->n_outputs = sequence_length * model->up_sampling_factor;
    return;
 }
--- a/otherarch/ttscpp/src/dac_model.h
+++ b/otherarch/ttscpp/src/dac_model.h
@ -0,0 +1,98 @@
 #ifndef dac_model_h
 #define dac_model_h
 #include "general_neural_audio_codec.h"
 #include <map>
 enum dac_tensor {
    DAC_ENCODER_IN_KERNEL,
    DAC_ENCODER_IN_BIAS,
    DAC_ENCODER_OUT_KERNEL,
    DAC_ENCODER_OUT_BIAS,
    DAC_ENCODER_SNAKE_ALPHA,
 };
 struct dac_quantize_layer {
    struct ggml_tensor * out_proj_kernel;
    struct ggml_tensor * out_proj_bias;
    struct ggml_tensor * codebook;
 };
 // DAC, Descript Audio Codec, is a channel token to audio autoencoder model (though we only use its decoder functionality).
 // this struct maintains the static tensors for the dac audio decoder graph.
 // As such, this is designed to contain basic configuration and ggml tensor support for DAC.
 // The dac_runner describes how the graph is built and run.
 struct dac_model : tts_model {    
    // These configs  are essentially built for the 44khZ 8kbps standard DAC model audio encoder and decoder
    uint32_t n_layers = 4;
    uint32_t n_heads = 9;
    uint32_t up_sampling_factor = 512;
    uint32_t max_generation_size = 2580;
    struct ggml_tensor * in_conv_kernel;
    struct ggml_tensor * in_conv_bias;
    struct ggml_tensor * out_conv_kernel;
    struct ggml_tensor * out_conv_bias;
    struct ggml_tensor * snake_alpha;
    std::vector<general_neural_audio_codec::layer> layers;
    std::vector<general_neural_audio_codec::residual_vector_quantize_layer> quantizer_layers;
    void assign_weight(std::string name, ggml_tensor * weight);
    void prep_constants(gguf_context * meta);
    void prep_layers(gguf_context * meta);
    void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only) {
        prep_layers(meta_ctx);
        prep_constants(meta_ctx);
        tts_model::setup_from_file(meta_ctx, load_context, cpu_only, "audio_encoder");
    }
 };
 // for loading DAC model from gguf file
 void assign_to_audio_encoder(dac_model * model, std::string name, ggml_tensor * tensor);
 // the context used for running the dac model
 struct dac_context : runner_context {
    dac_context(dac_model * model, int n_threads): runner_context(n_threads), model(model) {};
    struct dac_model * model;
    struct ggml_tensor * inp_tokens;
    void build_schedule() {
        runner_context::build_schedule(model->max_nodes());
    }
 };
 struct dac_context * build_new_dac_context(struct dac_model * model, int n_threads, bool use_cpu = true);
 struct dac_ubatch {
    uint32_t * input_tokens;
    uint32_t sequence_length;
 };
 static struct ggml_tensor * dac_build_audio_inputs(struct ggml_context * ctx, struct dac_context * dctx, const dac_ubatch & batch, std::vector<general_neural_audio_codec::residual_vector_quantize_layer> layers);
 // This struct is intended to manage the dac model's graph compilation and compute function.
 struct dac_runner : tts_runner {
    dac_runner(dac_model * model, dac_context * context): model(model), dctx(context) {};
    ~dac_runner() {
        if (ctx) {
            ggml_free(ctx);
        }
        model->free();
        delete model;
        delete dctx;
    }
    dac_model * model;
    dac_context * dctx;
    void init_build() {
        tts_runner::init_build(&dctx->buf_compute_meta);
    }
    void prepare_post_load();
    struct ggml_cgraph * build_dac_graph(dac_ubatch & batch);
    void run(uint32_t * input_tokens, uint32_t sequence_length, struct tts_response * outputs);
 };
 #endif
--- a/otherarch/ttscpp/src/dia_model.cpp
+++ b/otherarch/ttscpp/src/dia_model.cpp
@ -0,0 +1,911 @@
 #include "dia_model.h"
 void dia_model::assign_weight(std::string name, struct ggml_tensor * tensor) {
    std::vector<std::string> parts = split(name, ".");
    TTS_ASSERT(parts.size() >= 3);
    if (parts[1] == "encoder") {
        assign_to_encoder(parts, tensor, name);
    } else if (parts[1] == "decoder"){
        assign_to_decoder(parts, tensor, name);
    } else {
        TTS_ABORT("Unrecognized tensor '%s' when loading Dia from GGUF file.", name.c_str());
    }
 }
 void dia_model::assign_to_encoder(std::vector<std::string> parts, struct ggml_tensor * tensor, std::string name) {
    if (parts[2] == "embedding") {
        encoder->embedding = ggml_dup_tensor(ctx, tensor);
        set_tensor(encoder->embedding, tensor);
    } else if (parts[2] == "norm") {
        encoder->norm = ggml_dup_tensor(ctx, tensor);
        set_tensor(encoder->norm, tensor);
    } else if (parts[2] == "layers") {
        TTS_ASSERT(parts.size() >= 4);
        int index = std::stoi(parts[3]);
        TTS_ASSERT(index < decoder->layers.size());
        assign_to_encoder_layer(parts[4], encoder->layers[index], tensor);
    } else {
        TTS_ABORT("Unrecognized tensor '%s' when loading Dia from GGUF file.", name.c_str());
    }
 }
 void dia_model::assign_to_decoder(std::vector<std::string> parts, struct ggml_tensor * tensor, std::string name) {
    if (parts[2] == "embeddings") {
        TTS_ASSERT(parts.size() > 2);
        int index = std::stoi(parts[3]);
        TTS_ASSERT(index < decoder->embds.size());
        decoder->embds[index] = ggml_dup_tensor(ctx, tensor);
        set_tensor(decoder->embds[index], tensor);
    } else if (parts[2] == "norm") {
        decoder->norm = ggml_dup_tensor(ctx, tensor);
        set_tensor(decoder->norm, tensor);
    } else if (parts[2] == "heads") {
        TTS_ASSERT(parts.size() > 2);
        int index = std::stoi(parts[3]);
        TTS_ASSERT(index < decoder->heads.size());
        decoder->heads[index] = ggml_dup_tensor(ctx, tensor);
        set_tensor(decoder->heads[index], tensor);
    } else if (parts[2] == "layers") {
        TTS_ASSERT(parts.size() >= 4);
        int index = std::stoi(parts[3]);
        TTS_ASSERT(index < decoder->layers.size());
        assign_to_decoder_layer(parts[4], decoder->layers[index], tensor);
    } else {
        TTS_ABORT("Unrecognized tensor '%s' when loading Dia from GGUF file.", name.c_str());
    }
 }
 void dia_model::assign_to_encoder_layer(std::string part, dia_encoder_layer * layer, struct ggml_tensor * tensor) {
    if (part == "q_proj") {
        layer->q = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer->q, tensor);
    } else if (part == "k_proj") {
        layer->k = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer->k, tensor);
    } else if (part == "v_proj") {
        layer->v = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer->v, tensor);
    } else if (part == "o_proj") {
        layer->o = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer->o, tensor);
    } else if (part == "pre_sa_norm") {
        layer->self_attn_norm = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer->self_attn_norm, tensor);
    } else if (part == "post_sa_norm") {
        layer->mlp_norm = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer->mlp_norm, tensor);
    } else if (part == "gate") {
        layer->gate = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer->gate, tensor);
    } else if (part == "up") {
        layer->up = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer->up, tensor);
    } else if (part == "wo") {
        layer->out = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer->out, tensor);
    } else {
        TTS_ABORT("Unrecognized tensor '%s' for encoder layer when loading Dia from GGUF file.", part.c_str());
    }
 }
 void dia_model::assign_to_decoder_layer(std::string part, dia_decoder_layer * layer, struct ggml_tensor * tensor) {
    if (part == "self_q_proj") {
        layer->self_attn_q = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer->self_attn_q, tensor);
    } else if (part == "self_k_proj") {
        layer->self_attn_k = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer->self_attn_k, tensor);
    } else if (part == "self_v_proj") {
        layer->self_attn_v = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer->self_attn_v, tensor);
    } else if (part == "self_o_proj") {
        layer->self_attn_o = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer->self_attn_o, tensor);
    } else if (part == "cross_q_proj") {
        layer->cross_attn_q = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer->cross_attn_q, tensor);
    } else if (part == "cross_k_proj") {
        layer->cross_attn_k = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer->cross_attn_k, tensor);
    } else if (part == "cross_v_proj") {
        layer->cross_attn_v = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer->cross_attn_v, tensor);
    } else if (part == "cross_o_proj") {
        layer->cross_attn_o = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer->cross_attn_o, tensor);
    } else if (part == "pre_sa_norm") {
        layer->self_attn_norm = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer->self_attn_norm, tensor);
    } else if (part == "pre_mlp_norm") {
        layer->mlp_norm = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer->mlp_norm, tensor);    
    } else if (part == "pre_ca_norm") {
        layer->cross_attn_norm = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer->cross_attn_norm, tensor);
    } else if (part == "gate") {
        layer->gate = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer->gate, tensor);
    } else if (part == "up") {
        layer->up = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer->up, tensor);
    } else if (part == "wo") {
        layer->out = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer->out, tensor);
    } else {
        TTS_ABORT("Unrecognized tensor '%s' for encoder layer when loading Dia from GGUF file.", part.c_str());
    }
 }
 void dia_model::prep_layers() {
    encoder = new dia_encoder;
    decoder = new dia_decoder;
    encoder->layers.reserve((size_t) n_encoder_layers);
    for (int i = 0; i < (int) n_encoder_layers; i++) {
        dia_encoder_layer * l = new dia_encoder_layer;
        encoder->layers.push_back(l);
    }
    decoder->layers.reserve((size_t) n_decoder_layers);
    for (int i = 0; i < (int) n_decoder_layers; i++) {
        dia_decoder_layer * l = new dia_decoder_layer;
        decoder->layers.push_back(l);
    }
    decoder->embds.reserve((size_t) n_output_heads);
    decoder->heads.reserve((size_t) n_output_heads);
    for (int i = 0; i < n_output_heads; i++) {
        struct ggml_tensor * h = nullptr;
        struct ggml_tensor * embd = nullptr;
        decoder->embds.push_back(embd);
        decoder->heads.push_back(h);
    }
 }
 void dia_model::prep_constants(gguf_context * meta) {
    int output_heads_key = gguf_find_key(meta, "dia.decoder.output_heads");
    if (output_heads_key != -1) {
        n_output_heads = gguf_get_val_u32(meta, output_heads_key);
    }
    int decoder_layers_key = gguf_find_key(meta, "dia.decoder.layers");
    if (decoder_layers_key != -1) {
        n_decoder_layers = gguf_get_val_u32(meta, decoder_layers_key);
    }
    int encoder_layers_key = gguf_find_key(meta, "dia.encoder.layers");
    if (encoder_layers_key != -1) {
        n_encoder_layers = gguf_get_val_u32(meta, encoder_layers_key);
    }
    int decoder_hidden_size_key = gguf_find_key(meta, "dia.decoder.hidden_size");
    if (decoder_hidden_size_key != -1) {
        decoder_hidden_size = gguf_get_val_u32(meta, decoder_hidden_size_key);
    }
    int decoder_attn_heads_key = gguf_find_key(meta, "dia.decoder.attn_heads");
    if (decoder_attn_heads_key != -1) {
        decoder_attn_heads = gguf_get_val_u32(meta, decoder_attn_heads_key);
    }
    int decoder_query_heads_key = gguf_find_key(meta, "dia.decoder.query_heads");
    if (decoder_query_heads_key != -1) {
        decoder_query_heads = gguf_get_val_u32(meta, decoder_query_heads_key);
    }
    int encoder_attn_heads_key = gguf_find_key(meta, "dia.encoder.attn_heads");
    if (encoder_attn_heads_key != -1) {
        encoder_attn_heads = gguf_get_val_u32(meta, encoder_attn_heads_key);
    }    
    int head_size_key = gguf_find_key(meta, "dia.attn_head_size");
    if (head_size_key != -1) {
        head_size = gguf_get_val_u32(meta, head_size_key);
    }
    int eos_token_id_key = gguf_find_key(meta, "dia.eos_token_id");
    if (eos_token_id_key != -1) {
        eos_token_id = gguf_get_val_u32(meta, eos_token_id_key);
    }
    int bos_token_id_key = gguf_find_key(meta, "dia.bos_token_id");
    if (bos_token_id_key != -1) {
        bos_token_id = gguf_get_val_u32(meta, bos_token_id_key);
    }
    int pad_token_id_key = gguf_find_key(meta, "dia.pad_token_id");
    if (pad_token_id_key != -1) {
        pad_token_id = gguf_get_val_u32(meta, pad_token_id_key);
    }
    int max_context_key = gguf_find_key(meta, "dia.encoder.max_context_length");
    if (max_context_key != -1) {
        max_encoder_context_length = gguf_get_val_u32(meta, max_context_key);
    }
    int output_vocab_size_key = gguf_find_key(meta, "dia.decoder.output_vocab_size");
    if (output_vocab_size_key != -1) {
        output_vocab_size = gguf_get_val_u32(meta, output_vocab_size_key);
    }
    int audio_vocab_size_key = gguf_find_key(meta, "dia.decoder.audio_vocab_size");
    if (audio_vocab_size_key != -1) {
        audio_vocab_size = gguf_get_val_u32(meta, audio_vocab_size_key);
    }
    int max_generation_size_key = gguf_find_key(meta, "dia.decoder.max_generation_size");
    if (max_generation_size_key != -1) {
        max_generation_size = gguf_get_val_u32(meta, max_generation_size_key);
    }
    int max_delay_key = gguf_find_key(meta, "dia.max_delay");
    if (max_delay_key != -1) {
        max_delay = gguf_get_val_u32(meta, max_delay_key);
    }
    // please note that this value is not currently set in the gguf encoder as it effectively only exists as a default
    // python parameter (rather than an attribute in the model config) for the python Dia model.
    int cfg_scale_key = gguf_find_key(meta, "dia.cfg_scale");
    if (cfg_scale_key != -1) {
        cfg_scale_data[0] = gguf_get_val_f32(meta, cfg_scale_key);
    }
 }
 void dia_context::reset() {
    current_position = 0;
    prompt_size = 0;
    output_tokens.clear();
    delay_steps = -1;
 }
 struct dia_context * build_new_dia_context(struct dia_model * model, int n_threads, bool use_cpu) {
    dia_context * dctx = new dia_context(model, n_threads);
    if (!use_cpu) {
 #ifdef GGML_USE_METAL
        dctx->backend = ggml_backend_metal_init();
 #endif
    }
    dctx->backend_cpu = ggml_backend_cpu_init();
    dctx->set_threads();
    dctx->build_schedule();
    dctx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_nodes() + ggml_graph_overhead_custom(model->max_nodes(), false));
    return dctx;
 }
 static bool dia_kv_cache_init(struct dia_kv_cache * cache, dia_model * model, dia_context * dctx) {    
    ggml_backend_buffer_type_t buft = nullptr;
    // this will only really support cpu or metal for the time being;
    if (dctx->backend != nullptr) {
 #ifdef GGML_USE_METAL
        buft = ggml_backend_metal_buffer_type();
 #endif
    } else {
        buft = ggml_backend_cpu_buffer_type();
    }
    struct ggml_init_params params = {
        /*.mem_size   =*/ (4u * model->n_decoder_layers + 1) * ggml_tensor_overhead(),
        /*.mem_buffer =*/ NULL,
        /*.no_alloc   =*/ true,
    };
    ggml_context * ctx = ggml_init(params);
    if (!ctx) {
        return false;
    }
    cache->ctx = ctx;
    cache->k_l.reserve(model->n_decoder_layers);
    cache->v_l.reserve(model->n_decoder_layers);
    cache->cross_k_l.reserve(model->n_decoder_layers);
    cache->cross_v_l.reserve(model->n_decoder_layers);
    for (int i = 0; i < (int) model->n_decoder_layers; i++) {
        struct ggml_tensor * k = ggml_new_tensor_1d(cache->ctx, cache->tensor_type, model->head_size * model->decoder_attn_heads * model->max_generation_size * 2);
        struct ggml_tensor * v = ggml_new_tensor_1d(cache->ctx, cache->tensor_type, model->head_size * model->decoder_attn_heads * model->max_generation_size * 2);
        struct ggml_tensor * cross_k = ggml_new_tensor_1d(cache->ctx, cache->tensor_type, model->head_size * model->decoder_attn_heads * model->max_encoder_context_length * 2);
        struct ggml_tensor * cross_v = ggml_new_tensor_1d(cache->ctx, cache->tensor_type, model->head_size * model->decoder_attn_heads * model->max_encoder_context_length * 2);
        ggml_format_name(k, "cache_k_l%d", i);
        ggml_format_name(v, "cache_v_l%d", i);
        ggml_format_name(cross_k, "cache_cross_k_l%d", i);
        ggml_format_name(cross_v, "cache_cross_v_l%d", i);
        cache->k_l.push_back(k);
        cache->v_l.push_back(v);
        cache->cross_k_l.push_back(cross_k);
        cache->cross_v_l.push_back(cross_v);
    }
    // allocate tensors and initialize the buffers to avoid NaNs in the padding
    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(cache->ctx, buft);
    if (!buf) {
        return false;
    }
    ggml_backend_buffer_clear(buf, 0);
    cache->buf = buf;
    return true;
 }
 static struct ggml_tensor * build_dia_decoder_inp_embd(struct ggml_context * ctx, dia_context *dctx, dia_decoder * decoder, dia_ubatch & batch, uint32_t n_output_heads) {
    struct ggml_tensor * input_embs;
    dctx->audio_inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_output_heads * 2);
    ggml_set_input(dctx->audio_inp_tokens);
    for (int i = 0; i < n_output_heads; i++) {
        struct ggml_tensor * view = ggml_view_1d(ctx, dctx->audio_inp_tokens, 2, i * ggml_element_size(dctx->audio_inp_tokens));
        view->nb[0] = n_output_heads * ggml_element_size(dctx->audio_inp_tokens);
        if (i == 0) {
            input_embs = ggml_get_rows(ctx, decoder->embds[i], view);
        } else {
            input_embs = ggml_add(ctx, ggml_get_rows(ctx, decoder->embds[i], view), input_embs);
        }
    }
    return input_embs;
 }
 static struct ggml_tensor * dia_layer_norm(struct ggml_context * ctx, struct ggml_tensor * inputs, struct ggml_tensor * weight) {
    // dia always uses 1e-5 as the default eps
    float eps = 0.00001;
    inputs = ggml_rms_norm(ctx, inputs, eps);
    return ggml_mul(ctx, inputs, weight);
 }
 static struct ggml_tensor * build_dia_encoder_attn_mask(ggml_context * ctx, struct dia_context * dctx, dia_model * model) {
    dctx->encode_attn_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, (int64_t) model->max_encoder_context_length, (int64_t) model->max_encoder_context_length);
    ggml_set_input(dctx->encode_attn_mask);
    return dctx->encode_attn_mask;
 }
 static struct ggml_tensor * build_dia_head_outputs(struct ggml_context * ctx, dia_model * model, struct ggml_tensor * cur) {
    // going to cat the heads together and then reshape them
    struct ggml_tensor * out;
    for (int i = 0; i < model->n_output_heads; i++) {
        if (i == 0) {
            out = ggml_mul_mat(ctx, model->decoder->heads[i], cur);
        } else {
            out = ggml_concat(ctx, out, ggml_mul_mat(ctx, model->decoder->heads[i], cur), 2);
        }
    }
    struct ggml_tensor * cond = ggml_cont(ctx, ggml_view_2d(ctx, out, out->ne[0], out->ne[2], out->nb[2], 0));
    struct ggml_tensor * uncond = ggml_cont(ctx, ggml_view_2d(ctx, out, out->ne[0], out->ne[2], out->nb[2], out->nb[1]));
    return ggml_map_custom2(ctx, cond, uncond, &cfg_scale, out->ne[0], &model->cfg_scale_data);
 }
 static struct ggml_tensor * build_dia_encoder(ggml_context * ctx, dia_model * model, dia_context * dctx, dia_ubatch & batch) {
    dctx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, model->max_encoder_context_length*2);
    ggml_set_input(dctx->inp_tokens);
    dctx->encode_positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, model->max_encoder_context_length);
    ggml_set_input(dctx->encode_positions);
    struct ggml_tensor * attn_mask = build_dia_encoder_attn_mask(ctx, dctx, model);
    struct ggml_tensor * cur = ggml_reshape_3d(ctx, ggml_get_rows(ctx, model->encoder->embedding, dctx->inp_tokens), model->encoder_hidden_size, model->max_encoder_context_length, 2);
    for (auto layer : model->encoder->layers) {
        struct ggml_tensor * residual = cur;
        cur = dia_layer_norm(ctx, cur, layer->self_attn_norm);
        // self-attention
        {
            struct ggml_tensor * Qcur = ggml_mul_mat(ctx, layer->q, cur);
            struct ggml_tensor * Kcur = ggml_mul_mat(ctx, layer->k, cur);
            struct ggml_tensor * Vcur = ggml_mul_mat(ctx, layer->v, cur);
            // Strangely Dia follows the neoX Rotary Positional Embeddings Protocol
            Qcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, Qcur, model->head_size, model->encoder_attn_heads, model->max_encoder_context_length, 2)), dctx->encode_positions, model->head_size, 2);
            Kcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, Kcur, model->head_size, model->encoder_attn_heads, model->max_encoder_context_length, 2)), dctx->encode_positions, model->head_size, 2);
            struct ggml_tensor * q = ggml_cont(ctx, ggml_permute(ctx, Qcur, 0, 2, 1, 3));
            struct ggml_tensor * k = ggml_cont(ctx, ggml_permute(ctx, Kcur, 0, 2, 1, 3));
            struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
            kq = ggml_soft_max_ext(ctx, kq, attn_mask, 1.0f, 0.0f);
            struct ggml_tensor * v = ggml_cont_4d(ctx, ggml_transpose(ctx, Vcur), model->max_encoder_context_length, model->head_size, model->encoder_attn_heads, 2);
            struct ggml_tensor * kqv = ggml_mul_mat(ctx, kq, v);
            struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3);
            // It is unclear why the attention ops in Dia's encoder don't project to the embedding dimension size as is standard. Instead they up project to the decoder's embedding dimension
            // then down project back the the encoder embedding dimension. 
            cur = ggml_cont_3d(ctx, kqv_merged, model->decoder_hidden_size, model->max_encoder_context_length, 2);
            cur = ggml_mul_mat(ctx, layer->o, cur);
        }
        cur = ggml_add(ctx, cur, residual);
        struct ggml_tensor * residual_mlp = cur;
        cur = dia_layer_norm(ctx, cur, layer->mlp_norm);
        // mlp
        {
            cur = ggml_mul(ctx, ggml_silu(ctx, ggml_mul_mat(ctx, layer->gate, cur)), ggml_mul_mat(ctx, layer->up, cur));
            cur = ggml_mul_mat(ctx, layer->out, cur);
        }
        cur = ggml_add(ctx, cur, residual_mlp);
    }
    cur = dia_layer_norm(ctx, cur, model->encoder->norm);
    return cur;
 }
 static struct ggml_tensor * repeat_interleave_dim1(ggml_context * ctx, struct ggml_tensor * a, int repeat) {
    //return ggml_repeat(ctx, a, ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], 4*a->ne[1], a->ne[2], a->ne[3]));
    struct ggml_tensor * running;
    for (int i = 0; i < a->ne[1]; i++) {
        int offset = i * a->nb[1];
        struct ggml_tensor * t = ggml_cont(ctx, ggml_view_4d(ctx, a, a->ne[0], 1, a->ne[2], a->ne[3], a->nb[1], a->nb[2], a->nb[3], offset));
        t = ggml_repeat(ctx, t, ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], repeat, a->ne[2], a->ne[3]));
        if (i == 0) {
            running = t;
        } else {
            running = ggml_concat(ctx, running, t, 1);
        }
    }
    return running;
 }
 static void build_dia_self_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * k, struct ggml_tensor * v, dia_ubatch & batch, int layer_index) {
    int64_t attn_size = model->head_size * model->decoder_attn_heads;
    struct ggml_tensor * k_cache_view = 
        ggml_view_2d(
                ctx, kv->k_l[layer_index], attn_size, 2, 
                attn_size * model->max_generation_size * ggml_element_size(kv->k_l[layer_index]), 
                attn_size*dctx->current_position*ggml_element_size(kv->k_l[layer_index]));
    k = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, k, model->head_size, model->decoder_attn_heads / model->decoder_query_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2);
    // Since the sequence length should always be 1 here this is the most pertinent time to repeat the heads for grouped query attention.
    // If GGML supported a repeat_interleave op then it would be more optimal to store just the groups in the cache and interleave the attention heads after recalling
    // from the cache
    k = repeat_interleave_dim1(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, k, model->head_size, model->decoder_attn_heads / model->decoder_query_heads, batch.sequence_length, 2)), model->decoder_query_heads);
    k = ggml_cont(ctx, ggml_reshape_2d(ctx, k, attn_size, 2));
    ggml_build_forward_expand(gf, ggml_cpy(ctx, k, k_cache_view));
    struct ggml_tensor * v_cache_view = nullptr;
    v_cache_view = ggml_view_2d(
            ctx, kv->v_l[layer_index], attn_size, 2, 
            attn_size * model->max_generation_size * ggml_element_size(kv->v_l[layer_index]), 
            attn_size*dctx->current_position*ggml_element_size(kv->v_l[layer_index]));
    // Since the sequence length should always be 1 here this is the most pertinent time to repeat the heads for grouped query attention.
    // If GGML supported a repeat_interleave op then it would be more optimal to store just the groups in the cache and interleave the attention heads after recalling
    // from the cache
    v = repeat_interleave_dim1(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, v, model->head_size, model->decoder_attn_heads / model->decoder_query_heads, batch.sequence_length, 2)), model->decoder_query_heads);
    ggml_build_forward_expand(gf, ggml_cpy(ctx, v, v_cache_view));
 }
 static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * encoder_hidden_states, int layer_index) {
    dia_decoder_layer * layer = model->decoder->layers[layer_index];
    struct ggml_tensor * encoder_states_key_view = ggml_cont(ctx, ggml_view_3d(
        ctx, 
        encoder_hidden_states, 
        model->encoder_hidden_size, 
        dctx->prompt_size, 
        2, 
        model->encoder_hidden_size * ggml_element_size(encoder_hidden_states), model->encoder_hidden_size * model->max_encoder_context_length * ggml_element_size(encoder_hidden_states), 0));
    struct ggml_tensor * k = ggml_mul_mat(ctx, layer->cross_attn_k, encoder_states_key_view);
    struct ggml_tensor * positions_view = ggml_view_1d(ctx, dctx->encode_positions, dctx->prompt_size, 0);
    k = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, k, model->head_size, model->decoder_attn_heads, dctx->prompt_size, 2)), positions_view, model->head_size, 2);
    k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 1, 3, 2));
    struct ggml_tensor * k_cache_view =
        ggml_view_4d(
                ctx, kv->cross_k_l[layer_index], model->head_size, model->decoder_attn_heads, 2, dctx->prompt_size, 
                model->head_size*ggml_element_size(kv->cross_k_l[layer_index]), 
                model->head_size*model->decoder_attn_heads*ggml_element_size(kv->cross_k_l[layer_index]),
                model->head_size*model->decoder_attn_heads*2*ggml_element_size(kv->cross_k_l[layer_index]),
                0);
    ggml_build_forward_expand(gf, ggml_cpy(ctx, k, k_cache_view));
    struct ggml_tensor * v = ggml_cont(ctx, ggml_transpose(ctx, ggml_mul_mat(ctx, layer->cross_attn_v, encoder_hidden_states)));
    v = ggml_cont_4d(ctx, v, model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2);
    struct ggml_tensor * v_cache_view =
        ggml_view_4d(
                ctx, kv->cross_v_l[layer_index], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2, 
                model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]), 
                model->head_size*model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]), 
                model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(kv->cross_v_l[layer_index]), 
                0);
    ggml_build_forward_expand(gf, ggml_cpy(ctx, v, v_cache_view));
 }
 static struct ggml_tensor * build_dia_decoder(
        ggml_cgraph * gf,
        ggml_context * ctx, 
        dia_model * model, 
        dia_context * dctx, 
        dia_kv_cache * cache, 
        dia_ubatch & batch, 
        struct ggml_tensor * encoder_hidden_states) {
    dctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.sequence_length);
    ggml_set_input(dctx->positions);
    struct ggml_tensor * cur = build_dia_decoder_inp_embd(ctx, dctx, model->decoder, batch, model->n_output_heads);
    for (int l = 0; l < model->decoder->layers.size(); l++){
        dia_decoder_layer * layer = model->decoder->layers[l];
        struct ggml_tensor * residual = cur;
        cur = dia_layer_norm(ctx, cur, layer->self_attn_norm);
        // self-attention
        {
            struct ggml_tensor * Qcur = ggml_mul_mat(ctx, layer->self_attn_q, cur);
            struct ggml_tensor * Kcur = ggml_mul_mat(ctx, layer->self_attn_k, cur);
            struct ggml_tensor * Vcur = ggml_mul_mat(ctx, layer->self_attn_v, cur);
            build_dia_self_kv_store(ctx, dctx, model, cache, gf, Kcur, Vcur, batch, l);
            struct ggml_tensor * k =
                ggml_view_4d(ctx, cache->k_l[l],
                        model->head_size, model->decoder_attn_heads, dctx->current_position + 1, 2,
                        ggml_element_size(cache->k_l[l]) * model->head_size,
                        ggml_element_size(cache->k_l[l]) * model->decoder_attn_heads * model->head_size,
                        ggml_element_size(cache->k_l[l]) * model->decoder_attn_heads * model->head_size * model->max_generation_size,
                        0);
            k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));
            struct ggml_tensor * v = 
                ggml_view_3d(ctx, cache->v_l[l],
                        model->head_size * model->decoder_attn_heads, dctx->current_position + 1, 2,
                        ggml_element_size(cache->v_l[l]) * model->decoder_attn_heads * model->head_size,
                        ggml_element_size(cache->v_l[l]) * model->decoder_attn_heads * model->head_size * model->max_generation_size,
                        0);
            v = ggml_cont_4d(ctx, ggml_transpose(ctx, v), dctx->current_position + 1, model->head_size, model->decoder_attn_heads, 2); 
            // As noted in the encoder Dia uses the Neo-X protocol for RoPE.
            Qcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, Qcur, model->head_size, model->decoder_attn_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2);
            struct ggml_tensor * q = ggml_cont(ctx, ggml_permute(ctx, Qcur, 0, 2, 1, 3));
            struct ggml_tensor * kq = ggml_mul_mat(ctx, ggml_cont(ctx, k), q);
            // given that attention bias, scaling and masking are not used for decoding, it might be faster to prefer the #ggml_soft_max op here,
            kq = ggml_soft_max_ext(ctx, kq, nullptr, 1.0f, 0.0f);
            struct ggml_tensor * kqv = ggml_mul_mat(ctx, kq, v);
            struct ggml_tensor * kqv_merged = ggml_cont(ctx, ggml_permute(ctx, kqv, 2, 0, 1, 3));
            cur = ggml_cont_3d(ctx, kqv_merged, model->decoder_hidden_size, batch.sequence_length, 2);
            cur = ggml_mul_mat(ctx, layer->self_attn_o, cur);
        }
        // if we ever need to support multiple step decoder runs then this reshape will need to be replaced with permutation.
        cur = ggml_cont_2d(ctx, cur, cur->ne[0], 2);
        cur = ggml_add(ctx, cur, residual);
        struct ggml_tensor * residual_cross = cur;
        cur = dia_layer_norm(ctx, cur, layer->cross_attn_norm);
        // cross-attention
        {
            struct ggml_tensor * cross_Qcur = ggml_mul_mat(ctx, layer->cross_attn_q, cur);
            // only load the cross attention kv store when performing the encoding step
            if (batch.encoder_step) {
                build_dia_cross_kv_store(ctx, dctx, model, cache, gf, encoder_hidden_states, l);
            }
            struct ggml_tensor * cross_k = 
                ggml_view_4d(
                        ctx, cache->cross_k_l[l], model->head_size, model->decoder_attn_heads, 2,
                        model->max_encoder_context_length, model->head_size*ggml_element_size(cache->cross_k_l[l]), 
                        model->head_size*model->decoder_attn_heads*ggml_element_size(cache->cross_k_l[l]), 
                        model->head_size*model->decoder_attn_heads*2*ggml_element_size(cache->cross_k_l[l]),                 
                        0);
            // the double permute operation shouldn't be necessary here, but it seems that currently ggml permute only currently alows for a single
            // axis pair to be transposed.
            cross_k = ggml_cont(ctx, ggml_permute(ctx, ggml_permute(ctx, cross_k, 0, 1, 3, 2), 0, 2, 1, 3));
            struct ggml_tensor * cross_v = 
                ggml_cont(ctx, ggml_view_4d(
                        ctx, cache->cross_v_l[l], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2,
                        model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]), 
                        model->head_size*model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]), 
                        model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(cache->cross_v_l[l]),
                        0));
            // As noted in the encoder Dia uses the Neo-X protocol for RoPE.
            cross_Qcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, cross_Qcur, model->head_size, model->decoder_attn_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2);
            struct ggml_tensor * cross_q = ggml_cont(ctx, ggml_permute(ctx, cross_Qcur, 0, 2, 1, 3));
            struct ggml_tensor * cross_kq = ggml_mul_mat(ctx, cross_k, cross_q);
            // given that attention bias, scaling and masking are not used for decoding, it might be faster to prefer the #ggml_soft_max op here,
            cross_kq = ggml_soft_max_ext(ctx, cross_kq, nullptr, 1.0f, 0.0f);
            struct ggml_tensor * cross_kqv = ggml_mul_mat(ctx, cross_kq, cross_v);
            struct ggml_tensor * cross_kqv_merged = ggml_cont(ctx, ggml_permute(ctx, cross_kqv, 2, 0, 1, 3));
            cur = ggml_cont_3d(ctx, cross_kqv_merged, model->decoder_hidden_size, batch.sequence_length, 2);
            cur = ggml_mul_mat(ctx, layer->cross_attn_o, cur);
        }
        // if we ever need to support multiple step decoder runs then this reshape will need to be replaced with permutation.
        cur = ggml_cont_2d(ctx, cur, cur->ne[0], 2);
        cur = ggml_add(ctx, cur, residual_cross);
        struct ggml_tensor * residual_mlp = cur;
        cur = dia_layer_norm(ctx, cur, layer->mlp_norm);
        // mlp
        {
            cur = ggml_mul(ctx, ggml_silu(ctx, ggml_mul_mat(ctx, layer->gate, cur)), ggml_mul_mat(ctx, layer->up, cur));
            cur = ggml_mul_mat(ctx, layer->out, cur);
        }
        cur = ggml_add(ctx, cur, residual_mlp);
    }
    cur = dia_layer_norm(ctx, cur, model->decoder->norm);
    cur = build_dia_head_outputs(ctx, model, cur);
    return cur;
 }
 void dia_runner::tokenize_sentence(std::string sentence, dia_ubatch & batch) {
    // Dia's tokenization process is unusual. Essentially Dia takes the byte value for each character and uses that as 
    // a token array. Additionally, because Dia performs a cfg-scale adjustment before sampling tokens, it is necessary to 
    // generate with a conditioned context (i.e. with the text) and an unconditioned context (i.e. without any text) so that
    // proper adjustments can be perfored at each generation step. This means that we need to pad the end of our tokens to the 
    // max context size for both the conditional and unconditional sequence.
    // if the sentence isn't prepended by dialogue start tokens, [S1] or [S2], then append one.
    sentence = strip(sentence);
    std::string start = sentence.substr(0, 4);
    if (start != "[S1]" && start != "[S2]") {
        sentence = "[S1] " + sentence;
    }
    if (sentence[sentence.size() - 1] != '.') {
        sentence += ".";
    }
    // [S1] and [S2] are special character sequences that are replaced with the special tokens 0x01 and 0x02 respectively.
    std::string r1(1, 1);
    std::string r2(1, 2);
    while (sentence.find("[S1]") != std::string::npos) {
        size_t pos = sentence.find("[S1]");
        sentence.replace(pos, 4, r1);
    }
    while (sentence.find("[S2]") != std::string::npos) {
        size_t pos = sentence.find("[S2]");
        sentence.replace(pos, 4, r2);
    }
    if (sentence.size() > model->max_encoder_context_length) {
        TTS_ABORT("Dia currently only supports a max of %d characters and received an input of %d characters.", model->max_encoder_context_length, sentence.size());
    }
    batch.tokens.reserve(model->max_encoder_context_length * 2);
    for (auto character : sentence) {
        batch.tokens.push_back((uint32_t) character);
    }
    batch.sentence_length = batch.tokens.size();
    // this 100 token warning is arbitrarily chosen based on spot checking small prompt performance
    if (batch.sentence_length <= 100) {
        fprintf(stdout, "Your prompt has fewer than 100 tokens. Please note that Dia's generation with prompts that are fewer than 100 tokens is highly inconsistent.\n");
    }
    for (int i = (int) batch.tokens.size(); i < model->max_encoder_context_length * 2; i++) {
        batch.tokens.push_back(0u);
    }
 }
 dia_ubatch dia_runner::batch_from_sentence(std::string sentence) {
    // if we are generating a new batch from tokens then we need to run the encoder step;
    struct dia_ubatch batch{ 1, true};
    tokenize_sentence(sentence, batch);
    batch.audio_tokens.reserve(model->n_output_heads);
    for (int i = 0; i < model->n_output_heads; i++) {
        batch.audio_tokens.push_back(model->bos_token_id);
    }
    return batch;
 }
 /*
 * There are two unique features of Dia's model architecture:
 * 1.  Dia cleans its output generation by adding the difference between its text based output (its conditional output) and its unconditional output
 *     to the conditional ouput before sampling. This is why the batch is set to two throughout the graph.
 *
 * 2.  Dia's decoder attends across the entire encoded space including the pad buffer which receives a unique attention mask. This is why the 
 *     encoder sequence is always max length.
 */
 struct ggml_cgraph * dia_runner::build_dia_graph(dia_ubatch & batch) {
    init_build();
    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false);
    struct ggml_tensor * encoded_states = nullptr;
    if (batch.encoder_step) {
        encoded_states = build_dia_encoder(ctx, model, dctx, batch);
        ggml_build_forward_expand(gf, encoded_states);
    }
    struct ggml_tensor * cur = build_dia_decoder(gf, ctx, model, dctx, kv_cross_self, batch, encoded_states);
    ggml_set_name(cur, "decoder_output");
    ggml_build_forward_expand(gf, cur);
    free_build();
    return gf;
 }
 void dia_runner::configure_generation(generation_configuration * config) {
    GGML_ASSERT(config->max_tokens == 0 || config->max_tokens > model->max_delay);
    decode_sampler->temperature = config->temperature;
    decode_sampler->repetition_penalty = config->repetition_penalty;
    decode_sampler->do_sample = config->sample;
    decode_sampler->top_k = config->top_k;
    decode_sampler->top_p = config->top_p;
    dctx->max_generation_size = config->max_tokens > model->max_delay ? config->max_tokens : model->max_generation_size;
 }
 void dia_runner::set_inputs(dia_ubatch & batch) {
    if (batch.encoder_step) {
        ggml_backend_tensor_set(dctx->inp_tokens, batch.tokens.data(), 0, batch.tokens.size()*ggml_element_size(dctx->inp_tokens));
        int32_t * ep = (int32_t*) dctx->encode_positions->data;
        float * mask = (float*) dctx->encode_attn_mask->data;
        for (int i = 0; i < model->max_encoder_context_length; i++) {
            ep[i] = (int32_t) i;
            for (int ii = 0; ii < model->max_encoder_context_length; ii++) {
                if (i < batch.sentence_length) {
                    mask[i*model->max_encoder_context_length + ii] = ii < batch.sentence_length ? 0.0 : -INFINITY;
                } else {
                    mask[i*model->max_encoder_context_length + ii] = ii >= batch.sentence_length ? 0.0 : -INFINITY;
                }
            }
        }
    }
    // The audio tokens need to be repeated in the input in order to support cfg-scaling. I.E we need duplicate inputs for conditional and unconditional logits.
    ggml_backend_tensor_set(dctx->audio_inp_tokens, batch.audio_tokens.data(), 0, batch.audio_tokens.size()*ggml_element_size(dctx->audio_inp_tokens));
    ggml_backend_tensor_set(dctx->audio_inp_tokens, batch.audio_tokens.data(), batch.audio_tokens.size()*ggml_element_size(dctx->audio_inp_tokens), batch.audio_tokens.size()*ggml_element_size(dctx->audio_inp_tokens));
    ((int32_t*) dctx->positions->data)[0] = dctx->current_position;
 }
 int dia_runner::decode(dia_ubatch & batch) {
    if (batch.encoder_step) {
        dctx->prompt_size = batch.sentence_length;
        dctx->output_tokens.reserve(dctx->max_generation_size * model->n_output_heads);
    }
    ggml_backend_sched_reset(dctx->sched);
    const size_t logits_size = model->output_vocab_size * dctx->max_generation_size * model->n_output_heads;
    const size_t prev_size = dctx->buf_output ? ggml_backend_buffer_get_size(dctx->buf_output) : 0;
    const size_t new_size  = logits_size * sizeof(float);
    if (!dctx->buf_output || prev_size < new_size) {
        if (dctx->buf_output) {
            ggml_backend_buffer_free(dctx->buf_output);
            dctx->buf_output = nullptr;
            dctx->logits = nullptr;
        }
        dctx->buf_output = ggml_backend_buft_alloc_buffer(dctx->backend_cpu_buffer, new_size);
    }
    dctx->logits = (float *) ggml_backend_buffer_get_base(dctx->buf_output);
    ggml_cgraph * gf = build_dia_graph(batch);
    // the output is always the last tensor in the graph
    struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
    std::string resname = ggml_get_name(res);
    ggml_backend_sched_alloc_graph(dctx->sched, gf);
    set_inputs(batch);
    ggml_backend_sched_graph_compute_async(dctx->sched, gf);
    float * logits_out = dctx->logits + dctx->current_position * model->output_vocab_size * model->n_output_heads;
    dctx->get_ggml_node_data(res, logits_out, model->output_vocab_size * model->n_output_heads * sizeof(float));
    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
    // overlap with device computation.
    ggml_backend_sched_reset(dctx->sched);
    return 0;
 }
 dia_ubatch dia_runner::build_worst_case_batch()  {
    struct dia_ubatch batch{ 1, true };
    batch.tokens.resize(model->max_encoder_context_length * 2);
    batch.audio_tokens.resize(model->n_output_heads);
    return batch;
 }
 void dia_runner::prepare_post_load() {
    dac_runner->prepare_post_load();
    dia_kv_cache_init(kv_cross_self, model, dctx);
    auto batch = build_worst_case_batch();
    batch.sentence_length = model->max_encoder_context_length;
    dctx->prompt_size = model->max_encoder_context_length;
    auto gf = build_dia_graph(batch);
    dctx->prep_schedule(gf);
 }
 bool dia_runner::check_stopping(dia_ubatch & batch) {
    if (dctx->delay_steps == -1 && (batch.audio_tokens[0] == model->eos_token_id || dctx->current_position >= dctx->max_generation_size - model->max_delay)) {
        dctx->delay_steps = model->max_delay;
    }
    if (dctx->delay_steps > 0) {
        int step_after_eos = model->max_delay - dctx->delay_steps;
        for (int i = 0; i < model->delay_pattern.size(); i++) {
            if (step_after_eos == model->delay_pattern[i]) {
                batch.audio_tokens[i] = model->eos_token_id;
            } else if (step_after_eos > model->delay_pattern[i]) {
                batch.audio_tokens[i] = model->pad_token_id;
            }
        }
        dctx->delay_steps -= 1;
    }
    return dctx->delay_steps == 0;
 }
 void dia_runner::adjust_output_tokens(std::vector<uint32_t> & output_tokens, std::vector<uint32_t> & filtered) {
    // currently this is applying sliding window over the heads and filtering out bad tokens.
    // If we convert the DAC model's quantizer layers to support by row + column embeddings then we will need to transpose
    // the heads and the sequence here, but right now simplying using a strided view is more peformant.
    size_t size = output_tokens.size();
    filtered.reserve(size);
    for (int i = 0; i < (size / model->n_output_heads) - model->max_delay; i++) {
        bool skip_step = false;
        for (int ii = 0; ii < model->n_output_heads; ii++) {
            int next_index = i*model->n_output_heads+model->delay_pattern[ii]*model->n_output_heads+ii;
            if (next_index > size || output_tokens[next_index] >= model->audio_vocab_size) {
                skip_step = true;
                break;
            }
        }
        if (!skip_step) {
            for (int ii = 0; ii < model->n_output_heads; ii++) {
                int next_index = i*model->n_output_heads+model->delay_pattern[ii]*model->n_output_heads+ii;
                filtered.push_back(output_tokens[next_index]);
            }
        }
    }
 }
 int dia_runner::generate_from_batch(dia_ubatch & batch, struct tts_response * output) {
    while (!check_stopping(batch)) {
        int state = decode(batch);
        if (state != 0) {
            return state;
        }
        decode_sampler->sample(dctx->logits + dctx->current_position * model->n_output_heads * model->output_vocab_size, dctx->output_tokens);
        dctx->current_position += batch.sequence_length;
        batch = dia_ubatch{ 1 };
        uint32_t * last_outputs = (dctx->output_tokens.data() + (int) dctx->output_tokens.size() - model->n_output_heads);
        batch.audio_tokens.reserve(model->n_output_heads);
        for (int i = 0; i < model->n_output_heads; i++) {
            batch.audio_tokens.push_back(dctx->current_position > i ? last_outputs[i] : model->bos_token_id);
        }
    }
    std::vector<uint32_t> filtered_output_tokens;
    adjust_output_tokens(dctx->output_tokens, filtered_output_tokens);
    dac_runner->run(filtered_output_tokens.data(), (int32_t) filtered_output_tokens.size() / model->n_output_heads, output);
    return 0;
 }
 int dia_runner::generate(std::string sentence, struct tts_response * output) {
    dia_ubatch batch = batch_from_sentence(sentence);
    dctx->reset();
    decode_sampler->reset();
    dctx->current_position = 0;
    if (!kv_cross_self) {
        kv_cross_self = new dia_kv_cache;
        if (!dia_kv_cache_init(kv_cross_self, model, dctx)) {
            return 1;
        }
    }
    return generate_from_batch(batch, output);
 }
 void dia_runner::assign_weight(std::string name, ggml_tensor * tensor) {
    if (tensor->data == NULL) {
        return;
    }
    if (name.size() == 0) {
        // handles the top level meta tensor
        return;
    }
    if (name.size() > 14 && name.substr(0, 14) == "audio_encoder.") {
        dac_runner->model->assign_weight(name.substr(14), tensor);
    } else {
        model->assign_weight(name, tensor);
    }   
 }
--- a/otherarch/ttscpp/src/dia_model.h
+++ b/otherarch/ttscpp/src/dia_model.h
@ -0,0 +1,206 @@
 #pragma once
 #include "dac_model.h"
 #include "sampler.h"
 struct dia_encoder_layer {
    struct ggml_tensor * k;
    struct ggml_tensor * q;
    struct ggml_tensor * v;
    struct ggml_tensor * o;
    struct ggml_tensor * self_attn_norm;
    struct ggml_tensor * gate;
    struct ggml_tensor * up;
    struct ggml_tensor * out;
    struct ggml_tensor * mlp_norm;
 };
 struct dia_decoder_layer {
    struct ggml_tensor * self_attn_k;
    struct ggml_tensor * self_attn_q;
    struct ggml_tensor * self_attn_v;
    struct ggml_tensor * self_attn_o;
    struct ggml_tensor * self_attn_norm;
    struct ggml_tensor * cross_attn_k;
    struct ggml_tensor * cross_attn_q;
    struct ggml_tensor * cross_attn_v;
    struct ggml_tensor * cross_attn_o;
    struct ggml_tensor * cross_attn_norm;
    struct ggml_tensor * gate;
    struct ggml_tensor * up;
    struct ggml_tensor * out;
    struct ggml_tensor * mlp_norm;
    struct ggml_tensor * pad_attn_values;
 };
 struct dia_encoder {
    struct ggml_tensor * norm;
    struct ggml_tensor * embedding;
    std::vector<dia_encoder_layer*> layers;
 };
 struct dia_decoder {
    struct ggml_tensor * norm;
    std::vector<struct ggml_tensor*> embds;
    std::vector<struct ggml_tensor*> heads;
    std::vector<dia_decoder_layer*> layers;
 };
 struct dia_model : tts_model {
    // These default configurations are based on the default configuration for the Dia 1.68b param model.
    uint32_t n_output_heads = 9;
    uint32_t n_encoder_layers = 12;
    uint32_t n_decoder_layers = 18;
    uint32_t encoder_hidden_size = 1024;
    uint32_t decoder_hidden_size = 2048;
    uint32_t encoder_attn_heads = 16;
    uint32_t decoder_attn_heads = 16;
    uint32_t decoder_query_heads = 4;
    uint32_t head_size = 128;
    uint32_t eos_token_id = 1024;
    uint32_t pad_token_id = 1025;
    uint32_t bos_token_id = 1026;
    uint32_t output_vocab_size = 1028;
    uint32_t audio_vocab_size = 1024;
    uint32_t max_generation_size = 3072;
    uint32_t max_encoder_context_length = 1024;
    float cfg_scale_data[2] = {3.0, 1024.0};
    uint32_t max_delay = 15;
    std::vector<uint32_t> delay_pattern = {0, 8, 9, 10, 11, 12, 13, 14, 15};
    dia_encoder * encoder;
    dia_decoder * decoder;
    void assign_weight(std::string name, ggml_tensor * tensor);
    void assign_to_encoder(std::vector<std::string> parts, struct ggml_tensor * tensor, std::string name);
    void assign_to_decoder(std::vector<std::string> parts, struct ggml_tensor * tensor, std::string name);
    void assign_to_encoder_layer(std::string part, dia_encoder_layer * layer, struct ggml_tensor * tensor);
    void assign_to_decoder_layer(std::string part, dia_decoder_layer * layer, struct ggml_tensor * tensor);
    void prep_constants(gguf_context * meta);
    void prep_layers();
    void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only) {
        prep_constants(meta_ctx);
        prep_layers();
        tts_model::setup_from_file(meta_ctx, load_context, cpu_only, "dia", 1.30);
    }
 };
 struct dia_context : runner_context {
    dia_context(dia_model * model, int n_threads): runner_context(n_threads), model(model) {
        max_generation_size = model->max_generation_size;
    };
    uint32_t current_position = 0;  // current position in the active sequence
    int delay_steps           = -1; // the max remaining steps to take before terminating; is set after an eos token is seen on the first output channel
    size_t prompt_size        = 0;
    uint32_t max_generation_size; // this is set by the generation context or defaults to the config set on dia model.
    std::vector<uint32_t> output_tokens;
    struct dia_model * model;    
    struct ggml_tensor * inp_tokens;
    struct ggml_tensor * audio_inp_tokens;
    struct ggml_tensor * positions;
    struct ggml_tensor * encode_positions;
    struct ggml_tensor * encode_attn_mask;
    struct ggml_tensor * cross_attn_mask;
    void build_schedule() {
        runner_context::build_schedule(model->max_nodes());
    }
    void reset();
 };
 struct dia_kv_cache {
    ggml_type tensor_type = GGML_TYPE_F32;
    std::vector<struct ggml_tensor *> cross_k_l;
    std::vector<struct ggml_tensor *> cross_v_l;
    std::vector<struct ggml_tensor *> k_l;
    std::vector<struct ggml_tensor *> v_l;
    struct ggml_context * ctx;
    ggml_backend_buffer_type_t buft;
    ggml_backend_buffer_t buf;
    void free() {
        ggml_free(ctx);
        ggml_backend_buffer_free(buf);
    }
    ~dia_kv_cache() {
        free();
    }
 };
 struct dia_ubatch {
    dia_ubatch(size_t sequence_length, bool encoder_step = false): sequence_length(sequence_length), encoder_step(encoder_step) {};
    bool encoder_step; // whether we are performing the prompt encoding in this step.
    size_t sequence_length; // for just audio tokens the sequence length should be the total_tokens / num_heads; for normal generation this should always be 1.
    size_t sentence_length; // the number of non padded tokens in the conditional context
    std::vector<uint32_t> tokens; // character tokens for the encoder
    std::vector<uint32_t> audio_tokens; // audio tokens from the last generation
 };
 struct dia_context * build_new_dia_context(struct dia_model * model, int n_threads, bool use_cpu = true);
 static bool dia_kv_cache_init(struct dia_kv_cache * cache, dia_model * model, dia_context * dctx) ;
 static struct ggml_tensor * build_dia_decoder_inp_embd(struct ggml_context * ctx, dia_context *dctx, dia_decoder * decoder, dia_ubatch & batch, uint32_t n_output_heads);
 static struct ggml_tensor * dia_layer_norm(struct ggml_context * ctx, struct ggml_tensor * inputs, struct ggml_tensor * weight);
 static struct ggml_tensor * build_dia_encoder_attn_mask(ggml_context * ctx, struct dia_context * dctx, dia_model * model);
 static struct ggml_tensor * build_dia_decoder_attn_mask(ggml_context * ctx, struct dia_context * dctx, dia_ubatch & batch);
 static struct ggml_tensor * build_dia_decoder_cross_attn_mask(ggml_context * ctx, struct dia_context * dctx, dia_ubatch & batch);
 static struct ggml_tensor * build_dia_head_outputs(struct ggml_context * ctx, dia_model * model, struct ggml_tensor * cur);
 static struct ggml_tensor * build_dia_encoder(ggml_context * ctx, dia_model * model, dia_context * dctx, dia_ubatch & batch);
 static void build_dia_self_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * k, struct ggml_tensor * v, dia_ubatch & batch, int layer_index);
 static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * encoder_hidden_states, int layer_index);
 static struct ggml_tensor * build_dia_decoder( ggml_cgraph * gf, ggml_context * ctx, dia_model * model,  dia_context * dctx,  dia_kv_cache * cache, dia_ubatch & batch, struct ggml_tensor * encoder_hidden_states);
 // This struct is intended to support end-to-end TTS generation for the Dia model. As such, it manages Dia's model compilation, compute, generation,
 // tokenizationm and sampling process, and uses the dac_runner struct to encode audio outputs.
 struct dia_runner : tts_runner {
    dia_runner(dia_model * model, dac_runner * audio_decoder, dia_context * dctx, sampler * samp, dia_kv_cache * cache): model(model), dac_runner(audio_decoder), dctx(dctx), decode_sampler(samp), kv_cross_self(cache) {
        decode_sampler->vocab_size = model->output_vocab_size;
    };
    ~dia_runner() {
        if (ctx) {
            ggml_free(ctx);
        }
        model->free();
        delete model;
        delete kv_cross_self;
        delete dac_runner;
        delete dctx;
        delete decode_sampler;
    }
    struct dia_model * model;
    struct dac_runner * dac_runner;
    struct dia_context * dctx;
    struct dia_kv_cache * kv_cross_self = nullptr;
    struct sampler * decode_sampler;
    void init_build() {
        tts_runner::init_build(&dctx->buf_compute_meta);
    }
    void tokenize_sentence(std::string sentence, dia_ubatch & tokens);
    dia_ubatch batch_from_sentence(std::string sentence);
    void configure_generation(generation_configuration * config);
    void assign_weight(std::string name, ggml_tensor * tensor);
    dia_ubatch build_worst_case_batch();
    struct ggml_cgraph * build_dia_graph(dia_ubatch & batch);
    void set_inputs(dia_ubatch & batch);
    int decode(dia_ubatch & batch);
    void prepare_post_load();
    int generate(std::string sentence, struct tts_response * response);
    bool check_stopping(dia_ubatch & batch);
    void adjust_output_tokens(std::vector<uint32_t> & output_tokens, std::vector<uint32_t> & filtered);
    int generate_from_batch(dia_ubatch & batch, struct tts_response * output);
 };
--- a/otherarch/ttscpp/src/general_neural_audio_codec.cpp
+++ b/otherarch/ttscpp/src/general_neural_audio_codec.cpp
@ -0,0 +1,172 @@
 #include "general_neural_audio_codec.h"
 #include <algorithm>
 #include <stdexcept>
 #include <map>
 namespace general_neural_audio_codec {
    // This contains a mapping between string names and gguf_tensor enum values for the purposes of assigning the weights from a gguf file
    // to the general_neural_audio_codec::layer.
    // Please note that some gguf_tensor values have multiple keys; this is to support backwards compatibility with original DAC settings.
    static const std::map<std::string, gguf_tensor> GENERAL_NEURAL_AUDIO_CODEC_TENSOR_LOOKUP = {
        {".final.alpha", LAYER_ALPHA},
        {".final.bias", LAYER_INPUT_BIAS},
        {".final.weight", LAYER_INPUT_KERNEL},
        {".alpha", LAYER_ALPHA},
        {".bias", LAYER_INPUT_BIAS},
        {".weight", LAYER_INPUT_KERNEL},
        {".noise_weight", LAYER_NOISE_KERNEL},
        {".res.initial.alpha", RESIDUAL_UNIT_INPUT_ALPHA},
        {".res.initial.bias", RESIDUAL_UNIT_INPUT_BIAS},
        {".res.initial.weight", RESIDUAL_UNIT_INPUT_KERNEL},
        {".res.final.alpha", RESIDUAL_UNIT_OUTPUT_ALPHA},
        {".res.final.bias", RESIDUAL_UNIT_OUTPUT_BIAS},
        {".res.final.weight", RESIDUAL_UNIT_OUTPUT_KERNEL},
        {".in_alpha", RESIDUAL_UNIT_INPUT_ALPHA},
        {".in_bias", RESIDUAL_UNIT_INPUT_BIAS},
        {".in_weight", RESIDUAL_UNIT_INPUT_KERNEL},
        {".out_alpha", RESIDUAL_UNIT_OUTPUT_ALPHA},
        {".out_bias", RESIDUAL_UNIT_OUTPUT_BIAS},
        {".out_weight", RESIDUAL_UNIT_OUTPUT_KERNEL},
        {".out_proj.bias", QUANTIZER_LAYER_OUT_BIAS},
        {".out_proj.weight", QUANTIZER_LAYER_OUT_KERNEL},
        {".codebook.weight", QUANTIZER_LAYER_CODEBOOK},
    };
    void assign_to_residual_unit(tts_model * model, residual_unit & unit, std::string name, struct ggml_tensor * tensor) {
        try {
            gguf_tensor tensor_type = GENERAL_NEURAL_AUDIO_CODEC_TENSOR_LOOKUP.at(name);
            switch (tensor_type) {
                case RESIDUAL_UNIT_INPUT_ALPHA:
                    unit.in_alpha = ggml_dup_tensor(model->ctx, tensor);
                    model->set_tensor(unit.in_alpha, tensor);
                    break;
                case RESIDUAL_UNIT_OUTPUT_ALPHA:
                    unit.out_alpha = ggml_dup_tensor(model->ctx, tensor);
                    model->set_tensor(unit.out_alpha, tensor);
                    break;
                case RESIDUAL_UNIT_INPUT_KERNEL:
                    unit.in_conv_kernel = ggml_dup_tensor(model->ctx, tensor);
                    model->set_tensor(unit.in_conv_kernel, tensor);
                    break;
                case RESIDUAL_UNIT_OUTPUT_KERNEL:
                    unit.out_conv_kernel = ggml_dup_tensor(model->ctx, tensor);
                    model->set_tensor(unit.out_conv_kernel, tensor);
                    break;
                case RESIDUAL_UNIT_INPUT_BIAS:
                    unit.in_conv_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor));
                    model->set_tensor(unit.in_conv_bias, tensor);
                    break;
                case RESIDUAL_UNIT_OUTPUT_BIAS:
                    unit.out_conv_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor));
                    model->set_tensor(unit.out_conv_bias, tensor);
                    break;
                default:
                    fprintf(stdout, "residual unit unassigned tensor %s\n", name.c_str());
                    break;
            }
        } catch (const std::out_of_range& e) {
            TTS_ABORT("Tensor, '%s', is not a valid tensor general_neural_audio_codec::residual_unit tensor.", name.c_str());
        }
    }
    void assign_to_layer(tts_model * model, layer & l, std::string name, struct ggml_tensor * tensor) {
        if (GENERAL_NEURAL_AUDIO_CODEC_TENSOR_LOOKUP.find(name) != GENERAL_NEURAL_AUDIO_CODEC_TENSOR_LOOKUP.end()) {
            switch(GENERAL_NEURAL_AUDIO_CODEC_TENSOR_LOOKUP.at(name)) {
                case LAYER_ALPHA:
                    l.in_alpha = ggml_dup_tensor(model->ctx, tensor);
                    model->set_tensor(l.in_alpha, tensor);
                    break;
                case LAYER_INPUT_KERNEL:
                    l.in_conv_kernel = ggml_dup_tensor(model->ctx, tensor);
                    model->set_tensor(l.in_conv_kernel, tensor);
                    break;
                case LAYER_INPUT_BIAS:
                    l.in_conv_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor));
                    model->set_tensor(l.in_conv_bias, tensor);
                    break;
                case LAYER_NOISE_KERNEL:
                    l.noise_conv_kernel = ggml_dup_tensor(model->ctx, tensor);
                    model->set_tensor(l.noise_conv_kernel, tensor);
                    break;
                default:
                    fprintf(stdout, "layer unassigned tensor %s\n", name.c_str());
                    break;
            }
        } else if (std::find_if(name.begin(), name.end(), ::isdigit) != name.end())  {
            auto pair = parse_layer_count(name);
            int i = pair.first;
            std::string lt_name = pair.second;
            assign_to_residual_unit(model, l.residual_blocks[i], lt_name, tensor);
        } else {
            TTS_ABORT("Tensor, '%s', is not a valid tensor general_neural_audio_codec::layer tensor.", name.c_str());
        }
    }
    void assign_to_quantize_layer(tts_model * model, residual_vector_quantize_layer & l, std::string name, struct ggml_tensor * tensor) {
        try {
            switch(GENERAL_NEURAL_AUDIO_CODEC_TENSOR_LOOKUP.at(name)) {
                case QUANTIZER_LAYER_OUT_KERNEL:
                    l.out_proj_kernel = ggml_dup_tensor(model->ctx, tensor);
                    model->set_tensor(l.out_proj_kernel, tensor);
                    break;
                case QUANTIZER_LAYER_OUT_BIAS:
                    l.out_proj_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor));
                    model->set_tensor(l.out_proj_bias, tensor);
                    break;
                case QUANTIZER_LAYER_CODEBOOK:
                    l.codebook = ggml_dup_tensor(model->ctx, tensor);
                    model->set_tensor(l.codebook, tensor);
                    break;
                default:
                    fprintf(stdout, "quantized layer unassigned tensor %s\n", name.c_str());
                    break;
            }
        } catch (const std::out_of_range& e) {
            // older GGUF files still have the unused in_proj convolutional layer, so ignore it if we find it.
            if (!has_prefix(name, ".in_proj")) {
                TTS_ABORT("Error: %s\nTensor, '%s', is not a valid tensor.", e.what(), name.c_str());
            }
        }
    }
    struct ggml_tensor * build_residual_unit(ggml_context * ctx, struct ggml_tensor * cur, residual_unit & unit) {
        struct ggml_tensor * residual = cur;
        cur = snake_1d(ctx, unit.in_alpha, cur);
        if (unit.groups > 1) {
            // depthwise 1d convolution is equivalent to convolution in which grouping is equal to filter size.
            // If there is a divergence between filter size and grouping then the kernel's output filters will not be zero.
            TTS_ASSERT(unit.in_conv_kernel->ne[1] == 1);
            cur = ggml_conv_1d_dw_tts(ctx, unit.in_conv_kernel, cur, 1, unit.padding, unit.dilation);
        } else {
            cur = ggml_conv_1d_tts(ctx, unit.in_conv_kernel, cur, 1, unit.padding, unit.dilation);
        }
        cur = ggml_add(ctx, cur, unit.in_conv_bias);
        cur = snake_1d(ctx, unit.out_alpha, cur);
        cur = ggml_conv_1d_tts(ctx, unit.out_conv_kernel, cur, 1, 0, 1);
        cur = ggml_add(ctx, cur, unit.out_conv_bias);
        return ggml_add(ctx, cur, residual);
    }
    struct ggml_tensor * build_layer(ggml_context * ctx, struct ggml_tensor * cur, layer & l, struct ggml_tensor * noise) {
        cur = snake_1d(ctx, l.in_alpha, cur);
        cur = ggml_conv_transpose_1d_tts(ctx, l.in_conv_kernel, cur, l.stride, l.padding, 1, 0, 1);
        cur = ggml_add(ctx, cur, l.in_conv_bias);
        if (l.noise_conv_kernel && noise) {
            struct ggml_tensor * x = ggml_conv_1d_tts(ctx, l.noise_conv_kernel, cur, 1, 0, 1);
            x = ggml_mul(ctx, x, noise);
            cur = ggml_add(ctx, cur, x);
        }
        for (int i = 0; i < l.residual_blocks.size(); i++) {
           cur = build_residual_unit(ctx, cur, l.residual_blocks[i]);
        }
        return cur;
    }
    struct ggml_tensor * build_quantize_layer(ggml_context * ctx, struct ggml_tensor * cur, residual_vector_quantize_layer & l) {
        cur = ggml_get_rows(ctx, l.codebook, cur);
        cur = ggml_cont(ctx, ggml_transpose(ctx, cur));
        cur = ggml_conv_1d_tts(ctx, l.out_proj_kernel, cur, 1, 0, 1);
        cur = ggml_add(ctx, cur, l.out_proj_bias);
        return cur;
    }
 }
--- a/otherarch/ttscpp/src/general_neural_audio_codec.h
+++ b/otherarch/ttscpp/src/general_neural_audio_codec.h
@ -0,0 +1,67 @@
 #pragma once
 #include "tts_model.h"
 // This namespace implements a general abstraction of the core functionality used in common neural audio codecs like DAC and SNAC.
 namespace general_neural_audio_codec {
    enum gguf_tensor {
        LAYER_ALPHA,
        LAYER_INPUT_KERNEL,
        LAYER_INPUT_BIAS,
        LAYER_NOISE_KERNEL,
        RESIDUAL_UNIT_INPUT_ALPHA,
        RESIDUAL_UNIT_OUTPUT_ALPHA,
        RESIDUAL_UNIT_INPUT_KERNEL,
        RESIDUAL_UNIT_OUTPUT_KERNEL,
        RESIDUAL_UNIT_INPUT_BIAS,
        RESIDUAL_UNIT_OUTPUT_BIAS,
        QUANTIZER_LAYER_OUT_KERNEL,
        QUANTIZER_LAYER_OUT_BIAS,
        QUANTIZER_LAYER_CODEBOOK
    };
    struct residual_vector_quantize_layer {
        struct ggml_tensor * out_proj_kernel;
        struct ggml_tensor * out_proj_bias;
        struct ggml_tensor * codebook;
    };
    struct residual_unit {
        residual_unit(uint32_t padding, uint32_t dilation, uint32_t groups = 1): padding(padding), dilation(dilation), groups(groups) {}
        struct ggml_tensor * in_alpha;
        struct ggml_tensor * in_conv_kernel;
        struct ggml_tensor * in_conv_bias;
        struct ggml_tensor * out_alpha;
        struct ggml_tensor * out_conv_kernel;
        struct ggml_tensor * out_conv_bias;
        uint32_t padding;
        uint32_t dilation;
        uint32_t groups;
    };
    struct layer {
        layer(uint32_t padding, uint32_t stride, uint32_t groups = 1): padding(padding), stride(stride) {
            for (int i = 0; i < 3; i++) {
                residual_blocks.push_back(residual_unit{(uint32_t) pow(3, (i + 1)), (uint32_t) pow(3, i), groups});
            }
        }
        struct ggml_tensor * in_alpha;
        struct ggml_tensor * in_conv_kernel;
        struct ggml_tensor * in_conv_bias;
        struct ggml_tensor * noise_conv_kernel = nullptr;
        uint32_t padding;
        uint32_t stride;
        std::vector<residual_unit> residual_blocks;
    };
    void assign_to_residual_unit(tts_model * model, residual_unit & unit, std::string name, struct ggml_tensor * tensor);
    void assign_to_layer(tts_model * model, layer & l, std::string name, struct ggml_tensor * tensor);
    void assign_to_quantize_layer(tts_model * model, residual_vector_quantize_layer & l, std::string name, struct ggml_tensor * tensor);
    struct ggml_tensor * build_residual_unit(ggml_context * ctx, struct ggml_tensor * cur, residual_unit & unit);
    struct ggml_tensor * build_layer(ggml_context * ctx, struct ggml_tensor * cur, layer & l, struct ggml_tensor * noise = nullptr);
    struct ggml_tensor * build_quantize_layer(ggml_context * ctx, struct ggml_tensor * cur, residual_vector_quantize_layer & l);
 }
--- a/otherarch/ttscpp/src/kokoro_model.cpp
+++ b/otherarch/ttscpp/src/kokoro_model.cpp
--- a/otherarch/ttscpp/src/kokoro_model.h
+++ b/otherarch/ttscpp/src/kokoro_model.h
@ -0,0 +1,462 @@
 #ifndef kokoro_model_h
 #define kokoro_model_h
 #include <stdlib.h>
 #include "tts_model.h"
 #include "tokenizer.h"
 #include "phonemizer.h"
 // Rather than using ISO 639-2 language codes, Kokoro voice pack specify their corresponding language via their first letter.
 // Below is a map that describes the relationship between those designations and espeak-ng's voice identifiers so that the 
 // appropriate phonemization protocol can inferred from the Kokoro voice.
 static std::map<char, std::string> KOKORO_LANG_TO_ESPEAK_ID = {
 	{'a', "gmw/en-US"},
 	{'b', "gmw/en"},
 	{'e', "roa/es"},
 	{'f', "roa/fr"},
 	{'h', "inc/hi"},
 	{'i', "roa/it"},
 	{'j', "jpx/ja"},
 	{'p', "roa/pt-BR"},
 	{'z', "sit/cmn"}
 };
 struct lstm_cell {
 	std::vector<ggml_tensor*> weights; 
 	std::vector<ggml_tensor*> biases;
 	std::vector<ggml_tensor*> reverse_weights;
 	std::vector<ggml_tensor*> reverse_biases;
 };
 struct lstm {
 	std::vector<ggml_tensor*> hidden;
 	std::vector<ggml_tensor*> states;
 	bool bidirectional = false;
 	std::vector<lstm_cell*> cells;
 };
 struct duration_predictor_layer {
 	lstm * rnn;
 	struct ggml_tensor * ada_norm_gamma_weight;
 	struct ggml_tensor * ada_norm_gamma_bias;
 	struct ggml_tensor * ada_norm_beta_weight;
 	struct ggml_tensor * ada_norm_beta_bias;
 };
 struct ada_residual_conv_block {
 	struct ggml_tensor * conv1;
 	struct ggml_tensor * conv1_bias;
 	struct ggml_tensor * conv2;
 	struct ggml_tensor * conv2_bias;
 	struct ggml_tensor * norm1_gamma;
 	struct ggml_tensor * norm1_gamma_bias;
 	struct ggml_tensor * norm1_beta;
 	struct ggml_tensor * norm1_beta_bias;
 	struct ggml_tensor * norm2_gamma;
 	struct ggml_tensor * norm2_gamma_bias;
 	struct ggml_tensor * norm2_beta;
 	struct ggml_tensor * norm2_beta_bias;
 	struct ggml_tensor * pool = nullptr;
 	struct ggml_tensor * pool_bias = nullptr;
 	struct ggml_tensor * upsample = nullptr;
 	struct ggml_tensor * upsample_bias = nullptr;
 };
 struct duration_predictor {
 	struct ggml_tensor * albert_encode;
 	struct ggml_tensor * albert_encode_bias;
 	std::vector<duration_predictor_layer*> layers;
 	lstm * duration_proj_lstm;
 	struct ggml_tensor * duration_proj;
 	struct ggml_tensor * duration_proj_bias;
 	struct ggml_tensor * n_proj_kernel;
 	struct ggml_tensor * n_proj_bias;
 	struct ggml_tensor * f0_proj_kernel;
 	struct ggml_tensor * f0_proj_bias;
 	lstm * shared_lstm;
 	std::vector<ada_residual_conv_block*> f0_blocks;
 	std::vector<ada_residual_conv_block*> n_blocks;
 };
 struct kokoro_text_encoder_conv_layer {
 	struct ggml_tensor * norm_gamma;
 	struct ggml_tensor * norm_beta;
 	struct ggml_tensor * conv_weight;
 	struct ggml_tensor * conv_bias;
 };
 struct kokoro_text_encoder {
 	struct ggml_tensor * embd;
 	std::vector<kokoro_text_encoder_conv_layer*> conv_layers;
 	lstm * out_lstm;
 };
 struct kokoro_generator_residual_block {
 	std::vector<uint32_t> conv1_dilations;
 	std::vector<uint32_t> conv1_paddings;
 	std::vector<ggml_tensor*> adain1d_1_gamma_weights;
 	std::vector<ggml_tensor*> adain1d_2_gamma_weights;
 	std::vector<ggml_tensor*> adain1d_1_gamma_biases;
 	std::vector<ggml_tensor*> adain1d_2_gamma_biases;
 	std::vector<ggml_tensor*> adain1d_1_beta_weights;
 	std::vector<ggml_tensor*> adain1d_2_beta_weights;
 	std::vector<ggml_tensor*> adain1d_1_beta_biases;
 	std::vector<ggml_tensor*> adain1d_2_beta_biases;
 	std::vector<ggml_tensor*> input_alphas;
 	std::vector<ggml_tensor*> output_alphas;
 	std::vector<ggml_tensor*> convs1_weights;
 	std::vector<ggml_tensor*> convs1_biases;
 	std::vector<ggml_tensor*> convs2_weights;
 	std::vector<ggml_tensor*> convs2_biases;
 };
 struct kokoro_noise_residual_block {
 	uint32_t input_conv_stride;
 	uint32_t input_conv_padding;
 	struct ggml_tensor * input_conv;
 	struct ggml_tensor * input_conv_bias;
 	struct kokoro_generator_residual_block * res_block;
 };
 struct kokoro_generator_upsample_block {
 	uint32_t padding;
 	uint32_t stride;
 	// these are just conv transpose layers
 	struct ggml_tensor * upsample_weight;
 	struct ggml_tensor * upsample_bias;
 };
 struct kokoro_generator {
 	// unfortunately the squared sum of the windows needs to be computed dynamically per run because it is dependent
 	// on the sequence size of the generation and the hop is typically less than half the size of our window.
 	struct ggml_tensor * window;
 	struct ggml_tensor * m_source_weight;
 	struct ggml_tensor * m_source_bias;
 	struct ggml_tensor * out_conv_weight;
 	struct ggml_tensor * out_conv_bias;
 	std::vector<kokoro_noise_residual_block*> noise_blocks;
 	std::vector<kokoro_generator_residual_block*> res_blocks;
 	std::vector<kokoro_generator_upsample_block*> ups;
 };
 struct kokoro_decoder {
 	struct ggml_tensor * f0_conv;
 	struct ggml_tensor * f0_conv_bias;
 	struct ggml_tensor * n_conv;
 	struct ggml_tensor * n_conv_bias;
 	struct ggml_tensor * asr_conv;
 	struct ggml_tensor * asr_conv_bias;
 	std::vector<ada_residual_conv_block*> decoder_blocks;
 	ada_residual_conv_block* encoder_block;
 	kokoro_generator * generator;
 };
 struct albert_layer {
 	struct ggml_tensor * ffn;
 	struct ggml_tensor * ffn_out;
 	struct ggml_tensor * ffn_bias;
 	struct ggml_tensor * ffn_out_bias;
 	struct ggml_tensor * layer_output_norm_weight;
 	struct ggml_tensor * layer_output_norm_bias;
 	struct ggml_tensor * q;
 	struct ggml_tensor * k;
 	struct ggml_tensor * v;
 	struct ggml_tensor * o;
 	struct ggml_tensor * q_bias;
 	struct ggml_tensor * k_bias;
 	struct ggml_tensor * v_bias;
 	struct ggml_tensor * o_bias;
 	struct ggml_tensor * attn_norm_weight;
 	struct ggml_tensor * attn_norm_bias;
 };
 struct kokoro_model : tts_model {
 	// standard configruation for Kokoro's Albert model
 	// tokenization
 	uint32_t bos_token_id = 0;
 	uint32_t eos_token_id = 0;
 	uint32_t space_token_id = 16;
 	// duration prediction
 	uint32_t max_context_length = 512;
 	uint32_t vocab_size = 178;
 	uint32_t hidden_size = 768;
 	uint32_t n_attn_heads = 12;
 	uint32_t n_layers = 1;
 	uint32_t n_recurrence = 12;
 	uint32_t head_size = 64;
 	uint32_t duration_hidden_size = 512;
 	uint32_t up_sampling_factor;
 	float upsample_scale = 300.0f;
 	float scale = 0.125f;
 	// standard configuration for duration prediction
 	uint32_t f0_n_blocks = 3;
 	uint32_t n_duration_prediction_layers = 3;
 	// while it is technically possible for the duration predictor to assign 50 values per token there is no practical need to 
 	// allocate that many items to the sequence as it is impossible for all tokens to require such long durations and each 
 	// allocation increases node allocation size by O(N)
 	uint32_t max_duration_per_token = 20;
 	uint32_t style_half_size = 128;
 	// standard text encoding configuration
 	uint32_t n_conv_layers = 3;
 	// standard decoder configuration
 	uint32_t n_kernels = 3;
 	uint32_t n_upsamples = 2;
 	uint32_t n_decoder_blocks = 4;
 	uint32_t n_res_blocks = 6;
 	uint32_t n_noise_blocks = 2;
 	uint32_t out_conv_padding = 3;
 	uint32_t post_n_fft = 11;
 	uint32_t true_n_fft = 20;
 	uint32_t stft_hop = 5;
 	uint32_t harmonic_num = 8;
 	float sin_amp = 0.1f;
 	float noise_std = 0.003f;
 	float voice_threshold = 10.0f;
 	float sample_rate = 24000.0f;
 	std::string window = "hann"; 
 	// It is really annoying that ggml doesn't allow using non ggml tensors as the operator for simple math ops.
 	// This is just the constant defined above as a tensor.
 	struct ggml_tensor * n_kernels_tensor;
 	// Kokoro loads albert with use_pooling = true but doesn't use the pooling outputs.
 	bool uses_pooling = false;
 	bool static_token_types = true;
 	std::map<std::string, struct ggml_tensor *> voices;
 	// Albert portion of the model
 	struct ggml_tensor * embd_hidden;
 	struct ggml_tensor * embd_hidden_bias;
 	struct ggml_tensor * token_type_embd = nullptr;
 	struct ggml_tensor * token_embd;
 	struct ggml_tensor * position_embd;
 	struct ggml_tensor * input_norm_weight;
 	struct ggml_tensor * input_norm_bias;
 	struct ggml_tensor * static_token_type_values = nullptr;
 	struct ggml_tensor * pool = nullptr;
 	struct ggml_tensor * pool_bias = nullptr;
 	std::vector<albert_layer*> layers;
 	struct ggml_tensor * harmonic_sampling_norm = nullptr; // a static 1x9 harmonic multiplier
 	struct ggml_tensor * sampling_factor_scalar = nullptr; // a static scalar
 	struct ggml_tensor * sqrt_tensor = nullptr; // static tensor for constant division
 	// Prosody Predictor portion of the model
 	struct duration_predictor * prosody_pred;
 	// Text encoding portion of the model
 	struct kokoro_text_encoder * text_encoder;
 	// Decoding and Generation portion of the model
 	struct kokoro_decoder * decoder;
 	// the default hidden states need to be initialized 
 	std::vector<lstm*> lstms;
 	size_t duration_node_counter = 0;
 	size_t generation_node_counter = 0;
 	// setting this is likely unnecessary as it is precomputed by the post load function.
 	uint32_t post_load_tensor_bytes = 13000;
 	size_t max_gen_nodes();
 	size_t max_duration_nodes();
 	lstm * prep_lstm();
 	// helper functions for assigning tensors to substructs
 	void assign_lstm(lstm * rnn, std::string name, ggml_tensor * tensor);
 	void assign_generator_weight(kokoro_generator * generator, std::string name, ggml_tensor * tensor);
 	void assign_gen_resblock(kokoro_generator_residual_block * block, std::string name, ggml_tensor * tensor);
 	void assign_ada_res_block(ada_residual_conv_block * block, std::string name, ggml_tensor * tensor);
 	void assign_decoder_weight(std::string name, ggml_tensor * tensor);
 	void assign_duration_weight(std::string name, ggml_tensor * tensor);
 	void assign_text_encoder_weight(std::string name, ggml_tensor * tensor);
 	void assign_albert_weight(std::string name, ggml_tensor * tensor);
 	void post_load_assign();
    void assign_weight(std::string name, ggml_tensor * tensor);
    void prep_layers(gguf_context * meta);
    void prep_constants(gguf_context * meta);
    void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only = true) {
    	std::function<void (ggml_tensor *)> fn = ([&](ggml_tensor* cur) {
    		std::string name = ggml_get_name(cur);
    		size_t increment = 1;
    		if (name.find("lstm") != std::string::npos) {
    			increment = max_context_length;
    		}
    		if (name.find("duration_predictor") != std::string::npos) {
    			duration_node_counter += increment;
    		} else {
    			generation_node_counter += increment;
    		}
    	});
    	compute_tensor_meta_cb = &fn;
        prep_constants(meta_ctx);
        prep_layers(meta_ctx);
        tts_model::setup_from_file(meta_ctx, load_context, cpu_only, "kokoro", 1.6, post_load_tensor_bytes);
    }
 };
 struct kokoro_ubatch {
    size_t n_tokens; // the number of tokens in our encoded sequence
    uint32_t * input_tokens;    // [n_tokens]
    struct kokoro_duration_response * resp = nullptr;
 };
 struct kokoro_duration_context : runner_context {
    kokoro_duration_context(kokoro_model * model, int n_threads): runner_context(n_threads), model(model) {};
    ~kokoro_duration_context() {
        ggml_backend_buffer_free(buf_len_output);
    }
    std::string voice = "af_alloy";
    struct kokoro_model * model;
    ggml_backend_buffer_t buf_len_output = nullptr;
    size_t  logits_size = 0; // capacity (of floats) for logits
    float * lens 		= nullptr;
    struct ggml_tensor * inp_tokens;
    struct ggml_tensor * positions;
    struct ggml_tensor * attn_mask;
    struct ggml_tensor * token_types = nullptr;
    void build_schedule() {
        runner_context::build_schedule(model->max_duration_nodes()*5);
    }
 };
 static struct ggml_tensor * build_albert_attn_mask(ggml_context * ctx, struct kokoro_duration_context *kctx, const kokoro_ubatch & batch);
 static struct ggml_tensor * build_albert_inputs(ggml_context * ctx, kokoro_model * model, ggml_tensor * input_tokens, ggml_tensor * positions, ggml_tensor * token_types);
 static struct ggml_tensor * build_albert_norm(ggml_context * ctx, ggml_tensor * cur, ggml_tensor * weight, ggml_tensor * bias);
 static struct ggml_tensor * build_ada_residual_conv(ggml_context * ctx, struct ggml_tensor * x, ada_residual_conv_block * block, struct ggml_tensor * style, struct ggml_tensor * sqrt_tensor);
 static struct ggml_tensor * build_kokoro_generator_res_block(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * style, kokoro_generator_residual_block * block);
 static struct ggml_tensor * build_noise_block(ggml_context * ctx, kokoro_noise_residual_block * block, struct ggml_tensor * x, struct ggml_tensor * style);
 static kokoro_generator_residual_block * build_res_block_from_file(gguf_context * meta, std::string base_config_key);
 static kokoro_noise_residual_block * build_noise_block_from_file(gguf_context * meta, int index);
 static kokoro_generator_upsample_block* kokoro_generator_upsample_block(gguf_context * meta, int index);
 std::string get_espeak_id_from_kokoro_voice(std::string voice);
 struct kokoro_duration_context * build_new_duration_kokoro_context(struct kokoro_model * model, int n_threads, bool use_cpu = true);
 struct kokoro_duration_response {
 	size_t n_outputs;
 	float * lengths;
 	float * hidden_states;
 };
 // This struct is intended to manage graph and compute for the duration prediction portion of the kokoro model.
 // Duration computation and speech generation are separated into distinct graphs because the precomputed graph structure of ggml doesn't 
 // support the tensor dependent views that would otherwise be necessary.
 struct kokoro_duration_runner : tts_runner {
    kokoro_duration_runner(kokoro_model * model, kokoro_duration_context * context, single_pass_tokenizer * tokenizer): model(model), kctx(context), tokenizer(tokenizer) {};
    ~kokoro_duration_runner() {
        if (ctx) {
            ggml_free(ctx);
        }
        model->free();
        delete model;
        delete kctx;
    }
    struct single_pass_tokenizer * tokenizer;
    kokoro_model * model;
    kokoro_duration_context * kctx;
    void init_build() {
        tts_runner::init_build(&kctx->buf_compute_meta);
    }
    void prepare_post_load();
    struct kokoro_ubatch build_worst_case_batch();
    void set_inputs(kokoro_ubatch & batch);
    struct ggml_cgraph * build_kokoro_duration_graph(kokoro_ubatch & batch);
    void run(kokoro_ubatch & ubatch);
 };
 struct kokoro_context : runner_context {
    kokoro_context(kokoro_model * model, int n_threads): runner_context(n_threads), model(model) {};
    ~kokoro_context() {
        ggml_backend_sched_free(sched);
        ggml_backend_free(backend_cpu);
        if (backend) {
            ggml_backend_free(backend);
        }
        if (buf_output) {
            ggml_backend_buffer_free(buf_output);
        }
    }
    std::string voice = "af_alloy";
    struct kokoro_model * model;
    uint32_t total_duration;
    uint32_t sequence_length;
    struct ggml_tensor * inp_tokens;
    struct ggml_tensor * duration_pred;
    struct ggml_tensor * duration_mask;
    struct ggml_tensor * window_sq_sum; // needs to be calculatd from the generator window.
    struct ggml_tensor * uv_noise_data;
    void build_schedule() {
        runner_context::build_schedule(model->max_gen_nodes()*30);
    }
 };
 // TODO: now that we are passing the context down to these methods we should clean up their parameters
 static struct ggml_tensor * build_generator(ggml_context * ctx, kokoro_model * model, kokoro_context * kctx, struct ggml_tensor * x, struct ggml_tensor * style, struct ggml_tensor * f0_curve, kokoro_generator* generator, int sequence_length, struct ggml_tensor * window_sq_sum, ggml_cgraph * gf);
 static struct ggml_tensor * build_sin_gen(ggml_context * ctx, kokoro_model * model, kokoro_context * kctx, struct ggml_tensor * x, int harmonic_num, int sequence_length, float voice_threshold, float sin_amp, float noise_std);
 struct kokoro_context * build_new_kokoro_context(struct kokoro_model * model, int n_threads, bool use_cpu = true);
 // This manages the graph compilation of computation for the Kokoro model.
 struct kokoro_runner : tts_runner {
    kokoro_runner(kokoro_model * model, kokoro_context * context, single_pass_tokenizer * tokenizer, kokoro_duration_runner * drunner, phonemizer * phmzr): model(model), kctx(context), tokenizer(tokenizer), drunner(drunner), phmzr(phmzr) {
    	tts_runner::sampling_rate = 24000.0f;
    	tts_runner::supports_voices = true;
    };
    ~kokoro_runner() {
        if (ctx) {
            ggml_free(ctx);
        }
        delete drunner;
        model->free();
        delete model;
        delete kctx;
        delete phmzr;
    }
    struct single_pass_tokenizer * tokenizer;
    kokoro_model * model;
    kokoro_context * kctx;
    kokoro_duration_runner * drunner;
    phonemizer * phmzr;
    std::string default_voice = "af_alloy";
    void init_build() {
        tts_runner::init_build(&kctx->buf_compute_meta);
    }
    std::vector<std::string> list_voices();
    std::vector<std::vector<uint32_t>> tokenize_chunks(std::vector<std::string> clauses);
    void assign_weight(std::string name, ggml_tensor * tensor);
    void prepare_post_load();
    kokoro_ubatch build_worst_case_batch();
    void set_inputs(kokoro_ubatch & batch, uint32_t total_size);
    struct ggml_cgraph * build_kokoro_graph(kokoro_ubatch & batch);
    void run(kokoro_ubatch & batch, struct tts_response * outputs);
    int generate(std::string prompt, struct tts_response * response, std::string voice, std::string voice_code = "");
 };
 #endif
--- a/otherarch/ttscpp/src/orpheus_model.cpp
+++ b/otherarch/ttscpp/src/orpheus_model.cpp
@ -0,0 +1,475 @@
 #include "orpheus_model.h"
 #include <array>
 // These tokens and variables aren't defined in the Orpheus' model configuration but instead are defined inline in various python functions.
 // As such, they are not discoverable so defining them as unconfigurable constants should be fine.
 static constexpr std::array<const char *, 7> orpheus_voices{"zoe", "zac","jess", "leo", "mia", "julia", "leah"};
 static constexpr std::array<uint32_t, 2> orpheus_prepended_tokens = { 128259, 128000 };
 static constexpr std::array<uint32_t, 4> orpheus_appended_tokens = { 128009, 128260, 128261, 128257 };
 void orpheus_model::assign_weight(std::string name, struct ggml_tensor * tensor) {
    if (name == "norm") {
        output_norm = ggml_dup_tensor(ctx, tensor);
        set_tensor(output_norm, tensor);
    } else if (name == "lm_head") {
        head = ggml_dup_tensor(ctx, tensor);
        set_tensor(head, tensor);
    } else if (name == "embed_tokens") {
        embd = ggml_dup_tensor(ctx, tensor);
        set_tensor(embd, tensor);
    } else if (name == "rope_frequencies") {
        rope_frequencies = ggml_dup_tensor(ctx, tensor);
        set_tensor(rope_frequencies, tensor);
    } else if (has_prefix(name, "layers")) {
        auto lpair = parse_layer_count(name);
        int l = lpair.first;
        std::string lt_name = lpair.second;
        assign_to_layer(lt_name, layers[l], tensor);
    }
 }
 void orpheus_model::assign_to_layer(std::string part, orpheus_layer & layer, struct ggml_tensor * tensor) {
    if (part == ".self_attn.k_proj") {
        layer.k = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer.k, tensor);
    } else if (part == ".self_attn.q_proj") {
        layer.q = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer.q, tensor);
    } else if (part == ".self_attn.v_proj") {
        layer.v = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer.v, tensor);
    } else if (part == ".self_attn.o_proj") {
        layer.o = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer.o, tensor);
    } else if (part == ".mlp.gate_proj") {
        layer.gate = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer.gate, tensor);
    } else if (part == ".mlp.up_proj") {
        layer.up = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer.up, tensor);
    } else if (part == ".mlp.down_proj") {
        layer.down = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer.down, tensor);
    } else if (part == ".input_layernorm") {
        layer.input_norm = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer.input_norm, tensor);
    } else if (part == ".post_attention_layernorm") {
        layer.post_attention_norm = ggml_dup_tensor(ctx, tensor);
        set_tensor(layer.post_attention_norm, tensor);
    }
 }
 void orpheus_model::prep_constants(gguf_context * meta) {
    // get constants for orpheus
    int vocab_size_key = gguf_find_key(meta, "orpheus.vocab_size");
    if (vocab_size_key != -1) {
        vocab_size = gguf_get_val_u32(meta, vocab_size_key);
    }
    int attn_heads_key = gguf_find_key(meta, "orpheus.attn_heads");
    if (attn_heads_key != -1) {
        n_attn_heads = gguf_get_val_u32(meta, attn_heads_key);
    }
    int kv_attn_heads_key = gguf_find_key(meta, "orpheus.kv_attn_heads");
    if (kv_attn_heads_key != -1) {
        n_kv_attn_heads = gguf_get_val_u32(meta, kv_attn_heads_key);
    }
    int head_size_key = gguf_find_key(meta, "orpheus.head_dim");
    if (head_size_key != -1) {
        head_size = gguf_get_val_u32(meta, head_size_key);
    }
    int stopping_token_key = gguf_find_key(meta, "orpheus.stopping_token_id");
    if (stopping_token_key != -1) {
        stopping_token_id = gguf_get_val_u32(meta, stopping_token_key);;
    }
    int eos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.eos_token_id");
    if (eos_token_id_key != -1) {
        eos_token_id = gguf_get_val_u32(meta, eos_token_id_key);
    }
    int bos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.bos_token_id");
    if (bos_token_id_key != -1) {
        bos_token_id = gguf_get_val_u32(meta, bos_token_id_key);
    }
    int hidden_size_key = gguf_find_key(meta, "orpheus.hidden_size");
    if (hidden_size_key != -1) {
        hidden_size = gguf_get_val_u32(meta, hidden_size_key);
    }
    int kv_hidden_size_key = gguf_find_key(meta, "orpheus.kv_hidden_size");
    if (kv_hidden_size_key != -1) {
        kv_hidden_size = gguf_get_val_u32(meta, kv_hidden_size_key);
    }
 }
 void orpheus_model::prep_layers(gguf_context * meta) {
    int n_layers_key = gguf_find_key(meta, "orpheus.layers");
    if (n_layers_key == -1) {
        TTS_ABORT("the 'orpheus.layers' must be specified in the GGUF file.");
    }
    n_layers = (int) gguf_get_val_u32(meta, n_layers_key);
    for (int i = 0; i < n_layers; i++) {
        layers.push_back(orpheus_layer{});
    }
 }
 struct ggml_tensor * orpheus_build_layer_norm(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * weight) {
    float eps = 0.00001;
    return ggml_mul(ctx, ggml_rms_norm(ctx, x, eps), weight);
 }
 struct ggml_tensor * build_attn_mask(ggml_context * ctx, orpheus_context * octx, orpheus_ubatch & batch) {
    octx->attn_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, (int64_t) octx->current_position + batch.n_tokens, (int64_t) octx->current_position + batch.n_tokens);
    ggml_set_input(octx->attn_mask);
    return octx->attn_mask;
 }
 void orpheus_context::reset() {
    output_tokens.clear();
    current_position = 0;
    n_outputs = 0;
 }
 orpheus_context * build_new_orpheus_context(orpheus_model * model, int n_threads, bool use_cpu) {
    orpheus_context * octx = new orpheus_context(model, n_threads);
    if (!use_cpu) {
 #ifdef GGML_USE_METAL
        octx->backend = ggml_backend_metal_init();
 #endif
    }
    octx->backend_cpu = ggml_backend_cpu_init();
    octx->set_threads();
    octx->build_schedule();
    octx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_nodes() + ggml_graph_overhead_custom(model->max_nodes(), false));
    return octx;
 }
 void orpheus_runner::orpheus_kv_cache_init() {    
    ggml_backend_buffer_type_t buft = nullptr;
    if (octx->backend != nullptr) {
 #ifdef GGML_USE_METAL
        buft = ggml_backend_metal_buffer_type();
 #endif
    } else {
        buft = ggml_backend_cpu_buffer_type();
    }
    struct ggml_init_params params = {
        /*.mem_size   =*/ (2u * model->layers.size() + 1)*ggml_tensor_overhead(),
        /*.mem_buffer =*/ NULL,
        /*.no_alloc   =*/ true,
    };
    ggml_context * ctx = ggml_init(params);
    if (!ctx) {
        TTS_ABORT("%s: failed to initialze ggml context for key value cache.\n", __func__);
    }
    if (!kv_self) {
        kv_self = new orpheus_kv_cache;
    }
    kv_self->ctx = ctx;
    kv_self->k_l.reserve(model->layers.size());
    kv_self->v_l.reserve(model->layers.size());
    for (int i = 0; i < (int) model->layers.size(); i++) {
        ggml_tensor * k = ggml_new_tensor_1d(kv_self->ctx, kv_self->cache_type, model->hidden_size * (model->max_context_length + model->max_generation_size));
        ggml_tensor * v = ggml_new_tensor_1d(kv_self->ctx, kv_self->cache_type, model->hidden_size * (model->max_context_length + model->max_generation_size));
        ggml_format_name(k, "cache_k_l%d", i);
        ggml_format_name(v, "cache_v_l%d", i);
        kv_self->k_l.push_back(k);
        kv_self->v_l.push_back(v);
    }
    // allocate tensors and initialize the buffers to avoid NaNs in the padding
    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(kv_self->ctx, buft);
    ggml_backend_buffer_clear(buf, 0);
    kv_self->buf = buf;
 }
 void orpheus_runner::orpheus_build_kv_store(struct ggml_context * ctx, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, int index, uint32_t n_tokens, int repeat) {
    k_cur = ggml_rope_ext(ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, k_cur, model->head_size, model->n_kv_attn_heads, n_tokens)), octx->positions, model->rope_frequencies, 
                model->head_size, 2,0, 500000.0f,
                1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
    // A performance comparison between this method, i.e. performing 3 incremental copy operations in order to achieve repeat_interleave,
    // and performing the repeat operation upfront before performign a single copy needs to be performed in order to better optimize this function.
    // Additionally, it might be more performant for the values transposition to be performed prior to appending it to the cache, as it would save us 
    // from incrementally larger transpositions with generation.
    for (int i = 0; i < repeat; i++) {
        struct ggml_tensor * k_cache_view = ggml_view_3d(
            ctx, 
            kv_self->k_l[index], 
            model->head_size,
            model->n_kv_attn_heads,
            n_tokens, 
            ggml_element_size(kv_self->k_l[index]) * model->head_size * repeat,
            ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size,
            ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size * octx->current_position + i * ggml_element_size(kv_self->k_l[index]) * model->head_size
        );
        ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
        struct ggml_tensor * v_cache_view = ggml_view_3d(
            ctx,
            kv_self->v_l[index],
            model->head_size,
            model->n_kv_attn_heads,
            n_tokens,
            ggml_element_size(kv_self->k_l[index]) * model->head_size * repeat,
            ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size,
            ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size * octx->current_position + i * ggml_element_size(kv_self->k_l[index]) * model->head_size
        );
        ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
    }
 }
 struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch) {
    init_build();
    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false);
    struct ggml_tensor * cur;
    struct ggml_tensor * inpL;
    const int32_t full_sequence_length = octx->current_position + (uint32_t) batch.n_tokens;
    octx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
    ggml_set_input(octx->positions);
    octx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
    ggml_set_input(octx->inp_tokens);
    inpL = ggml_get_rows(ctx, model->embd, octx->inp_tokens);
    struct ggml_tensor * KQ_mask_dec = build_attn_mask(ctx, octx, batch);
    for (int l = 0; l < model->n_layers; l++) {
        struct ggml_tensor * residual = inpL;
        cur = orpheus_build_layer_norm(ctx, inpL, model->layers[l].input_norm);
        struct ggml_tensor * attn_out;
        // self-attention
        {
            struct ggml_tensor * Qcur = ggml_mul_mat(ctx, model->layers[l].q, cur);
            struct ggml_tensor * Kcur = ggml_mul_mat(ctx, model->layers[l].k, cur);
            struct ggml_tensor * Vcur = ggml_mul_mat(ctx, model->layers[l].v, cur);
            orpheus_build_kv_store(ctx, gf, Kcur, Vcur, l, batch.n_tokens, 3);
            struct ggml_tensor * k =
                ggml_cont(ctx, ggml_view_3d(ctx, kv_self->k_l[l],
                        model->head_size, full_sequence_length, model->n_attn_heads,
                        ggml_element_size(kv_self->k_l[l]) * model->n_attn_heads * model->head_size,
                        ggml_element_size(kv_self->k_l[l]) * model->head_size,
                        0));            
            struct ggml_tensor * v =
                ggml_view_2d(ctx, kv_self->v_l[l],
                        model->hidden_size, full_sequence_length,
                        ggml_element_size(kv_self->k_l[l]) * model->hidden_size,
                        0);
            v = ggml_cont_3d(ctx, ggml_transpose(ctx, v), full_sequence_length, model->head_size, model->n_attn_heads);
            Qcur = ggml_rope_ext(
                ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, Qcur, model->head_size, model->n_attn_heads, batch.n_tokens)), 
                octx->positions, model->rope_frequencies, model->head_size, 2, 0, 500000.0f, // rope theta
                1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
            struct ggml_tensor * q = ggml_cont(ctx, ggml_permute(ctx, Qcur, 0, 2, 1, 3));
            struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
            kq = ggml_soft_max_ext(ctx, kq, KQ_mask_dec, 1.0f/sqrtf(model->head_size), 0.0f);
            struct ggml_tensor * kqv = ggml_mul_mat(ctx, kq, v);
            struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3);
            attn_out = ggml_cont_2d(ctx, kqv_merged, model->hidden_size, batch.n_tokens);
            attn_out = ggml_mul_mat(ctx, model->layers[l].o, attn_out);
        }
        cur = ggml_add(ctx, attn_out, residual);
        struct ggml_tensor * residualffn = cur;
        // mlp
        {
            cur = orpheus_build_layer_norm(ctx, cur, model->layers[l].post_attention_norm);
            cur = ggml_mul(ctx, ggml_silu(ctx, ggml_mul_mat(ctx, model->layers[l].gate, cur)), ggml_mul_mat(ctx, model->layers[l].up, cur));
            cur = ggml_mul_mat(ctx, model->layers[l].down, cur);
        }
        cur = ggml_add(ctx, cur, residualffn);
        inpL = cur;
    }
    cur = orpheus_build_layer_norm(ctx, cur, model->output_norm);
    // only about 40k of the output head is actually uses for generation purposes. Ideally the head tensor should be shrunk and sampled tokens should be incremented.
    cur = ggml_mul_mat(ctx, model->head, cur);
    if (batch.n_tokens > 1) {
        cur = ggml_cont(ctx, ggml_view_1d(ctx, cur, model->vocab_size, ggml_element_size(cur) * (cur->ne[1] - 1) * model->vocab_size));
    }
    ggml_build_forward_expand(gf, cur);
    free_build();
    return gf;
 }
 void orpheus_runner::decode(orpheus_ubatch & batch) {
    ggml_backend_sched_reset(octx->sched);
    octx->output_tokens.reserve(model->max_generation_size);
    const size_t new_size  = model->vocab_size * model->max_generation_size * sizeof(float);
    octx->prep_output_buffer(new_size);
    ggml_cgraph * gf = build_orpheus_graph(batch);
    // the output is always the last tensor in the graph
    struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
    ggml_backend_sched_alloc_graph(octx->sched, gf);
    set_inputs(batch);
    ggml_backend_sched_graph_compute_async(octx->sched, gf);
    float * logits_out = octx->logits + octx->n_outputs * model->vocab_size;
    octx->get_ggml_node_data(res, logits_out, model->vocab_size * sizeof(float));
    // update the total number of outputs retrieved and the current position
    octx->current_position += batch.n_tokens;
    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
    // overlap with device computation.
    ggml_backend_sched_reset(octx->sched);
 }
 void orpheus_runner::set_inputs(orpheus_ubatch & batch) {
    ggml_backend_tensor_set(octx->inp_tokens, batch.tokens.data(), 0, batch.tokens.size()*ggml_element_size(octx->inp_tokens));
    int32_t * pos = (int32_t*) octx->positions->data;
    float * mask = (float*) octx->attn_mask->data;
    uint32_t max_pos = octx->current_position + batch.n_tokens;
    for (int i = 0; i < batch.n_tokens; i++) {
        pos[i] = (int32_t) octx->current_position + i;
        for (int ii = 0; ii < max_pos; ii++) {
            mask[i*max_pos + ii] = ii > pos[i] ? -INFINITY : 0.0f;
        }
    }
 }
 orpheus_ubatch orpheus_runner::batch_from_sentence(std::string sentence) {
    struct orpheus_ubatch batch;
    for (auto t : orpheus_prepended_tokens) {
        batch.tokens.push_back(t);
    }
    if (!octx->voice.empty()) {
        sentence = octx->voice  + ": " + sentence;
    }
    tokenizer->tokenize(sentence, batch.tokens);
    for (auto t : orpheus_appended_tokens) {
        batch.tokens.push_back(t);
    }
    batch.n_tokens = batch.tokens.size();
    return batch;
 }
 std::vector<std::vector<uint32_t>> orpheus_runner::prepare_output_tokens() {
    size_t chunks = octx->output_tokens.size() / 7;
    std::vector<std::vector<uint32_t>> output_tokens;
    for (int i = 0; i < model->audio_heads; i++) {
        output_tokens.push_back(std::vector<uint32_t>{});
    }
    for (int i = 0; i < chunks; i++) {
        for (int ii = 0; ii < 7; ii++) {
            uint32_t thead = model->heads[ii];
            // the manipulations below are not configured because they are performed inline via undocumented constants in the Orpheus codebase.
            // Essentially this is how Orpheus converts discrete samples from the output shape to the audio input shape.
            uint32_t t = octx->output_tokens[i*7 + ii] - 128266 - ((ii % 7) * 4096);
            output_tokens[thead].push_back(t);
        }
    }
    return output_tokens;
 }
 void orpheus_runner::generate_from_batch(orpheus_ubatch & batch, struct tts_response * output) {
    while ((octx->output_tokens.size() == 0 || octx->output_tokens.back() != model->stopping_token_id) && octx->output_tokens.size() < model->max_generation_size) {
        decode(batch);
        generation_sampler->sample(octx->logits + octx->n_outputs * model->vocab_size, octx->output_tokens);
        // only increment the output count after sampling
        octx->n_outputs++;
        batch = orpheus_ubatch{
            1, {octx->output_tokens.back()}
        };
    }
    // this case could be better addressed by adding spliting to the generation process.
    if (octx->output_tokens.size() >= model->max_generation_size) {
        fprintf(stdout, "Warning: generation hit its max default length. The generated audio may not contain the entire prompt.\n");
    }
    std::vector<std::vector<uint32_t>> processed_output_tokens = prepare_output_tokens();
    srunner->run(processed_output_tokens, output);
 }
 int orpheus_runner::generate(std::string sentence, struct tts_response * response) {
    orpheus_ubatch batch = batch_from_sentence(sentence);
    // it should be possible to update the max context window size, but currently it is extremely unlikely that a single prompt will
    // surpass the default size.
    if (batch.tokens.size() > model->max_context_length) {
        TTS_ABORT("The prompt was too large for the default context window. Try splitting up or shortenning the prompt.");
    }
    octx->reset();
    generation_sampler->reset();
    if  (!kv_self) {
        orpheus_kv_cache_init();
    }
    generate_from_batch(batch, response);
    return 0;
 }
 void orpheus_runner::configure_generation(generation_configuration * config) {
    generation_sampler->temperature = config->temperature;
    generation_sampler->repetition_penalty = config->repetition_penalty;
    generation_sampler->do_sample = config->sample;
    generation_sampler->top_k = config->top_k;
    generation_sampler->top_p = config->top_p;
    if (std::find(orpheus_voices.begin(), orpheus_voices.end(), config->voice) == orpheus_voices.end() && !config->voice.empty()) {
        TTS_ABORT("Voice '%s' is not a valid voice for Orpheus.", config->voice.c_str());
    }
    octx->voice = config->voice;
 }
 orpheus_ubatch orpheus_runner::build_worst_case_batch() {
    orpheus_ubatch batch;
    batch.n_tokens = model->max_context_length;
    return batch;
 }
 void orpheus_runner::assign_weight(std::string name, ggml_tensor * tensor) {
    if (tensor->data == NULL) {
        return;
    }
    if (name.size() == 0) {
        // handles the top level meta tensor
        return;
    }
    if (name.size() > 5 && name.substr(0, 5) == "snac.") {
        srunner->model->assign_weight(name.substr(5), tensor);
    } else if (name.size() > 8 && name.substr(0, 8) == "orpheus.") {
        model->assign_weight(name.substr(8), tensor);
    } else {
        fprintf(stdout, "Warning: function %s encountered an unhandled tensor named '%s'.\n", __func__, name.c_str());
    }
 }
 void orpheus_runner::prepare_post_load() {
    srunner->prepare_post_load();
    orpheus_kv_cache_init();
    auto batch = build_worst_case_batch();
    auto gf = build_orpheus_graph(batch);
    octx->prep_schedule(gf);
 }
 std::vector<std::string> list_voices() {
 	std::vector<std::string> voices;
 	voices.reserve(orpheus_voices.size());
 	for (auto voice : orpheus_voices) {
 		voices.push_back(voice);
 	}
 	return voices;
 }
--- a/otherarch/ttscpp/src/orpheus_model.h
+++ b/otherarch/ttscpp/src/orpheus_model.h
@ -0,0 +1,146 @@
 #pragma once
 #include "sampler.h"
 #include "tokenizer.h"
 #include "snac_model.h"
 // Orpheus uses vLLM with a llama-3 architecture. The only critical difference from the normal llama architecture is the use of kv heads.
 struct orpheus_layer {
    struct ggml_tensor * input_norm;
    struct ggml_tensor * post_attention_norm;
    struct ggml_tensor * q;
    struct ggml_tensor * k;
    struct ggml_tensor * v;
    struct ggml_tensor * o;
    struct ggml_tensor * gate;
    struct ggml_tensor * up;
    struct ggml_tensor * down;
 };
 struct orpheus_model : tts_model {
    uint32_t vocab_size = 156940;
    uint32_t n_attn_heads = 24;
    uint32_t n_kv_attn_heads = 8;
    uint32_t head_size = 128;
    uint32_t max_context_length = 1024;
    // the generation size is technically arbitrary as the model can handle a large context. This size comes out to being 25.6 seconds.
    uint32_t max_generation_size = 2100;
    uint32_t stopping_token_id = 128258;
    uint32_t eos_token_id = 128001;
    uint32_t bos_token_id = 128000;
    uint32_t hidden_size = 3072;
    uint32_t kv_hidden_size = 1024;
    uint32_t audio_heads = 3;
    uint32_t heads[7] = {0, 1, 2, 2, 1, 2, 2};
    int n_layers = 28;
    struct std::vector<orpheus_layer> layers;
    struct ggml_tensor * head;
    struct ggml_tensor * embd;
    struct ggml_tensor * output_norm;
    struct ggml_tensor * rope_frequencies;
    void assign_weight(std::string name, ggml_tensor * tensor);
    void assign_to_layer(std::string part, orpheus_layer & layer, struct ggml_tensor * tensor);
    void prep_constants(gguf_context * meta);
    void prep_layers(gguf_context * meta);
    void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only) {
        prep_constants(meta_ctx);
        prep_layers(meta_ctx);
        tts_model::setup_from_file(meta_ctx, load_context, cpu_only, "orpheus", 1.30);
    }
 };
 struct orpheus_context : runner_context {
    orpheus_context(orpheus_model * model, int n_threads): runner_context(n_threads), model(model) {};
    struct orpheus_model * model;
    uint32_t current_position = 0; // current position in the active sequence
    uint32_t n_outputs = 0; // the position of the text prompt termination (used for adjusting the cache when incrementally generating)
    std::string voice;
    std::vector<uint32_t> output_tokens;
    void reset();
    void build_schedule() {
        runner_context::build_schedule(model->max_nodes());
    }
    struct ggml_tensor * inp_tokens;
    struct ggml_tensor * attn_mask;
    struct ggml_tensor * positions;
 };
 struct orpheus_kv_cache {    
    ggml_type cache_type = GGML_TYPE_F32;
    std::vector<struct ggml_tensor *> k_l;
    std::vector<struct ggml_tensor *> v_l;
    struct ggml_context * ctx;
    ggml_backend_buffer_type_t buft;
    ggml_backend_buffer_t buf;
    void free() {
        ggml_free(ctx);
        ggml_backend_buffer_free(buf);
    }
    ~orpheus_kv_cache() {
        free();
    }
 };
 struct orpheus_context * build_new_orpheus_context(struct orpheus_model * model, int n_threads, bool use_cpu = true);
 struct orpheus_ubatch {
    orpheus_ubatch() = default;
    orpheus_ubatch(size_t n_tokens, std::vector<uint32_t> tokens): n_tokens(n_tokens), tokens(tokens) {};
    size_t n_tokens; // total sentence tokens
    std::vector<uint32_t> tokens;    // [n_tokens]
 };
 struct orpheus_runner : tts_runner {
    orpheus_runner(
            orpheus_model * model, 
            snac_runner * audio_decoder, 
            orpheus_context * octx, 
            bpe_tokenizer * bt, 
            sampler * samp, 
            orpheus_kv_cache * cache): model(model), srunner(audio_decoder), octx(octx), tokenizer(bt), generation_sampler(samp), kv_self(cache) {
        tts_runner::sampling_rate = 24000.0f;
        generation_sampler->n_output_heads = 1;
        generation_sampler->vocab_size = model->vocab_size;
        generation_sampler->eos_token_id = model->eos_token_id;
    }
    orpheus_model * model;
    snac_runner * srunner;
    orpheus_context * octx;
    bpe_tokenizer * tokenizer;
    orpheus_kv_cache * kv_self;
    sampler * generation_sampler;
    void init_build() {
        tts_runner::init_build(&octx->buf_compute_meta);
    }
    std::vector<std::string> list_voices();
    struct ggml_cgraph * build_orpheus_graph(orpheus_ubatch & batch);
    void orpheus_kv_cache_init();
    void orpheus_build_kv_store(struct ggml_context * ctx, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, int index, uint32_t n_tokens, int repeat);
    void configure_generation(generation_configuration * config);
    void assign_weight(std::string name, ggml_tensor * tensor);
    std::vector<std::vector<uint32_t>> prepare_output_tokens();
    orpheus_ubatch build_worst_case_batch();
    orpheus_ubatch batch_from_sentence(std::string sentence);
    void set_inputs(orpheus_ubatch & batch);
    void decode(orpheus_ubatch & batch);
    void prepare_post_load();
    int generate(std::string sentence, struct tts_response * response);
    void generate_from_batch(orpheus_ubatch & batch, struct tts_response * output);
 };
 static struct ggml_tensor * orpheus_build_layer_norm(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * weight);
 static struct ggml_tensor * build_attn_mask(ggml_context * ctx, orpheus_context * octx, orpheus_ubatch & batch);
--- a/otherarch/ttscpp/src/parler_model.cpp
+++ b/otherarch/ttscpp/src/parler_model.cpp
@ -0,0 +1,874 @@
 #include "parler_model.h"
 // For loading parler model from gguf file.
 static const std::map<std::string, parler_tensor> PARLER_TENSOR_GGUF_LOOKUP = {
    {"layer_norm.weight", PARLER_NORM},
    {"layer_norm.bias", PARLER_NORM_BIAS},
    {"embed_prompts", PARLER_EMBD_PROMPTS},
    {"text_encoding", PARLER_TEXT_ENCODING},
    {"positional_embed", PARLER_POSITIONAL_EMBD},
    {".self_attn.q_proj.weight", PARLER_LAYER_SELF_ATTN_Q},
    {".self_attn.k_proj.weight", PARLER_LAYER_SELF_ATTN_K},
    {".self_attn.v_proj.weight", PARLER_LAYER_SELF_ATTN_V},
    {".self_attn.out_proj.weight", PARLER_LAYER_SELF_ATTN_O},
    {".self_attn_layer_norm.weight", PARLER_LAYER_SELF_ATTN_NORM},
    {".self_attn_layer_norm.bias", PARLER_LAYER_SELF_ATTN_NORM_BIAS},
    {".encoder_attn.q_proj.weight", PARLER_LAYER_ATTN_Q},
    {".encoder_attn.k_proj.weight", PARLER_LAYER_ATTN_K},
    {".encoder_attn.v_proj.weight", PARLER_LAYER_ATTN_V},
    {".encoder_attn.out_proj.weight", PARLER_LAYER_ATTN_O},
    {".encoder_attn_layer_norm.weight", PARLER_LAYER_ATTN_NORM},
    {".encoder_attn_layer_norm.bias", PARLER_LAYER_ATTN_NORM_BIAS},
    {".fc1.weight", PARLER_LAYER_FC1},
    {".fc2.weight", PARLER_LAYER_FC2},
    {".final_layer_norm.weight", PARLER_LAYER_OUT_NORM},
    {".final_layer_norm.bias", PARLER_LAYER_OUT_NORM_BIAS},
    {".weight", PARLER_EMBD},
    {".weight.head", PARLER_HEAD}
 };
 void parler_tts_model::assign_weight(std::string name, ggml_tensor * tensor) {
    assign_to_decoder(this, name, tensor);
 }
 void parler_tts_model::prep_layers(gguf_context * meta_ctx) {
    layers.reserve((size_t) n_layers);
    for (int i = 0; i < (int) n_layers; i++) {
        parler_layer * l = new parler_layer{};
        layers.push_back(l);
    }
    embds.reserve((size_t) n_output_heads);
    heads.reserve((size_t) n_output_heads);
    for (int i = 0; i < n_output_heads; i++) {
        struct ggml_tensor * h = nullptr;
        struct ggml_tensor * embd = nullptr;
        embds.push_back(embd);
        heads.push_back(h);
    }
 }
 void parler_tts_model::prep_constants(gguf_context * meta) {
    int encode_length_key = search_for_gguf_keys(meta, {"parler-tts.decoder.encode_length", "encode_length"});
    if (encode_length_key == -1) {
        TTS_ABORT("key 'parler-tts.decoder.encode_length' must be specified in gguf file.");
    }
    n_encode_length = gguf_get_val_u32(meta, encode_length_key);
    int hidden_size_key = search_for_gguf_keys(meta, {"parler-tts.decoder.hidden_size", "hidden_size"});
    if (hidden_size_key != -1) {
        hidden_size = gguf_get_val_u32(meta, hidden_size_key);
    }
    int output_heads_key = search_for_gguf_keys(meta, {"parler-tts.decoder.output_heads", "output_heads"});
    if (output_heads_key != -1) {
        n_output_heads = gguf_get_val_u32(meta, output_heads_key);
    }
    int ctx_length_key = search_for_gguf_keys(meta, {"parler-tts.decoder.context_length", "ctx_length"});
    if (ctx_length_key != -1) {
        max_ctx_length = gguf_get_val_u32(meta, ctx_length_key);
    }
    int attn_heads_key = search_for_gguf_keys(meta, {"parler-tts.decoder.attention.head_count", "attn_heads"});
    if (attn_heads_key != -1) {
        n_attn_heads = gguf_get_val_u32(meta, attn_heads_key);
    }
    head_size = hidden_size / n_attn_heads;
    max_cross_nodes = n_attn_heads * 2;
    int output_vocab_size_key = search_for_gguf_keys(meta, {"parler-tts.decoder.out_vocab_size", "out_vocab_size"});
    if (output_vocab_size_key != -1) {
        output_vocab_size = gguf_get_val_u32(meta, output_vocab_size_key);
    }
    int audio_vocab_size_key = search_for_gguf_keys(meta, {"parler-tts.decoder.audio_vocab_size", "audio_vocab_size"});
    if (audio_vocab_size_key != -1) {
        audio_vocab_size = gguf_get_val_u32(meta, audio_vocab_size_key);
    }
    int max_gen_key = search_for_gguf_keys(meta, {"parler-tts.decoder.max_generation", "max_generation"});
    if (max_gen_key != -1) {
        max_generation_size = gguf_get_val_u32(meta, max_gen_key);
    }
    int n_layers_key = search_for_gguf_keys(meta, {"parler-tts.decoder.num_hidden_layers", "num_hidden_layers"});
    if (n_layers_key != -1) {
        n_layers = gguf_get_val_u32(meta, n_layers_key);
    }
    int bos_token_id_key = search_for_gguf_keys(meta, {"audio.bos_token_id", "bos_token_id"});
    if (bos_token_id_key != -1) {
        bos_token_id = gguf_get_val_u32(meta, bos_token_id_key);
    }
    int eos_token_id_key = search_for_gguf_keys(meta, {"audio.eos_token_id", "eos_token_id"});
    if (eos_token_id_key != -1) {
        eos_token_id = gguf_get_val_u32(meta, eos_token_id_key);
    }
 }
 void parler_tts_model::prep_cross_key_values(int n_threads, struct tts_response * conditional_prompt) {
    ggml_backend_t backend_cpu = ggml_backend_cpu_init();
    ggml_backend_buffer_type_t backend_cpu_buffer = ggml_backend_cpu_buffer_type();
    // Let it create a disposable threadpool just this once
    ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
    std::vector<ggml_backend_buffer_type_t> bufs = {backend_cpu_buffer};
    std::vector<ggml_backend_t> backs = {backend_cpu};
    ggml_backend_sched_t sched = ggml_backend_sched_new(backs.data(), bufs.data(), 1, max_cross_nodes*n_layers, false, false);
    std::vector<uint8_t> buf_compute_meta;
    buf_compute_meta.resize(max_cross_nodes*n_layers*ggml_tensor_overhead() + ggml_graph_overhead_custom(max_cross_nodes*n_layers, false));
    struct ggml_init_params params = {
        /*.mem_size   =*/ buf_compute_meta.size(),
        /*.mem_buffer =*/ buf_compute_meta.data(),
        /*.no_alloc   =*/ true,
    };
    struct ggml_context * cctx = ggml_init(params);
    struct ggml_cgraph * gf = ggml_new_graph_custom(cctx, 4096, false);
    if (conditional_prompt) {
        // If we are updating the conditional prompt then we have to reset the tensor offsets into the ggml_context otherwise we could overflow the assigned buffer and lose our prompt.
        // These offsets are assigned by #set_tensor below.
        offset -= n_encode_length*hidden_size*sizeof(float)*n_layers*2;
        precomputed_input_emb = ggml_new_tensor_2d(cctx, GGML_TYPE_F32, conditional_prompt->hidden_size, conditional_prompt->n_outputs);
        ggml_set_input(precomputed_input_emb);
        n_encode_length = conditional_prompt->n_outputs;
    }
    for (int i = 0; i < layers.size(); i++) {
        struct ggml_tensor * Kcur = ggml_mul_mat(cctx, layers[i]->attn_k_proj, precomputed_input_emb);
        struct ggml_tensor * Vcur = ggml_mul_mat(cctx, layers[i]->attn_v_proj, precomputed_input_emb);
        Kcur = ggml_reshape_3d(cctx, Kcur, head_size, n_attn_heads, n_encode_length);
        Vcur = ggml_transpose(cctx, Vcur);
        struct ggml_tensor * k = ggml_cont(cctx, ggml_permute(cctx, Kcur, 0, 2, 1, 3));
        ggml_set_name(k, ("cross_key_" + std::to_string(i)).c_str());
        ggml_build_forward_expand(gf, k);
        struct ggml_tensor * v = ggml_cont_3d(cctx, Vcur, n_encode_length, head_size, n_attn_heads);
        ggml_set_name(v, ("cross_value_" + std::to_string(i)).c_str());
        ggml_build_forward_expand(gf, v);
    }
    ggml_free(cctx);
    ggml_backend_sched_reserve(sched, gf);
    ggml_backend_sched_alloc_graph(sched, gf);
    if (conditional_prompt) {
        ggml_backend_tensor_set(precomputed_input_emb, conditional_prompt->data, 0, conditional_prompt->n_outputs*conditional_prompt->hidden_size*ggml_element_size(precomputed_input_emb));
    }
    ggml_backend_sched_graph_compute_async(sched, gf);
    for (int i = 0; i < layers.size(); i++) {
        struct ggml_tensor * k = ggml_graph_get_tensor(gf, ("cross_key_" + std::to_string(i)).c_str());
        layers[i]->cross_k = ggml_dup_tensor(ctx, k);
        set_tensor(layers[i]->cross_k, k);
        struct ggml_tensor * v = ggml_graph_get_tensor(gf, ("cross_value_" + std::to_string(i)).c_str());
        layers[i]->cross_v = ggml_dup_tensor(ctx, v);
        set_tensor(layers[i]->cross_v, v);
    }
    ggml_backend_sched_free(sched);
    ggml_backend_free(backend_cpu);
 }
 void assign_parler_layer(parler_tts_model * model, parler_layer * layer, std::string name, ggml_tensor * tensor) {
    try {
        switch(PARLER_TENSOR_GGUF_LOOKUP.at(name)) {
            case PARLER_LAYER_SELF_ATTN_Q:
                layer->self_attn_q_proj = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(layer->self_attn_q_proj, tensor);
                break;
            case PARLER_LAYER_SELF_ATTN_K:
                layer->self_attn_k_proj = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(layer->self_attn_k_proj, tensor);
                break;
            case PARLER_LAYER_SELF_ATTN_V:
                layer->self_attn_v_proj = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(layer->self_attn_v_proj, tensor);
                break;
            case PARLER_LAYER_SELF_ATTN_O:
                layer->self_attn_o_proj = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(layer->self_attn_o_proj, tensor);
                break;
            case PARLER_LAYER_SELF_ATTN_NORM:
                layer->self_attn_norm = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(layer->self_attn_norm, tensor);
                break;
            case PARLER_LAYER_SELF_ATTN_NORM_BIAS:
                layer->self_attn_norm_bias = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(layer->self_attn_norm_bias, tensor);
                break;
            case PARLER_LAYER_ATTN_Q:
                if (model->use_cross_attn) {
                    layer->attn_q_proj = ggml_dup_tensor(model->ctx, tensor);
                    model->set_tensor(layer->attn_q_proj, tensor);
                }
                break;
            case PARLER_LAYER_ATTN_K:
                if (model->use_cross_attn) {
                    layer->attn_k_proj = ggml_dup_tensor(model->ctx, tensor);
                    model->set_tensor(layer->attn_k_proj, tensor);
                }
                break;
            case PARLER_LAYER_ATTN_V:
                if (model->use_cross_attn) {
                    layer->attn_v_proj = ggml_dup_tensor(model->ctx, tensor);
                    model->set_tensor(layer->attn_v_proj, tensor);
                }
                break;
            case PARLER_LAYER_ATTN_O:
                if (model->use_cross_attn) {
                    layer->attn_o_proj = ggml_dup_tensor(model->ctx, tensor);
                    model->set_tensor(layer->attn_o_proj, tensor);
                }
                break;
            case PARLER_LAYER_ATTN_NORM:
                if (model->use_cross_attn) {
                    layer->attn_norm = ggml_dup_tensor(model->ctx, tensor);
                    model->set_tensor(layer->attn_norm, tensor);
                }
                break;
            case PARLER_LAYER_ATTN_NORM_BIAS:
                if (model->use_cross_attn) {
                    layer->attn_norm_bias = ggml_dup_tensor(model->ctx, tensor);
                    model->set_tensor(layer->attn_norm_bias, tensor);
                }
                break;
            case PARLER_LAYER_FC1:
                layer->fc1 = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(layer->fc1, tensor);
                break;
            case PARLER_LAYER_FC2:
                layer->fc2 = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(layer->fc2, tensor);
                break;
            case PARLER_LAYER_OUT_NORM:
                layer->final_norm = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(layer->final_norm, tensor);
                break;
            case PARLER_LAYER_OUT_NORM_BIAS:
                layer->final_norm_bias = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(layer->final_norm_bias, tensor);
                break;
            default:
                fprintf(stdout, "unassigned tensor %s\n", name.c_str());
                break;
        }
    } catch (const std::out_of_range& e) {
        TTS_ABORT("Error: %s\nTensor, '%s', is not a valid tensor.", e.what(), name.c_str());
    }
 }
 void assign_to_decoder(parler_tts_model * model, const std::string name, ggml_tensor * tensor) {
    if (PARLER_TENSOR_GGUF_LOOKUP.find(name) != PARLER_TENSOR_GGUF_LOOKUP.end()) {
        try {
            switch (PARLER_TENSOR_GGUF_LOOKUP.at(name)) {
                case PARLER_NORM:
                    model->layer_norm = ggml_dup_tensor(model->ctx, tensor);
                    model->set_tensor(model->layer_norm, tensor);
                    break;
                case PARLER_NORM_BIAS:
                    model->layer_norm_bias = ggml_dup_tensor(model->ctx, tensor);
                    model->set_tensor(model->layer_norm_bias, tensor);
                    break;
                case PARLER_EMBD_PROMPTS:
                    model->prompt_embd = ggml_dup_tensor(model->ctx, tensor);
                    model->set_tensor(model->prompt_embd, tensor);
                    break;
                case PARLER_TEXT_ENCODING:
                    if (model->use_cross_attn) {
                        model->precomputed_input_emb = ggml_dup_tensor(model->ctx, tensor);
                        model->set_tensor(model->precomputed_input_emb, tensor);
                    }
                    break;
                case PARLER_POSITIONAL_EMBD:
                    model->precomputed_positional_embds = ggml_dup_tensor(model->ctx, tensor);
                    model->set_tensor(model->precomputed_positional_embds, tensor);
                    break;
                default:
                    fprintf(stdout, "unassigned tensor %s\n", name.c_str());
                    break;
            }
        } catch (const std::out_of_range& e) {
            TTS_ABORT("Error: %s\nTensor, '%s', is not a valid tensor.", e.what(), name.c_str());
        }
    } else if (std::find_if(name.begin(), name.end(), ::isdigit) != name.end())  {
        auto pair = parse_layer_count(name);
        int layer = pair.first;
        std::string lt_name = pair.second;
        if (name.find("embed_tokens") != std::string::npos) {
            model->embds[layer] = ggml_dup_tensor(model->ctx, tensor);
            model->set_tensor(model->embds[layer], tensor);
        } else if (name.find("lm_heads") != std::string::npos) {
            model->heads[layer] = ggml_dup_tensor(model->ctx, tensor);
            model->set_tensor(model->heads[layer], tensor);
        } else {
            assign_parler_layer(model, model->layers[layer], lt_name, tensor);
        }
    }
 }
 void parler_context::reset(int32_t n_output_heads) {
    n_outputs = 0;
    prompt_end_position = 0;
    current_position = 0;
    output_size = 0;
    output_tokens.clear();
    eos_seen.clear();
    for (int i = 0; i < (int) n_output_heads; i++) {
        eos_seen.push_back(false);
    }
 }
 struct parler_context * build_new_parler_context(struct parler_tts_model * model, int n_threads, bool use_cpu) {
    parler_context * pctx = new parler_context(model, n_threads);
    if (!use_cpu) {
 #ifdef GGML_USE_METAL
        pctx->backend = ggml_backend_metal_init();
 #endif
    }
    pctx->eos_seen.reserve(model->n_output_heads);
    pctx->backend_cpu = ggml_backend_cpu_init();
    pctx->set_threads();
    pctx->build_schedule();
    pctx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_nodes() + ggml_graph_overhead_custom(model->max_nodes(), false));
    return pctx;
 }
 static bool parler_kv_cache_init(struct parler_kv_cache * cache, parler_tts_model * model, parler_context * pctx, int32_t seq_id) {
    const int64_t n_layer = (int64_t) model->layers.size();
    cache->seq_id = seq_id;
    ggml_backend_buffer_type_t buft = nullptr;
    // this will only really support cpu or metal for the time being;
    if (pctx->backend != nullptr) {
 #ifdef GGML_USE_METAL
        buft = ggml_backend_metal_buffer_type();
 #endif
    } else {
        buft = ggml_backend_cpu_buffer_type();
    }
    struct ggml_init_params params = {
        /*.mem_size   =*/ (2u*model->n_layers+1)*ggml_tensor_overhead(),
        /*.mem_buffer =*/ NULL,
        /*.no_alloc   =*/ true,
    };
    ggml_context * ctx = ggml_init(params);
    if (!ctx) {
        return false;
    }
    cache->ctx = ctx;
    cache->k_l.reserve(n_layer);
    cache->v_l.reserve(n_layer);
    for (int i = 0; i < (int) n_layer; i++) {
        ggml_tensor * k = ggml_new_tensor_1d(cache->ctx, cache->type_k, model->hidden_size*model->max_ctx_length);
        ggml_tensor * v = ggml_new_tensor_1d(cache->ctx, cache->type_v, model->hidden_size*model->max_ctx_length);
        ggml_format_name(k, "cache_k_l%d", i);
        ggml_format_name(v, "cache_v_l%d", i);
        cache->k_l.push_back(k);
        cache->v_l.push_back(v);
    }
    // allocate tensors and initialize the buffers to avoid NaNs in the padding
    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(cache->ctx, buft);
    if (!buf) {
        return false;
    }
    ggml_backend_buffer_clear(buf, 0);
    cache->buf = buf;
    return true;
 }
 struct ggml_tensor * parler_build_inp_embd(struct ggml_context * ctx, struct parler_context * pctx, parler_tts_model * model, parler_ubatch & batch) {
    // Parler has two embedding schemas one for the text input and one for generative audio tokens. These two schemas have effectively distinct shapes (i.e. [batch_size, sequence_length] and [batch_size, sequence_lenghth, num_codebooks] respectively).
    // This means that depending on where we are in generation we need to follow a distinct pattern
    struct ggml_tensor * input_embs;
    pctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.sequence_length);
    ggml_set_input(pctx->positions);
    if (batch.audio_generation) {
        pctx->audio_inp_tokens = ggml_reshape_2d(ctx, ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_audio_tokens), batch.n_audio_tokens / model->n_output_heads, model->n_output_heads);
        ggml_set_input(pctx->audio_inp_tokens);
        struct ggml_tensor * audio_tokens = ggml_reshape_2d(ctx, pctx->audio_inp_tokens, batch.n_audio_tokens / model->n_output_heads, model->n_output_heads);
        for (int i = 0; i < model->n_output_heads; i++) {
            if (i == 0) {
                input_embs = ggml_get_rows(ctx, model->embds[i], ggml_view_2d(ctx, audio_tokens, 1, batch.n_audio_tokens / model->n_output_heads, audio_tokens->nb[1], i*sizeof(int32_t)));
            } else {
                input_embs = ggml_add(ctx, ggml_get_rows(ctx, model->embds[i], ggml_view_2d(ctx, audio_tokens, 1, batch.n_audio_tokens / model->n_output_heads, audio_tokens->nb[1], i*sizeof(int32_t))), input_embs);
            }
        }
    } else {
        pctx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
        ggml_set_input(pctx->inp_tokens);
        input_embs = ggml_get_rows(ctx, model->prompt_embd, pctx->inp_tokens);
    }
    return ggml_add(ctx, input_embs, ggml_get_rows(ctx, model->precomputed_positional_embds, pctx->positions));
 }
 struct ggml_tensor * parler_build_layer_norm(struct ggml_context * ctx, struct ggml_tensor * inputs, struct ggml_tensor * weight, struct ggml_tensor * bias) {
    // parler always uses default eps
    float eps = 0.00001;
    inputs = ggml_norm(ctx, inputs, eps);
    inputs = ggml_mul(ctx, inputs, weight);
    return ggml_add(ctx, inputs, bias);
 }
 void parler_build_kv_store(struct ggml_context * ctx, parler_kv_cache * kv, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, int32_t n_tokens, int32_t kv_head, int32_t index, int32_t n_embd_gqa) {
    // this is the max context size;
    const int64_t n_ctx = 4096;
    struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv->k_l[index], n_tokens*n_embd_gqa, ggml_row_size(kv->k_l[index]->type, n_embd_gqa)*kv_head);
    ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
    assert(v_cur->ne[0] == n_embd_gqa && v_cur->ne[1] == n_tokens);
    struct ggml_tensor * v_cache_view = nullptr;
    v_cache_view = ggml_view_2d(ctx, kv->v_l[index], n_tokens, n_embd_gqa,
            (  n_ctx)*ggml_element_size(kv->v_l[index]),
            (kv_head)*ggml_element_size(kv->v_l[index]));
    v_cur = ggml_cont(ctx, ggml_transpose(ctx, v_cur));
    ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
 }
 struct ggml_tensor * parler_build_head_outputs(struct ggml_context * ctx, parler_tts_model * model, struct ggml_tensor * cur) {
    // going to cat the heads together and then reshape them;
    // honestly ggml doesn't provide good support for stacking and discrete tensor access
    struct ggml_tensor * out;
    for (int i = 0; i < model->n_output_heads; i++) {
        if (i == 0) {
            out = ggml_mul_mat(ctx, model->heads[i], cur);
        } else {
            out = ggml_concat(ctx, out, ggml_mul_mat(ctx, model->heads[i], cur), 1);
        }
    }
    ggml_set_name(out, "final_out");
    //out = ggml_cont(ctx, ggml_transpose(ctx, out));
    int32_t sql_len = (int32_t) (ggml_nelements(out) / (model->output_vocab_size * model->n_output_heads));
    return ggml_cont_3d(ctx, out, model->output_vocab_size, sql_len, model->n_output_heads);
 }
 struct ggml_tensor * build_attn_mask(ggml_context * ctx, parler_context * pctx, parler_ubatch & batch) {
    pctx->attn_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, (int64_t) pctx->current_position + batch.sequence_length, (int64_t) pctx->current_position + batch.sequence_length);
    ggml_set_input(pctx->attn_mask);
    return pctx->attn_mask;
 }
 struct ggml_tensor * build_attn_mask_cross(ggml_context * ctx, parler_context * pctx, parler_tts_model * model, parler_ubatch & batch) {
    pctx->attn_mask_cross = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, (int64_t) model->n_encode_length, (int64_t) batch.sequence_length);
    ggml_set_input(pctx->attn_mask_cross);
    return pctx->attn_mask_cross;
 }
 static struct parler_ubatch batch_from_sentence(std::string sentence, parler_tts_model * model, unigram_tokenizer * tokenizer) {
    struct parler_ubatch batch;
    batch.audio_generation = false;
    std::vector<uint32_t>* token_ids = new std::vector<uint32_t>;
    tokenizer->tokenize(sentence, *token_ids);
    token_ids->push_back(tokenizer->eos_token);
    batch.current_step = 0;
    batch.n_tokens = token_ids->size();
    batch.n_audio_tokens = 0;
    batch.sequence_length = batch.n_tokens; // sequence_length is equal to the number of tokens for non-audio generation
    std::vector<uint32_t>* position = new std::vector<uint32_t>;
    for (uint32_t i = 0; i < batch.sequence_length; i++) {
        position->push_back(i);
    }
    std::vector<uint32_t>* order = new std::vector<uint32_t>;
    for (int i = 0; i < batch.sequence_length; i++) {
        if (i >= batch.sequence_length - 1) {
            order->push_back(0);
        } else {
            order->push_back(i+1);
        }
    }
    batch.positions = position->data();
    batch.tokens = token_ids->data();
    return batch;
 }
 void parler_tts_runner::assign_weight(std::string name, ggml_tensor * tensor) {
    std::string::size_type pos = name.find(".", 0);
    std::string top_level(name.substr(0, pos));
    std::string value(name.substr(pos + 1));
    if (tensor->data == NULL) {
        return;
    }
    if (top_level == "audio_encoder") {
        dac_runner->model->assign_weight(value, tensor);
    } else if (top_level == "decoder") {
        model->assign_weight(value, tensor);
    } else {
        return;
    }
 }
 void parler_tts_runner::update_conditional_prompt(const std::string file_path, const std::string prompt, int n_threads, bool cpu_only) {
    t5_runner * text_encoder = text_encoder_from_file(file_path, n_threads, tokenizer, cpu_only);
    tts_response* response;
    text_encoder->generate(prompt, response);
    model->prep_cross_key_values(n_threads, response);
    delete text_encoder;
    return;
 }
 struct ggml_cgraph * parler_tts_runner::build_parler_graph(parler_ubatch & batch) {
    init_build();
    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false);
    struct ggml_tensor * cur;
    struct ggml_tensor * inpL;
    const int32_t full_sequence_length = pctx->current_position + (uint32_t) batch.sequence_length;
    inpL = parler_build_inp_embd(ctx, pctx, model, batch);
    struct ggml_tensor * KQ_mask_dec = build_attn_mask(ctx, pctx, batch);
    struct ggml_tensor * KQ_mask_cross = build_attn_mask_cross(ctx, pctx, model, batch);
    for (int l = 0; l < model->n_layers; l++) {
        struct ggml_tensor * residual = inpL;
        ggml_set_name(inpL, ("layer_" + std::to_string(l) + "_input").c_str());
        cur = parler_build_layer_norm(ctx, inpL, model->layers[l]->self_attn_norm, model->layers[l]->self_attn_norm_bias);
        struct ggml_tensor * attn_out;
        // self-attention
        {
            struct ggml_tensor * Qcur = ggml_mul_mat(ctx, model->layers[l]->self_attn_q_proj, cur);
            struct ggml_tensor * Kcur = ggml_mul_mat(ctx, model->layers[l]->self_attn_k_proj, cur);
            struct ggml_tensor * Vcur = ggml_mul_mat(ctx, model->layers[l]->self_attn_v_proj, cur);
            parler_build_kv_store(ctx, kv_self, gf, Kcur, Vcur, (int32_t) batch.sequence_length, pctx->current_position, l, model->hidden_size);
            struct ggml_tensor * k =
                ggml_view_3d(ctx, kv_self->k_l[l],
                        model->head_size, full_sequence_length, model->n_attn_heads,
                        ggml_row_size(kv_self->k_l[l]->type, model->hidden_size),
                        ggml_row_size(kv_self->k_l[l]->type, model->head_size),
                        0);
            struct ggml_tensor * v =
                ggml_view_3d(ctx, kv_self->v_l[l],
                        full_sequence_length, model->head_size, model->n_attn_heads,
                        ggml_element_size(kv_self->v_l[l])*model->max_ctx_length,
                        ggml_element_size(kv_self->v_l[l])*model->max_ctx_length*model->head_size,
                        0);
            Qcur = ggml_reshape_3d(ctx, Qcur, model->head_size, model->n_attn_heads, batch.sequence_length);
            struct ggml_tensor * q = ggml_cont(ctx, ggml_permute(ctx, Qcur, 0, 2, 1, 3));
            struct ggml_tensor * kq = ggml_mul_mat(ctx, ggml_cont(ctx, k), q);
            kq = ggml_soft_max_ext(ctx, kq, KQ_mask_dec, 1.0f/sqrtf(model->head_size), 0.0f);
            struct ggml_tensor * kqv = ggml_mul_mat(ctx, kq, v);
            struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3);
            attn_out = ggml_cont_2d(ctx, kqv_merged, model->hidden_size, batch.sequence_length);
            attn_out = ggml_mul_mat(ctx, model->layers[l]->self_attn_o_proj, attn_out);
        }
        cur = ggml_add(ctx, attn_out, residual);
        if (model->use_cross_attn) {
            struct ggml_tensor * residuala = cur;
            // norm
            cur = parler_build_layer_norm(ctx, cur, model->layers[l]->attn_norm, model->layers[l]->attn_norm_bias);
            //cross-attention
            struct ggml_tensor * Qcur = ggml_mul_mat(ctx, model->layers[l]->attn_q_proj, cur);
            Qcur = ggml_reshape_3d(ctx, Qcur, model->head_size, model->n_attn_heads, batch.sequence_length);
            struct ggml_tensor * q = ggml_cont(ctx, ggml_permute(ctx, Qcur, 0, 2, 1, 3));
            struct ggml_tensor * kq = ggml_mul_mat(ctx, model->layers[l]->cross_k, q);
            kq = ggml_soft_max_ext(ctx, kq, KQ_mask_cross, 1.0f/sqrtf(model->head_size), 0.0f);
            struct ggml_tensor * kqv  = ggml_mul_mat(ctx, kq, model->layers[l]->cross_v);
            struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3);
            cur = ggml_cont_2d(ctx, kqv_merged, model->hidden_size, batch.sequence_length);
            cur = ggml_mul_mat(ctx, model->layers[l]->attn_o_proj, cur);
            cur = ggml_add(ctx, cur, residuala);
        }
        struct ggml_tensor * residualffn = cur;
        cur = parler_build_layer_norm(ctx, cur, model->layers[l]->final_norm, model->layers[l]->final_norm_bias);
        cur = ggml_mul_mat(ctx, model->layers[l]->fc1, cur);
        cur = ggml_gelu(ctx, cur);
        cur = ggml_mul_mat(ctx, model->layers[l]->fc2, cur);
        cur = ggml_add(ctx, cur, residualffn);
        inpL = cur;
    }
    cur = parler_build_layer_norm(ctx, cur, model->layer_norm, model->layer_norm_bias);
    cur = parler_build_head_outputs(ctx, model, cur);
    ggml_build_forward_expand(gf, cur);
    free_build();
    return gf;
 }
 void parler_tts_runner::configure_generation(generation_configuration * config) {
    sampler->temperature = config->temperature;
    sampler->repetition_penalty = config->repetition_penalty;
    sampler->do_sample = config->sample;
    sampler->top_k = config->top_k;
    sampler->top_p = config->top_p;
    model->use_cross_attn = config->use_cross_attn;
 }
 void parler_tts_runner::set_inputs(parler_ubatch & batch) {
    if (batch.audio_generation) {
        ggml_backend_tensor_set(pctx->audio_inp_tokens, batch.audio_tokens, 0, batch.n_audio_tokens*ggml_element_size(pctx->audio_inp_tokens));
    } else {
        ggml_backend_tensor_set(pctx->inp_tokens, batch.tokens, 0, batch.n_tokens*ggml_element_size(pctx->inp_tokens));
    }
    ggml_backend_tensor_set(pctx->positions, batch.positions, 0, batch.sequence_length*ggml_element_size(pctx->positions));
    float * d = nullptr;
    d = (float *) pctx->attn_mask->data;
    uint32_t max_pos = pctx->current_position + batch.sequence_length;
    for (int i = 0; i < batch.sequence_length; i++) {
        uint32_t pos = batch.positions[i];
        for (int ii = 0; ii < max_pos; ii++) {
            d[i*max_pos + ii] = ii > pos ? -INFINITY : 0.0f;
        }
    }
    if (model->use_cross_attn) {
        float * d2 = nullptr;
        d2 = (float *) pctx->attn_mask_cross->data;
        for (int i = 0; i < model->n_encode_length; i++) {
            for (int ii = 0; ii < batch.sequence_length; ii++) {
                d2[i*batch.sequence_length + ii] = 0.0f;
            }
        }
    }
 }
 void parler_tts_runner::parler_graph_compute(ggml_cgraph * gf) {
    ggml_backend_sched_graph_compute_async(pctx->sched, gf);
 }
 int parler_tts_runner::decode(parler_ubatch & batch) {
    ggml_backend_sched_reset(pctx->sched);
    pctx->output_tokens.reserve(model->max_generation_size);
    const size_t logits_size = model->output_vocab_size*model->max_generation_size*model->n_output_heads;
    const size_t prev_size = pctx->buf_output ? ggml_backend_buffer_get_size(pctx->buf_output) : 0;
    const size_t new_size  = logits_size * sizeof(float);
    if (!pctx->buf_output || prev_size < new_size) {
        if (pctx->buf_output) {
            ggml_backend_buffer_free(pctx->buf_output);
            pctx->buf_output = nullptr;
            pctx->logits = nullptr;
        }
        pctx->buf_output = ggml_backend_buft_alloc_buffer(pctx->backend_cpu_buffer, new_size);
    }
    pctx->logits = (float *) ggml_backend_buffer_get_base(pctx->buf_output);
    //ggml_backend_buffer_clear(pctx->buf_output, 0);
    ggml_cgraph * gf = build_parler_graph(batch);
    // the output is always the last tensor in the graph
    struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
    ggml_backend_sched_alloc_graph(pctx->sched, gf);
    // use the sequence_length variable here so that audio input tokens are handled correctly.
    size_t n_outputs_new = batch.sequence_length;
    set_inputs(batch);
    parler_graph_compute(gf);
    float * logits_out = pctx->logits + pctx->n_outputs * model->output_vocab_size * model->n_output_heads;
    pctx->get_ggml_node_data(res, logits_out, n_outputs_new*model->output_vocab_size*model->n_output_heads*sizeof(float));
    // set to total number of outputs in the batch*/
    pctx->n_outputs += n_outputs_new;
    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
    // overlap with device computation.
    ggml_backend_sched_reset(pctx->sched);
    return 0;
 }
 parler_ubatch parler_tts_runner::build_worst_case_batch()  {
    struct parler_ubatch batch;
    batch.audio_generation = false;
    batch.n_tokens = model->max_ctx_length;
    batch.n_audio_tokens = 0;
    batch.sequence_length = model->max_ctx_length;
    return batch;
 }
 void parler_tts_runner::prepare_post_load() {
    dac_runner->prepare_post_load();
    parler_kv_cache_init(kv_self, model, pctx, std::mt19937(std::random_device{}())());
    auto batch = build_worst_case_batch();
    auto gf = build_parler_graph(batch);
    pctx->prep_schedule(gf);
 }
 bool parler_tts_runner::adjust_for_sequence_continuation(struct parler_ubatch & batch) {
    return false; // not implemneted
 }
 bool parler_tts_runner::check_stopping() {
    int32_t token_position = (int32_t) pctx->output_tokens.size() - (int32_t) model->n_output_heads;
    if (token_position < 0) {
        return false;
    }
    if (pctx->current_position >= model->max_generation_size) {
        return true;
    }
    bool channels_complete = true;
    for (int i = 0; i < model->n_output_heads; i++) {
        pctx->eos_seen[i] = pctx->eos_seen[i] || pctx->output_tokens[token_position+i] == model->eos_token_id;
        if (channels_complete) {
            channels_complete = pctx->eos_seen[i];
        }
    }
    return channels_complete;
 }
 void parler_tts_runner::adjust_output_tokens(std::vector<uint32_t> & output_tokens, std::vector<uint32_t> & filtered) {
    // currently this is applying sliding window over the heads and filtering out bad tokens.
    // If we convert the DAC model's quantizer layers to support by row + column embeddings then we will need to transpose
    // the heads and the sequence here, but right now simplying using a strided view is more peformant.
    size_t size = output_tokens.size();
    filtered.reserve(size);
    for (int i = 0; i < size / model->n_output_heads; i++) {
        bool remove = false;
        for (int ii = 0; ii < model->n_output_heads; ii++) {
            int next_index = i*model->n_output_heads+ii*model->n_output_heads+ii;
            if (next_index > size || output_tokens[next_index] >= model->audio_vocab_size) {
                remove = true;
                break;
            }
        }
        if (!remove) {
            for (int ii = 0; ii < model->n_output_heads; ii++) {
                int next_index = i*model->n_output_heads+ii*model->n_output_heads+ii;
                if (next_index > size) {
                    filtered.push_back(model->eos_token_id);
                } else {
                    filtered.push_back(output_tokens[next_index]);
                }
            }
        }
    }
 }
 int parler_tts_runner::generate_from_batch(parler_ubatch & batch, struct tts_response * output) {
    std::vector<uint32_t> next_decoder_token_ids;
    next_decoder_token_ids.reserve(model->n_output_heads);
    while (!check_stopping()) {
        int state = decode(batch);
        if (state != 0) {
            return state;
        }
        if (!batch.audio_generation) {
            pctx->prompt_end_position += batch.sequence_length;
        }
        if (batch.audio_generation) {
            sampler->sample(pctx->logits + pctx->current_position * model->n_output_heads * model->output_vocab_size, pctx->output_tokens);
        }
        pctx->current_position += batch.sequence_length;
        next_decoder_token_ids.clear();
        uint32_t * last_outputs = (pctx->output_tokens.data() + (int) pctx->output_tokens.size() - model->n_output_heads);
        for (int i = 0; i < model->n_output_heads; i++) {
            next_decoder_token_ids.push_back(batch.current_step > i ? pctx->eos_seen[i] ? model->eos_token_id : last_outputs[i] : model->bos_token_id);
        }
        batch = parler_ubatch{
            true, 0, 9, 1, nullptr, next_decoder_token_ids.data(), &pctx->current_position, nullptr, batch.current_step+1
        };
    }
    std::vector<uint32_t> filtered_output_tokens;
    adjust_output_tokens(pctx->output_tokens, filtered_output_tokens);
    dac_runner->run(filtered_output_tokens.data(), (int32_t) filtered_output_tokens.size() / model->n_output_heads, output);
    return 0;
 }
 int parler_tts_runner::generate_audio_tokens(std::string sentence) {
    parler_ubatch batch = batch_from_sentence(sentence, model, tokenizer);
    pctx->reset(model->n_output_heads);
    sampler->reset();
    int32_t seq_id = std::mt19937(std::random_device{}())();
    if (!kv_self) {
        kv_self = new parler_kv_cache;
        if (!parler_kv_cache_init(kv_self, model, pctx, seq_id)) {
            return 1;
        }
    }
    std::vector<uint32_t> next_decoder_token_ids;
    next_decoder_token_ids.reserve(model->n_output_heads);
    while (!check_stopping()) {
        int state = decode(batch);
        if (state != 0) {
            return state;
        }
        if (!batch.audio_generation) {
            pctx->prompt_end_position += batch.sequence_length;
        }
        if (batch.audio_generation) {
            sampler->sample(pctx->logits + pctx->current_position * model->n_output_heads * model->output_vocab_size, pctx->output_tokens);
        }
        pctx->current_position += batch.sequence_length;
        next_decoder_token_ids.clear();
        uint32_t * last_outputs = (pctx->output_tokens.data() + (int) pctx->output_tokens.size() - model->n_output_heads);
        for (int i = 0; i < model->n_output_heads; i++) {
            next_decoder_token_ids.push_back(batch.current_step > i ? pctx->eos_seen[i] ? model->eos_token_id : last_outputs[i] : model->bos_token_id);
        }
        batch = parler_ubatch{
            true, 0, 9, 1, nullptr, next_decoder_token_ids.data(), &pctx->current_position, nullptr, batch.current_step+1
        };
    }
    return 0;
 }
 void parler_tts_runner::just_audio_token_decode(uint32_t * tokens, int32_t sq_len, struct tts_response * outputs) {
    dac_runner->run(tokens, sq_len, outputs);
 }
 int parler_tts_runner::generate(std::string sentence, struct tts_response * output, int32_t seq_id) {
    parler_ubatch batch = batch_from_sentence(sentence, model, tokenizer);
    pctx->reset(model->n_output_heads);
    sampler->reset();
    if (pctx->seq_id != seq_id || seq_id == -1) {
        seq_id = std::mt19937(std::random_device{}())();
        pctx->current_position = 0;
        if (!kv_self) {
            kv_self = new parler_kv_cache;
            if (!parler_kv_cache_init(kv_self, model, pctx, seq_id)) {
                return 1;
            }
        }
    } else {
        if (!adjust_for_sequence_continuation(batch)) {
            return 2;
        }
    }
    return generate_from_batch(batch, output);
 }
--- a/otherarch/ttscpp/src/parler_model.h
+++ b/otherarch/ttscpp/src/parler_model.h
@ -0,0 +1,225 @@
 #ifndef parler_model_h
 #define parler_model_h
 #include "dac_model.h"
 #include "t5_encoder_model.h"
 #include "sampler.h"
 enum parler_tensor {
    PARLER_EMBD,
    PARLER_EMBD_PROMPTS,
    PARLER_TEXT_ENCODING,
    PARLER_POSITIONAL_EMBD,
    PARLER_HEAD,
    PARLER_NORM,
    PARLER_NORM_BIAS,
    PARLER_LAYER_SELF_ATTN_Q,
    PARLER_LAYER_SELF_ATTN_K,
    PARLER_LAYER_SELF_ATTN_V,
    PARLER_LAYER_SELF_ATTN_O,
    PARLER_LAYER_SELF_ATTN_NORM,
    PARLER_LAYER_SELF_ATTN_NORM_BIAS,
    PARLER_LAYER_ATTN_Q,
    PARLER_LAYER_ATTN_K,
    PARLER_LAYER_ATTN_V,
    PARLER_LAYER_ATTN_O,
    PARLER_LAYER_ATTN_NORM,
    PARLER_LAYER_ATTN_NORM_BIAS,
    PARLER_LAYER_FC1,
    PARLER_LAYER_FC2,
    PARLER_LAYER_OUT_NORM,
    PARLER_LAYER_OUT_NORM_BIAS,
 };
 struct parler_layer {
    struct ggml_tensor * self_attn_k_proj;
    struct ggml_tensor * self_attn_q_proj;
    struct ggml_tensor * self_attn_v_proj;
    struct ggml_tensor * self_attn_o_proj;
    struct ggml_tensor * self_attn_norm;
    struct ggml_tensor * self_attn_norm_bias;
    struct ggml_tensor * attn_k_proj;
    struct ggml_tensor * attn_q_proj;
    struct ggml_tensor * attn_v_proj;
    struct ggml_tensor * attn_o_proj;
    struct ggml_tensor * attn_norm;
    struct ggml_tensor * attn_norm_bias;
    struct ggml_tensor * cross_k;
    struct ggml_tensor * cross_v;
    struct ggml_tensor * fc1;
    struct ggml_tensor * fc2;
    struct ggml_tensor * final_norm;
    struct ggml_tensor * final_norm_bias;
 };
 struct parler_tts_model : tts_model {
    // These default configurations are based on the configuration of Parler TTS Mini (version 1.0)
    uint32_t n_output_heads = 9;
    uint32_t n_encode_length;
    uint32_t max_encode_length = 512; // This corresponds with the max token length of the conditional prompt
    uint32_t hidden_size = 1024;
    uint32_t max_ctx_length = 4096;
    uint32_t n_attn_heads = 16;
    uint32_t head_size = 64;
    uint32_t output_vocab_size = 1088;
    uint32_t eos_token_id = 1024;
    uint32_t audio_vocab_size = 1024;
    uint32_t max_generation_size = 2580;
    uint32_t n_layers = 24;
    uint32_t bos_token_id = 1025;
    uint32_t max_cross_nodes = 32;
    uint32_t prompt_vocab_size;
    bool use_cross_attn = true;
    std::vector<struct ggml_tensor*> embds;
    std::vector<parler_layer*> layers;
    std::vector<struct ggml_tensor*> heads;
    struct ggml_tensor * precomputed_input_emb;
    struct ggml_tensor * precomputed_positional_embds;
    struct ggml_tensor * layer_norm;
    struct ggml_tensor * layer_norm_bias;
    struct ggml_tensor * prompt_embd;
    void assign_weight(std::string name, ggml_tensor * tensor);
    void prep_constants(gguf_context * meta);
    void prep_layers(gguf_context * meta);
    void prep_cross_key_values(int n_threads, struct tts_response * conditional_prompt = nullptr);
    void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only) {
        prep_constants(meta_ctx);
        prep_layers(meta_ctx);
        tts_model::setup_from_file(meta_ctx, load_context, cpu_only, "decoder", 1.30, max_encode_length*hidden_size*sizeof(float)*n_layers*2);
    }
 };
 // For assigning weights to the parler model from a gguf file.
 void assign_parler_layer(parler_tts_model * model, parler_layer & layer, std::string name, ggml_tensor * tensor);
 void assign_to_decoder(parler_tts_model * model, const std::string name, ggml_tensor * tensor);
 struct parler_context : runner_context {
    parler_context(parler_tts_model * model, int n_threads): runner_context(n_threads), model(model) {};
    struct parler_tts_model * model;
    std::vector<bool> eos_seen;
    bool use_cache = true;
    size_t  output_size = 0; // capacity (of tokens positions) for the output buffers
    int32_t n_outputs   = 0; // number of actually-used outputs in the current ubatch or last logical batch
    uint32_t current_position = 0; // current position in the active sequence
    uint32_t prompt_end_position = 0; // the position of the text prompt termination (used for adjusting the cache when incrementally generating)
    int32_t seq_id; // a unique identifier associated with the active sequence.
    std::vector<uint32_t> output_tokens;
    struct ggml_tensor * inp_tokens;
    struct ggml_tensor * audio_inp_tokens;
    struct ggml_tensor * positions;
    struct ggml_tensor * attn_mask;
    struct ggml_tensor * attn_mask_cross;
    void build_schedule() {
        runner_context::build_schedule(model->max_nodes());
    }
    void reset(int32_t n_output_heads);
 };
 struct parler_kv_cache {
    int32_t seq_id;
    ggml_type type_k = GGML_TYPE_F32;
    ggml_type type_v = GGML_TYPE_F32;
    std::vector<struct ggml_tensor *> k_l;
    std::vector<struct ggml_tensor *> v_l;
    struct ggml_context * ctx;
    ggml_backend_buffer_type_t buft;
    ggml_backend_buffer_t buf;
    void free() {
        ggml_free(ctx);
        ggml_backend_buffer_free(buf);
    }
    ~parler_kv_cache() {
        free();
    }
 };
 struct parler_ubatch {
    parler_ubatch(bool audio_generation, size_t n_tokens, size_t n_audio_tokens, size_t sequence_length, 
        uint32_t * tokens, uint32_t * audio_tokens, uint32_t * positions, uint32_t * true_order, 
        int current_step): audio_generation(audio_generation), n_tokens(n_tokens), n_audio_tokens(n_audio_tokens), sequence_length(sequence_length), tokens(tokens), audio_tokens(audio_tokens), positions(positions), true_order(true_order), current_step(current_step) {};
    parler_ubatch() {};
    bool audio_generation; // whether we are receiving codebook decoded tokens or text tokens
    size_t n_tokens; // total sentence tokens
    size_t n_audio_tokens; // total audio tokens
    size_t sequence_length; // for just audio tokens the sequence length should be the total_tokens / num_heads; in general this should be n_tokens + n_audio_tokens / num_heads
    uint32_t * tokens;    // [n_tokens]
    uint32_t * audio_tokens; // [n_audio_tokens]
    uint32_t * positions; // [sequence_length]
    uint32_t * true_order;
    int current_step = 0; // total_generations
 };
 struct parler_context * build_new_parler_context(struct parler_tts_model * model, int n_threads, bool use_cpu = true);
 static bool parler_kv_cache_init(struct parler_kv_cache * cache, parler_tts_model * model, parler_context * pctx, int32_t seq_id);
 struct ggml_tensor * parler_build_inp_embd(struct ggml_context * ctx, struct parler_context * pctx, parler_tts_model * model, const parler_ubatch & batch);
 struct ggml_tensor * parler_build_layer_norm(struct ggml_context * ctx, struct ggml_tensor * inputs, struct ggml_tensor * weight, struct ggml_tensor * bias);
 void parler_build_kv_store(struct ggml_context * ctx, const parler_kv_cache * kv, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, int32_t n_tokens, int32_t kv_head, int32_t index, int32_t n_embd_gqa);
 struct ggml_tensor * parler_build_head_outputs(struct ggml_context * ctx, parler_tts_model * model, struct ggml_tensor * cur);
 struct ggml_tensor * build_attn_mask(ggml_context * ctx, parler_context * pctx, parler_ubatch & batch);
 struct ggml_tensor * build_attn_mask_cross(ggml_context * ctx, parler_context * pctx, parler_tts_model * model, parler_ubatch & batch);
 static struct parler_ubatch batch_from_sentence(std::string sentence, parler_tts_model * model, unigram_tokenizer * tokenizer);
 // This struct is intended to support end-to-end TTS generation. As such, it manages the parler tts model compilation, compute and generation process,
 // the tokenization and sampling process, and uses the dac_runner struct to encode audio outputs.
 struct parler_tts_runner : tts_runner {
    parler_tts_runner(parler_tts_model * model, dac_runner * audio_decoder, parler_context * pctx, unigram_tokenizer * ut, sampler * samp, parler_kv_cache * cache): model(model), dac_runner(audio_decoder), pctx(pctx), tokenizer(ut), sampler(samp), kv_self(cache) {};
    ~parler_tts_runner() {
        if (ctx) {
            ggml_free(ctx);
        }
        model->free();
        delete model;
        delete kv_self;
        delete dac_runner;
        delete pctx;
        delete sampler;
    }
    struct parler_tts_model * model;
    struct dac_runner * dac_runner;
    struct parler_context * pctx;
    struct unigram_tokenizer * tokenizer;
    struct parler_kv_cache * kv_self = nullptr;
    struct sampler * sampler;
    void init_build() {
        tts_runner::init_build(&pctx->buf_compute_meta);
    }
    void configure_generation(generation_configuration * config);
    void assign_weight(std::string name, ggml_tensor * tensor);
    parler_ubatch build_worst_case_batch();
    struct ggml_cgraph * build_parler_graph(parler_ubatch & batch);
    void set_inputs(parler_ubatch & batch);
    int decode(parler_ubatch & batch);
    void prepare_post_load();
    bool adjust_for_sequence_continuation(struct parler_ubatch & batch);
    int generate(std::string sentence, struct tts_response * response, int32_t seq_id = -1);
    bool check_stopping();
    void adjust_output_tokens(std::vector<uint32_t> & output_tokens, std::vector<uint32_t> & filtered);
    int generate_from_batch(parler_ubatch & batch, struct tts_response * output);
    void parler_graph_compute(ggml_cgraph * gf);
    void just_audio_token_decode(uint32_t * tokens, int32_t sq_len, struct tts_response * output);
    int generate_audio_tokens(std::string sentence);
    void update_conditional_prompt(const std::string file_path, const std::string prompt, int n_threads, bool cpu_only = true);
 };
 #endif
--- a/otherarch/ttscpp/src/phonemizer.cpp
+++ b/otherarch/ttscpp/src/phonemizer.cpp
--- a/otherarch/ttscpp/src/sampler.cpp
+++ b/otherarch/ttscpp/src/sampler.cpp
@ -0,0 +1,204 @@
 #include "sampler.h"
 void sampler::sample(float * logits, std::vector<uint32_t> & output_tokens) {
    // assume that we are pointing to the start of the first token output;
    if (!do_sample) {
        return max(logits, output_tokens);
    }
    std::vector<uint32_t> max_vals;
    // the max_head_probs variable is used when top-p is applied but exists to address the case in which top-k and top-p cause the cumulative probability of the nucleus to beless than or 
    // equal to top_p;
    std::vector<float> max_head_probs;
    // This allows us to perform an effective softmax without logarithms or big number calculations.
    // Additionally by avoiding large number division we drastically improve the stability of
    // our softmax implementation;
    max(logits, max_vals);
    std::vector<std::vector<size_t>> picks;
    bool use_nucleus_sampling = false;
    bool performed_softmax = false;
    if (top_p < 1.0) {
        // if we are nucleus sampling via top-p then we need to perform softmax over the samples before getting top_k samples, so that we don't trim beyond top_p.
        // Otherwise, if we are not performing top-p sampling then it is more efficient to perform softmax after getting the top_k nucleus.
        softmax(logits, picks, max_vals);
        performed_softmax = true;
    }
    if (top_k > 0 && top_k < vocab_size) {
        picks = topk(logits, performed_softmax);
        use_nucleus_sampling = true;
    }
    if (top_p >= 1.0) {
        softmax(logits, picks, max_vals);
        performed_softmax = true;
    }
    if (top_p < 1.0) {
        topp(logits, picks, max_head_probs);
        use_nucleus_sampling = true;
    }
    bool has_repetition_penalty = repetition_penalty != 1.0;
    if (has_repetition_penalty && (last_token_ids.size() == 0 || repetition_counts.size() == 0)) {
        reset();
    }
    std::minstd_rand gen(std::random_device{}());
    std::uniform_real_distribution<float> dist(0.0f, 1.0f);
    for (int i = 0; i < n_output_heads; i++) {
        float assignment =  top_p < 1.0 ? dist(gen) * max_head_probs[i] : dist(gen);
        float cumulative = 0.0f;
        for (uint32_t j = 0; j < (use_nucleus_sampling ? picks[i].size() : vocab_size); j++) {
            int ii = use_nucleus_sampling ? (int) picks[i][j] : j;
            cumulative += *(logits+(i*vocab_size+ii));
            // with top_k and top_p it is possible for the assignment to be greater than the cumulative value
            if (assignment <= cumulative || ii >= vocab_size + 1 || j >= picks[i].size() - 1) {
                if (has_repetition_penalty) {
                    if (last_token_ids[i] != ii) {
                        repetition_counts[i] = 0;
                    }
                    last_token_ids[i] = ii;
                    repetition_counts[i] += 1;
                }
                output_tokens.push_back(ii);
                break;
            }
        }
    }
 }
 void sampler::reset() {
    if (repetition_penalty != 1.0) {
        last_token_ids.clear();
        repetition_counts.clear();
        for (int i = 0; i < n_output_heads; i++) {
            last_token_ids.push_back(-1);
            repetition_counts.push_back(0);
        }
    }
 }
 void sampler::softmax(float * logits, std::vector<std::vector<size_t>> picks, std::vector<uint32_t> max_indices) {
    bool use_nucleus_sampling = picks.size() > 0;
    bool has_repetition_penalty = repetition_penalty != 1.0f;
    bool has_temperature = temperature != 1.0f;
    for (int i = 0; i < n_output_heads; i++) {
        float cumsum = 0.0;
        float max_val = logits[i*vocab_size + max_indices[i]];
        if (has_repetition_penalty && last_token_ids[i] == max_indices[i]) {
            max_val /= (pow(repetition_penalty, repetition_counts[i]));
        }
        if (has_temperature) {
            max_val /= temperature;
        }
        for (int j = 0; j < (use_nucleus_sampling ? picks[i].size() : vocab_size); j++) {
            int ii = use_nucleus_sampling ? (int) picks[i][j] : j;
            int index = i * vocab_size + ii;
            float v = *(logits + index);
            if (has_repetition_penalty && last_token_ids[i] == ii) {
                v /= (pow(repetition_penalty, repetition_counts[i]));
            }
            if (has_temperature) {
                v /= temperature;
            }
            v = expf(v - max_val);
            cumsum += v;
            logits[index] = v;
        }
        for (int j = 0; j < (use_nucleus_sampling ? picks[i].size() : vocab_size); j++) {
            int ii = use_nucleus_sampling ? picks[i][j] : j;
            int index = i * vocab_size + ii;
            float v = *(logits + index);
            logits[index] = v / cumsum;
        }
    }
 }
 void sampler::topp(float * logits, std::vector<std::vector<size_t>> & picks, std::vector<float> & max_head_probs) {
    if (picks.empty()) {
        // we need to get the softmaxed logits ordered
        for (int i = 0; i < n_output_heads; i++) {
            std::vector<size_t> head_picks(vocab_size);
            iota(head_picks.begin(), head_picks.end(), 0);
            // have to sort with repetition penalty applied so not to inavertently trim our nucleus size.
            std::sort(head_picks.begin(), head_picks.end(), [&logits, &i, this](size_t s1, size_t s2) {
                float v1 = logits[i*vocab_size+s1];
                float v2 = logits[i*vocab_size+s2];
                return v1 > v2;
            });
            picks.push_back(head_picks);
        }
    }
    // if we didn't already perform topk or if the probable sum of topk logits is greater than top_p then we need to trim.
    for (int i = 0; i < n_output_heads; i++) {
        float prob_sum = 0.0f;
        int trim_to = -1;
        for (int ii = 0; ii < picks[i].size(); ii++) {
            prob_sum += logits[i*vocab_size+picks[i][ii]];
            if (prob_sum >= top_p) {
                trim_to = ii+1;
                break;
            }
        }
        max_head_probs.push_back(std::min(prob_sum, top_p));
        if (trim_to > 0) {
            picks[i] = std::vector<size_t>(picks[i].begin(), picks[i].begin()+trim_to);
        }
    }
 }
 std::vector<std::vector<size_t>> sampler::topk(float * logits, bool performed_softmax) {
    bool has_repetition_penalty = repetition_penalty != 1.0f;
    std::vector<std::vector<size_t>> head_picks;
    if (vocab_size < top_k) {
        // technically we should never get here, but lets be protective.
        for (int i = 0; i < n_output_heads; i++) {
            std::vector<size_t> picks(vocab_size);
            iota(picks.begin(), picks.end(), 0);
            head_picks.push_back(picks);
        }
        return head_picks;
    }
    for (int i = 0; i < n_output_heads; i++) {
        std::vector<size_t> picks(vocab_size);
        iota(picks.begin(), picks.end(), 0);
        // have to sort with repetition penalty applied so not to inavertently trim our nucleus size.
        std::sort(picks.begin(), picks.end(), [&logits, &i, &has_repetition_penalty, &performed_softmax, this](size_t s1, size_t s2) {
            float v1 = logits[i*vocab_size+s1];
            float v2 = logits[i*vocab_size+s2];
            if (!performed_softmax) {
                if (has_repetition_penalty && last_token_ids[i] == s1) {
                    v1 /= (pow(repetition_penalty, repetition_counts[i]));
                } else if (has_repetition_penalty && last_token_ids[i] == s2) {
                    v2 /= (pow(repetition_penalty, repetition_counts[i]));
                }
            }
            return v1 > v2;
        });
        head_picks.push_back(std::vector<size_t>(picks.begin(), picks.begin() + top_k));
    }
    return head_picks;
 }
 void sampler::max(float * logits, std::vector<uint32_t> & output_tokens) {
    bool has_repetition_penalty = repetition_penalty != 1.0f;
    for (int i = 0; i < n_output_heads; i++) {
        float max = -INFINITY;
        uint32_t token_id = 0;
        for (uint32_t ii = 0; ii < vocab_size; ii++) {
            float v = *(logits+i*vocab_size+ii);
            // while repetition penalty will never be used for maximum token selection, it is used for the logarithmic stabilization of 
            // the softmax function in which case it is possible for repetition counts to be set.
            if (has_repetition_penalty && last_token_ids[i] == ii) {
                v /= (pow(repetition_penalty, repetition_counts[i]));
            }
            if (v > max) {
                max = v;
                token_id = ii;
            }
        }
        output_tokens.push_back(token_id);
    }
 }
--- a/otherarch/ttscpp/src/sampler.h
+++ b/otherarch/ttscpp/src/sampler.h
@ -0,0 +1,33 @@
 #ifndef sampler_h
 #define sampler_h
 #include <stdint.h>
 #include <vector>
 #include <random>
 #include <numeric>
 #include <algorithm>
 // currently this is only built to support single sequence output sampling without beam search.
 struct sampler {
    // These default configurations are based on the generation configuration for Parler TTS Mini (version 1.0)
    uint32_t n_output_heads = 9;
    uint32_t eos_token_id = 1024;
    uint32_t vocab_size = 1088;
    float temperature = 1.0f;
    uint32_t top_k = 0;
    float top_p = 1.0f;
    float repetition_penalty = 1.0f;
    std::vector<int32_t> last_token_ids;
    std::vector<uint32_t> repetition_counts;
    bool do_sample = true;
    bool apply_softmax = true;
    void sample(float * logits, std::vector<uint32_t> & output_tokens);
    void softmax(float * logits, std::vector<std::vector<size_t>> picks, std::vector<uint32_t> max_indices);
    void max(float * logits, std::vector<uint32_t> & output_tokens);
    std::vector<std::vector<size_t>> topk(float * logits, bool performed_softmax);
    void topp(float * logits, std::vector<std::vector<size_t>> & picks, std::vector<float> & max_head_probs);
    void reset();
 };
 #endif
--- a/otherarch/ttscpp/src/snac_model.cpp
+++ b/otherarch/ttscpp/src/snac_model.cpp
@ -0,0 +1,209 @@
 #include "snac_model.h"
 void snac_model::prep_constants(gguf_context * meta) {
    int heads_key = gguf_find_key(meta, "snac.audio_token_channels");
    if (heads_key != -1) {
        n_heads = gguf_get_val_u32(meta, heads_key);
    }
    int sampling_factor_key = gguf_find_key(meta, "snac.up_sampling_factor");
    if (sampling_factor_key != -1) {
        up_sampling_factor = gguf_get_val_u32(meta, sampling_factor_key);
    }
    int max_gen_key = gguf_find_key(meta, "snac.max_generation_size");
    if (max_gen_key != -1) {
        max_generation_size = gguf_get_val_u32(meta, max_gen_key);
    }
 }
 void snac_model::prep_layers(gguf_context * meta) {
    for (int i = 0; i < n_heads; i++) {
        quantizer_layers.push_back(general_neural_audio_codec::residual_vector_quantize_layer{});
    }
    for (int i = 0; i < n_layers; i++) {
        std::string stride_key = "snac.snac_layer_stride_" + std::to_string(i);
        std::string padding_key = "snac.snac_layer_padding_" + std::to_string(i);
        std::string grouping_key = "snac.snac_layer_grouping_" + std::to_string(i);
        int layer_stride_key = gguf_find_key(meta, stride_key.c_str());
        if (layer_stride_key == -1) {
            TTS_ABORT("key %s must be specified in gguf file inorder to initialize the SNAC audio decoder.", stride_key.c_str());
        }
        int layer_padding_key = gguf_find_key(meta, padding_key.c_str());
        if (layer_padding_key == -1) {
            TTS_ABORT("key %s must be specified in gguf file inorder to initialize the SNAC audio decoder.", padding_key.c_str());
        }
        int layer_grouping_key = gguf_find_key(meta, grouping_key.c_str());
        if (layer_grouping_key == -1) {
            TTS_ABORT("key %s must be specified in gguf file inorder to initialize the SNAC audio decoder.", grouping_key.c_str());
        }
        layers.push_back(
            general_neural_audio_codec::layer{
                gguf_get_val_u32(meta, layer_padding_key),
                gguf_get_val_u32(meta, layer_stride_key),
                gguf_get_val_u32(meta, layer_grouping_key)
            }
        );
    }
 }
 void snac_model::assign_weight(std::string name, ggml_tensor * tensor) {
    if (name == "alpha_out") {
        snake_alpha = ggml_dup_tensor(ctx, tensor);
        set_tensor(snake_alpha, tensor);
    } else if (name == "in.weight") {
        in_conv_kernel = ggml_dup_tensor(ctx, tensor);
        set_tensor(in_conv_kernel, tensor);
    } else if (name == "in.bias") {
        in_conv_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor));
        set_tensor(in_conv_bias, tensor);
    } else if (name == "up.weight") {
        up_conv_kernel = ggml_dup_tensor(ctx, tensor);
        set_tensor(up_conv_kernel, tensor);
    } else if (name == "up.bias") {
        up_conv_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor));
        set_tensor(up_conv_bias, tensor);
    } else if (name == "final.weight") {
        out_conv_kernel = ggml_dup_tensor(ctx, tensor);
        set_tensor(out_conv_kernel, tensor);
    } else if (name == "final.bias") {
        out_conv_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor));
        set_tensor(out_conv_bias, tensor);
    } else if (has_prefix(name, "layers")) {
        auto pair = parse_layer_count(name);
        int l = pair.first;
        std::string lt_name = pair.second;
        general_neural_audio_codec::assign_to_layer((tts_model *) this, layers[l], lt_name, tensor);
    } else if (has_prefix(name, "quantizers")) {
        auto pair = parse_layer_count(name);
        int l = pair.first;
        std::string lt_name = pair.second;
        general_neural_audio_codec::assign_to_quantize_layer((tts_model *) this, quantizer_layers[l], lt_name, tensor);
    }
 }
 static struct ggml_tensor * snac_build_audio_inputs(struct ggml_context * ctx, struct snac_context * sctx, size_t sequence_length, std::vector<general_neural_audio_codec::residual_vector_quantize_layer> layers) {
    struct ggml_tensor * embd;
    // these devisors represent the discreate repeats performed against each of the three input heads.
    sctx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sequence_length / 4 + sequence_length / 2 + sequence_length);
    ggml_set_input(sctx->inp_tokens);
    size_t last_stride = 0;
    for(int i = 0; i < sctx->model->n_heads; i++) {
        auto quantize_layer = sctx->model->quantizer_layers[i];
        struct ggml_tensor * inp_head = ggml_cont(ctx, ggml_view_1d(ctx, sctx->inp_tokens, sequence_length / sctx->model->repeats[i], last_stride));
        last_stride += (sequence_length / sctx->model->repeats[i]) * ggml_element_size(sctx->inp_tokens);
        struct ggml_tensor * code = general_neural_audio_codec::build_quantize_layer(ctx, inp_head, quantize_layer);
        if (sctx->model->repeats[i] > 1) {
            // this manipulation is equivalent to repeat_interleave against the first dimension of the tensor
            code = ggml_repeat(ctx, ggml_cont_3d(ctx, code, 1, code->ne[0], code->ne[1]), ggml_new_tensor_3d(ctx, GGML_TYPE_F32, sctx->model->repeats[i], code->ne[0], sctx->model->embd));
            code = ggml_cont_2d(ctx, code, sequence_length, code->ne[2]);
        }
        if (i == 0) {
            embd = code;
        } else {
            embd = ggml_add(ctx, embd, code);
        }
    }
    return embd;
 }
 snac_context * build_new_snac_context(struct snac_model * model, int n_threads, bool use_cpu) {
    snac_context * sctx = new snac_context(model, n_threads);
    if (!use_cpu) {
 #ifdef GGML_USE_METAL
        sctx->backend = ggml_backend_metal_init();
 #endif
    }
    sctx->backend_cpu = ggml_backend_cpu_init();
    sctx->set_threads();
    sctx->build_schedule();
    sctx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_nodes() + ggml_graph_overhead_custom(model->max_nodes(), false));
    return sctx;
 }
 void snac_runner::prepare_post_load() {
    ggml_cgraph * gf = build_snac_graph(model->max_generation_size);
    sctx->prep_schedule(gf);
 }
 struct ggml_cgraph * snac_runner::build_snac_graph(size_t sequence_length) {
    init_build();
    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false);
    struct ggml_tensor * cur;
    struct ggml_tensor * inputs;
    sctx->noise = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model->noise_steps_sum * sequence_length);
    ggml_set_input(sctx->noise);
    inputs = snac_build_audio_inputs(ctx, sctx, sequence_length, model->quantizer_layers);
    cur = ggml_conv_1d_dw_tts(ctx, model->in_conv_kernel, inputs, 1, 3, 1);
    cur = ggml_add(ctx, cur, model->in_conv_bias);
    cur = ggml_conv_1d_tts(ctx, model->up_conv_kernel, cur, 1, 0, 1);
    cur = ggml_add(ctx, cur, model->up_conv_bias);
    size_t noise_offset = 0;
    for (int l = 0; l < model->layers.size(); l++) {
        auto layer = model->layers[l];
        struct ggml_tensor * noise = ggml_cont(ctx, ggml_view_1d(ctx, sctx->noise, model->noise_steps[l] * sequence_length, noise_offset));
        noise_offset += model->noise_steps[l] * sequence_length * sizeof(float);
        cur = general_neural_audio_codec::build_layer(ctx, cur, layer, noise);
    }
    cur = snake_1d(ctx, model->snake_alpha, cur);
    cur = ggml_conv_1d_tts(ctx, model->out_conv_kernel, cur, 1, 3, 1);
    cur = ggml_add(ctx, cur, model->out_conv_bias);
    cur = ggml_tanh(ctx, cur);
    ggml_build_forward_expand(gf, cur);
    free_build();
    return gf;
 }
 void snac_runner::set_inputs(std::vector<std::vector<uint32_t>> & tokens) {
    ggml_backend_tensor_set(
        sctx->inp_tokens, tokens[0].data(), 0,
        tokens[0].size()*ggml_element_size(sctx->inp_tokens)
    );
    ggml_backend_tensor_set(
        sctx->inp_tokens, tokens[1].data(), tokens[0].size() * ggml_element_size(sctx->inp_tokens),
        tokens[1].size() * ggml_element_size(sctx->inp_tokens)
    );
    ggml_backend_tensor_set(
        sctx->inp_tokens, tokens[2].data(),
        tokens[1].size()*ggml_element_size(sctx->inp_tokens)+tokens[0].size()*ggml_element_size(sctx->inp_tokens),
        tokens[2].size()*ggml_element_size(sctx->inp_tokens)
    );
    size_t sequence_length = tokens[2].size();
    random_normal_gen(model->noise_steps_sum * sequence_length, (float*) sctx->noise->data);
 }
 void snac_runner::run(std::vector<std::vector<uint32_t>> & tokens, struct tts_response * outputs) {
    size_t sequence_length = tokens[2].size();
    ggml_backend_sched_reset(sctx->sched);
    sctx->prep_output_buffer(model->max_generation_size * model->up_sampling_factor * sizeof(float));
    outputs->data = sctx->logits;
    ggml_backend_buffer_clear(sctx->buf_output, 0);
    struct ggml_cgraph * gf = NULL;
    gf = build_snac_graph(sequence_length);
    // the output is always the last tensor in the graph
    struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1];
    ggml_backend_sched_alloc_graph(sctx->sched, gf);
    set_inputs(tokens);
    ggml_backend_sched_graph_compute_async(sctx->sched, gf);
    sctx->get_ggml_node_data(result, outputs->data, sequence_length*sizeof(float)*model->up_sampling_factor);
    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
    // overlap with device computation.
    ggml_backend_sched_reset(sctx->sched);
    outputs->n_outputs = sequence_length * model->up_sampling_factor;
    return;
 }
--- a/otherarch/ttscpp/src/snac_model.h
+++ b/otherarch/ttscpp/src/snac_model.h
@ -0,0 +1,86 @@
 #pragma once
 #include "general_neural_audio_codec.h"
 // SNAC, Scale Neural Audio Codec, is another neural audio codec much like DAC.
 // The key differences are that it uses grouping in the residual units of its layers,
 // performs a repeat_interleave over the second and third input channels, applies 
 // a noise convolutional layer after input encoding for each layer, and applies
 // an extra convolutional layer before residual layers are applied.
 struct snac_model : tts_model {
    // general configuration from SNAC as used by Orpheus
    uint32_t n_layers = 4;
    uint32_t n_heads = 3;
    uint32_t up_sampling_factor = 512;
    uint32_t embd = 768;
    size_t max_generation_size = 2580;
    uint32_t repeats[3] = {4, 2, 1};
    // configuration for adding noise
    uint32_t noise_steps[4] = {8, 64, 256, 512};
    uint32_t noise_steps_sum = 840;
    bool use_noise = true;
    struct ggml_tensor * repeat_interleave_buffer;
    struct ggml_tensor * in_conv_kernel;
    struct ggml_tensor * in_conv_bias;
    struct ggml_tensor * up_conv_kernel;
    struct ggml_tensor * up_conv_bias;
    struct ggml_tensor * out_conv_kernel;
    struct ggml_tensor * out_conv_bias;
    struct ggml_tensor * snake_alpha;
    std::vector<general_neural_audio_codec::layer> layers;
    std::vector<general_neural_audio_codec::residual_vector_quantize_layer> quantizer_layers;
    void assign_weight(std::string name, ggml_tensor * weight);
    void prep_constants(gguf_context * meta);
    void prep_layers(gguf_context * meta);
    void post_load_assign();
    void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only) {
        prep_layers(meta_ctx);
        prep_constants(meta_ctx);
        tts_model::setup_from_file(meta_ctx, load_context, cpu_only, "snac");
    }
 };
 // the context used for running the snac model
 struct snac_context : runner_context {
    snac_context(snac_model * model, int n_threads): runner_context(n_threads), model(model) {};
    struct snac_model * model;
    struct ggml_tensor * inp_tokens;
    struct ggml_tensor * noise;
    void build_schedule() {
        runner_context::build_schedule(model->max_nodes());
    }
 };
 snac_context * build_new_snac_context(struct snac_model * model, int n_threads, bool use_cpu = true);
 static struct ggml_tensor * snac_build_audio_inputs(struct ggml_context * ctx, struct snac_context * sctx, size_t sequence_length, std::vector<general_neural_audio_codec::residual_vector_quantize_layer> layers);
 // This struct is intended to manage the snac model's graph compilation and compute function.
 struct snac_runner : tts_runner {
    snac_runner(snac_model * model, snac_context * context): model(model), sctx(context) {};
    ~snac_runner() {
        if (ctx) {
            ggml_free(ctx);
        }
        model->free();
        delete model;
        delete sctx;
    }
    snac_model * model;
    snac_context * sctx;
    void init_build() {
        tts_runner::init_build(&sctx->buf_compute_meta);
    }
    void set_inputs(std::vector<std::vector<uint32_t>> & tokens);
    void prepare_post_load();
    struct ggml_cgraph * build_snac_graph(size_t sequence_length);
    void run(std::vector<std::vector<uint32_t>> & tokens, struct tts_response * outputs);
 };
--- a/otherarch/ttscpp/src/t5_encoder_model.cpp
+++ b/otherarch/ttscpp/src/t5_encoder_model.cpp
@ -0,0 +1,402 @@
 #include "t5_encoder_model.h"
 static const std::map<std::string, t5_tensor> T5_TENSOR_GGUF_LOOKUP = {
    {"t5encoder.token_embd", T5_EMBD},
    {"t5encoder.enc.final_layer_norm", T5_NORM},
    {"t5encoder.down_proj", T5_DOWN_PROJ},
    {"t5encoder.down_proj_bias", T5_DOWN_PROJ_BIAS},
    {".attn_norm", T5_LAYER_ATTN_NORM},
    {".attn_q", T5_LAYER_ATTN_Q},
    {".attn_k", T5_LAYER_ATTN_K},
    {".attn_v", T5_LAYER_ATTN_V},
    {".attn_o", T5_LAYER_ATTN_O},
    {".attn_rel_b", T5_RELATIVE_BIAS},
    {".ffn_norm", T5_LAYER_OUT_NORM},
    {".ffn_gate", T5_LAYER_WI_1},
    {".ffn_down", T5_LAYER_WO},
    {".ffn_up", T5_LAYER_WI_0},
 };
 void assign_to_t5_layer(t5_encoder * model, t5_layer & layer, std::string name, ggml_tensor * tensor) {
    try {
        switch(T5_TENSOR_GGUF_LOOKUP.at(name)) {
            case T5_LAYER_ATTN_NORM:
                layer.attn_norm = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(layer.attn_norm, tensor);
                break;
            case T5_LAYER_ATTN_Q:
                layer.q = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(layer.q, tensor);
                break;
            case T5_LAYER_ATTN_K:
                layer.k = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(layer.k, tensor);
                break;
            case T5_LAYER_ATTN_V:
                layer.v = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(layer.v, tensor);
                break;
            case T5_LAYER_ATTN_O:
                layer.o = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(layer.o, tensor);
                break;
            case T5_LAYER_OUT_NORM:
                layer.mlp_norm = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(layer.mlp_norm, tensor);
                break;
            case T5_LAYER_WI_1:
                layer.wi_1 = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(layer.wi_1, tensor);
                break;
            case T5_LAYER_WI_0:
                layer.wi_0 = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(layer.wi_0, tensor);
                break;
            case T5_LAYER_WO:
                layer.wo = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(layer.wo, tensor);
                break;
            case T5_RELATIVE_BIAS:
                model->relative_attn_bias = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(model->relative_attn_bias, tensor);
                break;
            default:
                fprintf(stdout, "unassigned tensor %s\n", name.c_str());
                break;
        }
    } catch (const std::out_of_range& e) {
        TTS_ABORT("Error: %s\nTensor, '%s', is not a valid tensor.", e.what(), name.c_str());
    }
 }
 void assign_to_t5_encoder(t5_encoder * model, const std::string name, ggml_tensor * tensor) {
    if (tensor->data == NULL) {
        return;
    }
    std::string::size_type pos = name.find(".", 0);
    std::string top_level(name.substr(0, pos));
    if (T5_TENSOR_GGUF_LOOKUP.find(name) != T5_TENSOR_GGUF_LOOKUP.end()) {
        switch (T5_TENSOR_GGUF_LOOKUP.at(name)) {
            case T5_EMBD:
                model->embd = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(model->embd, tensor);
                break;
            case T5_NORM:
                model->out_norm = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(model->out_norm, tensor);
                break;
            case T5_DOWN_PROJ:
                model->down_proj = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(model->down_proj, tensor);
                break;
            case T5_DOWN_PROJ_BIAS:
                model->down_proj_bias = ggml_dup_tensor(model->ctx, tensor);
                model->set_tensor(model->down_proj_bias, tensor);
                break;
            default:
                fprintf(stdout, "unassigned tensor %s\n", name.c_str());
                break;
        }
    } else if (top_level == "t5encoder") {
        auto pair = parse_layer_count(name, 2);
        int l = pair.first;
        std::string lt_name = pair.second;
        assign_to_t5_layer(model, model->layers[l], lt_name, tensor);
    } else {
        return;
    }
 }
 void t5_encoder::prep_layers(gguf_context * meta) {
 	for (uint32_t i = 0; i < n_layers; i++) {
 		t5_layer l;
 		layers.push_back(l);
 	}
 }
 void t5_encoder::prep_constants(gguf_context * meta) {
 	int n_layers_key = gguf_find_key(meta, "t5encoder.block_count");
    if (n_layers_key != -1) {
        n_layers = gguf_get_val_u32(meta, n_layers_key);
    }
 	int hidden_size_key = gguf_find_key(meta, "t5encoder.embedding_length");
    if (hidden_size_key != -1) {
        hidden_size = gguf_get_val_u32(meta, hidden_size_key);
    }
 	int attn_heads_key = gguf_find_key(meta, "t5encoder.attention.head_count");
    if (attn_heads_key != -1) {
        n_attn_heads = gguf_get_val_u32(meta, attn_heads_key);
    }
    int context_size_key = gguf_find_key(meta, "t5encoder.context_length");
    if (context_size_key != -1) {
        max_context_length = gguf_get_val_u32(meta, context_size_key);
    }
    int bos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.bos_token_id");
    if (bos_token_id_key != -1) {
        bos_token_id = gguf_get_val_u32(meta, bos_token_id_key);
    }    
    int eos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.eos_token_id");
    if (eos_token_id_key != -1) {
        eos_token_id = gguf_get_val_u32(meta, eos_token_id_key);
    }
    int vocab_size_key = gguf_find_key(meta, "t5encoder.vocab_size");
    if (vocab_size_key == -1) {
        TTS_ABORT("key 't5encoder.vocab_size' must be specified in gguf file.");
    }
    vocab_size = gguf_get_val_u32(meta, vocab_size_key);
    int output_size_key = gguf_find_key(meta, "t5encoder.output_size");
    if (output_size_key != -1) {
        output_size = gguf_get_val_u32(meta, output_size_key);
    }
 }
 void t5_encoder::assign_weight(std::string name, ggml_tensor * tensor) {
    assign_to_t5_encoder(this, name, tensor);
 }
 struct t5_context * build_new_t5_context(struct t5_encoder * model, int n_threads, bool use_cpu) {
 	t5_context * t5ctx = new t5_context(model, n_threads);
    if (!use_cpu) {
 #ifdef GGML_USE_METAL
        t5ctx->backend = ggml_backend_metal_init();
 #endif
    }
    t5ctx->backend_cpu = ggml_backend_cpu_init();
    t5ctx->set_threads();
    t5ctx->build_schedule();
    t5ctx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_nodes() + ggml_graph_overhead_custom(model->max_nodes(), false));
    return t5ctx;
 }
 static struct ggml_tensor * build_t5_norm(struct ggml_context * ctx, struct ggml_tensor * cur, struct ggml_tensor * weight) {
 	// this is static for all versions of t5 flan
    float eps = 0.000001;
    cur = ggml_rms_norm(ctx, cur, eps);
    cur = ggml_mul(ctx, cur, weight);
    return cur;
 }
 static struct ggml_tensor * build_t5_attn_mask(ggml_context * ctx, struct t5_context *t5ctx, const t5_ubatch & batch) {
    t5ctx->attn_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, (int64_t) batch.n_tokens, (int64_t) batch.n_tokens);
    ggml_set_input(t5ctx->attn_mask);
    return t5ctx->attn_mask;
 }
 static struct ggml_tensor * build_t5_pos_bias(ggml_context * ctx, struct ggml_tensor * pos_bucket, struct ggml_tensor * relative_attn_bias) {
    struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0);
    struct ggml_tensor * pos_bias = ggml_get_rows(ctx, relative_attn_bias, pos_bucket_1d);
    pos_bias = ggml_view_3d(ctx, pos_bias, pos_bias->ne[0], pos_bucket->ne[0], pos_bucket->ne[1], ggml_element_size(pos_bias) * pos_bias->ne[0], ggml_element_size(pos_bias) * pos_bias->ne[0] * pos_bucket->ne[0],  0);
    pos_bias = ggml_permute(ctx, pos_bias, 2, 1, 0, 3);
    pos_bias = ggml_cont(ctx, pos_bias);
    return pos_bias;
 }
 t5_ubatch t5_runner::build_worst_case_batch()  {
    struct t5_ubatch batch;
    batch.n_tokens = model->max_context_length;
    return batch;
 }
 void t5_runner::prepare_post_load() {
    auto batch = build_worst_case_batch();
    auto gf = build_t5_graph(batch);
    t5ctx->prep_schedule(gf);
 }
 struct ggml_cgraph * t5_runner::build_t5_graph(t5_ubatch & batch) {
    init_build();
    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false);
    struct ggml_tensor * cur;
    struct ggml_tensor * inpL;
    //t5ctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
    //ggml_set_input(t5ctx->positions);
    t5ctx->inp_pos_bucket = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, batch.n_tokens, batch.n_tokens);
    ggml_set_input(t5ctx->inp_pos_bucket);
    t5ctx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
    ggml_set_input(t5ctx->inp_tokens);
    inpL = ggml_get_rows(ctx, model->embd, t5ctx->inp_tokens);
    struct ggml_tensor * KQ_mask_dec = build_t5_attn_mask(ctx, t5ctx, batch);
    struct ggml_tensor * pos_bias = build_t5_pos_bias(ctx, t5ctx->inp_pos_bucket, model->relative_attn_bias);
    for (int l = 0; l < model->n_layers; l++) {
        struct ggml_tensor * residual = inpL;
        cur = build_t5_norm(ctx, inpL, model->layers[l].attn_norm);
        struct ggml_tensor * attn_out;
        // self-attention
        {
            struct ggml_tensor * Qcur = ggml_mul_mat(ctx, model->layers[l].q, cur);
            struct ggml_tensor * Kcur = ggml_mul_mat(ctx, model->layers[l].k, cur);
            struct ggml_tensor * Vcur = ggml_mul_mat(ctx, model->layers[l].v, cur);
 			Qcur = ggml_reshape_3d(ctx, Qcur, model->head_size, model->n_attn_heads, batch.n_tokens);
            Kcur = ggml_reshape_3d(ctx, Kcur, model->head_size, model->n_attn_heads, batch.n_tokens);
            struct ggml_tensor * q = ggml_permute(ctx, Qcur, 0, 2, 1, 3);
            struct ggml_tensor * k = ggml_cont(ctx, ggml_permute(ctx, Kcur, 0, 2, 1, 3));
            struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
            kq = ggml_add(ctx, kq, pos_bias);
            kq = ggml_soft_max_ext(ctx, kq, KQ_mask_dec, 1.0f, 0.0f);
            struct ggml_tensor * v = ggml_cont_3d(ctx, ggml_transpose(ctx, Vcur), batch.n_tokens, model->head_size, model->n_attn_heads);
            struct ggml_tensor * kqv = ggml_mul_mat(ctx, kq, v);
            struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3);
            attn_out = ggml_cont_2d(ctx, kqv_merged, model->hidden_size, batch.n_tokens);
            attn_out = ggml_mul_mat(ctx, model->layers[l].o, attn_out);
        }
        cur = ggml_add(ctx, attn_out, residual);
        struct ggml_tensor * residualmlp = cur;
        // mlp
        {
        	cur = build_t5_norm(ctx, cur, model->layers[l].mlp_norm);
        	struct ggml_tensor * gate_proj = ggml_mul_mat(ctx, model->layers[l].wi_1, cur);
        	cur = ggml_mul(ctx, ggml_gelu(ctx, ggml_mul_mat(ctx, model->layers[l].wi_0, cur)), gate_proj);
        	cur = ggml_mul_mat(ctx, model->layers[l].wo, cur);
        }
 		cur = ggml_add(ctx, cur, residualmlp);
        inpL = cur;
    }
    cur = build_t5_norm(ctx, cur, model->out_norm);
    if (model->down_proj) {
        cur = ggml_mul_mat(ctx, model->down_proj, cur);
    }
    if (model->down_proj_bias) {
        cur = ggml_add(ctx, cur, model->down_proj_bias);
    }
    ggml_build_forward_expand(gf, cur);
    free_build();
    return gf;
 }
 void t5_runner::set_inputs(t5_ubatch & batch) {
    ggml_backend_tensor_set(t5ctx->inp_tokens, batch.input_tokens, 0, batch.n_tokens*ggml_element_size(t5ctx->inp_tokens));
    float * attn_mask = nullptr;
    uint32_t * positions = nullptr;
    uint32_t * pos_bucket = nullptr;
    attn_mask = (float *) t5ctx->attn_mask->data;
    positions = (uint32_t *) t5ctx->positions->data;
    pos_bucket = (uint32_t *) t5ctx->inp_pos_bucket->data;
    int n_buckets = (int) model->relative_attn_buckets / 2;
    int max_exact = (int) n_buckets / 2;
 	float logarithmic_denominator = log(128.0 / max_exact);
    for (int i = 0; i < batch.n_tokens; i++) {
        for (int ii = 0; ii < batch.n_tokens; ii++) {
        	int ab_rpos = abs(i - ii);
        	int rpos = i - ii;
            attn_mask[i*batch.n_tokens + ii] = 0.0f; //ii > i ? -INFINITY : 0.0f; 
            pos_bucket[i*batch.n_tokens + ii] = (uint32_t) (rpos > 0 ? n_buckets : 0) + (ab_rpos < max_exact ? ab_rpos : std::min((n_buckets - 1), (max_exact + (int)((log((ab_rpos / max_exact)) / logarithmic_denominator) * max_exact))));
        }
    }
 }
 void t5_runner::run(uint32_t * input_tokens, uint32_t sequence_length, struct tts_response * outputs) {
 	t5_ubatch batch;
    batch.input_tokens = input_tokens;
    batch.n_tokens = sequence_length;
    ggml_backend_sched_reset(t5ctx->sched);
    const size_t prev_size = t5ctx->buf_output ? ggml_backend_buffer_get_size(t5ctx->buf_output) : 0;
    const size_t new_size = model->max_context_length * model->output_size * sizeof(float);
    if (!t5ctx->buf_output || prev_size < new_size) {
        if (t5ctx->buf_output) {
            ggml_backend_buffer_free(t5ctx->buf_output);
            t5ctx->buf_output = nullptr;
            t5ctx->logits = nullptr;
        }
        t5ctx->buf_output = ggml_backend_buft_alloc_buffer(t5ctx->backend_cpu_buffer, new_size);
    }
    outputs->data = (float *) ggml_backend_buffer_get_base(t5ctx->buf_output);
    ggml_backend_buffer_clear(t5ctx->buf_output, 0);
    struct ggml_cgraph * gf = NULL;
    gf = build_t5_graph(batch);
    // the output is always the last tensor in the graph
    struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1];
    ggml_backend_sched_alloc_graph(t5ctx->sched, gf);
    set_inputs(batch);
    ggml_backend_sched_graph_compute_async(t5ctx->sched, gf);
    t5ctx->get_ggml_node_data(result, outputs->data, batch.n_tokens*sizeof(float)*model->output_size);
    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
    // overlap with device computation.
    ggml_backend_sched_reset(t5ctx->sched);
    outputs->n_outputs = sequence_length;
    outputs->hidden_size = model->output_size;
    return;
 }
 int t5_runner::generate(std::string prompt, tts_response *response) {
 	std::vector<uint32_t> tokens;
 	tokenizer->tokenize(prompt, tokens);
    tokens.push_back(model->eos_token_id);
 	run(tokens.data(), (uint32_t) tokens.size(), response);
 	return 0;
 }
 struct t5_runner * text_encoder_from_file(std::string file_path, int n_threads, unigram_tokenizer * tokenizer, bool cpu_only) {
    t5_encoder * model = new t5_encoder;
    ggml_context * weight_ctx = NULL;
    struct gguf_init_params params = {
        /*.no_alloc   =*/ false,
        /*.ctx        =*/ &weight_ctx,
    };
    gguf_context * meta_ctx = gguf_init_from_file(file_path.c_str(), params);
    if (!meta_ctx) {
        TTS_ABORT("%s failed for file %s\n", __func__, file_path.c_str());
    }
    if (!tokenizer) {
        tokenizer = unigram_tokenizer_from_gguf(meta_ctx);
    }
    if (!tokenizer->init) {
        tokenizer->initialize_tokenizer();
    }
    model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
    // TODO: change this weight assignment pattern to mirror llama.cpp
    for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) {
        model->assign_weight(cur->name, cur);
    }
    struct t5_context * t5ctx = build_new_t5_context(model, n_threads, cpu_only);
    struct t5_runner * runner = new t5_runner(model, t5ctx, tokenizer);
    runner->prepare_post_load();
    gguf_free(meta_ctx);
    ggml_free(weight_ctx);
    return runner;
 }
--- a/otherarch/ttscpp/src/t5_encoder_model.h
+++ b/otherarch/ttscpp/src/t5_encoder_model.h
@ -0,0 +1,130 @@
 #ifndef t5_encoder_model_h
 #define t5_encoder_model_h
 #include "tts_model.h"
 #include "tokenizer.h"
 enum t5_tensor {
    T5_EMBD,
    T5_NORM,
    T5_DOWN_PROJ,
    T5_DOWN_PROJ_BIAS,
    T5_RELATIVE_BIAS,
    T5_LAYER_ATTN_Q,
    T5_LAYER_ATTN_K,
    T5_LAYER_ATTN_V,
    T5_LAYER_ATTN_O,
    T5_LAYER_ATTN_NORM,
    T5_LAYER_WI_0,
    T5_LAYER_WI_1,
    T5_LAYER_WO,
    T5_LAYER_OUT_NORM,
 };
 struct t5_layer {
    struct ggml_tensor * q;
    struct ggml_tensor * k;
    struct ggml_tensor * v;
    struct ggml_tensor * o;
    struct ggml_tensor * attn_norm;
    struct ggml_tensor * wi_0;
    struct ggml_tensor * wi_1;
    struct ggml_tensor * wo;
    struct ggml_tensor * mlp_norm;
 };
 // this struct maintains the static tensors for a t5_encoder model
 // the defautl configuration is form copied from standard configuration for
 // flan-t5-xl. Note this model is slightly different from a standard t5 encoder.
 // Specifically this model has a down projection which converts the text encoder's
 // hidden size to the hidden size of the parler decoder.
 struct t5_encoder : tts_model {
    // These configs  are essentially built for the 44khZ 8kbps standard DAC model audio encoder and decoder
    uint32_t n_layers = 24;
    uint32_t n_attn_heads = 32;
    uint32_t head_size = 64;
    uint32_t hidden_size = 2048;
    uint32_t relative_attn_buckets = 32;
    uint32_t eos_token_id = 1;
    uint32_t bos_token_id = 0;
    uint32_t max_context_length = 512;
    uint32_t output_size = 1536;
    uint32_t vocab_size;
    struct ggml_tensor * embd;
    struct ggml_tensor * relative_attn_bias;
    struct ggml_tensor * out_norm;
    struct ggml_tensor * down_proj = nullptr;
    struct ggml_tensor * down_proj_bias = nullptr;
    std::vector<t5_layer> layers;
    void assign_weight(std::string name, ggml_tensor * tensor);
    void prep_layers(gguf_context * meta);
    void prep_constants(gguf_context * meta);
    void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only = true) {
        prep_constants(meta_ctx);
        prep_layers(meta_ctx);
        tts_model::setup_from_file(meta_ctx, load_context, cpu_only, "t5encoder", 1.25);
    }
 };
 // For assigning weights from gguf file to local model.
 void assign_to_t5_encoder(t5_encoder * model, const std::string name, ggml_tensor * tensor);
 void assign_to_t5_layer(t5_encoder * model, t5_layer & layer, std::string name, ggml_tensor * tensor);
 struct t5_context : runner_context {
    t5_context(t5_encoder * model, int n_threads): runner_context(n_threads), model(model) {};
    struct t5_encoder * model;
    struct ggml_tensor * inp_tokens;
    struct ggml_tensor * positions;
    struct ggml_tensor * attn_mask;
    struct ggml_tensor * inp_pos_bucket;
    void build_schedule() {
        runner_context::build_schedule(model->max_nodes());
    }
 };
 struct t5_context * build_new_t5_context(struct t5_encoder * model, int n_threads, bool use_cpu = true);
 struct t5_ubatch {
    size_t n_tokens; // the number of tokens in our encoded sequence
    uint32_t * input_tokens;    // [n_tokens]
 };
 static struct ggml_tensor * build_t5_norm(struct ggml_context * ctx, struct ggml_tensor * cur, struct ggml_tensor * weight);
 static struct ggml_tensor * build_t5_attn_mask(ggml_context * ctx, struct t5_context *t5ctx, const t5_ubatch & batch);
 // This struct is intended to manage the t5 encoder model's graph compilation and compute function.
 struct t5_runner : tts_runner {
    t5_runner(t5_encoder * model, t5_context * context, unigram_tokenizer * tokenizer): model(model), t5ctx(context), tokenizer(tokenizer) {};
    ~t5_runner() {
        if (ctx) {
            ggml_free(ctx);
        }
        model->free();
        delete model;
        delete t5ctx;
    }
    struct unigram_tokenizer * tokenizer;
    t5_encoder * model;
    t5_context * t5ctx;
    void init_build() {
        tts_runner::init_build(&t5ctx->buf_compute_meta);
    }
    void prepare_post_load();
    struct t5_ubatch build_worst_case_batch();
    void set_inputs(t5_ubatch & batch);
    struct ggml_cgraph * build_t5_graph(t5_ubatch & batch);
    void run(uint32_t * input_tokens, uint32_t sequence_length, struct tts_response * outputs);
    int generate(std::string prompt, struct tts_response * response);
 };
 struct t5_runner * text_encoder_from_file(std::string file_path, int n_threads, unigram_tokenizer * tokenizer, bool cpu_only = true);
 #endif
--- a/otherarch/ttscpp/src/tokenizer.cpp
+++ b/otherarch/ttscpp/src/tokenizer.cpp
@ -0,0 +1,331 @@
 #include "tokenizer.h"
 void token_trie::add(const std::string & gram, uint32_t token) {
    _add(gram, token, 0);
 }
 void token_trie::_add(const std::string & gram, uint32_t new_token, size_t index) {
    if (index >= gram.size()) {
        has_value = true;
        token = new_token;
        return;
    }
    const char c = gram[index];
    auto res = children.find(c);
    if (res != children.end()) {
        res->second._add(gram, new_token, index + 1);
    } else {
        struct token_trie nt{};
        nt._add(gram, new_token, index + 1);
        children[c] = nt;
    }
 }
 const struct token_trie * token_trie::traverse(const char c) const {
    auto res = children.find(c);
    if (res != children.end()) {
        return &res->second;
    }
    return NULL;
 }
 size_t unicode_len_utf8_tts(char src) {
    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
    return lookup[highbits];
 }
 void unigram_tokenizer::initialize_tokenizer() {
    for (const auto it : vocab) {
        root_trie.add(it.first, it.second);
    }
    init = true;
 }
 // the general approach here is to find the character grams that sum to the max possible value over the entire text sequence.
 // The particular algorithm used here effectively works by walking the text and at each index storing the max value of all possible gram combinations
 // we can then reverse that sequence to pick the best possible tokens.
 void unigram_tokenizer::tokenize(const std::string & text, std::vector<uint32_t> & tokens) {
    if (!init) {
        TTS_ABORT("Error: %s\nTokenizer must be initialized before #tokenize is called.");
    }
    // the parler tokenizer's normalizer (i.e. the bert normalizer implemented by huggingface tokenizers libs) only deduplicates and strips extra spaces and
    // optionally handles chinese characters and accents (neither of which are currently supported here).
    std::string normalized = text;
    if (dedupe_spaces) {
        normalized = " " + std::regex_replace(text, duped_spaces, " ");
    }
    size_t text_length = normalized.size();
    // initialize score_sum to neg infinity so it will be always lower than sums of token scores
    std::vector<struct result> results(text_length + 1, {unk_token, 0, -INFINITY});
    results[0] = { unk_token, 0, 0 };
    size_t offset = 0;
    while (offset < text_length) {
        size_t current_offset = offset;
        // pulled this directly from llama.cpp; I suspect that this is for handling of non-utf8 steps (to be marked as unknown tokens)
        size_t n_utf8_code_units = std::min<size_t>(unicode_len_utf8_tts(normalized[offset]), text_length - offset);
        bool found_unknown = true;
        const struct result & current_best = results[offset];
        // find the current branch in the trie
        const struct token_trie * node = root_trie.traverse(normalized[current_offset++]);
        // search for the next token
        while (current_offset <= text_length && node != NULL) {
            // check if this is a complete token (it could just be an unkown step between two tokens).
            if (node->has_value) {
                // check if it corresponds to the whole utf8 step
                if (current_offset - offset == n_utf8_code_units) {
                    found_unknown = false;
                }
                float score = current_best.score + scores[node->token];
                struct result & current_champ = results[current_offset];
                if (score > current_champ.score) {
                    struct result challenger = { node->token, offset, score };
                    current_champ = challenger;
                }
            }
            node = node->traverse(normalized[current_offset++]);
        }
        // if we found an unknown token, process it
        if (found_unknown) {
            current_offset = offset + n_utf8_code_units;
            struct result & current_champ = results[current_offset];
            float score = current_best.score + unk_token_score;
            if (score > current_champ.score) {
                struct result challenger = { unk_token, offset, score };
                current_champ = challenger;
            }
        }
        // move one utf8 step
        offset += n_utf8_code_units;
    }
    // if we have more than on unknown token in a row, we can join them.
    bool is_prev_unknown = false;
    // iterate from the last result backwards and get the best performing tokens
    for (struct result & result = results[text_length]; ; result = results[result.offset]) {
        bool is_unknown = result.token == unk_token;
        if (!(is_prev_unknown && is_unknown)) {
            tokens.push_back(result.token);
        }
        if (result.offset == 0) {
            break;
        }
        is_prev_unknown = is_unknown;
    }
    // reverse the tokens since we added tokens starting from the end of the input
    std::reverse(tokens.begin(), tokens.end());
 }
 // loading the vocab to the tokenizer from gguf file.
 unigram_tokenizer * unigram_tokenizer_from_gguf(gguf_context * meta) {
    std::unordered_map<std::string, uint32_t> vocab;
    std::vector<float> scores;
    int vocab_key = gguf_find_key(meta, "tokenizer.ggml.tokens");
    int vocab_size = gguf_get_arr_n(meta, vocab_key);
    scores.reserve(vocab_size);
    for (int i = 0; i < vocab_size; i++) {
        std::string val = gguf_get_arr_str(meta, vocab_key, i);
        vocab[val] = (uint32_t) i;
    }
    int scores_key = gguf_find_key(meta, "tokenizer.ggml.scores");
    int scores_size = gguf_get_arr_n(meta, scores_key);
    assert(scores_size == vocab_size);
    float * data = (float*) gguf_get_arr_data(meta, scores_key);
    for (int i = 0; i < scores_size; i++) {
        scores.push_back(data[i]);
    }
    int unkown_token_key = gguf_find_key(meta, "tokenizer.ggml.unknown_token_id");
    uint32_t token = gguf_get_val_u32(meta, unkown_token_key);
    auto tokenizer =  new unigram_tokenizer(vocab, token, scores[token], scores);
    uint32_t eos_token_key = gguf_find_key(meta, "tokenizer.ggml.eos_token_id");
    if (eos_token_key != -1) {
        tokenizer->eos_token = gguf_get_val_u32(meta, eos_token_key);
    }
    return tokenizer;
 }
 void single_pass_tokenizer::tokenize(const std::string & text, std::vector<uint32_t> & token_ids) {
    std::string remaining = text;
    while (remaining.size() > 0) {
        uint32_t token_id = unknown_id;
        for (int i = 1; i < std::min(remaining.size()+1, max_size+1); i++) {
            std::string part = remaining.substr(0, i);
            ptrdiff_t pos = std::distance(tokens.begin(), std::find(tokens.begin(), tokens.end(), part));
            if (pos < tokens.size()) {
                token_id = (uint32_t) pos;
                remaining = remaining.substr(part.size(), remaining.size() - part.size());
                break;
            }
        }
        if (token_id == unknown_id) {
            remaining = remaining.substr(1, remaining.size() - 1);
        }
        token_ids.push_back(token_id);
    }
 }
 void single_pass_tokenizer::token_split(const std::string & text, std::vector<std::string> & tokens) {
    std::string remaining = text;
    while (remaining.size() > 0) {
        // String copying is much slower than using a std::string_view, but the former is simpler to implement for now.
        std::string token = remaining.substr(0, 1);
        for (int i = 1; i < remaining.size(); i++) {
            std::string part = remaining.substr(0, i+1);
            if (token_vocab.find(part) == token_vocab.end()) {
                break;
            }
            token = part;
        }
        tokens.push_back(token);
        remaining = remaining.substr(token.size(), remaining.size() - token.size());
    }
 }
 struct single_pass_tokenizer * single_pass_tokenizer_from_gguf(gguf_context * meta, std::string key_name) {
    int tokens_key = gguf_find_key(meta, key_name.c_str());
    if (tokens_key == -1) {
        TTS_ABORT("The '%s' key must be set in order to support single pass tokenization.", key_name.c_str());
    }
    std::vector<std::string> tokens;
    int token_count = gguf_get_arr_n(meta, tokens_key);
    for (int i = 0; i < token_count; i++) {
        tokens.push_back(gguf_get_arr_str(meta, tokens_key, i));
    }
    return new single_pass_tokenizer(tokens);
 }
 void bpe_symbol::add_merges(std::priority_queue<bpe_merge, std::vector<bpe_merge>, bpe_merge_comp> & merges, std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> & rank_map, bool only_forward) {
    if (!only_forward && last) {
        auto rid = std::make_pair<std::string, std::string>(last->as_str(), as_str());
        if (rank_map.find(rid) != rank_map.end()) {
            bpe_merge m{last, this, rank_map[rid], last->size + size};
            merges.push(m);
        }
    }
    if (next) {
        auto rid = std::make_pair<std::string, std::string>(as_str(), next->as_str());
        if (rank_map.find(rid) != rank_map.end()) {
            bpe_merge m{this, next, rank_map[rid], size + next->size};
            merges.push(m);
        }
    }
 }
 std::string bpe_symbol::as_str() {
    return std::string(token, size);
 }
 bool bpe_merge_comp::operator() (const bpe_merge & a, const bpe_merge & b) {
    return a.rank > b.rank || (a.rank == b.rank && a.a && b.a && a.a->pos > b.a->pos);
 }
 size_t pair_hash::operator() (const std::pair<std::string, std::string> & p) const {
    return std::hash<std::string>{}(p.first) ^ (std::hash<std::string>{}(p.second) << 1);
 }
 bpe_symbol * bpe_merge::merge() {
    a->size += b->size;
    b->size = -1;
    a->next = b->next;
    if (a->next) {
        a->next->last = a;
    }
    return a;
 }
 void pair_builder::join_pairs(std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> & rank_map) {
    std::priority_queue<bpe_merge, std::vector<bpe_merge>, bpe_merge_comp> merges;
    for (auto part : parts) {
        part->add_merges(merges, rank_map, true);
    }
    while (!merges.empty()) {
        auto m = merges.top();
        merges.pop();
        if (m.a->size > 0 && m.b->size > 0 && m.new_size == m.a->size + m.b->size) {
            m.merge();
            m.a->add_merges(merges, rank_map);
        }
    }
 }
 void bpe_tokenizer::tokenize(const std::string & text, std::vector<uint32_t> & token_ids) {
    std::vector<std::string> chunks = split(text, " ", true);
    bool space_prior = false;
    for (auto chunk : chunks) {
        if (chunk != " ") {
            bpe_tokenize(space_prior ? "Ġ" + chunk : chunk, token_ids);
        } else {
            space_prior = true;
        }
    }
 }
 void bpe_tokenizer::bpe_tokenize(std::string chunk, std::vector<uint32_t> & token_ids) {
    if (tokens_to_ids.find(chunk) != tokens_to_ids.end()) {
        token_ids.push_back(tokens_to_ids[chunk]);
        return;
    }
    auto pb = pair_builder{chunk};
    pb.join_pairs(ranks);
    bpe_symbol * next = pb.parts[0];
    while (next) {
        token_ids.push_back(tokens_to_ids[next->as_str()]);
        next = next->next;
    }
 }
 bpe_tokenizer * bpe_tokenizer_from_gguf(gguf_context * meta, std::string base_name) {
    int vocab_key = gguf_find_key(meta, (base_name + ".tokens").c_str());
    if (vocab_key == -1) {
        TTS_ABORT("The '%s' key must be set in order to support BPE tokenization.", (base_name + ".tokens").c_str());
    }
    int merges_key = gguf_find_key(meta, (base_name + ".merges").c_str());
    if (merges_key == -1) {
        TTS_ABORT("The '%s' key must be set in order to support BPE tokenization.", (base_name + ".merges").c_str());
    }
    int eos_token_id_key = gguf_find_key(meta, (base_name + ".eos_token_id").c_str());
    if (eos_token_id_key == -1) {
        TTS_ABORT("The '%s' key must be set in order to support BPE tokenization.", (base_name + ".eos_token_id").c_str());
    }
    int bos_token_id_key = gguf_find_key(meta, (base_name + ".bos_token_id").c_str());
    if (bos_token_id_key == -1) {
        TTS_ABORT("The '%s' key must be set in order to support BPE tokenization.", (base_name + ".bos_token_id").c_str());
    }
    uint32_t bos_token_id = gguf_get_val_u32(meta, bos_token_id_key);
    uint32_t eos_token_id = gguf_get_val_u32(meta, eos_token_id_key);
    std::unordered_map<std::string, uint32_t> vocab;
    int token_count = gguf_get_arr_n(meta, vocab_key);
    for (int i = 0; i < token_count; i++) {
        vocab[gguf_get_arr_str(meta, vocab_key, i)] = (uint32_t) i;
    }
    std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> ranks;
    int merge_count = gguf_get_arr_n(meta, merges_key);
    for (int i = 0; i < merge_count; i++) {
        auto raw_merge = gguf_get_arr_str(meta, merges_key, i);
        std::vector<std::string> pair = split(raw_merge, " ");
        if (pair.size() != 2) {
            TTS_ABORT("Invalid pair, '%s', found in BPE merges, '%s', at index %d.", raw_merge, (base_name + ".merges").c_str(), i);
        }
        ranks[std::make_pair<>(pair[0], pair[1])] = i;
    }
    return new bpe_tokenizer(vocab, ranks, bos_token_id, eos_token_id);
 }
--- a/otherarch/ttscpp/src/tokenizer.h
+++ b/otherarch/ttscpp/src/tokenizer.h
@ -0,0 +1,154 @@
 #ifndef tokenizer_h
 #define tokenizer_h
 #include <unordered_map>
 #include <stdint.h>
 #include <map>
 #include <unordered_set>
 #include <regex>
 #include <queue>
 #include "ttsutil.h"
 struct token_trie {
    bool has_value = false;
    uint32_t token;
    std::map<char, struct token_trie> children;
    void add(const std::string & gram, uint32_t token);
    void _add(const std::string & gram, uint32_t new_token, size_t index);
    const struct token_trie * traverse(const char c) const;
 };
 static std::regex duped_spaces("\\s{2,}");
 static std::regex spaces("\\s");
 struct result {
    uint32_t token;
    size_t offset;
    float score;
 };
 // much of this is implemented in llama.cpp, but in order to simplify this for my use case, I reimplementing here.
 // There are several important simplifications here:
 // 1. I only implement unigram tokenization
 // 2. I don't need to support detokenization
 struct unigram_tokenizer {
    unigram_tokenizer(std::unordered_map<std::string, uint32_t> vocab, uint32_t unk_token, float unk_token_score, std::vector<float> scores): vocab(vocab), unk_token(unk_token), unk_token_score(unk_token_score), scores(scores) {};
    ~unigram_tokenizer() = default;
    std::unordered_map<std::string, uint32_t> vocab;
    std::vector<float> scores;
    struct token_trie root_trie;
    uint32_t unk_token;
    float unk_token_score;
    uint32_t eos_token = 1;
    bool dedupe_spaces = true;
    bool init = false;
    void initialize_tokenizer();
    void tokenize(const std::string & text, std::vector<uint32_t> & tokens);
 };
 // For intializing a new tokenizer from a gguf file meta
 unigram_tokenizer * unigram_tokenizer_from_gguf(gguf_context * meta);
 // While this functions like a tokenizer, no token ids are assigned as the token ids never need to be used in the context in which this is
 // currently being used. This tokenizer pattern is currently being used by the phonemizer to break up a word into its relevant graphemes.
 // As such, only the graphemes need to be returned.
 struct single_pass_tokenizer {
    single_pass_tokenizer(std::vector<std::string> tkns): tokens(tkns) {
        max_size = 0;
        for (auto token : tkns) {
            token_vocab.insert(token);
            if (token.size() > max_size) {
                max_size = token.size();
            }
        }
    }
    size_t max_size;
    uint32_t unknown_id = 0;
    std::vector<std::string> tokens;
    std::unordered_set<std::string> token_vocab;
    void tokenize(const std::string & text, std::vector<uint32_t> & token_ids);
    void token_split(const std::string & text, std::vector<std::string> & tokens);
 };
 single_pass_tokenizer * single_pass_tokenizer_from_gguf(gguf_context * meta, std::string key_name = "phonemizer.graphemes");
 struct bpe_symbol;
 struct bpe_merge {
    bpe_symbol * a;
    bpe_symbol * b;
    int rank;
    int new_size;
    bpe_symbol * merge();
 };
 struct bpe_merge_comp{
    bool operator() (const bpe_merge & a, const bpe_merge & b);
 };
 struct pair_hash {
    size_t operator() (const std::pair<std::string, std::string> & p) const;
 };
 struct bpe_symbol {
    bpe_symbol(const char * token): token(token) {};
    const char* token;
    int size = 1;
    int pos;
    bpe_symbol * next = nullptr;
    bpe_symbol * last = nullptr;
    void add_merges(std::priority_queue<bpe_merge, std::vector<bpe_merge>, bpe_merge_comp> & merges, std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> & rank_map, bool only_forward = false);
    std::string as_str();
 };
 struct pair_builder {
    pair_builder(std::string word) {
        bpe_symbol * last = nullptr;
        for (int i = 0; i < word.size(); i++) {
            int increment = 0;
            // make sure we process each utf-8 character.
            while(i + increment + 1 < word.size() && (word[i+increment+1] & 0b11000000) == 0b10000000) {
                ++increment;
            }
            bpe_symbol * part = new bpe_symbol(word.data()+i);
            part->pos = i;
            part->size += increment;
            i += increment;
            if (last) {
                last->next = part;
                part->last = last;
            }
            last = part;
            parts.push_back(part);
        }
    }
    ~pair_builder() {
        for (auto p : parts) {
            delete p;
        }
    }
    void join_pairs(std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> & rank_map);
    std::vector<bpe_symbol*> parts;
 };
 struct bpe_tokenizer {
    bpe_tokenizer(std::unordered_map<std::string, uint32_t> & tokens_to_ids, std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> & ranks, uint32_t bos, uint32_t eos): tokens_to_ids(tokens_to_ids), ranks(ranks), eos_token_id(eos), bos_token_id(bos) {};
    std::unordered_map<std::string, uint32_t> tokens_to_ids;
    std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> ranks;
    uint32_t eos_token_id;
    uint32_t bos_token_id;
    void tokenize(const std::string & text, std::vector<uint32_t> & token_ids);
    void bpe_tokenize(std::string chunk, std::vector<uint32_t> & token_ids);
 };
 bpe_tokenizer * bpe_tokenizer_from_gguf(gguf_context * meta, std::string base_name = "tokenizer.ggml");
 #endif
--- a/otherarch/ttscpp/src/tts.cpp
+++ b/otherarch/ttscpp/src/tts.cpp
@ -0,0 +1,445 @@
 #include "tts.h"
 #include <mutex>
 // A list of all of the top level GGUF names under kokoro.duration_predictor that have quantization compatible tensors.
 static constexpr std::array<const char *, 5> DURATION_PREDICTOR_QUANTIZATION_COMPATIBLE_PARTS = {
    "duration_proj",
    "encode",
    "shared_lstm",
    "duration_lstm",
    "layers"
 };
 struct tts_runner * orpheus_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) {
    orpheus_model * model = new orpheus_model;
    snac_model * audio_model = new snac_model;
    bpe_tokenizer * bt = bpe_tokenizer_from_gguf(meta_ctx);
    model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
    audio_model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
    sampler * samp = new sampler;
    snac_context * sctx = build_new_snac_context(audio_model, n_threads, cpu_only);
    snac_runner * audio_decoder = new snac_runner(audio_model, sctx);
    orpheus_context * octx = build_new_orpheus_context(model, n_threads, cpu_only);
    orpheus_kv_cache * cache = new orpheus_kv_cache;
    orpheus_runner * runner = new orpheus_runner(model, audio_decoder, octx, bt, samp, cache);
    for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) {
        runner->assign_weight(cur->name, cur);
    }
    runner->prepare_post_load();
    gguf_free(meta_ctx);
    ggml_free(weight_ctx);
    runner->arch = arch;
    return (tts_runner*)runner;
 }
 struct tts_runner * parler_tts_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) {
    parler_tts_model * model = new parler_tts_model;
    dac_model * audio_model = new dac_model;
    unigram_tokenizer * ut = unigram_tokenizer_from_gguf(meta_ctx);
    ut->initialize_tokenizer();
    model->use_cross_attn = config->use_cross_attn;
    model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
    audio_model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
    struct sampler * samp = new sampler;
    struct dac_context * dctx = build_new_dac_context(audio_model, n_threads, cpu_only);
    struct dac_runner * audio_decoder = new dac_runner(audio_model, dctx);
    struct parler_context * pctx = build_new_parler_context(model, n_threads, cpu_only);
    struct parler_kv_cache * cache = new parler_kv_cache;
    struct parler_tts_runner * runner = new parler_tts_runner(model, audio_decoder, pctx, ut, samp, cache);
    // TODO: change this weight assignment pattern to mirror llama.cpp
    for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) {
        runner->assign_weight(cur->name, cur);
    }
    if (config->use_cross_attn) {
        runner->model->prep_cross_key_values(n_threads);
    }
    runner->prepare_post_load();
    gguf_free(meta_ctx);
    ggml_free(weight_ctx);
    runner->arch = arch;
    return (tts_runner*)runner;
 }
 struct tts_runner * kokoro_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) {
    kokoro_model * model = new kokoro_model;
    single_pass_tokenizer * spt = single_pass_tokenizer_from_gguf(meta_ctx, "tokenizer.ggml.tokens");
    model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
    struct kokoro_duration_context * kdctx = build_new_duration_kokoro_context(model, n_threads, cpu_only);
    struct kokoro_duration_runner * duration_runner = new kokoro_duration_runner(model, kdctx, spt);
    struct kokoro_context * kctx = build_new_kokoro_context(model, n_threads, cpu_only);
    // if an espeak voice id wasn't specifically set infer it from the kokoro voice, if it was override it, otherwise fallback to American English.
    std::string espeak_voice_id = config->espeak_voice_id;
    if (espeak_voice_id.empty()) {
        espeak_voice_id = !config->voice.empty() && KOKORO_LANG_TO_ESPEAK_ID.find(config->voice.at(0)) != KOKORO_LANG_TO_ESPEAK_ID.end() ? KOKORO_LANG_TO_ESPEAK_ID[config->voice.at(0)] : "gmw/en-US";
    }
    struct phonemizer * phmzr = phonemizer_from_gguf(meta_ctx, espeak_voice_id);
    struct kokoro_runner * runner = new kokoro_runner(model, kctx, spt, duration_runner, phmzr);
    // TODO: change this weight assignment pattern to mirror llama.cpp
    for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) {
        runner->assign_weight(cur->name, cur);
    }
    runner->prepare_post_load();
    gguf_free(meta_ctx);
    ggml_free(weight_ctx);
    runner->arch = arch;
    return (tts_runner*)runner;
 }
 struct tts_runner * dia_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) {
    dia_model * model = new dia_model;
    dac_model * audio_model = new dac_model;
    model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
    audio_model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
    struct sampler * samp = new sampler;
    struct dac_context * dctx = build_new_dac_context(audio_model, n_threads, cpu_only);
    struct dac_runner * audio_decoder = new dac_runner(audio_model, dctx);
    struct dia_context * diactx = build_new_dia_context(model, n_threads, cpu_only);
    struct dia_kv_cache * cache = new dia_kv_cache;
    struct dia_runner * runner = new dia_runner(model, audio_decoder, diactx, samp, cache);
    for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) {
        runner->assign_weight(cur->name, cur);
    }
    runner->prepare_post_load();
    gguf_free(meta_ctx);
    ggml_free(weight_ctx);
    runner->arch = arch;
    return (tts_runner*)runner;
 }
 // currently only metal and cpu devices are supported, so cpu_only only describes whether or not to try to load and run on metal.
 struct tts_runner * runner_from_file(const std::string & fname, int n_threads, generation_configuration * config, bool cpu_only) {
    ggml_context * weight_ctx = NULL;
    struct gguf_init_params params = {
        /*.no_alloc   =*/ false,
        /*.ctx        =*/ &weight_ctx,
    };
    gguf_context * meta_ctx = gguf_init_from_file(fname.c_str(), params);
    if (!meta_ctx) {
        TTS_ABORT("%s failed for file %s\n", __func__, fname.c_str());
    }
    int arch_key = gguf_find_key(meta_ctx, "general.architecture");
    if (arch_key == -1) {
        TTS_ABORT("%s failed for file %s. No architecture is set.\n", __func__, fname.c_str());
    }
    std::string arch = std::string(gguf_get_val_str(meta_ctx, arch_key));
    if (SUPPORTED_ARCHITECTURES.find(arch) == SUPPORTED_ARCHITECTURES.end()) {
        TTS_ABORT("%s failed for file %s. The architecture '%s' is not supported.", __func__, fname.c_str(), arch.c_str());
    }
    tts_arch arch_type = SUPPORTED_ARCHITECTURES.at(arch);
    switch(arch_type) {
        case PARLER_TTS_ARCH:
            return parler_tts_from_file(meta_ctx, weight_ctx, n_threads, config, arch_type, cpu_only);
        case KOKORO_ARCH:
            return kokoro_from_file(meta_ctx, weight_ctx, n_threads, config, arch_type, cpu_only);
        case DIA_ARCH:
            return dia_from_file(meta_ctx, weight_ctx, n_threads, config, arch_type, cpu_only);
        case ORPHEUS_ARCH:
            return orpheus_from_file(meta_ctx, weight_ctx, n_threads, config, arch_type, cpu_only);
        default:
            TTS_ABORT("%s failed for file %s. The architecture '%s' is not supported.", __func__, fname.c_str(), arch.c_str());
    }
 }
 int generate(tts_runner * runner, std::string sentence, struct tts_response * response, generation_configuration * config) {
    switch(runner->arch) {
        case PARLER_TTS_ARCH:
            ((parler_tts_runner*)runner)->configure_generation(config);
            return ((parler_tts_runner*)runner)->generate(sentence, response);
        case KOKORO_ARCH:
            return ((kokoro_runner*)runner)->generate(sentence, response, config->voice, config->espeak_voice_id);
        case DIA_ARCH:
            ((dia_runner*)runner)->configure_generation(config);
            return ((dia_runner*)runner)->generate(sentence, response);
        case ORPHEUS_ARCH:
            ((orpheus_runner*)runner)->configure_generation(config);
            return ((orpheus_runner*)runner)->generate(sentence, response);
        default:
            TTS_ABORT("%s failed. The architecture '%d' is not supported.", __func__, runner->arch);
    }
 }
 std::vector<std::string> list_voices(tts_runner * runner) {
    switch(runner->arch) {
        case KOKORO_ARCH:
            return ((kokoro_runner*)runner)->list_voices();
        default:
            TTS_ABORT("%s failed. The architecture '%d' does not support #list_voices supported.", __func__, runner->arch);
    }
 }
 void update_conditional_prompt(tts_runner * runner, const std::string file_path, const std::string prompt, bool cpu_only) {
    int n_threads = ((parler_tts_runner*)runner)->pctx->n_threads;
    ((parler_tts_runner*)runner)->update_conditional_prompt(file_path, prompt, n_threads, cpu_only);
 }
 bool kokoro_is_f16_compatible(std::string name) {
    return name.find("voice_tensors") == std::string::npos &&
           name.find("bias") == std::string::npos &&
           name.find("gamma") == std::string::npos &&
           name.find("beta") == std::string::npos &&
           name.find("alpha") == std::string::npos &&
           !has_suffix(name, "embd") &&
           !has_suffix(name, "norm");
 }
 bool kokoro_is_quantizable(std::string name, struct quantization_params * params) {
    if (kokoro_is_f16_compatible(name)) {
        if (has_prefix(name, "kokoro.albert") || has_prefix(name, "kokoro.text_encoder.lstm")) {
            return true;
        } else if (has_prefix(name, "kokoro.duration_predictor.")) {
            std::vector<std::string> parts = split(name, ".");
            for (std::string part : DURATION_PREDICTOR_QUANTIZATION_COMPATIBLE_PARTS) {
                if (part == parts[2]) {
                    return true;
                }
            }
        }
    }
    return false;
 }
 bool dia_is_quantizable(std::string name, struct quantization_params * params) {
    // The DAC audio encoder / decoder is not compatible with quantization and normalization tensors should not be quantized.
    bool quantizable = !has_prefix(name, "audio_encoder") && !has_suffix(name, "norm");
    if (!params->quantize_output_heads) {
        quantizable = quantizable && !has_prefix(name, "dia.decoder.heads");
    }
    return quantizable;
 }
 bool parler_is_quanitizable(std::string name, struct quantization_params * params) {
    // the DAC audio encoder / decoder is not compatible with quantization, normalization weight shouldn't be quantized, and the text encoding shouldn't be normalized.
    bool quantizable = !has_prefix(name, "audio_encoder") && !has_suffix(name, "norm.weight") && !has_suffix(name, "text_encoding") && !has_suffix(name, "positional_embed") && !has_suffix(name, "norm.bias");
    if (!params->quantize_output_heads) {
        quantizable = quantizable && !has_suffix(name, "weight.head");
    }
    if (!params->quantize_text_embeddings) {
        quantizable = quantizable && !has_suffix(name, "embed_prompts");
    }
    if (!params->quantize_cross_attn_kv) {
        quantizable = quantizable && !has_suffix(name, "encoder_attn.k_proj.weight") && !has_suffix(name, "encoder_attn.v_proj.weight");
    }
    return quantizable;
 }
 bool is_quantizable(tts_arch arch, std::string name, struct quantization_params * params) {
    switch(arch) {
        case PARLER_TTS_ARCH:
            return parler_is_quanitizable(name, params);
        case DIA_ARCH:
            return dia_is_quantizable(name, params);
        case KOKORO_ARCH:
            return kokoro_is_quantizable(name, params);
        default:
            TTS_ABORT("%s failed. The architecture '%d' is not supported.", __func__, arch);
    }
 }
 size_t quantize_tensor(void * new_data, struct ggml_tensor * tensor, const float * imatrix, enum ggml_type qtype, uint32_t n_threads) {
    // much of this is form copied from llama.cpp
    int chunk_size_multiplier = 1;
    if (qtype == GGML_TYPE_Q4_0_4_4 || qtype == GGML_TYPE_Q4_0_4_8 || qtype == GGML_TYPE_Q4_0_8_8) {
        if ((qtype == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) qtype = GGML_TYPE_Q4_0;
        else if (tensor->ne[1] % 4 != 0) qtype = GGML_TYPE_Q4_0;
        if (qtype == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
        else if (qtype == GGML_TYPE_Q4_0_4_4 || qtype == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
    }
    size_t out_size = 0;
    const int32_t d3_step = tensor->ne[0] * tensor->ne[1];
    const int32_t n_per_row = tensor->ne[0];
    const int32_t nrows = tensor->ne[1];
    static const int32_t min_chunk_size = 32 * 512;
    const int32_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) * chunk_size_multiplier;
    uint32_t thread_count = std::max(1, std::min((int)n_threads, (int)(d3_step + chunk_size - 1) / chunk_size));
    std::mutex mutex;
    for (int32_t d3_index = 0; d3_index < tensor->ne[2]; d3_index++) {
        const float * f32_data_d3 = ((float *) tensor->data) + d3_index * d3_step;
        void * new_data_d3 = (char *)new_data + ggml_row_size(qtype, tensor->ne[0]) * d3_index * nrows;
        const float * imatrix_03 = imatrix ? imatrix + d3_index * tensor->ne[0] : nullptr;
        if (thread_count <= 1) {
            // not threaded
            out_size += ggml_quantize_chunk(qtype, f32_data_d3, new_data_d3, 0, nrows, n_per_row, imatrix);
        } else {
            std::vector <std::thread> threads;
            int64_t counter = 0;
            size_t new_size = 0;
            bool valid = true;
            for (uint32_t t = 0; t < thread_count; t++) {
                auto func = [&mutex, &counter, &new_size, &valid, qtype, f32_data_d3, new_data_d3, chunk_size, nrows, n_per_row, imatrix]() {
                    const int64_t nrows_per_chunk = chunk_size / n_per_row;
                    size_t local_size = 0;
                    while (true) {
                        std::unique_lock<std::mutex> lock(mutex);
                        int64_t first_row = counter;
                        counter += nrows_per_chunk;
                        if (first_row >= nrows) {
                            if (local_size > 0) {
                                new_size += local_size;
                            }
                            break;
                        }
                        lock.unlock();
                        const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
                        size_t this_size = ggml_quantize_chunk(qtype, f32_data_d3, new_data_d3, first_row * n_per_row, this_nrow, n_per_row, imatrix);
                        local_size += this_size;
                        // validate the quantized data; I am not sure how this would occur, but there is always the safe fallback on doing this single threaded.
                        const size_t row_size  = ggml_row_size(qtype, n_per_row);
                        void * this_data = (char *) new_data_d3 + first_row * row_size;
                        if (!ggml_validate_row_data(qtype, this_data, this_size)) {
                            std::unique_lock<std::mutex> lock(mutex);
                            valid = false;
                            break;
                        }
                    }
                };
                threads.push_back(std::thread(func));
            }
            for (auto & t : threads) t.join();
            if (!valid) {
                TTS_ABORT("Validation of quantized data failed. Please try again and/or switch to single thread quantization.\n");
            }
            out_size += new_size;
        }
    }
    return out_size;
 }
 static void zeros(std::ofstream & file, size_t n) {
    char zero = 0;
    for (size_t i = 0; i < n; ++i) {
        file.write(&zero, 1);
    }
 }
 template <typename T>
 struct no_init {
    T value;
    no_init() { /* do nothing */ }
 };
 void quantize_gguf(const std::string & ifile, const std::string & ofile, struct quantization_params * params) {
    ggml_context * weight_ctx = NULL;
    struct gguf_init_params gguf_params = {
        /*.no_alloc   =*/ false,
        /*.ctx        =*/ &weight_ctx,
    };
    gguf_context * meta_ctx = gguf_init_from_file(ifile.c_str(), gguf_params);
    std::string arch = "parler-tts"; // only parler-tts gguf files should lack an explicit architecture.
    int arch_key = gguf_find_key(meta_ctx, "general.architecture");
    if (arch_key != -1) {
        arch = std::string(gguf_get_val_str(meta_ctx, arch_key));
    }
    tts_arch arch_type = SUPPORTED_ARCHITECTURES.at(arch);
    if (params->quantize_type != GGML_TYPE_Q5_0 && params->quantize_type != GGML_TYPE_Q8_0 && params->quantize_type != GGML_TYPE_F16 && params->quantize_type != GGML_TYPE_Q4_0) {
        fprintf(stdout, "Warning, %s is untested for quantization type '%d'. Use at your own risk.\n", arch.c_str(), params->quantize_type);
    }
    const size_t align = GGUF_DEFAULT_ALIGNMENT;
    gguf_context_ptr ctx_out { gguf_init_empty() };
    // copy the KV pairs from the input file
    gguf_set_kv(ctx_out.get(), meta_ctx);
    gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION);
    gguf_set_val_u32(ctx_out.get(), "general.quantization_type", params->quantize_type);
    for (ggml_tensor * tensor = ggml_get_first_tensor(weight_ctx); tensor; tensor = ggml_get_next_tensor(weight_ctx, tensor)) {
        std::string name = ggml_get_name(tensor);
        if (name.size() != 0) {
            gguf_add_tensor(ctx_out.get(), tensor);
        }
    }
    std::vector<no_init<uint8_t>> work;
    std::ofstream fout;
    auto close_ofstream = [&]() {
        // Write metadata and close file handler
        if (fout.is_open()) {
            fout.seekp(0);
            std::vector<uint8_t> data(gguf_get_meta_size(ctx_out.get()));
            gguf_get_meta_data(ctx_out.get(), data.data());
            fout.write((const char *) data.data(), data.size());
            fout.close();
        }
    };
    auto new_ofstream = [&]() {
        std::string fname = ofile;
        fout = std::ofstream(fname, std::ios::binary);
        fout.exceptions(std::ofstream::failbit); // fail fast on write errors
        const size_t meta_size = gguf_get_meta_size(ctx_out.get());
        // placeholder for the meta data
        ::zeros(fout, meta_size);
    };
    new_ofstream();
    for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) {
        enum ggml_type new_type;
        void * new_data;
        size_t new_size;
        std::string name = ggml_get_name(cur);
        if (name.size() == 0) {
            continue;
        }
        if (is_quantizable(arch_type, name, params)) {
            if ((cur->type) != GGML_TYPE_F32) {
                TTS_ABORT("ERROR: All quantized tensors must be transformed from 32bit floats. Tensor, '%s', has improper type, '%d'\n", cur->name, cur->type);
            }
            new_type = params->quantize_type;
            if ((new_type >= GGML_TYPE_IQ2_XXS && new_type <= GGML_TYPE_IQ4_XS)) {
                TTS_ABORT("ERROR: Quantization type '%d' requires an importance matrix.\n", new_type);
            }
            const int64_t nelement_size = ggml_nelements(cur) * 4;
            if (work.size() < (size_t)nelement_size) {
                work.resize(nelement_size); // upper bound on size
            }
            new_data = work.data();
            new_size = quantize_tensor(new_data, cur, nullptr, new_type, params->n_threads);
        } else if ((params->convert_non_quantizable_to_f16 && kokoro_is_f16_compatible(name)) || (params->convert_dac_to_f16 && has_prefix(name, "audio_encoder") && !has_suffix(name, "alpha"))) {
            if ((cur->type) != GGML_TYPE_F32) {
                TTS_ABORT("ERROR: All converted tensors must be transformed from 32bit floats. Tensor, '%s', has improper type, '%d'\n", cur->name, cur->type);
            }
            new_type = GGML_TYPE_F16;
            const int64_t nelement_size = ggml_nelements(cur) * 4;
            if (work.size() < (size_t)nelement_size) {
                work.resize(nelement_size); // upper bound on size
            }
            new_data = work.data();
            new_size = quantize_tensor(new_data, cur, nullptr, new_type, params->n_threads);
        } else {
            new_type = cur->type;
            new_data = cur->data;
            new_size = ggml_nbytes(cur);
        }
        gguf_set_tensor_type(ctx_out.get(), name.c_str(), new_type);
        gguf_set_tensor_data(ctx_out.get(), name.c_str(), new_data);
        fprintf(stdout, "At tensor: '%s' with new size: %zu bytes\n", name.c_str(), new_size);
        // write tensor data + padding
        fout.write((const char *) new_data, new_size);
        zeros(fout, GGML_PAD(new_size, align) - new_size);
    }
    close_ofstream();
 }
--- a/otherarch/ttscpp/src/tts_model.cpp
+++ b/otherarch/ttscpp/src/tts_model.cpp
@ -0,0 +1,157 @@
 #include "tts_model.h"
 #include "ggml-backend.h"
 #include "ggml-cpu.h"
 void append_to_response(struct tts_response * response, struct tts_response * to_append) {
    float * new_data = (float *) malloc((response->n_outputs + to_append->n_outputs) * sizeof(float));
    if (response->n_outputs > 0) {
        std::memcpy(new_data, response->data, response->n_outputs*sizeof(float));
    }
    if (to_append->n_outputs > 0) {
        float * next_loc = new_data + response->n_outputs;
        std::memcpy(next_loc, to_append->data, to_append->n_outputs*sizeof(float));
    }
    response->data = new_data;
    response->n_outputs += to_append->n_outputs;
 }
 /*
 * Pulls output_size to prepped buffer 'output' from 'output_node' tensor. If no buffer is passed will default to the existing output buffer present
 * on runner_context.
 */
 void runner_context::get_ggml_node_data(struct ggml_tensor * output_node, float * output, size_t output_size, ggml_backend_buffer_t buffer) {
    if (buffer == nullptr) {
        buffer = buf_output;
    }
    if (ggml_backend_buffer_get_size(buffer) < output_size) {
        TTS_ABORT("Output buffer overflow of %d / %d for output node '%s'\n", output_size, ggml_backend_buffer_get_size(buffer), ggml_get_name(output_node));
    } else if (ggml_nbytes(output_node) < output_size) {
        TTS_ABORT("Output node, '%s', with %d bytes is too small for #ggml_backend_tensor_get_async with size of %d.\n", ggml_get_name(output_node), ggml_nbytes(output_node), output_size);
    }
    ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched, output_node);
    ggml_backend_tensor_get_async(backend_res, output_node, output, 0, output_size);
 }
 void runner_context::set_threads() {
    if (backend != nullptr) {
 #ifdef GGML_USE_METAL
        // this is form copied from llama.cpp, but has since been removed. I don't know if this should be tuned.
        ggml_backend_metal_set_n_cb(backend, 1);
 #endif
    }
    if (backend_cpu != nullptr) {
        ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
        struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads);
        threadpool = ggml_threadpool_new(&ttp);
        ggml_backend_cpu_set_threadpool(backend_cpu, threadpool);
    }
 }
 void runner_context::build_schedule(size_t max_nodes) {
    backend_cpu_buffer = ggml_backend_cpu_buffer_type();
    if (backend != nullptr) {
 #ifdef GGML_USE_METAL
        backend_buffer = ggml_backend_metal_buffer_type();
 #endif
        std::vector<ggml_backend_buffer_type_t> bufs = {backend_buffer, backend_cpu_buffer};
        std::vector<ggml_backend_t> backs = {backend, backend_cpu};
        sched = ggml_backend_sched_new(backs.data(), bufs.data(), 2, max_nodes, false, false);
    } else {
        std::vector<ggml_backend_buffer_type_t> bufs = {backend_cpu_buffer};
        std::vector<ggml_backend_t> backs = {backend_cpu};
        sched = ggml_backend_sched_new(backs.data(), bufs.data(), 1, max_nodes, false, false);
    }
 }
 bool runner_context::prep_schedule(struct ggml_cgraph * gf) {
    return ggml_backend_sched_reserve(sched, gf);
 }
 void runner_context::prep_output_buffer(size_t new_size) {
    const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output) : 0;
    if (!buf_output || prev_size < new_size) {
        if (buf_output) {
            ggml_backend_buffer_free(buf_output);
            buf_output = nullptr;
            logits = nullptr;
        }
        buf_output = ggml_backend_buft_alloc_buffer(backend_cpu_buffer, new_size);
    }
    logits = (float *) ggml_backend_buffer_get_base(buf_output);
 }
 void tts_runner::init_build(std::vector<uint8_t>* buf_compute_meta) {
    struct ggml_init_params params = {
        /*.mem_size   =*/ buf_compute_meta->size(),
        /*.mem_buffer =*/ buf_compute_meta->data(),
        /*.no_alloc   =*/ true,
    };
    ctx = ggml_init(params);
 }
 void tts_runner::free_build() {
    if (ctx) {
        ggml_free(ctx);
        ctx = nullptr;
    }
 }
 void tts_model::prep_buffers_and_context(bool cpu_only, float size_offset, uint32_t dedicated_add_on_size) {
    // currently DAC is only supported on cpu because the ops are not implemented on other devices;
    if (cpu_only) {
        backend = ggml_backend_cpu_init();
        buffer = ggml_backend_cpu_buffer_type();
    } else {
 #ifdef GGML_USE_METAL
        backend = ggml_backend_metal_init();
        buffer = ggml_backend_metal_buffer_type();
 #endif
        // if use metal is not installed then we need to warn here
        if (!backend || !buffer) {
            TTS_ABORT("'GGML_USE_METAL' is not defined either set the model to use CPU only or install ggml with metal support.");
        }
    }
    size_t ctx_size = ggml_tensor_overhead() * (tensor_meta.n_tensors * size_offset);
    struct ggml_init_params params = {
        /*.mem_size   =*/ ctx_size,
        /*.mem_buffer =*/ NULL,
        /*.no_alloc   =*/ true,
    };
    ctx = ggml_init(params);
    buf = ggml_backend_buft_alloc_buffer(buffer, tensor_meta.n_bytes + dedicated_add_on_size);
 }
 void tts_model::assign_weight(std::string name, ggml_tensor * tensor) {
 	TTS_ABORT("%s received name, %s, tensor without being defined. %s must be defined for all implementations of tts_model. \n", __func__, name.c_str(), __func__);
 }
 void tts_model::set_tensor(struct ggml_tensor * tensor, struct ggml_tensor * target) {
    tensor->buffer = buf;
    tensor->data = (void *)((uint8_t *) ggml_backend_buffer_get_base(buf) + offset);
    size_t size = ggml_nbytes(target);
    ggml_backend_tensor_set(tensor, target->data, 0, size);
    ggml_set_name(tensor, target->name);
    offset += size;
 }
 void tts_model::setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only, std::string model_prefix, float size_offset, uint32_t dedicated_add_on_size) {
    tensor_meta = compute_tensor_meta(model_prefix, load_context, compute_tensor_meta_cb);
    prep_buffers_and_context(cpu_only, size_offset, dedicated_add_on_size);
 }
 size_t tts_model::max_nodes() {
    return std::max<size_t>(8192, tensor_meta.n_tensors*5);
 }
 void tts_model::free() {
    if (ctx) {
        ggml_free(ctx);
    }
    if (buf) {
        ggml_backend_buffer_free(buf);
    }
    if (backend) {
        ggml_backend_free(backend);
    }
 }
--- a/otherarch/ttscpp/src/tts_model.h
+++ b/otherarch/ttscpp/src/tts_model.h
@ -0,0 +1,69 @@
 #ifndef tts_model_h
 #define tts_model_h
 #include <cstring>
 #include <functional>
 #include "ttsutil.h"
 #include "ttscommon.h"
 void append_to_response(struct tts_response * response, struct tts_response * to_append);
 using tensor_meta_callback = std::function<void(ggml_tensor*)>*;
 struct runner_context {
    runner_context(int n_threads): n_threads(n_threads) {};
    virtual ~runner_context() {
        ggml_backend_sched_free(sched);
        ggml_threadpool_free(threadpool);
        ggml_backend_free(backend_cpu);
        ggml_backend_free(backend);
        ggml_backend_buffer_free(buf_output);
    }
    // TODO: extend the backend and buffer support out to all devices
    ggml_backend_t backend = nullptr;
    ggml_backend_buffer_type_t backend_buffer = nullptr;
    ggml_backend_t backend_cpu = nullptr;
    ggml_backend_buffer_type_t backend_cpu_buffer = nullptr;
    std::vector<uint8_t> buf_compute_meta;
    ggml_backend_buffer_t buf_output = nullptr;
    ggml_backend_sched_t sched = nullptr;
    ggml_threadpool_t threadpool = nullptr;
    float * logits = nullptr;
    int n_threads;
    void get_ggml_node_data(struct ggml_tensor * output_tensor, float * output, size_t output_size, ggml_backend_buffer_t buffer = nullptr);
    void set_threads();
    void build_schedule(size_t max_nodes);
    bool prep_schedule(ggml_cgraph * gf);
    void prep_output_buffer(size_t new_size);
 };
 struct tts_model {
    struct model_tensor_meta tensor_meta;
    // this is the current byte offset into the model's buffer.
    size_t offset = 0;
    bool use_cross_attn = true;
    ggml_backend_buffer_type_t buffer = nullptr;
    ggml_backend_t backend = nullptr;
    ggml_backend_buffer_t buf = nullptr;
    // it is quite common for implementations of tts_model to need to update attributes or perform distinct operations
    // when computing the tensor meta of the loaded model. This callback allows this as it will receive each processed tensor.
    tensor_meta_callback compute_tensor_meta_cb = nullptr;
    struct ggml_context * ctx;
    void prep_buffers_and_context(bool cpu_only, float size_offset, uint32_t dedicated_add_on_size);
    void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only, std::string model_prefix, float size_offset = 1.4, uint32_t dedicated_add_on_size = 0);
    void set_tensor(struct ggml_tensor * tensor, struct ggml_tensor * target);
    size_t max_nodes();
    void assign_weight(std::string name, ggml_tensor * tensor);
    void free();
 };
 #endif
--- a/otherarch/ttscpp/src/ttsutil.cpp
+++ b/otherarch/ttscpp/src/ttsutil.cpp
@ -0,0 +1,308 @@
 #include "ttsutil.h"
 #include <algorithm>
 #include <cstdio>
 #include <stdarg.h>
 #ifdef __APPLE__
 #include <sys/sysctl.h>
 #elif __linux__
 #include <unistd.h>
 #else
 // windows stuff
 #endif
 void tts_abort(const char * file, int line, const char * fmt, ...) {
    fflush(stdout);
    fprintf(stderr, "%s:%d: ", file, line);
    va_list args;
    va_start(args, fmt);
    vfprintf(stderr, fmt, args);
    va_end(args);
    abort();
 }
 // Simple helper function for getting layer count from tensor name
 std::pair<int, std::string> parse_layer_count(std::string name, int skip) {
    bool found = false;
    bool after_layer = false;
    std::string digit_chars = "";
    std::string after_layer_name = "";
    int count = 0;
    for (char& c : name) {
        if (count < skip) {
            count += 1;
            continue;
        }
        count += 1;
        if (after_layer) {
            after_layer_name += c;
        } else if (std::isdigit(c)) {
            found = true;
            digit_chars += c;
        } else if (!found) {
        } else {
            after_layer = true;
            after_layer_name += c;
        }
    }
    if (digit_chars.size() == 0) {
        return std::make_pair(-1, name);
    }
    return std::make_pair(std::stoi(digit_chars), after_layer_name);
 }
 int search_for_gguf_keys(gguf_context * meta, std::vector<std::string> possible_keys) {
    int gguf_key = -1;
    for (auto key : possible_keys) {
        gguf_key = gguf_find_key(meta, key.c_str());
        if (gguf_key != -1) {
            return gguf_key;
        }
    }
    return gguf_key;
 }
 void random_uniform_gen(int count, float * tgt, float min, float max) {
    static std::default_random_engine e;
    static std::uniform_real_distribution<float> dis(min, max);
    for (int i = 0; i < count; i++) {
        tgt[i] = dis(e);
    }
 }
 void random_normal_gen(int count, float * tgt, float mean, float std) {
    static std::default_random_engine e;
    static std::normal_distribution<float> dis(mean, std);
    for (int i = 0; i < count; i++) {
        tgt[i] = dis(e);
    }
 }
 float round_to_float(double v) {
    return roundf(v * powl(10, 6)) / powl(10, 6);
 }
 struct ggml_tensor * reciprocal(ggml_context * ctx, struct ggml_tensor * x) {
    TTS_ASSERT(x->ne[0] == 1);
    static constexpr float one = 1.0f;
    ggml_tensor * numerator = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, x->ne[1]);
    // stride trick so that the scalar numerator can be divided by x.
    numerator->nb[1] = 0;
    numerator->data = const_cast<float *>(&one);
    return ggml_div(ctx, numerator, x);
 }
 // Described in https://arxiv.org/abs/2006.08195
 // Snake1d is a common tunable activation function used in the DAC model.
 struct ggml_tensor * snake_1d(ggml_context * ctx, struct ggml_tensor * alpha, struct ggml_tensor * a) {
    assert(a->ne[2] == 1 && a->ne[3] == 1);
    return ggml_add(ctx, a, ggml_mul(ctx, ggml_sqr(ctx, ggml_sin(ctx, ggml_mul(ctx, a, alpha))), reciprocal(ctx, alpha)));
 }
 bool has_suffix(std::string value, std::string suffix) {
    return value.size() >= suffix.size() && value.compare(value.size()-suffix.size(), suffix.size(), suffix) == 0;
 }
 bool has_prefix(std::string value, std::string prefix) {
    return value.size() >= prefix.size() && value.compare(0, prefix.size(), prefix) == 0;
 }
 struct ggml_tensor * stft(ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * window, size_t n_fft, size_t hop, bool abs_and_angle, bool one_sided) {
    if (window->ne[0] != n_fft) {
        TTS_ABORT("For #stft the window_size, %d, must be either equal to n_fft, %d, or, when one sided, n_fft / 2 + 1, %d.\n", a->ne[0], n_fft, n_fft/2+1);
    }
    struct ggml_tensor * cur = ggml_stft(ctx, a, window, n_fft, hop, abs_and_angle);
    if (one_sided) {
        cur = ggml_cont(ctx, ggml_view_4d(ctx, cur, ((int64_t) n_fft / 2) + 1, cur->ne[1], cur->ne[2], cur->ne[3], cur->nb[1], cur->nb[2], cur->nb[3], 0));
    }
    return cur;
 }
 struct ggml_tensor * istft(ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * window_squared_sum, struct ggml_tensor * window, size_t n_fft, size_t hop, bool abs_and_angle, bool one_sided) {
    if ((!one_sided && a->ne[0] != n_fft) || (one_sided && a->ne[0] != n_fft / 2 + 1)) {
        TTS_ABORT("For #istft the window_size, %d, must be either equal to n_fft, %d, or, when one sided, n_fft / 2 + 1, %d.\n", a->ne[0], n_fft, n_fft/2+1);
    }
    struct ggml_tensor * cur = ggml_istft(ctx, a, window, n_fft, hop, abs_and_angle);
    cur = ggml_div(ctx, cur, window_squared_sum);
    return cur;
 }
 void hann_window(size_t n_fft, std::vector<float> & tgt) {
    for (int i = 0; i < n_fft; i++) {
        float v = pow(sin(M_PI * (double)i / (double) n_fft), 2.0);
        tgt.push_back(v);
    }
 }
 // This is a custom map op for computing noise and relevant voiced sections.
 void uv_noise_compute(struct ggml_tensor * dst, const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata) {
    float voice_threshold = ((float *) c->data)[0];
    float noise_std = ((float *) c->data)[1];
    float sin_amp = ((float *) c->data)[2];
    float sin_amp_div = ((float *) c->data)[3];
    float * rand_init = ((float *) c->data) + 4;
    const int rpt = (b->ne[0] + nth - 1)/nth;
    const int start = ith * rpt;
    const int end = MIN((ith + 1) * rpt, b->ne[0]);
    float * uv_dst = (float *) dst->data;
    float * noise_dst = (float *)((char*)dst->data + dst->nb[2]);
    float * tgt = (float *) b->data;
    for(int bt = 0; bt < b->ne[2]; bt++) {
        for(int r = start; r < end; r++) {
            if (tgt[r] > voice_threshold) {
                for (int h = 0; h < a->ne[1]; h++) {
                    int index = h*dst->ne[0]+r;
                    uv_dst[index] = sin_amp;
                    noise_dst[index] = noise_std * rand_init[index];
                }
            } else {
                for (int h = 0; h < a->ne[1]; h++) {
                    int index = h*dst->ne[0]+r;
                    uv_dst[index] = 0.0f;
                    noise_dst[index] = sin_amp_div * rand_init[index];
                }
            }
        }
    }
 }
 // This is a custom map op for applying cfg scale. It is used at the terminus of logit generation in Dia.
 void cfg_scale(struct ggml_tensor * dst, const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata) {
    const float scale = ((float *) userdata)[0];
    const float max_output = ((float*) userdata)[1];
    const int rpt = (b->ne[0] + nth - 1)/nth;
    const int start = ith * rpt;
    const int end = MIN((ith + 1) * rpt, b->ne[0]);
    float * output = (float *) dst->data;
    float * cond = (float *) a->data;
    float * uncond = (float *) b->data;
    for(int bt = 0; bt < b->ne[2]; bt++) {
        for (int h = 0; h < b->ne[1]; h++) {
            int i = (h * b->ne[0]) + (bt * b->ne[0] * b->ne[1]);
            for(int r = start; r < end; r++) {
                // only let the output heads yield tokens up to EOS
                if (r > max_output) {
                    output[i+r] = -INFINITY;
                }
                const float cr = cond[i+r];
                const float ur = uncond[i+r];
                output[i+r] = cr + scale * (cr - ur);
            }
        }
    }
 }
 // currently this assumes a center view in which the output vector is reflectively padded by n_fft / 2 on each side.
 void compute_window_squared_sum(size_t n_fft, size_t hop, size_t n_frames, float * tgt, float * window) {
    size_t cutoff = n_frames * hop;
    size_t half = n_fft / 2;
    std::memset(tgt, 0, cutoff*sizeof(float));
    // istft applies half / hop steps before the beginning of the sequence. We need to account for these accumulated windows.
    for (int i = 0; i < n_frames + (half / hop); i++) {
        for (int ii = 0; ii < n_fft; ii++) {
            int index = ii + i*hop - half;
            if (index < 0 || index >= cutoff) {
                continue;
            }
            tgt[index] += powf(window[ii], 2);
        }
    }
 }
 std::vector<std::string> split(std::string target, std::string split_on, bool include_split_characters) {
    std::vector<std::string> output;
    size_t last = 0;
    for (int i = 0; i < target.size(); i++) {
        if (i > last && split_on.find(target[i]) != std::string::npos) {
            std::string part(target.substr(last, i - last));
            output.push_back(part);
            if (include_split_characters) {
                output.push_back(target.substr(i, 1));
            }
            last = i+1;
        } else if (i == last && split_on.find(target[i]) != std::string::npos) {
            if (include_split_characters) {
                output.push_back(target.substr(i, 1));
            }
            last = i+1;
        }
    }
    if (last < target.size()) {
        std::string part(target.substr(last));
        output.push_back(part);
    }
    return output;
 }
 std::vector<std::string> split(std::string target, const char split_on, bool include_split_characters) {
    std::vector<std::string> output;
    size_t last = 0;
    for (int i = 0; i < target.size(); i++) {
        if (i > last && split_on == target[i]) {
            std::string part(target.substr(last, i - last));
            output.push_back(part);
            if (include_split_characters) {
                output.push_back(target.substr(i, 1));
            }
            last = i+1;
        } else if (i == last && split_on == target[i]) {
            if (include_split_characters) {
                output.push_back(target.substr(i, 1));
            }
            last = i+1;
        }
    }
    if (last < target.size()) {
        std::string part(target.substr(last));
        output.push_back(part);
    }
    return output;
 }
 std::string strip(std::string target, std::string vals) {
    target.erase(target.begin(), std::find_if(target.begin(), target.end(), [&vals](unsigned char ch) {
        return vals.find(ch) == std::string::npos;
    }));
    target.erase(std::find_if(target.rbegin(), target.rend(), [&vals](unsigned char ch) {
        return vals.find(ch) == std::string::npos;
    }).base(), target.end());
    return target;
 }
 std::string replace_any(std::string target, std::string to_replace, std::string replacement) {
    for (int i = 0; i < to_replace.size(); i++) {
        size_t position = target.find(to_replace[i]);
        while (position != std::string::npos) {
            target.replace(position, 1, replacement);
            position = target.find(to_replace[i]);
        }
    }
    return target;
 }
 struct model_tensor_meta compute_tensor_meta(std::string name_prefix, ggml_context * weight_ctx, std::function<void(ggml_tensor*)>* callback) {
    model_tensor_meta meta;
    for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) {
        if (callback) {
            (*callback)(cur);
        }
        std::string::size_type pos = std::string(cur->name).find(".", 0);
        std::string top_level(std::string(cur->name).substr(0, pos));
        if (top_level == name_prefix) {
            meta.n_tensors += 1;
            meta.n_bytes += ggml_nbytes_pad(cur);
        }
    }
    return meta;
 }
--- a/otherarch/ttscpp/src/ttsutil.h
+++ b/otherarch/ttscpp/src/ttsutil.h
@ -0,0 +1,71 @@
 #ifndef util_h
 #define util_h
 #define _USE_MATH_DEFINES
 #include <cmath>
 #include <functional>
 #include <random>
 #include <stdio.h>
 #include <string>
 #include <cstring>
 #include <vector>
 #include <stdint.h>
 #include <sys/types.h>
 #include "ggml-metal.h"
 #include "ggml-backend.h"
 #include "ggml-alloc.h"
 #include "ggml-cpu.h"
 #include "ggml.h"
 #include "ggml-impl.h"
 #include "ggml-cpp.h"
 #define TTS_ABORT(...) tts_abort(__FILE__, __LINE__, __VA_ARGS__)
 #define TTS_ASSERT(x) if (!(x)) TTS_ABORT("TTS_ASSERT(%s) failed", #x)
 struct model_tensor_meta {
 	uint32_t n_tensors = 0;
 	size_t n_bytes = 0;
 };
 /**
 * Both of these random fill the tgt array with count random floating point values.
 * the default parameter values are consistent with pytorch random function defaults.
 */
 void random_uniform_gen(int count, float * tgt, float min = 0.0f, float max = 1.0f);
 void random_normal_gen(int count, float * tgt, float mean = 0.0f, float std = 1.0f);
 std::pair<int, std::string> parse_layer_count(std::string name, int skip = 0);
 struct model_tensor_meta compute_tensor_meta(std::string name_prefix, ggml_context * weight_ctx, std::function<void(ggml_tensor*)>* callback = nullptr);
 struct ggml_tensor * snake_1d(ggml_context * ctx, struct ggml_tensor * alpha, struct ggml_tensor * a);
 int search_for_gguf_keys(gguf_context * meta, std::vector<std::string> possible_keys);
 // a simple window function for stft
 void hann_window(size_t n_fft, std::vector<float>& tgt);
 // currently this assumes a center view in which the output vector is reflectively padded by n_fft / 2 on each side.
 void compute_window_squared_sum(size_t n_fft, size_t hop, size_t n_frames, float * tgt, float * window);
 // these functions wrap the stft and istft ggml ops and compute the necessary view and division ops for their indepentent settings.
 struct ggml_tensor * stft(ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * window, size_t n_fft, size_t hop, bool abs_and_angle, bool one_sided);
 struct ggml_tensor * istft(ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * window_squared_sum, struct ggml_tensor * window, size_t n_fft, size_t hop, bool abs_and_angle, bool one_sided);
 // This is a custom op for sine_generation in the Kokoro model.
 void uv_noise_compute(struct ggml_tensor * dst, const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
 // This is a custom op for logit correction in the Dia model.
 void cfg_scale(struct ggml_tensor * dst, const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
 struct ggml_tensor * reciprocal(ggml_context * ctx, struct ggml_tensor * x);
 bool has_suffix(std::string value, std::string suffix);
 bool has_prefix(std::string value, std::string prefix);
 std::vector<std::string> split(std::string target, std::string split_on, bool include_split_characters = false);
 std::vector<std::string> split(std::string target, const char split_on, bool include_split_characters = false);
 std::string strip(std::string target, std::string vals = " ");
 std::string replace_any(std::string target, std::string to_replace, std::string replacement);
 [[noreturn]] void tts_abort(const char * file, int line, const char * fmt, ...);
 #endif