diff --git a/Makefile b/Makefile index 7ee0cb9aa..9eed29485 100644 --- a/Makefile +++ b/Makefile @@ -55,8 +55,8 @@ ifdef KCPP_SANITIZE CFLAGS += -fsanitize=undefined -fsanitize-undefined-trap-on-error CXXFLAGS += -fsanitize=undefined -fsanitize-undefined-trap-on-error endif -CFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./vendor -I./vendor/stb -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_REPACK -CXXFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./vendor -I./vendor/stb -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c++17 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_REPACK +CFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./vendor -I./vendor/stb -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/ttscpp/include -I./otherarch/ttscpp/src -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_REPACK +CXXFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./vendor -I./vendor/stb -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/ttscpp/include -I./otherarch/ttscpp/src -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c++17 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_REPACK ifndef KCPP_DEBUG CFLAGS += -DNDEBUG -s CXXFLAGS += -DNDEBUG -s @@ -729,6 +729,8 @@ mainvk: tools/main/main.cpp common/arg.cpp build-info.h ggml_v4_vulkan.o ggml-cp $(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS) embedding: examples/embedding/embedding.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +ttscppmain: otherarch/ttscpp/examples/cli/cli.cpp otherarch/ttscpp/examples/cli/playback.cpp otherarch/ttscpp/examples/cli/playback.h otherarch/ttscpp/examples/cli/write_file.cpp otherarch/ttscpp/examples/cli/write_file.h otherarch/ttscpp/examples/cli/vad.cpp otherarch/ttscpp/examples/cli/vad.h otherarch/ttscpp/src/tts.cpp otherarch/ttscpp/src/tokenizer.cpp otherarch/ttscpp/src/sampler.cpp otherarch/ttscpp/src/parler_model.cpp otherarch/ttscpp/src/dac_model.cpp otherarch/ttscpp/src/ttsutil.cpp otherarch/ttscpp/src/args.cpp otherarch/ttscpp/src/t5_encoder_model.cpp otherarch/ttscpp/src/phonemizer.cpp otherarch/ttscpp/src/tts_model.cpp otherarch/ttscpp/src/kokoro_model.cpp otherarch/ttscpp/src/dia_model.cpp otherarch/ttscpp/src/orpheus_model.cpp otherarch/ttscpp/src/snac_model.cpp otherarch/ttscpp/src/general_neural_audio_codec.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) ggml/src/ggml-vulkan-shaders.cpp: ifdef VULKAN_BUILD diff --git a/README.md b/README.md index 5e4ea3808..484cebc52 100644 --- a/README.md +++ b/README.md @@ -191,6 +191,7 @@ and it will install everything required. Alternatively, you can download the abo - KoboldCpp code and other files are also under the AGPL v3.0 License unless otherwise stated - Llama.cpp source repo is at https://github.com/ggml-org/llama.cpp (MIT) - Stable-diffusion.cpp source repo is at https://github.com/leejet/stable-diffusion.cpp (MIT) +- TTS.cpp source repo is at https://github.com/mmwillet/TTS.cpp (MIT) - KoboldCpp source repo is at https://github.com/LostRuins/koboldcpp (AGPL) - KoboldAI Lite source repo is at https://github.com/LostRuins/lite.koboldai.net (AGPL) - For any further enquiries, contact @concedo on discord, or LostRuins on github. diff --git a/otherarch/ttscpp/TTSCPP_LICENSE b/otherarch/ttscpp/TTSCPP_LICENSE new file mode 100644 index 000000000..b5479530c --- /dev/null +++ b/otherarch/ttscpp/TTSCPP_LICENSE @@ -0,0 +1,24 @@ +The original TTS.cpp is made by mmwillet, repo can be found at https://github.com/mmwillet/TTS.cpp +KoboldCpp uses a minimal implementation with some files removed. + +MIT License + +Copyright (c) 2023-2024 The ggml authors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/otherarch/ttscpp/examples/cli/cli.cpp b/otherarch/ttscpp/examples/cli/cli.cpp new file mode 100644 index 000000000..5cb9bc2f9 --- /dev/null +++ b/otherarch/ttscpp/examples/cli/cli.cpp @@ -0,0 +1,96 @@ +#include "tts.h" +#include "ttsargs.h" +#include "ttscommon.h" +#include "playback.h" +#include "vad.h" +#include "write_file.h" +#include + +class tts_timing_printer { + const int64_t start_us{[] { + ggml_time_init(); + return ggml_time_us(); + }()}; +public: + ~tts_timing_printer() { + const int64_t end_us{ggml_time_us()}; + // Just a simple "total time" for now before adding "load" / "prompt eval" / "eval" from llama_print_timings + printf("total time = %.2f ms\n", (end_us - start_us) / 1000.0f); + } +}; + +int main(int argc, const char ** argv) { + const tts_timing_printer _{}; + float default_temperature = 1.0f; + int default_n_threads = std::max((int)std::thread::hardware_concurrency(), 1); + int default_top_k = 50; + int default_max_tokens = 0; + float default_repetition_penalty = 1.0f; + float default_top_p = 1.0f; + arg_list args; + args.add_argument(string_arg("--model-path", "(REQUIRED) The local path of the gguf model file for Parler TTS mini or large v1, Dia, or Kokoro.", "-mp", true)); + args.add_argument(string_arg("--prompt", "(REQUIRED) The text prompt for which to generate audio in quotation markers.", "-p", true)); + args.add_argument(string_arg("--save-path", "(OPTIONAL) The path to save the audio output to in a .wav format. Defaults to TTS.cpp.wav", "-sp", false, "TTS.cpp.wav")); + args.add_argument(float_arg("--temperature", "The temperature to use when generating outputs. Defaults to 1.0.", "-t", false, &default_temperature)); + args.add_argument(int_arg("--n-threads", "The number of cpu threads to run generation with. Defaults to hardware concurrency. If hardware concurrency cannot be determined then it defaults to 1.", "-nt", false, &default_n_threads)); + args.add_argument(int_arg("--topk", "(OPTIONAL) When set to an integer value greater than 0 generation uses nucleus sampling over topk nucleaus size. Defaults to 50.", "-tk", false, &default_top_k)); + args.add_argument(float_arg("--repetition-penalty", "The by channel repetition penalty to be applied the sampled output of the model. defaults to 1.0.", "-r", false, &default_repetition_penalty)); + args.add_argument(bool_arg("--use-metal", "(OPTIONAL) Whether to use metal acceleration", "-m")); + args.add_argument(bool_arg("--no-cross-attn", "(OPTIONAL) Whether to not include cross attention", "-ca")); + args.add_argument(string_arg("--conditional-prompt", "(OPTIONAL) A distinct conditional prompt to use for generating. If none is provided the preencoded prompt is used. '--text-encoder-path' must be set to use conditional generation.", "-cp", false)); + args.add_argument(string_arg("--text-encoder-path", "(OPTIONAL) The local path of the text encoder gguf model for conditional generaiton.", "-tep", false)); + args.add_argument(string_arg("--voice", "(OPTIONAL) The voice to use to generate the audio. This is only used for models with voice packs.", "-v", false, "af_alloy")); + args.add_argument(bool_arg("--vad", "(OPTIONAL) whether to apply voice inactivity detection (VAD) and strip silence form the end of the output (particularly useful for Parler TTS). By default, no VAD is applied.", "-va")); + args.add_argument(string_arg("--espeak-voice-id", "(OPTIONAL) The espeak voice id to use for phonemization. This should only be specified when the correct espeak voice cannot be inferred from the kokoro voice ( see MultiLanguage Configuration in the README for more info).", "-eid", false)); + args.add_argument(int_arg("--max-tokens", "(OPTIONAL) The max audio tokens or token batches to generate where each represents approximates 11 ms of audio. Only applied to Dia generation. If set to zero as is its default then the default max generation size. Warning values under 15 are not supported.", "-mt", false, &default_max_tokens)); + args.add_argument(float_arg("--top-p", "(OPTIONAL) the sum of probabilities to sample over. Must be a value between 0.0 and 1.0. Defaults to 1.0.", "-tp", false, &default_top_p)); + register_play_tts_response_args(args); + args.parse(argc, argv); + if (args.for_help) { + args.help(); + exit(0); + } + args.validate(); + + std::string conditional_prompt = args.get_string_param("--conditional-prompt"); + std::string text_encoder_path = args.get_string_param("--text-encoder-path"); + if (conditional_prompt.size() > 0 && text_encoder_path.size() <= 0) { + fprintf(stderr, "The '--text-encoder-path' must be specified when '--condtional-prompt' is passed.\n"); + exit(1); + } + + if (*args.get_float_param("--top-p") > 1.0f || *args.get_float_param("--top-p") <= 0.0f) { + fprintf(stderr, "The '--top-p' value must be between 0.0 and 1.0. It was set to '%.6f'.\n", *args.get_float_param("--top-p")); + exit(1); + } + + generation_configuration * config = new generation_configuration( + args.get_string_param("--voice"), + *args.get_int_param("--topk"), + *args.get_float_param("--temperature"), + *args.get_float_param("--repetition-penalty"), + !args.get_bool_param("--no-cross-attn"), + args.get_string_param("--espeak-voice-id"), + *args.get_int_param("--max-tokens"), + *args.get_float_param("--top-p")); + + struct tts_runner * runner = runner_from_file(args.get_string_param("--model-path"), *args.get_int_param("--n-threads"), config, !args.get_bool_param("--use-metal")); + + if (conditional_prompt.size() > 0) { + update_conditional_prompt(runner, text_encoder_path, conditional_prompt, true); + } + tts_response data; + + generate(runner, args.get_string_param("--prompt"), &data, config); + if (data.n_outputs == 0) { + fprintf(stderr, "Got empty response for prompt, '%s'.\n", args.get_string_param("--prompt").c_str()); + exit(1); + } + if (args.get_bool_param("--vad")) { + apply_energy_voice_inactivity_detection(data, runner->sampling_rate); + } + if (!play_tts_response(args, data, runner->sampling_rate)) { + write_audio_file(data, args.get_string_param("--save-path"), runner->sampling_rate); + } + return 0; +} diff --git a/otherarch/ttscpp/examples/cli/playback.cpp b/otherarch/ttscpp/examples/cli/playback.cpp new file mode 100644 index 000000000..1659c8583 --- /dev/null +++ b/otherarch/ttscpp/examples/cli/playback.cpp @@ -0,0 +1,62 @@ +#include +#include "playback.h" + +#ifndef SDL2_INSTALL +void register_play_tts_response_args(arg_list & args) { + // Hide --play +} + +bool play_tts_response(arg_list & args, const tts_response & data, float sample_rate) { + return false; +} +#else +#include "SDL.h" +void register_play_tts_response_args(arg_list & args) { + args.add_argument(bool_arg("--play", "(OPTIONAL) Whether to play back the audio immediately instead of saving it to file.")); +} + +bool play_tts_response(arg_list & args, const tts_response & data, float sample_rate) { + if (!args.get_bool_param("--play")) { + return false; + } + + if (SDL_Init(SDL_INIT_AUDIO)) { + fprintf(stderr, "SDL_INIT failed\n"); + exit(1); + } + + const SDL_AudioSpec desired{ + .freq = static_cast(sample_rate), + .format = AUDIO_F32, + .channels = 1, + .silence = 0, + .padding = 0, + .size = static_cast(data.n_outputs), + .callback = nullptr, + .userdata = nullptr, + }; + const SDL_AudioDeviceID dev = SDL_OpenAudioDevice(nullptr, false, &desired, nullptr, 0); + if (!dev) { + fprintf(stderr, "SDL_OpenAudioDevice failed\n"); + exit(1); + } + + SDL_PauseAudioDevice(dev, false); + fprintf(stdout, "Playing %ld samples of audio\n", data.n_outputs); + if (SDL_QueueAudio(dev, data.data, data.n_outputs * sizeof(data.data[0]))) { + fprintf(stderr, "SDL_QueueAudio failed\n"); + exit(1); + } + + SDL_Event event; + while (SDL_GetQueuedAudioSize(dev)) { + if (SDL_PollEvent(&event) && event.type == SDL_QUIT) break; + SDL_Delay(100); + } + + SDL_CloseAudioDevice(dev); + SDL_Quit(); + + return true; +} +#endif diff --git a/otherarch/ttscpp/examples/cli/playback.h b/otherarch/ttscpp/examples/cli/playback.h new file mode 100644 index 000000000..346b67459 --- /dev/null +++ b/otherarch/ttscpp/examples/cli/playback.h @@ -0,0 +1,7 @@ +#pragma once + +#include "ttsargs.h" +#include "ttscommon.h" + +void register_play_tts_response_args(arg_list & args); +bool play_tts_response(arg_list & args, const tts_response & data, float sample_rate); diff --git a/otherarch/ttscpp/examples/cli/vad.cpp b/otherarch/ttscpp/examples/cli/vad.cpp new file mode 100644 index 000000000..9468ef2fa --- /dev/null +++ b/otherarch/ttscpp/examples/cli/vad.cpp @@ -0,0 +1,68 @@ +#include "vad.h" + +float energy(float * chunk, int count) { + float en = 0.0f; + for (int i = 0; i < count; i++) { + en += powf(chunk[i], 2.0f); + } + return en; +} + +void apply_energy_voice_inactivity_detection( + tts_response & data, + float sample_rate, + int ms_per_frame, + int frame_threshold, + float normalized_energy_threshold, + int trailing_silent_frames, + int early_cutoff_seconds_threshold, + float early_cutoff_energy_threshold) { + int samples_per_frame = (int) (ms_per_frame * sample_rate / 1000.0f); + int n_frames = (int) (data.n_outputs / samples_per_frame); + int early_cuttoff_frames = (int)((early_cutoff_seconds_threshold * 1000) / ms_per_frame); + + // for min-max normalization + float max_energy = 0.0f; + float min_energy = 0.0f; + float * energies = (float *) malloc(n_frames * sizeof(float)); + int silent_frames = 0; + + // compute the energies and the necessary elements for min-max normalization + for (int i = 0; i < n_frames; i++) { + float * chunk = data.data + i * samples_per_frame; + energies[i] = energy(chunk, samples_per_frame); + if (i == 0) { + max_energy = energies[i]; + min_energy = energies[i]; + } else if (energies[i] > max_energy) { + max_energy = energies[i]; + } else if (energies[i] < min_energy) { + min_energy = energies[i]; + } + if (energies[i] <= early_cutoff_energy_threshold) { + silent_frames++; + } else { + silent_frames = 0; + } + if (silent_frames >= early_cuttoff_frames) { + data.n_outputs = (i + trailing_silent_frames - silent_frames) * samples_per_frame; + free(energies); + return; + } + } + + int concurrent_silent_frames = 0; + + for (int i = n_frames; i > 0; i--) { + float frame_energy = (energies[i-1] - min_energy) / (max_energy - min_energy); + if (frame_energy < normalized_energy_threshold) { + concurrent_silent_frames++; + } else { + break; + } + } + if (concurrent_silent_frames >= frame_threshold) { + data.n_outputs -= ((concurrent_silent_frames - trailing_silent_frames) * samples_per_frame); + } + free(energies); +} diff --git a/otherarch/ttscpp/examples/cli/vad.h b/otherarch/ttscpp/examples/cli/vad.h new file mode 100644 index 000000000..109da29b3 --- /dev/null +++ b/otherarch/ttscpp/examples/cli/vad.h @@ -0,0 +1,21 @@ +#pragma once + +#include +#include "ttscommon.h" + +float energy(float * chunk, int count); + +/* + * This function is used to trim trailing silence at the end of audio data within the tts_response struct. + * It detects silence by min-max normalizing energy and trimming frames which fall under a relative threshold. + */ +void apply_energy_voice_inactivity_detection( + tts_response & data, + float sample_rate = 44100.0f, // the sample rate of the audio + int ms_per_frame = 10, // the audio time per frame + int frame_threshold = 20, // the number of trailing empty frames upon which silence is clipped. + float normalized_energy_threshold = 0.01f, // the normalized threshold to determine a silent frame + int trailing_silent_frames = 5, // the number of frames of silence to allow + int early_cutoff_seconds_threshold = 3, // the number of seconds of complete silence before terminating and cutting audio early + float early_cutoff_energy_threshold = 0.1 // the energy threshold for treating a frame as silent for early cutoff +); diff --git a/otherarch/ttscpp/examples/cli/write_file.cpp b/otherarch/ttscpp/examples/cli/write_file.cpp new file mode 100644 index 000000000..939357277 --- /dev/null +++ b/otherarch/ttscpp/examples/cli/write_file.cpp @@ -0,0 +1,12 @@ +#include +#include "write_file.h" +#include "audio_file.h" + +void write_audio_file(const tts_response & data, std::string path, float sample_rate) { + fprintf(stdout, "Writing audio file: %s\n", path.c_str()); + AudioFile file; + file.setSampleRate(sample_rate); + file.samples[0] = std::vector(data.data, data.data + data.n_outputs); + file.save(path, AudioFileFormat::Wave); + file.printSummary(); +} diff --git a/otherarch/ttscpp/examples/cli/write_file.h b/otherarch/ttscpp/examples/cli/write_file.h new file mode 100644 index 000000000..929391b1c --- /dev/null +++ b/otherarch/ttscpp/examples/cli/write_file.h @@ -0,0 +1,5 @@ +#pragma once + +#include "ttscommon.h" + +void write_audio_file(const tts_response & data, std::string path = "TTS.cpp.wav", float sample_rate = 44100.0f); diff --git a/otherarch/ttscpp/examples/phonemize/phonemize.cpp b/otherarch/ttscpp/examples/phonemize/phonemize.cpp new file mode 100644 index 000000000..a9f794bf5 --- /dev/null +++ b/otherarch/ttscpp/examples/phonemize/phonemize.cpp @@ -0,0 +1,31 @@ +#include "phonemizer.h" +#include "ttsargs.h" +#include + +int main(int argc, const char ** argv) { + arg_list args; + args.add_argument(string_arg("--phonemizer-path", "(OPTIONAL) The local path of the gguf phonemiser file for TTS.cpp phonemizer. This is required if not using espeak.", "-mp")); + args.add_argument(string_arg("--prompt", "(REQUIRED) The text prompt to phonemize.", "-p", true)); + args.add_argument(bool_arg("--use-espeak", "(OPTIONAL) Whether to use espeak to generate phonems.", "-ue")); + args.add_argument(string_arg("--espeak-voice-id", "(OPTIONAL) The voice id to use for espeak phonemization. Defaults to 'gmw/en-US'.", "-eid", false, "gmw/en-US")); + args.parse(argc, argv); + if (args.for_help) { + args.help(); + return 0; + } + args.validate(); + if (!args.get_bool_param("--use-espeak") && args.get_string_param("--phonemizer-path") == "") { + fprintf(stderr, "The '--phonemizer-path' must be specified when '--use-espeak' is not true.\n"); + exit(1); + } + + phonemizer * ph; + if (args.get_bool_param("--use-espeak")) { + ph = espeak_phonemizer(false, args.get_string_param("--espeak-voice-id")); + } else { + ph = phonemizer_from_file(args.get_string_param("--phonemizer-path")); + } + std::string response = ph->text_to_phonemes(args.get_string_param("--prompt")); + fprintf(stdout, "%s\n", response.c_str()); + return 0; +} diff --git a/otherarch/ttscpp/include/audio_file.h b/otherarch/ttscpp/include/audio_file.h new file mode 100644 index 000000000..dd1f50fb3 --- /dev/null +++ b/otherarch/ttscpp/include/audio_file.h @@ -0,0 +1,1815 @@ +//======================================================================= +/** @file AudioFile.h + * @author Adam Stark + * @copyright Copyright (C) 2017 Adam Stark + * + * This file is part of the 'AudioFile' library + * + * MIT License + * + * Copyright (c) 2017 Adam Stark + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do so, + * subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A + * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +//======================================================================= + +#ifndef _AS_AudioFile_h +#define _AS_AudioFile_h + +#if defined (_MSC_VER) +#undef max +#undef min +#define NOMINMAX +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// disable some warnings on Windows +#if defined (_MSC_VER) + __pragma(warning (push)) + __pragma(warning (disable : 4244)) + __pragma(warning (disable : 4457)) + __pragma(warning (disable : 4458)) + __pragma(warning (disable : 4389)) + __pragma(warning (disable : 4996)) +#elif defined (__GNUC__) + _Pragma("GCC diagnostic push") + _Pragma("GCC diagnostic ignored \"-Wconversion\"") + _Pragma("GCC diagnostic ignored \"-Wsign-compare\"") + _Pragma("GCC diagnostic ignored \"-Wshadow\"") +#endif + +//============================================================= +/** The different types of audio file, plus some other types to + * indicate a failure to load a file, or that one hasn't been + * loaded yet + */ +enum class AudioFileFormat +{ + Error, + NotLoaded, + Wave, + Aiff +}; + +//============================================================= +template +class AudioFile +{ +public: + + //============================================================= + typedef std::vector > AudioBuffer; + + //============================================================= + /** Constructor */ + AudioFile(); + + /** Constructor, using a given file path to load a file */ + AudioFile (std::string filePath); + + //============================================================= + /** Loads an audio file from a given file path. + * @Returns true if the file was successfully loaded + */ + bool load (std::string filePath); + + /** Saves an audio file to a given file path. + * @Returns true if the file was successfully saved + */ + bool save (std::string filePath, AudioFileFormat format = AudioFileFormat::Wave); + + /** Writes audio data to fileData. + * @Returns true if the write was successful + */ + bool writeData (std::vector& fileData, AudioFileFormat format = AudioFileFormat::Wave); + + //============================================================= + /** Loads an audio file from data in memory */ + bool loadFromMemory (std::vector& fileData); + + //============================================================= + /** @Returns the sample rate */ + uint32_t getSampleRate() const; + + /** @Returns the number of audio channels in the buffer */ + int getNumChannels() const; + + /** @Returns true if the audio file is mono */ + bool isMono() const; + + /** @Returns true if the audio file is stereo */ + bool isStereo() const; + + /** @Returns the bit depth of each sample */ + int getBitDepth() const; + + /** @Returns the number of samples per channel */ + int getNumSamplesPerChannel() const; + + /** @Returns the length in seconds of the audio file based on the number of samples and sample rate */ + double getLengthInSeconds() const; + + /** Prints a summary of the audio file to the console */ + void printSummary() const; + + //============================================================= + + /** Set the audio buffer for this AudioFile by copying samples from another buffer. + * @Returns true if the buffer was copied successfully. + */ + bool setAudioBuffer (AudioBuffer& newBuffer); + + /** Sets the audio buffer to a given number of channels and number of samples per channel. This will try to preserve + * the existing audio, adding zeros to any new channels or new samples in a given channel. + */ + void setAudioBufferSize (int numChannels, int numSamples); + + /** Sets the number of samples per channel in the audio buffer. This will try to preserve + * the existing audio, adding zeros to new samples in a given channel if the number of samples is increased. + */ + void setNumSamplesPerChannel (int numSamples); + + /** Sets the number of channels. New channels will have the correct number of samples and be initialised to zero */ + void setNumChannels (int numChannels); + + /** Sets the bit depth for the audio file. If you use the save() function, this bit depth rate will be used */ + void setBitDepth (int numBitsPerSample); + + /** Sets the sample rate for the audio file. If you use the save() function, this sample rate will be used */ + void setSampleRate (uint32_t newSampleRate); + + //============================================================= + /** Sets whether the library should log error messages to the console. By default this is true */ + void shouldLogErrorsToConsole (bool logErrors); + + //============================================================= + /** A vector of vectors holding the audio samples for the AudioFile. You can + * access the samples by channel and then by sample index, i.e: + * + * samples[channel][sampleIndex] + */ + AudioBuffer samples; + + //============================================================= + /** An optional iXML chunk that can be added to the AudioFile. + */ + std::string iXMLChunk; + +private: + + //============================================================= + enum class Endianness + { + LittleEndian, + BigEndian + }; + + //============================================================= + AudioFileFormat determineAudioFileFormat (std::vector& fileData); + bool decodeWaveFile (std::vector& fileData); + bool decodeAiffFile (std::vector& fileData); + + //============================================================= + bool writeToWaveData (std::vector & fileData); + bool writeToAiffData (std::vector & fileData); + + + //============================================================= + bool saveToWaveFile (std::string filePath); + bool saveToAiffFile (std::string filePath); + + //============================================================= + void clearAudioBuffer(); + + //============================================================= + int32_t fourBytesToInt (std::vector& source, int startIndex, Endianness endianness = Endianness::LittleEndian); + int16_t twoBytesToInt (std::vector& source, int startIndex, Endianness endianness = Endianness::LittleEndian); + int getIndexOfString (std::vector& source, std::string s); + int getIndexOfChunk (std::vector& source, const std::string& chunkHeaderID, int startIndex, Endianness endianness = Endianness::LittleEndian); + + //============================================================= + uint32_t getAiffSampleRate (std::vector& fileData, int sampleRateStartIndex); + bool tenByteMatch (std::vector& v1, int startIndex1, std::vector& v2, int startIndex2); + void addSampleRateToAiffData (std::vector& fileData, uint32_t sampleRate); + + //============================================================= + void addStringToFileData (std::vector& fileData, std::string s); + void addInt32ToFileData (std::vector& fileData, int32_t i, Endianness endianness = Endianness::LittleEndian); + void addInt16ToFileData (std::vector& fileData, int16_t i, Endianness endianness = Endianness::LittleEndian); + + //============================================================= + bool writeDataToFile (std::vector& fileData, std::string filePath); + + //============================================================= + void reportError (std::string errorMessage); + + //============================================================= + AudioFileFormat audioFileFormat; + uint32_t sampleRate; + int bitDepth; + bool logErrorsToConsole {true}; +}; + +//============================================================= +template +struct AudioSampleConverter +{ + //============================================================= + /** Convert a signed 8-bit integer to an audio sample */ + static T signedByteToSample (int8_t sample); + + /** Convert an audio sample to an signed 8-bit representation */ + static int8_t sampleToSignedByte (T sample); + + //============================================================= + /** Convert an unsigned 8-bit integer to an audio sample */ + static T unsignedByteToSample (uint8_t sample); + + /** Convert an audio sample to an unsigned 8-bit representation */ + static uint8_t sampleToUnsignedByte (T sample); + + //============================================================= + /** Convert a 16-bit integer to an audio sample */ + static T sixteenBitIntToSample (int16_t sample); + + /** Convert a an audio sample to a 16-bit integer */ + static int16_t sampleToSixteenBitInt (T sample); + + //============================================================= + /** Convert a 24-bit value (int a 32-bit int) to an audio sample */ + static T twentyFourBitIntToSample (int32_t sample); + + /** Convert a an audio sample to a 24-bit value (in a 32-bit integer) */ + static int32_t sampleToTwentyFourBitInt (T sample); + + //============================================================= + /** Convert a 32-bit signed integer to an audio sample */ + static T thirtyTwoBitIntToSample (int32_t sample); + + /** Convert a an audio sample to a 32-bit signed integer */ + static int32_t sampleToThirtyTwoBitInt (T sample); + + //============================================================= + /** Helper clamp function to enforce ranges */ + static T clamp (T v1, T minValue, T maxValue); +}; + +//============================================================= +// Pre-defined 10-byte representations of common sample rates +static std::unordered_map > aiffSampleRateTable = { + {8000, {64, 11, 250, 0, 0, 0, 0, 0, 0, 0}}, + {11025, {64, 12, 172, 68, 0, 0, 0, 0, 0, 0}}, + {16000, {64, 12, 250, 0, 0, 0, 0, 0, 0, 0}}, + {22050, {64, 13, 172, 68, 0, 0, 0, 0, 0, 0}}, + {32000, {64, 13, 250, 0, 0, 0, 0, 0, 0, 0}}, + {37800, {64, 14, 147, 168, 0, 0, 0, 0, 0, 0}}, + {44056, {64, 14, 172, 24, 0, 0, 0, 0, 0, 0}}, + {44100, {64, 14, 172, 68, 0, 0, 0, 0, 0, 0}}, + {47250, {64, 14, 184, 146, 0, 0, 0, 0, 0, 0}}, + {48000, {64, 14, 187, 128, 0, 0, 0, 0, 0, 0}}, + {50000, {64, 14, 195, 80, 0, 0, 0, 0, 0, 0}}, + {50400, {64, 14, 196, 224, 0, 0, 0, 0, 0, 0}}, + {88200, {64, 15, 172, 68, 0, 0, 0, 0, 0, 0}}, + {96000, {64, 15, 187, 128, 0, 0, 0, 0, 0, 0}}, + {176400, {64, 16, 172, 68, 0, 0, 0, 0, 0, 0}}, + {192000, {64, 16, 187, 128, 0, 0, 0, 0, 0, 0}}, + {352800, {64, 17, 172, 68, 0, 0, 0, 0, 0, 0}}, + {2822400, {64, 20, 172, 68, 0, 0, 0, 0, 0, 0}}, + {5644800, {64, 21, 172, 68, 0, 0, 0, 0, 0, 0}} +}; + +//============================================================= +enum WavAudioFormat +{ + PCM = 0x0001, + IEEEFloat = 0x0003, + ALaw = 0x0006, + MULaw = 0x0007, + Extensible = 0xFFFE +}; + +//============================================================= +enum AIFFAudioFormat +{ + Uncompressed, + Compressed, + Error +}; + +//============================================================= +/* IMPLEMENTATION */ +//============================================================= + +//============================================================= +template +AudioFile::AudioFile() +{ + bitDepth = 16; + sampleRate = 44100; + samples.resize (1); + samples[0].resize (0); + audioFileFormat = AudioFileFormat::NotLoaded; +} + +//============================================================= +template +AudioFile::AudioFile (std::string filePath) + : AudioFile() +{ + load (filePath); +} + +//============================================================= +template +uint32_t AudioFile::getSampleRate() const +{ + return sampleRate; +} + +//============================================================= +template +int AudioFile::getNumChannels() const +{ + return (int)samples.size(); +} + +//============================================================= +template +bool AudioFile::isMono() const +{ + return getNumChannels() == 1; +} + +//============================================================= +template +bool AudioFile::isStereo() const +{ + return getNumChannels() == 2; +} + +//============================================================= +template +int AudioFile::getBitDepth() const +{ + return bitDepth; +} + +//============================================================= +template +int AudioFile::getNumSamplesPerChannel() const +{ + if (samples.size() > 0) + return (int) samples[0].size(); + else + return 0; +} + +//============================================================= +template +double AudioFile::getLengthInSeconds() const +{ + return (double)getNumSamplesPerChannel() / (double)sampleRate; +} + +//============================================================= +template +void AudioFile::printSummary() const +{ + std::cout << "|======================================|" << std::endl; + std::cout << "Num Channels: " << getNumChannels() << std::endl; + std::cout << "Num Samples Per Channel: " << getNumSamplesPerChannel() << std::endl; + std::cout << "Sample Rate: " << sampleRate << std::endl; + std::cout << "Bit Depth: " << bitDepth << std::endl; + std::cout << "Length in Seconds: " << getLengthInSeconds() << std::endl; + std::cout << "|======================================|" << std::endl; +} + +//============================================================= +template +bool AudioFile::setAudioBuffer (AudioBuffer& newBuffer) +{ + int numChannels = (int)newBuffer.size(); + + if (numChannels <= 0) + { + assert (false && "The buffer you are trying to use has no channels"); + return false; + } + + size_t numSamples = newBuffer[0].size(); + + // set the number of channels + samples.resize (newBuffer.size()); + + for (int k = 0; k < getNumChannels(); k++) + { + assert (newBuffer[k].size() == numSamples); + + samples[k].resize (numSamples); + + for (size_t i = 0; i < numSamples; i++) + { + samples[k][i] = newBuffer[k][i]; + } + } + + return true; +} + +//============================================================= +template +void AudioFile::setAudioBufferSize (int numChannels, int numSamples) +{ + samples.resize (numChannels); + setNumSamplesPerChannel (numSamples); +} + +//============================================================= +template +void AudioFile::setNumSamplesPerChannel (int numSamples) +{ + int originalSize = getNumSamplesPerChannel(); + + for (int i = 0; i < getNumChannels();i++) + { + samples[i].resize (numSamples); + + // set any new samples to zero + if (numSamples > originalSize) + std::fill (samples[i].begin() + originalSize, samples[i].end(), (T)0.); + } +} + +//============================================================= +template +void AudioFile::setNumChannels (int numChannels) +{ + int originalNumChannels = getNumChannels(); + int originalNumSamplesPerChannel = getNumSamplesPerChannel(); + + samples.resize (numChannels); + + // make sure any new channels are set to the right size + // and filled with zeros + if (numChannels > originalNumChannels) + { + for (int i = originalNumChannels; i < numChannels; i++) + { + samples[i].resize (originalNumSamplesPerChannel); + std::fill (samples[i].begin(), samples[i].end(), (T)0.); + } + } +} + +//============================================================= +template +void AudioFile::setBitDepth (int numBitsPerSample) +{ + bitDepth = numBitsPerSample; +} + +//============================================================= +template +void AudioFile::setSampleRate (uint32_t newSampleRate) +{ + sampleRate = newSampleRate; +} + +//============================================================= +template +void AudioFile::shouldLogErrorsToConsole (bool logErrors) +{ + logErrorsToConsole = logErrors; +} + +//============================================================= +template +bool AudioFile::load (std::string filePath) +{ + std::ifstream file (filePath, std::ios::binary); + + // check the file exists + if (! file.good()) + { + reportError ("ERROR: File doesn't exist or otherwise can't load file\n" + filePath); + return false; + } + + std::vector fileData; + + file.unsetf (std::ios::skipws); + + file.seekg (0, std::ios::end); + size_t length = file.tellg(); + file.seekg (0, std::ios::beg); + + // allocate + fileData.resize (length); + + file.read(reinterpret_cast (fileData.data()), length); + file.close(); + + if (file.gcount() != length) + { + reportError ("ERROR: Couldn't read entire file\n" + filePath); + return false; + } + + // Handle very small files that will break our attempt to read the + // first header info from them + if (fileData.size() < 12) + { + reportError ("ERROR: File is not a valid audio file\n" + filePath); + return false; + } + else + { + return loadFromMemory (fileData); + } +} + +//============================================================= +template +bool AudioFile::loadFromMemory (std::vector& fileData) +{ + // get audio file format + audioFileFormat = determineAudioFileFormat (fileData); + + if (audioFileFormat == AudioFileFormat::Wave) + { + return decodeWaveFile (fileData); + } + else if (audioFileFormat == AudioFileFormat::Aiff) + { + return decodeAiffFile (fileData); + } + else + { + reportError ("Audio File Type: Error"); + return false; + } +} + +//============================================================= +template +bool AudioFile::decodeWaveFile (std::vector& fileData) +{ + // ----------------------------------------------------------- + // HEADER CHUNK + std::string headerChunkID (fileData.begin(), fileData.begin() + 4); + //int32_t fileSizeInBytes = fourBytesToInt (fileData, 4) + 8; + std::string format (fileData.begin() + 8, fileData.begin() + 12); + + // ----------------------------------------------------------- + // try and find the start points of key chunks + int indexOfDataChunk = getIndexOfChunk (fileData, "data", 12); + int indexOfFormatChunk = getIndexOfChunk (fileData, "fmt ", 12); + int indexOfXMLChunk = getIndexOfChunk (fileData, "iXML", 12); + + // if we can't find the data or format chunks, or the IDs/formats don't seem to be as expected + // then it is unlikely we'll able to read this file, so abort + if (indexOfDataChunk == -1 || indexOfFormatChunk == -1 || headerChunkID != "RIFF" || format != "WAVE") + { + reportError ("ERROR: this doesn't seem to be a valid .WAV file"); + return false; + } + + // ----------------------------------------------------------- + // FORMAT CHUNK + int f = indexOfFormatChunk; + std::string formatChunkID (fileData.begin() + f, fileData.begin() + f + 4); + //int32_t formatChunkSize = fourBytesToInt (fileData, f + 4); + uint16_t audioFormat = twoBytesToInt (fileData, f + 8); + uint16_t numChannels = twoBytesToInt (fileData, f + 10); + sampleRate = (uint32_t) fourBytesToInt (fileData, f + 12); + uint32_t numBytesPerSecond = fourBytesToInt (fileData, f + 16); + uint16_t numBytesPerBlock = twoBytesToInt (fileData, f + 20); + bitDepth = (int) twoBytesToInt (fileData, f + 22); + + if (bitDepth > sizeof (T) * 8) + { + std::string message = "ERROR: you are trying to read a "; + message += std::to_string (bitDepth); + message += "-bit file using a "; + message += std::to_string (sizeof (T) * 8); + message += "-bit sample type"; + reportError (message); + return false; + } + + uint16_t numBytesPerSample = static_cast (bitDepth) / 8; + + // check that the audio format is PCM or Float or extensible + if (audioFormat != WavAudioFormat::PCM && audioFormat != WavAudioFormat::IEEEFloat && audioFormat != WavAudioFormat::Extensible) + { + reportError ("ERROR: this .WAV file is encoded in a format that this library does not support at present"); + return false; + } + + // check the number of channels is mono or stereo + if (numChannels < 1 || numChannels > 128) + { + reportError ("ERROR: this WAV file seems to be an invalid number of channels (or corrupted?)"); + return false; + } + + // check header data is consistent + if (numBytesPerSecond != static_cast ((numChannels * sampleRate * bitDepth) / 8) || numBytesPerBlock != (numChannels * numBytesPerSample)) + { + reportError ("ERROR: the header data in this WAV file seems to be inconsistent"); + return false; + } + + // check bit depth is either 8, 16, 24 or 32 bit + if (bitDepth != 8 && bitDepth != 16 && bitDepth != 24 && bitDepth != 32) + { + reportError ("ERROR: this file has a bit depth that is not 8, 16, 24 or 32 bits"); + return false; + } + + // ----------------------------------------------------------- + // DATA CHUNK + int d = indexOfDataChunk; + std::string dataChunkID (fileData.begin() + d, fileData.begin() + d + 4); + int32_t dataChunkSize = fourBytesToInt (fileData, d + 4); + + int numSamples = dataChunkSize / (numChannels * bitDepth / 8); + int samplesStartIndex = indexOfDataChunk + 8; + + clearAudioBuffer(); + samples.resize (numChannels); + + for (int i = 0; i < numSamples; i++) + { + for (int channel = 0; channel < numChannels; channel++) + { + int sampleIndex = samplesStartIndex + (numBytesPerBlock * i) + channel * numBytesPerSample; + + if ((sampleIndex + (bitDepth / 8) - 1) >= fileData.size()) + { + reportError ("ERROR: read file error as the metadata indicates more samples than there are in the file data"); + return false; + } + + if (bitDepth == 8) + { + T sample = AudioSampleConverter::unsignedByteToSample (fileData[sampleIndex]); + samples[channel].push_back (sample); + } + else if (bitDepth == 16) + { + int16_t sampleAsInt = twoBytesToInt (fileData, sampleIndex); + T sample = AudioSampleConverter::sixteenBitIntToSample (sampleAsInt); + samples[channel].push_back (sample); + } + else if (bitDepth == 24) + { + int32_t sampleAsInt = 0; + sampleAsInt = (fileData[sampleIndex + 2] << 16) | (fileData[sampleIndex + 1] << 8) | fileData[sampleIndex]; + + if (sampleAsInt & 0x800000) // if the 24th bit is set, this is a negative number in 24-bit world + sampleAsInt = sampleAsInt | ~0xFFFFFF; // so make sure sign is extended to the 32 bit float + + T sample = AudioSampleConverter::twentyFourBitIntToSample (sampleAsInt); + samples[channel].push_back (sample); + } + else if (bitDepth == 32) + { + int32_t sampleAsInt = fourBytesToInt (fileData, sampleIndex); + T sample; + + if (audioFormat == WavAudioFormat::IEEEFloat && std::is_floating_point_v) + { + float f; + memcpy (&f, &sampleAsInt, sizeof(int32_t)); + sample = (T)f; + } + else // assume PCM + { + sample = AudioSampleConverter::thirtyTwoBitIntToSample (sampleAsInt); + } + + samples[channel].push_back (sample); + } + else + { + assert (false); + } + } + } + + // ----------------------------------------------------------- + // iXML CHUNK + if (indexOfXMLChunk != -1) + { + int32_t chunkSize = fourBytesToInt (fileData, indexOfXMLChunk + 4); + iXMLChunk = std::string ((const char*) &fileData[indexOfXMLChunk + 8], chunkSize); + } + + return true; +} + +//============================================================= +template +bool AudioFile::decodeAiffFile (std::vector& fileData) +{ + // ----------------------------------------------------------- + // HEADER CHUNK + std::string headerChunkID (fileData.begin(), fileData.begin() + 4); + //int32_t fileSizeInBytes = fourBytesToInt (fileData, 4, Endianness::BigEndian) + 8; + std::string format (fileData.begin() + 8, fileData.begin() + 12); + + int audioFormat = format == "AIFF" ? AIFFAudioFormat::Uncompressed : format == "AIFC" ? AIFFAudioFormat::Compressed : AIFFAudioFormat::Error; + + // ----------------------------------------------------------- + // try and find the start points of key chunks + int indexOfCommChunk = getIndexOfChunk (fileData, "COMM", 12, Endianness::BigEndian); + int indexOfSoundDataChunk = getIndexOfChunk (fileData, "SSND", 12, Endianness::BigEndian); + int indexOfXMLChunk = getIndexOfChunk (fileData, "iXML", 12, Endianness::BigEndian); + + // if we can't find the data or format chunks, or the IDs/formats don't seem to be as expected + // then it is unlikely we'll able to read this file, so abort + if (indexOfSoundDataChunk == -1 || indexOfCommChunk == -1 || headerChunkID != "FORM" || audioFormat == AIFFAudioFormat::Error) + { + reportError ("ERROR: this doesn't seem to be a valid AIFF file"); + return false; + } + + // ----------------------------------------------------------- + // COMM CHUNK + int p = indexOfCommChunk; + std::string commChunkID (fileData.begin() + p, fileData.begin() + p + 4); + //int32_t commChunkSize = fourBytesToInt (fileData, p + 4, Endianness::BigEndian); + int16_t numChannels = twoBytesToInt (fileData, p + 8, Endianness::BigEndian); + int32_t numSamplesPerChannel = fourBytesToInt (fileData, p + 10, Endianness::BigEndian); + bitDepth = (int) twoBytesToInt (fileData, p + 14, Endianness::BigEndian); + sampleRate = getAiffSampleRate (fileData, p + 16); + + if (bitDepth > sizeof (T) * 8) + { + std::string message = "ERROR: you are trying to read a "; + message += std::to_string (bitDepth); + message += "-bit file using a "; + message += std::to_string (sizeof (T) * 8); + message += "-bit sample type"; + reportError (message); + return false; + } + + // check the sample rate was properly decoded + if (sampleRate == 0) + { + reportError ("ERROR: this AIFF file has an unsupported sample rate"); + return false; + } + + // check the number of channels is mono or stereo + if (numChannels < 1 ||numChannels > 2) + { + reportError ("ERROR: this AIFF file seems to be neither mono nor stereo (perhaps multi-track, or corrupted?)"); + return false; + } + + // check bit depth is either 8, 16, 24 or 32-bit + if (bitDepth != 8 && bitDepth != 16 && bitDepth != 24 && bitDepth != 32) + { + reportError ("ERROR: this file has a bit depth that is not 8, 16, 24 or 32 bits"); + return false; + } + + // ----------------------------------------------------------- + // SSND CHUNK + int s = indexOfSoundDataChunk; + std::string soundDataChunkID (fileData.begin() + s, fileData.begin() + s + 4); + int32_t soundDataChunkSize = fourBytesToInt (fileData, s + 4, Endianness::BigEndian); + int32_t offset = fourBytesToInt (fileData, s + 8, Endianness::BigEndian); + //int32_t blockSize = fourBytesToInt (fileData, s + 12, Endianness::BigEndian); + + int numBytesPerSample = bitDepth / 8; + int numBytesPerFrame = numBytesPerSample * numChannels; + int totalNumAudioSampleBytes = numSamplesPerChannel * numBytesPerFrame; + int samplesStartIndex = s + 16 + (int)offset; + + // sanity check the data + if ((soundDataChunkSize - 8) != totalNumAudioSampleBytes || totalNumAudioSampleBytes > static_cast(fileData.size() - samplesStartIndex)) + { + reportError ("ERROR: the metadatafor this file doesn't seem right"); + return false; + } + + clearAudioBuffer(); + samples.resize (numChannels); + + for (int i = 0; i < numSamplesPerChannel; i++) + { + for (int channel = 0; channel < numChannels; channel++) + { + int sampleIndex = samplesStartIndex + (numBytesPerFrame * i) + channel * numBytesPerSample; + + if ((sampleIndex + (bitDepth / 8) - 1) >= fileData.size()) + { + reportError ("ERROR: read file error as the metadata indicates more samples than there are in the file data"); + return false; + } + + if (bitDepth == 8) + { + T sample = AudioSampleConverter::signedByteToSample (static_cast (fileData[sampleIndex])); + samples[channel].push_back (sample); + } + else if (bitDepth == 16) + { + int16_t sampleAsInt = twoBytesToInt (fileData, sampleIndex, Endianness::BigEndian); + T sample = AudioSampleConverter::sixteenBitIntToSample (sampleAsInt); + samples[channel].push_back (sample); + } + else if (bitDepth == 24) + { + int32_t sampleAsInt = 0; + sampleAsInt = (fileData[sampleIndex] << 16) | (fileData[sampleIndex + 1] << 8) | fileData[sampleIndex + 2]; + + if (sampleAsInt & 0x800000) // if the 24th bit is set, this is a negative number in 24-bit world + sampleAsInt = sampleAsInt | ~0xFFFFFF; // so make sure sign is extended to the 32 bit float + + T sample = AudioSampleConverter::twentyFourBitIntToSample (sampleAsInt); + samples[channel].push_back (sample); + } + else if (bitDepth == 32) + { + int32_t sampleAsInt = fourBytesToInt (fileData, sampleIndex, Endianness::BigEndian); + T sample; + + if (audioFormat == AIFFAudioFormat::Compressed) + sample = (T)reinterpret_cast (sampleAsInt); + else // assume PCM + sample = AudioSampleConverter::thirtyTwoBitIntToSample (sampleAsInt); + + samples[channel].push_back (sample); + } + else + { + assert (false); + } + } + } + + // ----------------------------------------------------------- + // iXML CHUNK + if (indexOfXMLChunk != -1) + { + int32_t chunkSize = fourBytesToInt (fileData, indexOfXMLChunk + 4); + iXMLChunk = std::string ((const char*) &fileData[indexOfXMLChunk + 8], chunkSize); + } + + return true; +} + +//============================================================= +template +uint32_t AudioFile::getAiffSampleRate (std::vector& fileData, int sampleRateStartIndex) +{ + for (auto it : aiffSampleRateTable) + { + if (tenByteMatch (fileData, sampleRateStartIndex, it.second, 0)) + return it.first; + } + + return 0; +} + +//============================================================= +template +bool AudioFile::tenByteMatch (std::vector& v1, int startIndex1, std::vector& v2, int startIndex2) +{ + for (int i = 0; i < 10; i++) + { + if (v1[startIndex1 + i] != v2[startIndex2 + i]) + return false; + } + + return true; +} + +//============================================================= +template +void AudioFile::addSampleRateToAiffData (std::vector& fileData, uint32_t sampleRate) +{ + if (aiffSampleRateTable.count (sampleRate) > 0) + { + for (int i = 0; i < 10; i++) + fileData.push_back (aiffSampleRateTable[sampleRate][i]); + } +} + +//============================================================= +template +bool AudioFile::save (std::string filePath, AudioFileFormat format) +{ + if (format == AudioFileFormat::Wave) + { + return saveToWaveFile (filePath); + } + else if (format == AudioFileFormat::Aiff) + { + return saveToAiffFile (filePath); + } + + return false; +} + +//============================================================= +template +bool AudioFile::writeData (std::vector & fileData, AudioFileFormat format) +{ + if (format == AudioFileFormat::Wave) + { + return writeToWaveData (fileData); + } + else if (format == AudioFileFormat::Aiff) + { + return writeToAiffData (fileData); + } + + return false; +} + + +//============================================================= +template +bool AudioFile::writeToWaveData (std::vector & fileData) +{ + int32_t dataChunkSize = getNumSamplesPerChannel() * (getNumChannels() * bitDepth / 8); + int16_t audioFormat = bitDepth == 32 && std::is_floating_point_v ? WavAudioFormat::IEEEFloat : WavAudioFormat::PCM; + int32_t formatChunkSize = audioFormat == WavAudioFormat::PCM ? 16 : 18; + int32_t iXMLChunkSize = static_cast (iXMLChunk.size()); + + // ----------------------------------------------------------- + // HEADER CHUNK + addStringToFileData (fileData, "RIFF"); + + // The file size in bytes is the header chunk size (4, not counting RIFF and WAVE) + the format + // chunk size (24) + the metadata part of the data chunk plus the actual data chunk size + int32_t fileSizeInBytes = 4 + formatChunkSize + 8 + 8 + dataChunkSize; + if (iXMLChunkSize > 0) + { + fileSizeInBytes += (8 + iXMLChunkSize); + } + + addInt32ToFileData (fileData, fileSizeInBytes); + + addStringToFileData (fileData, "WAVE"); + + // ----------------------------------------------------------- + // FORMAT CHUNK + addStringToFileData (fileData, "fmt "); + addInt32ToFileData (fileData, formatChunkSize); // format chunk size (16 for PCM) + addInt16ToFileData (fileData, audioFormat); // audio format + addInt16ToFileData (fileData, (int16_t)getNumChannels()); // num channels + addInt32ToFileData (fileData, (int32_t)sampleRate); // sample rate + + int32_t numBytesPerSecond = (int32_t) ((getNumChannels() * sampleRate * bitDepth) / 8); + addInt32ToFileData (fileData, numBytesPerSecond); + + int16_t numBytesPerBlock = getNumChannels() * (bitDepth / 8); + addInt16ToFileData (fileData, numBytesPerBlock); + + addInt16ToFileData (fileData, (int16_t)bitDepth); + + if (audioFormat == WavAudioFormat::IEEEFloat) + addInt16ToFileData (fileData, 0); // extension size + + // ----------------------------------------------------------- + // DATA CHUNK + addStringToFileData (fileData, "data"); + addInt32ToFileData (fileData, dataChunkSize); + + for (int i = 0; i < getNumSamplesPerChannel(); i++) + { + for (int channel = 0; channel < getNumChannels(); channel++) + { + if (bitDepth == 8) + { + uint8_t byte = AudioSampleConverter::sampleToUnsignedByte (samples[channel][i]); + fileData.push_back (byte); + } + else if (bitDepth == 16) + { + int16_t sampleAsInt = AudioSampleConverter::sampleToSixteenBitInt (samples[channel][i]); + addInt16ToFileData (fileData, sampleAsInt); + } + else if (bitDepth == 24) + { + int32_t sampleAsIntAgain = AudioSampleConverter::sampleToTwentyFourBitInt (samples[channel][i]); + + uint8_t bytes[3]; + bytes[2] = (uint8_t) (sampleAsIntAgain >> 16) & 0xFF; + bytes[1] = (uint8_t) (sampleAsIntAgain >> 8) & 0xFF; + bytes[0] = (uint8_t) sampleAsIntAgain & 0xFF; + + fileData.push_back (bytes[0]); + fileData.push_back (bytes[1]); + fileData.push_back (bytes[2]); + } + else if (bitDepth == 32) + { + int32_t sampleAsInt; + + if (audioFormat == WavAudioFormat::IEEEFloat) + sampleAsInt = (int32_t) reinterpret_cast (samples[channel][i]); + else // assume PCM + sampleAsInt = AudioSampleConverter::sampleToThirtyTwoBitInt (samples[channel][i]); + + addInt32ToFileData (fileData, sampleAsInt, Endianness::LittleEndian); + } + else + { + assert (false && "Trying to write data with unsupported bit depth"); + return false; + } + } + } + + // ----------------------------------------------------------- + // iXML CHUNK + if (iXMLChunkSize > 0) + { + addStringToFileData (fileData, "iXML"); + addInt32ToFileData (fileData, iXMLChunkSize); + addStringToFileData (fileData, iXMLChunk); + } + + return true; +} + +//============================================================= +template +bool AudioFile::writeToAiffData (std::vector & fileData) +{ + int32_t numBytesPerSample = bitDepth / 8; + int32_t numBytesPerFrame = numBytesPerSample * getNumChannels(); + int32_t totalNumAudioSampleBytes = getNumSamplesPerChannel() * numBytesPerFrame; + int32_t soundDataChunkSize = totalNumAudioSampleBytes + 8; + int32_t iXMLChunkSize = static_cast (iXMLChunk.size()); + + // ----------------------------------------------------------- + // HEADER CHUNK + addStringToFileData (fileData, "FORM"); + + // The file size in bytes is the header chunk size (4, not counting FORM and AIFF) + the COMM + // chunk size (26) + the metadata part of the SSND chunk plus the actual data chunk size + int32_t fileSizeInBytes = 4 + 26 + 16 + totalNumAudioSampleBytes; + if (iXMLChunkSize > 0) + { + fileSizeInBytes += (8 + iXMLChunkSize); + } + + addInt32ToFileData (fileData, fileSizeInBytes, Endianness::BigEndian); + + addStringToFileData (fileData, "AIFF"); + + // ----------------------------------------------------------- + // COMM CHUNK + addStringToFileData (fileData, "COMM"); + addInt32ToFileData (fileData, 18, Endianness::BigEndian); // commChunkSize + addInt16ToFileData (fileData, getNumChannels(), Endianness::BigEndian); // num channels + addInt32ToFileData (fileData, getNumSamplesPerChannel(), Endianness::BigEndian); // num samples per channel + addInt16ToFileData (fileData, bitDepth, Endianness::BigEndian); // bit depth + addSampleRateToAiffData (fileData, sampleRate); + + // ----------------------------------------------------------- + // SSND CHUNK + addStringToFileData (fileData, "SSND"); + addInt32ToFileData (fileData, soundDataChunkSize, Endianness::BigEndian); + addInt32ToFileData (fileData, 0, Endianness::BigEndian); // offset + addInt32ToFileData (fileData, 0, Endianness::BigEndian); // block size + + for (int i = 0; i < getNumSamplesPerChannel(); i++) + { + for (int channel = 0; channel < getNumChannels(); channel++) + { + if (bitDepth == 8) + { + uint8_t byte = static_cast (AudioSampleConverter::sampleToSignedByte (samples[channel][i])); + fileData.push_back (byte); + } + else if (bitDepth == 16) + { + int16_t sampleAsInt = AudioSampleConverter::sampleToSixteenBitInt (samples[channel][i]); + addInt16ToFileData (fileData, sampleAsInt, Endianness::BigEndian); + } + else if (bitDepth == 24) + { + int32_t sampleAsIntAgain = AudioSampleConverter::sampleToTwentyFourBitInt (samples[channel][i]); + + uint8_t bytes[3]; + bytes[0] = (uint8_t) (sampleAsIntAgain >> 16) & 0xFF; + bytes[1] = (uint8_t) (sampleAsIntAgain >> 8) & 0xFF; + bytes[2] = (uint8_t) sampleAsIntAgain & 0xFF; + + fileData.push_back (bytes[0]); + fileData.push_back (bytes[1]); + fileData.push_back (bytes[2]); + } + else if (bitDepth == 32) + { + // write samples as signed integers (no implementation yet for floating point, but looking at WAV implementation should help) + int32_t sampleAsInt = AudioSampleConverter::sampleToThirtyTwoBitInt (samples[channel][i]); + addInt32ToFileData (fileData, sampleAsInt, Endianness::BigEndian); + } + else + { + assert (false && "Trying to write data with unsupported bit depth"); + return false; + } + } + } + + // ----------------------------------------------------------- + // iXML CHUNK + if (iXMLChunkSize > 0) + { + addStringToFileData (fileData, "iXML"); + addInt32ToFileData (fileData, iXMLChunkSize, Endianness::BigEndian); + addStringToFileData (fileData, iXMLChunk); + } + return true; +} + +//============================================================= +template +bool AudioFile::saveToWaveFile (std::string filePath) +{ + std::vector fileData; + + int32_t dataChunkSize = getNumSamplesPerChannel() * (getNumChannels() * bitDepth / 8); + int16_t audioFormat = bitDepth == 32 && std::is_floating_point_v ? WavAudioFormat::IEEEFloat : WavAudioFormat::PCM; + int32_t formatChunkSize = audioFormat == WavAudioFormat::PCM ? 16 : 18; + int32_t iXMLChunkSize = static_cast (iXMLChunk.size()); + + // ----------------------------------------------------------- + // HEADER CHUNK + addStringToFileData (fileData, "RIFF"); + + // The file size in bytes is the header chunk size (4, not counting RIFF and WAVE) + the format + // chunk size (24) + the metadata part of the data chunk plus the actual data chunk size + int32_t fileSizeInBytes = 4 + formatChunkSize + 8 + 8 + dataChunkSize; + if (iXMLChunkSize > 0) + { + fileSizeInBytes += (8 + iXMLChunkSize); + } + + addInt32ToFileData (fileData, fileSizeInBytes); + + addStringToFileData (fileData, "WAVE"); + + // ----------------------------------------------------------- + // FORMAT CHUNK + addStringToFileData (fileData, "fmt "); + addInt32ToFileData (fileData, formatChunkSize); // format chunk size (16 for PCM) + addInt16ToFileData (fileData, audioFormat); // audio format + addInt16ToFileData (fileData, (int16_t)getNumChannels()); // num channels + addInt32ToFileData (fileData, (int32_t)sampleRate); // sample rate + + int32_t numBytesPerSecond = (int32_t) ((getNumChannels() * sampleRate * bitDepth) / 8); + addInt32ToFileData (fileData, numBytesPerSecond); + + int16_t numBytesPerBlock = getNumChannels() * (bitDepth / 8); + addInt16ToFileData (fileData, numBytesPerBlock); + + addInt16ToFileData (fileData, (int16_t)bitDepth); + + if (audioFormat == WavAudioFormat::IEEEFloat) + addInt16ToFileData (fileData, 0); // extension size + + // ----------------------------------------------------------- + // DATA CHUNK + addStringToFileData (fileData, "data"); + addInt32ToFileData (fileData, dataChunkSize); + + for (int i = 0; i < getNumSamplesPerChannel(); i++) + { + for (int channel = 0; channel < getNumChannels(); channel++) + { + if (bitDepth == 8) + { + uint8_t byte = AudioSampleConverter::sampleToUnsignedByte (samples[channel][i]); + fileData.push_back (byte); + } + else if (bitDepth == 16) + { + int16_t sampleAsInt = AudioSampleConverter::sampleToSixteenBitInt (samples[channel][i]); + addInt16ToFileData (fileData, sampleAsInt); + } + else if (bitDepth == 24) + { + int32_t sampleAsIntAgain = AudioSampleConverter::sampleToTwentyFourBitInt (samples[channel][i]); + + uint8_t bytes[3]; + bytes[2] = (uint8_t) (sampleAsIntAgain >> 16) & 0xFF; + bytes[1] = (uint8_t) (sampleAsIntAgain >> 8) & 0xFF; + bytes[0] = (uint8_t) sampleAsIntAgain & 0xFF; + + fileData.push_back (bytes[0]); + fileData.push_back (bytes[1]); + fileData.push_back (bytes[2]); + } + else if (bitDepth == 32) + { + int32_t sampleAsInt; + + if (audioFormat == WavAudioFormat::IEEEFloat) + sampleAsInt = (int32_t) reinterpret_cast (samples[channel][i]); + else // assume PCM + sampleAsInt = AudioSampleConverter::sampleToThirtyTwoBitInt (samples[channel][i]); + + addInt32ToFileData (fileData, sampleAsInt, Endianness::LittleEndian); + } + else + { + assert (false && "Trying to write a file with unsupported bit depth"); + return false; + } + } + } + + // ----------------------------------------------------------- + // iXML CHUNK + if (iXMLChunkSize > 0) + { + addStringToFileData (fileData, "iXML"); + addInt32ToFileData (fileData, iXMLChunkSize); + addStringToFileData (fileData, iXMLChunk); + } + + // check that the various sizes we put in the metadata are correct + if (fileSizeInBytes != static_cast (fileData.size() - 8) || dataChunkSize != (getNumSamplesPerChannel() * getNumChannels() * (bitDepth / 8))) + { + reportError ("ERROR: couldn't save file to " + filePath); + return false; + } + + // try to write the file + return writeDataToFile (fileData, filePath); +} + +//============================================================= +template +bool AudioFile::saveToAiffFile (std::string filePath) +{ + std::vector fileData; + + int32_t numBytesPerSample = bitDepth / 8; + int32_t numBytesPerFrame = numBytesPerSample * getNumChannels(); + int32_t totalNumAudioSampleBytes = getNumSamplesPerChannel() * numBytesPerFrame; + int32_t soundDataChunkSize = totalNumAudioSampleBytes + 8; + int32_t iXMLChunkSize = static_cast (iXMLChunk.size()); + + // ----------------------------------------------------------- + // HEADER CHUNK + addStringToFileData (fileData, "FORM"); + + // The file size in bytes is the header chunk size (4, not counting FORM and AIFF) + the COMM + // chunk size (26) + the metadata part of the SSND chunk plus the actual data chunk size + int32_t fileSizeInBytes = 4 + 26 + 16 + totalNumAudioSampleBytes; + if (iXMLChunkSize > 0) + { + fileSizeInBytes += (8 + iXMLChunkSize); + } + + addInt32ToFileData (fileData, fileSizeInBytes, Endianness::BigEndian); + + addStringToFileData (fileData, "AIFF"); + + // ----------------------------------------------------------- + // COMM CHUNK + addStringToFileData (fileData, "COMM"); + addInt32ToFileData (fileData, 18, Endianness::BigEndian); // commChunkSize + addInt16ToFileData (fileData, getNumChannels(), Endianness::BigEndian); // num channels + addInt32ToFileData (fileData, getNumSamplesPerChannel(), Endianness::BigEndian); // num samples per channel + addInt16ToFileData (fileData, bitDepth, Endianness::BigEndian); // bit depth + addSampleRateToAiffData (fileData, sampleRate); + + // ----------------------------------------------------------- + // SSND CHUNK + addStringToFileData (fileData, "SSND"); + addInt32ToFileData (fileData, soundDataChunkSize, Endianness::BigEndian); + addInt32ToFileData (fileData, 0, Endianness::BigEndian); // offset + addInt32ToFileData (fileData, 0, Endianness::BigEndian); // block size + + for (int i = 0; i < getNumSamplesPerChannel(); i++) + { + for (int channel = 0; channel < getNumChannels(); channel++) + { + if (bitDepth == 8) + { + uint8_t byte = static_cast (AudioSampleConverter::sampleToSignedByte (samples[channel][i])); + fileData.push_back (byte); + } + else if (bitDepth == 16) + { + int16_t sampleAsInt = AudioSampleConverter::sampleToSixteenBitInt (samples[channel][i]); + addInt16ToFileData (fileData, sampleAsInt, Endianness::BigEndian); + } + else if (bitDepth == 24) + { + int32_t sampleAsIntAgain = AudioSampleConverter::sampleToTwentyFourBitInt (samples[channel][i]); + + uint8_t bytes[3]; + bytes[0] = (uint8_t) (sampleAsIntAgain >> 16) & 0xFF; + bytes[1] = (uint8_t) (sampleAsIntAgain >> 8) & 0xFF; + bytes[2] = (uint8_t) sampleAsIntAgain & 0xFF; + + fileData.push_back (bytes[0]); + fileData.push_back (bytes[1]); + fileData.push_back (bytes[2]); + } + else if (bitDepth == 32) + { + // write samples as signed integers (no implementation yet for floating point, but looking at WAV implementation should help) + int32_t sampleAsInt = AudioSampleConverter::sampleToThirtyTwoBitInt (samples[channel][i]); + addInt32ToFileData (fileData, sampleAsInt, Endianness::BigEndian); + } + else + { + assert (false && "Trying to write a file with unsupported bit depth"); + return false; + } + } + } + + // ----------------------------------------------------------- + // iXML CHUNK + if (iXMLChunkSize > 0) + { + addStringToFileData (fileData, "iXML"); + addInt32ToFileData (fileData, iXMLChunkSize, Endianness::BigEndian); + addStringToFileData (fileData, iXMLChunk); + } + + // check that the various sizes we put in the metadata are correct + if (fileSizeInBytes != static_cast (fileData.size() - 8) || soundDataChunkSize != getNumSamplesPerChannel() * numBytesPerFrame + 8) + { + reportError ("ERROR: couldn't save file to " + filePath); + return false; + } + + // try to write the file + return writeDataToFile (fileData, filePath); +} + +//============================================================= +template +bool AudioFile::writeDataToFile (std::vector& fileData, std::string filePath) +{ + std::ofstream outputFile (filePath, std::ios::binary); + + if (outputFile.is_open()) + { + for (size_t i = 0; i < fileData.size(); i++) + { + char value = (char) fileData[i]; + outputFile.write (&value, sizeof (char)); + } + + outputFile.close(); + + return true; + } + + return false; +} + +//============================================================= +template +void AudioFile::addStringToFileData (std::vector& fileData, std::string s) +{ + for (size_t i = 0; i < s.length();i++) + fileData.push_back ((uint8_t) s[i]); +} + +//============================================================= +template +void AudioFile::addInt32ToFileData (std::vector& fileData, int32_t i, Endianness endianness) +{ + uint8_t bytes[4]; + + if (endianness == Endianness::LittleEndian) + { + bytes[3] = (i >> 24) & 0xFF; + bytes[2] = (i >> 16) & 0xFF; + bytes[1] = (i >> 8) & 0xFF; + bytes[0] = i & 0xFF; + } + else + { + bytes[0] = (i >> 24) & 0xFF; + bytes[1] = (i >> 16) & 0xFF; + bytes[2] = (i >> 8) & 0xFF; + bytes[3] = i & 0xFF; + } + + for (int i = 0; i < 4; i++) + fileData.push_back (bytes[i]); +} + +//============================================================= +template +void AudioFile::addInt16ToFileData (std::vector& fileData, int16_t i, Endianness endianness) +{ + uint8_t bytes[2]; + + if (endianness == Endianness::LittleEndian) + { + bytes[1] = (i >> 8) & 0xFF; + bytes[0] = i & 0xFF; + } + else + { + bytes[0] = (i >> 8) & 0xFF; + bytes[1] = i & 0xFF; + } + + fileData.push_back (bytes[0]); + fileData.push_back (bytes[1]); +} + +//============================================================= +template +void AudioFile::clearAudioBuffer() +{ + for (size_t i = 0; i < samples.size();i++) + { + samples[i].clear(); + } + + samples.clear(); +} + +//============================================================= +template +AudioFileFormat AudioFile::determineAudioFileFormat (std::vector& fileData) +{ + std::string header (fileData.begin(), fileData.begin() + 4); + + if (header == "RIFF") + return AudioFileFormat::Wave; + else if (header == "FORM") + return AudioFileFormat::Aiff; + else + return AudioFileFormat::Error; +} + +//============================================================= +template +int32_t AudioFile::fourBytesToInt (std::vector& source, int startIndex, Endianness endianness) +{ + if (source.size() >= (startIndex + 4)) + { + int32_t result; + + if (endianness == Endianness::LittleEndian) + result = (source[startIndex + 3] << 24) | (source[startIndex + 2] << 16) | (source[startIndex + 1] << 8) | source[startIndex]; + else + result = (source[startIndex] << 24) | (source[startIndex + 1] << 16) | (source[startIndex + 2] << 8) | source[startIndex + 3]; + + return result; + } + else + { + assert (false && "Attempted to read four bytes from vector at position where out of bounds access would occur"); + return 0; // this is a dummy value as we don't have one to return + } +} + +//============================================================= +template +int16_t AudioFile::twoBytesToInt (std::vector& source, int startIndex, Endianness endianness) +{ + int16_t result; + + if (endianness == Endianness::LittleEndian) + result = (source[startIndex + 1] << 8) | source[startIndex]; + else + result = (source[startIndex] << 8) | source[startIndex + 1]; + + return result; +} + +//============================================================= +template +int AudioFile::getIndexOfString (std::vector& source, std::string stringToSearchFor) +{ + int index = -1; + int stringLength = (int)stringToSearchFor.length(); + + for (size_t i = 0; i < source.size() - stringLength;i++) + { + std::string section (source.begin() + i, source.begin() + i + stringLength); + + if (section == stringToSearchFor) + { + index = static_cast (i); + break; + } + } + + return index; +} + +//============================================================= +template +int AudioFile::getIndexOfChunk (std::vector& source, const std::string& chunkHeaderID, int startIndex, Endianness endianness) +{ + constexpr int dataLen = 4; + + if (chunkHeaderID.size() != dataLen) + { + assert (false && "Invalid chunk header ID string"); + return -1; + } + + int i = startIndex; + while (i < source.size() - dataLen) + { + if (memcmp (&source[i], chunkHeaderID.data(), dataLen) == 0) + { + return i; + } + + i += dataLen; + + // If somehow we don't have 4 bytes left to read, then exit with -1 + if ((i + 4) >= source.size()) + return -1; + + auto chunkSize = fourBytesToInt (source, i, endianness); + i += (dataLen + chunkSize); + } + + return -1; +} + +//============================================================= +template +void AudioFile::reportError (std::string errorMessage) +{ + if (logErrorsToConsole) + std::cout << errorMessage << std::endl; +} + +//============================================================= +template +typename std::make_unsigned::type convertSignedToUnsigned (SignedType signedValue) +{ + static_assert (std::is_signed::value, "The input value must be signed"); + + typename std::make_unsigned::type unsignedValue = static_cast::type> (1) + std::numeric_limits::max(); + + unsignedValue += signedValue; + return unsignedValue; +} + +//============================================================= +enum SampleLimit +{ + SignedInt16_Min = -32768, + SignedInt16_Max = 32767, + UnsignedInt16_Min = 0, + UnsignedInt16_Max = 65535, + SignedInt24_Min = -8388608, + SignedInt24_Max = 8388607, + UnsignedInt24_Min = 0, + UnsignedInt24_Max = 16777215 +}; + +//============================================================= +template +T AudioSampleConverter::thirtyTwoBitIntToSample (int32_t sample) +{ + if constexpr (std::is_floating_point::value) + { + return static_cast (sample) / static_cast (std::numeric_limits::max()); + } + else if (std::numeric_limits::is_integer) + { + if constexpr (std::is_signed_v) + return static_cast (sample); + else + return static_cast (clamp (static_cast (sample + 2147483648), 0, 4294967295)); + } +} + +//============================================================= +template +int32_t AudioSampleConverter::sampleToThirtyTwoBitInt (T sample) +{ + if constexpr (std::is_floating_point::value) + { + // multiplying a float by a the max int32_t is problematic because + // of roundng errors which can cause wrong values to come out, so + // we use a different implementation here compared to other types + if constexpr (std::is_same_v) + { + if (sample >= 1.f) + return std::numeric_limits::max(); + else if (sample <= -1.f) + return std::numeric_limits::lowest() + 1; // starting at 1 preserves symmetry + else + return static_cast (sample * std::numeric_limits::max()); + } + else + { + return static_cast (clamp (sample, -1., 1.) * std::numeric_limits::max()); + } + } + else + { + if constexpr (std::is_signed_v) + return static_cast (clamp (sample, -2147483648LL, 2147483647LL)); + else + return static_cast (clamp (sample, 0, 4294967295) - 2147483648); + } +} + +//============================================================= +template +T AudioSampleConverter::twentyFourBitIntToSample (int32_t sample) +{ + if constexpr (std::is_floating_point::value) + { + return static_cast (sample) / static_cast (8388607.); + } + else if (std::numeric_limits::is_integer) + { + if constexpr (std::is_signed_v) + return static_cast (clamp (sample, SignedInt24_Min, SignedInt24_Max)); + else + return static_cast (clamp (sample + 8388608, UnsignedInt24_Min, UnsignedInt24_Max)); + } +} + +//============================================================= +template +int32_t AudioSampleConverter::sampleToTwentyFourBitInt (T sample) +{ + if constexpr (std::is_floating_point::value) + { + sample = clamp (sample, -1., 1.); + return static_cast (sample * 8388607.); + } + else + { + if constexpr (std::is_signed_v) + return static_cast (clamp (sample, SignedInt24_Min, SignedInt24_Max)); + else + return static_cast (clamp (sample, UnsignedInt24_Min, UnsignedInt24_Max) + SignedInt24_Min); + } +} + +//============================================================= +template +T AudioSampleConverter::sixteenBitIntToSample (int16_t sample) +{ + if constexpr (std::is_floating_point::value) + { + return static_cast (sample) / static_cast (32767.); + } + else if constexpr (std::numeric_limits::is_integer) + { + if constexpr (std::is_signed_v) + return static_cast (sample); + else + return static_cast (convertSignedToUnsigned (sample)); + } +} + +//============================================================= +template +int16_t AudioSampleConverter::sampleToSixteenBitInt (T sample) +{ + if constexpr (std::is_floating_point::value) + { + sample = clamp (sample, -1., 1.); + return static_cast (sample * 32767.); + } + else + { + if constexpr (std::is_signed_v) + return static_cast (clamp (sample, SignedInt16_Min, SignedInt16_Max)); + else + return static_cast (clamp (sample, UnsignedInt16_Min, UnsignedInt16_Max) + SignedInt16_Min); + } +} + +//============================================================= +template +uint8_t AudioSampleConverter::sampleToUnsignedByte (T sample) +{ + if constexpr (std::is_floating_point::value) + { + sample = clamp (sample, -1., 1.); + sample = (sample + 1.) / 2.; + return static_cast (1 + (sample * 254)); + } + else + { + if constexpr (std::is_signed_v) + return static_cast (clamp (sample, -128, 127) + 128); + else + return static_cast (clamp (sample, 0, 255)); + } +} + +//============================================================= +template +int8_t AudioSampleConverter::sampleToSignedByte (T sample) +{ + if constexpr (std::is_floating_point::value) + { + sample = clamp (sample, -1., 1.); + return static_cast (sample * (T)0x7F); + } + else + { + if constexpr (std::is_signed_v) + return static_cast (clamp (sample, -128, 127)); + else + return static_cast (clamp (sample, 0, 255) - 128); + } +} + +//============================================================= +template +T AudioSampleConverter::unsignedByteToSample (uint8_t sample) +{ + if constexpr (std::is_floating_point::value) + { + return static_cast (sample - 128) / static_cast (127.); + } + else if (std::numeric_limits::is_integer) + { + if constexpr (std::is_unsigned_v) + return static_cast (sample); + else + return static_cast (sample - 128); + } +} + +//============================================================= +template +T AudioSampleConverter::signedByteToSample (int8_t sample) +{ + if constexpr (std::is_floating_point::value) + { + return static_cast (sample) / static_cast (127.); + } + else if constexpr (std::numeric_limits::is_integer) + { + if constexpr (std::is_signed_v) + return static_cast (sample); + else + return static_cast (convertSignedToUnsigned (sample)); + } +} + +//============================================================= +template +T AudioSampleConverter::clamp (T value, T minValue, T maxValue) +{ + value = std::min (value, maxValue); + value = std::max (value, minValue); + return value; +} + +#if defined (_MSC_VER) + __pragma(warning (pop)) +#elif defined (__GNUC__) + _Pragma("GCC diagnostic pop") +#endif + +#endif /* AudioFile_h */ diff --git a/otherarch/ttscpp/include/phonemizer.h b/otherarch/ttscpp/include/phonemizer.h new file mode 100644 index 000000000..6167a6818 --- /dev/null +++ b/otherarch/ttscpp/include/phonemizer.h @@ -0,0 +1,533 @@ +#ifndef phonemizer_h +#define phonemizer_h + +#ifdef ESPEAK_INSTALL +# ifdef ESPEAK_INSTALL_LOCAL +# include "speak_lib.h" +# else +# include +# endif +#endif + +#include +#include +#include +#include "tokenizer.h" +#include +#include + +static const std::string ALPHABET = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; +static const std::string ACCENTED_A = "àãâäáåÀÃÂÄÁÅ"; +static const std::string ACCENTED_C = "çÇ"; +static const std::string ACCENTED_E = "èêëéÈÊËÉ"; +static const std::string ACCENTED_I = "ìîïíÌÎÏÍ"; +static const std::string ACCENTED_N = "ñÑ"; +static const std::string ACCENTED_O = "òõôöóøÒÕÔÖÓØ"; +static const std::string ACCENTED_U = "ùûüúÙÛÜÚ"; +static const std::string COMMON_ACCENTED_CHARACTERS = ACCENTED_A + ACCENTED_C + ACCENTED_E + ACCENTED_I + ACCENTED_N + ACCENTED_O + ACCENTED_U; +static const std::string WORD_CHARACTERS = ALPHABET + "." + COMMON_ACCENTED_CHARACTERS; +static const std::string NON_CLAUSE_WORD_CHARACTERS = ALPHABET + COMMON_ACCENTED_CHARACTERS + "'"; +static const std::string VOWELS = "aeiouy"; +static const std::unordered_set ONE_LETTER_WORDS = { + "a", + "i", +}; +/* + * The two letter and three letter words listed below have been filtered down from the complete list of english two and three letter words + * via several criteria: + * 1. All non-EN-US words have been removed + * 2. All three letter acronyms have been removed (as these lists are used to identify acronyms) + * 3. All archaic, deprecated, or poetic words have been removed. + * 4. All literary, abbreviative, and slang words have been removed if they see no more than a mean of 30 daily searches via google (over the + * last 10 years). + * + * After the lists were filtered by the criteria described above, removed items were reviewed. Any item which had entered the common EN-US + * vernacular but was not identified as of American origin was reintroduced into the sets below. + */ +static const std::unordered_set TWO_LETTER_WORDS = { + "ab", "ah", "am", "an", "as", "at", "aw", "ax", "ay", "be", "bo", "br", + "by", "do", "eh", "er", "ew", "ex", "go", "ha", "he", "hi", "hm", "ho", + "id", "if", "in", "is", "it", "la", "lo", "ma", "me", "mm", "my", "na", + "no", "of", "oh", "oi", "on", "oo", "or", "ow", "ox", "oy", "pa", "qi", + "re", "sh", "so", "to", "uh", "um", "un", "up", "us", "we", "wo", "ya", + "ye", "yo", +}; +static const std::unordered_set THREE_LETTER_WORDS = { + "aah", "abs", "aby", "ace", "ach", "ack", "act", "add", "ado", "ads", "aft", "age", + "ago", "aha", "ahi", "aid", "ail", "aim", "air", "alb", "ale", "all", "alp", "alt", + "ama", "amp", "and", "ant", "any", "ape", "app", "apt", "arc", "are", "arf", "ark", + "arm", "art", "ash", "ask", "asp", "ass", "ate", "awe", "axe", "aye", "baa", "bad", + "bae", "bag", "bah", "bam", "ban", "bao", "bap", "bar", "bat", "bay", "bed", "bee", + "beg", "bet", "bez", "bib", "bid", "big", "bin", "bio", "bis", "bit", "biz", "boa", + "bod", "bog", "boi", "boo", "bop", "bot", "bow", "box", "boy", "bra", "bro", "brr", + "bub", "bud", "bug", "bum", "bun", "bur", "bus", "but", "buy", "bye", "cab", "caf", + "cam", "can", "cap", "car", "cat", "caw", "chi", "cig", "cis", "cly", "cob", "cod", + "cog", "col", "con", "coo", "cop", "cos", "cot", "cow", "cox", "coy", "cry", "cub", + "cue", "cum", "cup", "cur", "cut", "cuz", "dab", "dad", "dag", "dal", "dam", "dap", + "das", "daw", "day", "deb", "def", "del", "den", "dep", "dew", "dib", "did", "die", + "dif", "dig", "dim", "din", "dip", "dis", "div", "doc", "doe", "dog", "doh", "dom", + "don", "dos", "dot", "dox", "dry", "dub", "dud", "due", "dug", "duh", "dum", "dun", + "duo", "dup", "dur", "dye", "ear", "eat", "ebb", "eco", "eek", "eel", "egg", "ego", + "elf", "elk", "elm", "emo", "emu", "end", "eon", "era", "err", "est", "eve", "eww", + "eye", "fab", "fad", "fae", "fag", "fah", "fam", "fan", "fap", "far", "fat", "fav", + "fax", "fay", "fed", "fee", "feh", "fem", "fen", "few", "fey", "fez", "fib", "fid", + "fig", "fin", "fir", "fit", "fix", "flu", "fly", "fob", "foe", "fog", "foo", "fop", + "for", "fox", "fro", "fry", "fub", "fun", "fur", "gab", "gad", "gag", "gal", "gam", + "gap", "gas", "gay", "gee", "gel", "gem", "gen", "geo", "get", "gib", "gid", "gif", + "gig", "gin", "gip", "git", "goa", "gob", "god", "goo", "gor", "got", "gov", "grr", + "gum", "gun", "gup", "gut", "guy", "gym", "gyp", "had", "hag", "hah", "haj", "ham", + "hap", "has", "hat", "haw", "hay", "heh", "hem", "hen", "her", "hes", "hew", "hex", + "hey", "hic", "hid", "him", "hip", "his", "hit", "hmm", "hod", "hoe", "hog", "hop", + "hot", "how", "hoy", "hub", "hue", "hug", "huh", "hum", "hun", "hup", "hut", "ice", + "ich", "ick", "icy", "ids", "ifs", "ill", "imp", "ink", "inn", "int", "ion", "ire", + "irk", "ism", "its", "ivy", "jab", "jam", "jap", "jar", "jaw", "jay", "jet", "jib", + "jig", "jin", "job", "joe", "jog", "jot", "joy", "jug", "jut", "kat", "kaw", "kay", + "ked", "keg", "key", "kid", "kin", "kit", "kob", "koi", "lab", "lac", "lad", "lag", + "lam", "lap", "law", "lax", "lay", "led", "leg", "lei", "lek", "let", "lev", "lex", + "lib", "lid", "lie", "lip", "lit", "lob", "log", "loo", "lop", "lot", "low", "lug", + "luv", "lye", "mac", "mad", "mag", "mam", "man", "map", "mar", "mat", "maw", "max", + "may", "med", "meg", "meh", "mel", "men", "met", "mew", "mib", "mid", "mig", "mil", + "mix", "mmm", "mob", "mod", "mog", "mol", "mom", "mon", "moo", "mop", "mow", "mud", + "mug", "mum", "mut", "nab", "nag", "nah", "nan", "nap", "nat", "naw", "nay", "nef", + "neg", "net", "new", "nib", "nil", "nip", "nit", "nob", "nod", "nog", "noh", "nom", + "non", "noo", "nor", "not", "now", "noy", "nth", "nub", "nun", "nut", "nyx", "oaf", + "oak", "oar", "oat", "oba", "obs", "oca", "odd", "ode", "off", "oft", "ohm", "oil", + "oke", "old", "one", "oof", "ooh", "oom", "oop", "ops", "opt", "orb", "orc", "ore", + "org", "ort", "oud", "our", "out", "ova", "owe", "owl", "own", "oxy", "pad", "pah", + "pal", "pan", "par", "pas", "pat", "paw", "pax", "pay", "pea", "pec", "pee", "peg", + "pen", "pep", "per", "pes", "pet", "pew", "phi", "pho", "pht", "pic", "pie", "pig", + "pin", "pip", "pit", "pix", "ply", "pod", "poi", "pol", "poo", "pop", "pos", "pot", + "pow", "pox", "pre", "pro", "pry", "psi", "pst", "pub", "pug", "puh", "pul", "pun", + "pup", "pur", "pus", "put", "pwn", "pya", "pyx", "qat", "rad", "rag", "rai", "raj", + "ram", "ran", "rap", "rat", "raw", "ray", "reb", "rec", "red", "ref", "reg", "rem", + "res", "ret", "rex", "rez", "rho", "ria", "rib", "rid", "rig", "rim", "rin", "rip", + "rob", "roc", "rod", "roe", "rom", "rot", "row", "rub", "rue", "rug", "rum", "run", + "rut", "rya", "rye", "sac", "sad", "sag", "sal", "sap", "sat", "saw", "sax", "say", + "sea", "sec", "see", "seg", "sen", "set", "sew", "sex", "she", "shh", "shy", "sib", + "sic", "sig", "sim", "sin", "sip", "sir", "sis", "sit", "six", "ska", "ski", "sky", + "sly", "sob", "sod", "sol", "som", "son", "sop", "sot", "sou", "sow", "sox", "soy", + "spa", "spy", "sty", "sub", "sue", "sum", "sun", "sup", "sus", "tab", "tad", "tag", + "tai", "taj", "tan", "tao", "tap", "tar", "tat", "tau", "tav", "taw", "tax", "tea", + "tec", "tee", "teg", "tel", "ten", "tet", "tex", "the", "tho", "thy", "tic", "tie", + "til", "tin", "tip", "tis", "tit", "tod", "toe", "ton", "too", "top", "tor", "tot", + "tow", "toy", "try", "tsk", "tub", "tug", "tui", "tum", "tun", "tup", "tut", "tux", + "two", "ugh", "umm", "ump", "uni", "ups", "urd", "urn", "use", "uta", "ute", "utu", + "uwu", "vac", "van", "var", "vas", "vat", "vav", "vax", "vee", "veg", "vet", "vex", + "via", "vid", "vie", "vig", "vim", "vol", "vow", "vox", "vug", "wad", "wag", "wan", + "wap", "war", "was", "wat", "wax", "way", "web", "wed", "wee", "wen", "wet", "wey", + "who", "why", "wig", "win", "wit", "wiz", "woe", "wok", "won", "woo", "wop", "wow", + "wry", "wud", "wus", "yag", "yah", "yak", "yam", "yap", "yar", "yaw", "yay", "yea", + "yeh", "yen", "yep", "yes", "yet", "yew", "yin", "yip", "yok", "you", "yow", "yum", + "yup", "zag", "zap", "zax", "zed", "zee", "zen", "zig", "zip", "zit", "zoo", "zzz" +}; + +static const std::map LETTER_PHONEMES = { + {'a', "ˈeɪ"}, + {'b', "bˈiː"}, + {'c', "sˈiː"}, + {'d', "dˈiː"}, + {'e', "ˈiː"}, + {'f', "ˈɛf"}, + {'j', "dʒˈeɪ"}, + {'h', "ˈeɪtʃ"}, + {'i', "ˈaɪ"}, + {'j', "dʒˈeɪ"}, + {'k', "kˈeɪ"}, + {'l', "ˈɛl"}, + {'m', "ˈɛm"}, + {'n', "ˈɛn"}, + {'o', "ˈoʊ"}, + {'p', "pˈiː"}, + {'q', "kjˈuː"}, + {'r', "ˈɑːɹ"}, + {'s', "ˈɛs"}, + {'t', "tˈiː"}, + {'u', "jˈuː"}, + {'v', "vˈiː"}, + {'w', "dˈʌbəljˌuː"}, + {'x', "ˈɛks"}, + {'y', "wˈaɪ"}, + {'z', "zˈiː"} +}; + +static const std::string SPACE_CHARACTERS = " \t\f\n"; +static const std::string NOOP_BREAKS = "{}[]():;,\""; +static const std::string CLAUSE_BREAKS = ".!?"; + +static const std::string TRILLION_PHONEME = "tɹˈɪliən"; +static const long long int TRILLION = 1000000000000; +static const std::string BILLION_PHONEME = "bˈɪliən"; +static const int BILLION = 1000000000; +static const std::string MILLION_PHONEME = "mˈɪliən"; +static const int MILLION = 1000000; +static const std::string POINT_PHONEME = "pˈɔɪnt"; +static const std::string THOUSAND_PHONEME = "θˈaʊzənd"; +static const std::string HUNDRED_PHONEME = "hˈʌndɹɪd"; +static const std::string NUMBER_CHARACTERS = "0123456789"; +static const std::string COMPATIBLE_NUMERICS = NUMBER_CHARACTERS + "., "; +static const long long int LARGEST_PRONOUNCABLE_NUMBER = 999999999999999; + +static const std::vector NUMBER_PHONEMES = { + "zˈiəɹoʊ", + "wˈʌn", + "tˈuː", + "θɹˈiː", + "fˈɔːɹ", + "fˈaɪv", + "sˈɪks", + "sˈɛvən", + "ˈeɪt", + "nˈaɪn", + "tˈɛn", + "ɪlˈɛvən", + "twˈɛlv", + "θˈɜːtiːn", + "fˈɔːɹtiːn", + "fˈɪftiːn", + "sˈɪkstiːn", + "sˈɛvəntˌiːn", + "ˈeɪtiːn", + "nˈaɪntiːn" +}; + +static const std::vector SUB_HUNDRED_NUMBERS = { + "twˈɛnti", + "θˈɜːɾi", + "fˈɔːɹɾi", + "fˈɪfti", + "sˈɪksti", + "sˈɛvənti", + "ˈeɪɾi", + "nˈaɪnti" +}; + +static const std::map REPLACEABLE = { + {"*", "ˈæstɚɹˌɪsk"}, + {"+", "plˈʌs"}, + {"&", "ˈænd"}, + {"%", "pɚsˈɛnt"}, + {"@", "ˈæt"}, + {"#", "hˈæʃ"}, + {"$", "dˈɑːlɚ"}, + {"~", "tˈɪldə"}, + {"¢", "sˈɛnts"}, + {"£", "pˈaʊnd"}, + {"¥", "jˈɛn"}, + {"₨", "ɹˈuːpiː"}, + {"€", "jˈʊɹɹoʊz"}, + {"₹", "ɹˈuːpiː"}, + {"♯", "ʃˈɑːɹp"}, + {"♭", "flˈæt"}, + {"≈", "ɐpɹˈɑːksɪmətli"}, + {"≠", "nˈɑːt ˈiːkwəl tʊ"}, + {"≤", "lˈɛs ɔːɹ ˈiːkwəl tʊ"}, + {"≥", "ɡɹˈeɪɾɚɹ ɔːɹ ˈiːkwəl tʊ"}, + {">", "ɡɹˈeɪɾɚ ðɐn"}, + {"<", "lˈɛs ðɐn"}, + {"=", "ˈiːkwəlz"}, + {"±", "plˈʌs ɔːɹ mˈaɪnəs"}, + {"×", "tˈaɪmz"}, + {"÷", "dᵻvˈaɪdᵻd bˈaɪ"}, + {"℞", "pɹɪskɹˈɪpʃən"}, + {"№", "nˈuːməˌoʊ"}, + {"°", "dᵻɡɹˈiːz"}, + {"∴", "ðˈɛɹfɔːɹ"}, + {"∵", "bɪkˈʌz"}, + {"√", "skwˈɛɹ ɹˈuːt"}, + {"∛", "kjˈuːb ɹˈuːt"}, + {"∑", "sˈʌm sˈaɪn"}, + {"∂", "dˈɛltə"}, + {"←", "lˈɛft ˈæɹoʊ"}, + {"↑", "ˈʌp ˈæɹoʊ"}, + {"→", "ɹˈaɪt ˈæɹoʊ"}, + {"↓", "dˈaʊn ˈæɹoʊ"}, + {"−", "mˈaɪnəs"}, + {"¶", "pˈæɹəɡɹˌæf"}, + {"§", "sˈɛkʃən"}, +}; + +static const std::string ROMAN_NUMERAL_CHARACTERS = "MDCLXVImdclxvi"; +static const std::map ROMAN_NUMERALS = { + {"m", 1000}, + {"mm", 2000}, + {"mmm", 3000}, + {"c", 100}, + {"cc", 200}, + {"ccc", 300}, + {"cd", 400}, + {"cm", 900}, + {"dc", 600}, + {"dcc", 700}, + {"dccc", 800}, + {"x", 10}, + {"xx", 20}, + {"xxx", 30}, + {"xl", 40}, + {"l", 50}, + {"lx", 60}, + {"lxx", 70}, + {"lxxx", 80}, + {"xc", 90}, + {"i", 1}, + {"ii", 2}, + {"iii", 3}, + {"iv", 4}, + {"v", 5}, + {"vi", 6}, + {"vii", 7}, + {"viii", 8}, + {"ix", 9}, +}; + +static const std::map CONTRACTION_PHONEMES = { + {"re", "r"}, + {"ve", "əv"}, + {"ll", "l"}, + {"d", "d"}, + {"t", "t"}, +}; + +// characters that Espeak-ng treats as stopping tokens. +static std::string STOPPING_TOKENS = ".,:;!?"; + +#ifdef ESPEAK_INSTALL +/** + * espeak-ng uses globals to persist and manage its state so it is not compatible with + * threaded parallelism (https://github.com/espeak-ng/espeak-ng/issues/1527). + * This singleton acts as a mutex wrapped provider for all espeak phonemization methods such + * that multiple instances of the kokoro_runner can be initialized and called in parallel. + */ +class espeak_wrapper { +private: + static espeak_wrapper * instance; + static std::mutex mutex; + +protected: + espeak_wrapper() {}; + ~espeak_wrapper() {}; + bool espeak_initialized = false; + +public: + // singletons aren't copyable + espeak_wrapper(espeak_wrapper &other) = delete; + + // singletons aren't assignable + void operator=(const espeak_wrapper &) = delete; + + static espeak_wrapper * get_instance(); + const espeak_VOICE ** list_voices(); + espeak_ERROR set_voice(const char * voice_code); + const char * text_to_phonemes(const void ** textptr, int textmode, int phonememode); + void initialize(espeak_AUDIO_OUTPUT output, int buflength, const char * path, int options); +}; +#endif + +enum lookup_code { + SUCCESS = 100, + SUCCESS_PARTIAL = 101, + FAILURE_UNFOUND = 200, + FAILURE_PHONETIC = 201, +}; + +enum phoneme_type { + IPA = 1, + ESPEAK_PHONEMES = 2, +}; + +enum phonemizer_type { + TTS_PHONEMIZER = 0, + ESPEAK = 1, +}; + +std::string parse_voice_code(std::string voice_code); +void update_voice(std::string voice_code); +const std::unordered_set inline_combine_sets(const std::vector> sets); +int upper_count(std::string word); +bool is_all_upper(std::string word); +bool is_roman_numeral(char letter); +bool can_be_roman_numeral(std::string word); +bool is_alphabetic(char letter); +bool is_numeric(char letter); + + +std::string replace_accents(std::string word); +std::string build_subthousand_phoneme(int value); +std::string build_number_phoneme(long long int remainder); + +// The conditions struct is used to track and describe stateful criteria while converting text to phonemes. +struct conditions { + bool hyphenated = false; + bool was_all_capitalized = false; + bool was_word = false; + bool was_punctuated_acronym = false; + bool was_number = false; + bool beginning_of_clause = true; + + void reset_for_clause_end(); + void reset_for_space(); + void update_for_word(std::string word,bool allow_for_upper_check = true); +}; + +/* + * The corpus struct is simply a small wrapper class that is used to perform simple look forward and backwards in the text + * which is being phonemized. This can be used to discern how to convert chunks of text in a consistent and protective fashion + * in order to accurately phonemize complicated text. + */ +struct corpus { + corpus(const char * text, size_t size): size(size), text(text) {}; + size_t location = 0; + size_t size; + const char * text; + + /* + * These all return strings because we are parsing in utf-8. As such the count variables passed to all the functions do not represent + * the byte offset to pull to but rather the number of full utf-8 characters to pull (this can include 2, 3, and 4 byte characters). + */ + std::string next(int count = 1); + std::string last(int count = 1); + std::string pop(int count = 1); + std::string after(int after = 1, int count = 1); + + // this is used for popping byte count rather than unique character count. + std::string size_pop(size_t pop_size); + + std::string next_in(std::string val, bool* has_accent = nullptr); + std::string pop_in(std::string val); + + std::string after_until(int after, std::string val); +}; + +/* + * The TTS phonemizer works by splitting each word into distinct graphemes, and for each grapheme the phonemizer will look at the grapheme that came + * before, after, and for any word specific exceptions in order to compile a + */ +struct phonemizer_rule { + ~phonemizer_rule() { + for (auto it : rules) { + delete it.second; + } + } + + std::unordered_map rules; + std::string value = ""; + std::string lookup_rule(std::vector & keys, int index); +}; + +typedef std::unordered_map rules_lookup; + +struct word_phonemizer { + word_phonemizer(struct single_pass_tokenizer * tokenizer): tokenizer(tokenizer) {}; + ~word_phonemizer() { + for (auto it : rules) { + delete it.second; + } + delete tokenizer; + } + + struct single_pass_tokenizer * tokenizer; + rules_lookup rules; + + std::string phonemize(std::string word); + void add_rule(std::vector keys, std::string phoneme); + +private: + std::string lookup_rule(std::string word, std::string current_token, std::string last_token, std::string next_token); +}; + +struct word_phonemizer * word_phonemizer_from_gguf(gguf_context * meta); + +/* + * The general translation approach that espeak uses is to lookup words in the dictionary and return a list of possible matches per lookup. + * Each match contains flags which describe the match's conditions and limitations and optionally a pronunciation. When a pronunciation is not returned, + * it usually means that the word needs to be pronounced phonetically, the word belongs to another language, or that the original content is a + * token representation of a different word (e.g. with numbers). + * + * Since it does not make sense to have the core lexer reperform this lookup operation with represented words or via distinct languages, those behaviors + * are managed by the lookup operation itself and thus the lookup operation will only fail when phonetic or acronym content should be produced. + */ +struct dictionary_response { + dictionary_response(lookup_code code, std::string value = ""): code(code), value(value) {} + std::string value; + lookup_code code; + bool expects_to_be_proceeded_by_number = false; + bool not_at_clause_end = false; + bool not_at_clause_start = false; + + std::string after_match = ""; + + bool is_successful(); + bool is_match(corpus* text, conditions* flags); +}; + +dictionary_response * response_from_string(std::string value, std::string key); + +struct phoneme_dictionary { + std::unordered_map> lookup_map; + dictionary_response* lookup(corpus* text,std::string value, conditions* flags); + dictionary_response* not_found_response = new dictionary_response(FAILURE_UNFOUND); + dictionary_response* phonetic_fallback_response = new dictionary_response(FAILURE_PHONETIC); +}; + +struct phoneme_dictionary * phoneme_dictionary_from_gguf(gguf_context * meta); + +/* + * In general, I would like to avoid requiring the installation of otherwise broad and technically complicated libraries, + * like espeak, especially when they are only being used for a small portion of their overall functionality. While avoiding these + * requirements will keep the default installation cost of TTS.cpp down, it is also unlikely that TTS.cpp will support + * the level of variability in phonemization that espeak currently does. In this regard, I have chosen to optionally support usage of + * espeak. As such, the phonemizer struct described below will support simple text to IPA phoneme functionality out of the box, + * while also optionally acting as an interface for espeak phonemization. + * + * Phonemization seems to use a pattern close to the common lexer, such that at each index or chunk of text forward and backward context + * views are used to support single pass translation. As such, the TTS.cpp phonemization pattern I've decided to implement behaves + * effecively like a simple router lexer. It will only support utf-8 encoded text and english IPA conversion. + */ +struct phonemizer { + phonemizer(struct phoneme_dictionary * dict, struct word_phonemizer * phonetic_phonemizer, bool preserve_punctuation = true): dict(dict), phonetic_phonemizer(phonetic_phonemizer), preserve_punctuation(preserve_punctuation) {}; + ~phonemizer() { + delete dict; + delete phonetic_phonemizer; + } + const std::unordered_set small_english_words = inline_combine_sets({THREE_LETTER_WORDS, TWO_LETTER_WORDS, ONE_LETTER_WORDS}); + std::string separator = " "; + phoneme_type phoneme_mode = IPA; + phonemizer_type mode = TTS_PHONEMIZER; + bool preserve_punctuation = true; + + struct phoneme_dictionary * dict; + + struct word_phonemizer * phonetic_phonemizer; + + void text_to_phonemes(std::string text, std::string* output); + void text_to_phonemes(const char * text, size_t size, std::string* output); + std::string text_to_phonemes(std::string text); + std::string text_to_phonemes(const char * text, size_t size); + +#ifdef ESPEAK_INSTALL + std::string espeak_text_to_phonemes(const char * text); +#endif + + bool process_word(corpus* text, std::string* output, std::string word, conditions * flags, bool has_accent = false); + void append_numeric_series(std::string series, std::string* output, conditions * flags); + bool is_acronym_like(corpus* text, std::string word, conditions* flags); + + bool route(corpus* text, std::string* output, conditions* flags); + bool handle_space(corpus* text, std::string* output, conditions* flags); + bool handle_contraction(corpus* text, std::string* output, conditions* flags); + bool handle_possession_plural(corpus* text, std::string* output, conditions* flags); + bool handle_replacement(corpus* text, std::string next, std::string* output, conditions * flags); + bool handle_phonetic(corpus* text, std::string word, std::string* output, conditions* flags, size_t unaccented_size_difference); + bool handle_acronym(corpus* text, std::string word, std::string* output, conditions * flags); + bool handle_roman_numeral(corpus* text, std::string* output, conditions * flags); + bool handle_word(corpus* text, std::string* output, conditions* flags); + bool handle_numeric_series(corpus* text, std::string* output, conditions* flags); + bool handle_numeric(corpus* text, std::string* output, conditions* flags); + bool handle_punctuation(corpus* text, std::string next, std::string* output, conditions* flags); + bool handle_unknown(corpus* text); +}; + +struct phonemizer * phonemizer_from_gguf(gguf_context * meta, const std::string espeak_voice_code = "gmw/en-US"); +struct phonemizer * phonemizer_from_file(const std::string fname, const std::string espeak_voice_code = "gmw/en-US"); +struct phonemizer * espeak_phonemizer(bool use_espeak_phonemes = false, std::string espeak_voice_code = "gmw/en-US"); + +#endif diff --git a/otherarch/ttscpp/include/tts.h b/otherarch/ttscpp/include/tts.h new file mode 100644 index 000000000..30e98dc2e --- /dev/null +++ b/otherarch/ttscpp/include/tts.h @@ -0,0 +1,34 @@ +#ifndef tts_h +#define tts_h + +#include "parler_model.h" +#include "kokoro_model.h" +#include "dia_model.h" +#include "orpheus_model.h" +#include +#include +#include + +struct tts_runner * parler_tts_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only); +struct tts_runner * kokoro_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only); +struct tts_runner * dia_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only); +struct tts_runner * orpheus_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only); +struct tts_runner * runner_from_file(const std::string & fname, int n_threads, generation_configuration * config, bool cpu_only = true); +int generate(tts_runner * runner, std::string sentence, struct tts_response * response, generation_configuration * config); +void update_conditional_prompt(tts_runner * runner, const std::string file_path, const std::string prompt, bool cpu_only = true); +std::vector list_voices(tts_runner * runner); + +struct quantization_params { + quantization_params(uint32_t n_threads, enum ggml_type quantize_type): n_threads(n_threads), quantize_type(quantize_type) {}; + uint32_t n_threads; + enum ggml_type quantize_type; // quantization type + bool quantize_output_heads = false; + bool quantize_text_embeddings = false; + bool quantize_cross_attn_kv = false; + bool convert_dac_to_f16 = false; + bool convert_non_quantizable_to_f16 = false; +}; + +void quantize_gguf(const std::string & ifile, const std::string & ofile, struct quantization_params * params); + +#endif diff --git a/otherarch/ttscpp/include/ttsargs.h b/otherarch/ttscpp/include/ttsargs.h new file mode 100644 index 000000000..c89f384ca --- /dev/null +++ b/otherarch/ttscpp/include/ttsargs.h @@ -0,0 +1,115 @@ +#ifndef args_h +#define args_h + +#include +#include +#include + +struct arg { + std::string full_name; + std::string abbreviation = ""; + std::string description = ""; + bool required = false; + bool has_param = false; + + std::string help_text(); +}; + +struct bool_arg : public arg { + bool_arg(std::string fn, std::string desc = "", std::string abbr = "", bool req = false, bool val = false) { + full_name = fn; + description = desc; + abbreviation = abbr; + required = req; + value = val; + }; + + bool value = false; +}; + +struct string_arg : public arg { + string_arg(std::string fn, std::string desc = "", std::string abbr = "", bool req = false, std::string val = "") { + full_name = fn; + description = desc; + abbreviation = abbr; + required = req; + value = val; + }; + bool has_param = true; + std::string value; + + int parse(int argc, const char ** argv); +}; + +struct int_arg : public arg { + int_arg(std::string fn, std::string desc = "", std::string abbr = "", bool req = false, int * val = nullptr) { + full_name = fn; + description = desc; + abbreviation = abbr; + required = req; + value = val; + }; + + int * value; + + int parse(int argc, const char ** argv); + +}; + +struct float_arg : public arg { + float_arg(std::string fn, std::string desc = "", std::string abbr = "", bool req = false, float * val = nullptr) { + full_name = fn; + description = desc; + abbreviation = abbr; + required = req; + value = val; + }; + + bool has_param = true; + float * value; + + int parse(int argc, const char ** argv); +}; + +struct arg_list { + std::vector fargs; + std::vector iargs; + std::vector bargs; + std::vector sargs; + bool for_help = false; + + void add_argument(float_arg arg) { + fargs.push_back(arg); + } + + void add_argument(int_arg arg) { + iargs.push_back(arg); + } + + void add_argument(bool_arg arg) { + bargs.push_back(arg); + } + + void add_argument(string_arg arg) { + sargs.push_back(arg); + } + + void help(); + + void validate(); + + void parse(int argc, const char ** argv); + + int find_and_parse(std::string name, int argc, const char ** argv); + + std::string get_string_param(std::string full_name); + + int * get_int_param(std::string full_name); + + float * get_float_param(std::string full_name); + + bool get_bool_param(std::string full_name); +}; + +#endif + diff --git a/otherarch/ttscpp/include/ttscommon.h b/otherarch/ttscpp/include/ttscommon.h new file mode 100644 index 000000000..c3a1a1c80 --- /dev/null +++ b/otherarch/ttscpp/include/ttscommon.h @@ -0,0 +1,80 @@ +#ifndef common_h +#define common_h + +#include +#include +#include +#include + +// Using this simple struct as opposed to a common std::vector allows us to return the cpu buffer +// pointer directly rather than copying the contents of the buffer to a predefined std::vector. +struct tts_response { + float * data; + size_t n_outputs = 0; + uint32_t hidden_size; // this parameter is only currently used by the t5_encoder for which n_outputs corresponds to sequence length; +}; + +enum tts_arch { + PARLER_TTS_ARCH = 0, + KOKORO_ARCH = 1, + DIA_ARCH = 2, + ORPHEUS_ARCH = 3, +}; + +const std::map SUPPORTED_ARCHITECTURES = { + { "parler-tts", PARLER_TTS_ARCH }, + { "kokoro", KOKORO_ARCH }, + { "dia", DIA_ARCH }, + { "orpheus", ORPHEUS_ARCH } +}; + +/// Given a map from keys to values, creates a new map from values to keys +template +static std::map reverse_map(const std::map& m) { + std::map r; + for (const auto& kv : m) { + r[kv.second] = kv.first; + } + return r; +} + +const std::map ARCHITECTURE_NAMES = reverse_map(SUPPORTED_ARCHITECTURES); + +struct generation_configuration { + generation_configuration( + std::string voice = "", + int top_k = 50, + float temperature = 1.0, + float repetition_penalty = 1.0, + bool use_cross_attn = true, + std::string espeak_voice_id = "", + int max_tokens = 0, + float top_p = 1.0, + bool sample = true): top_k(top_k), temperature(temperature), repetition_penalty(repetition_penalty), use_cross_attn(use_cross_attn), sample(sample), voice(voice), espeak_voice_id(espeak_voice_id), max_tokens(max_tokens), top_p(top_p) {}; + + bool use_cross_attn; + float temperature; + float repetition_penalty; + float top_p; + int top_k; + int max_tokens; + std::string voice = ""; + bool sample = true; + std::string espeak_voice_id = ""; +}; + +struct tts_runner { + tts_arch arch; + struct ggml_context * ctx = nullptr; + float sampling_rate = 44100.0f; + bool supports_voices = false; + + std::string arch_name() { + return ARCHITECTURE_NAMES.at(arch); + } + + void init_build(std::vector* buf_compute_meta); + void free_build(); +}; + +#endif diff --git a/otherarch/ttscpp/src/args.cpp b/otherarch/ttscpp/src/args.cpp new file mode 100644 index 000000000..3bbc8b1f7 --- /dev/null +++ b/otherarch/ttscpp/src/args.cpp @@ -0,0 +1,164 @@ +#include "ttsargs.h" + +std::string arg::help_text() { + std::string htxt = full_name; + if (abbreviation != "") { + htxt += " (" + abbreviation + ")"; + } + htxt += ":\n "; + if (description != "") { + htxt += description + "\n"; + } else { + htxt += "is a " + (std::string)(required ? "required " : "optional ") + "parameter.\n"; + } + return htxt; +} + +int string_arg::parse(int argc, const char ** argv) { + required = false; + value.assign(argv[0]); + return 1; +} + +int int_arg::parse(int argc, const char ** argv) { + if (required) { + required = false; + } + int val = atoi(argv[0]); + *value = val; + return 1; +} + +int float_arg::parse(int argc, const char ** argv) { + if (required) { + required = false; + } + float val = strtof(argv[0], nullptr); + *value = val; + return 1; +} + +void arg_list::help() { + std::string help_text = ""; + for (auto arg : fargs) { + help_text += arg.help_text(); + } + for (auto arg : iargs) { + help_text += arg.help_text(); + + } + for (auto arg : bargs) { + help_text += arg.help_text(); + + } + for (auto arg : sargs) { + help_text += arg.help_text(); + + } + fprintf(stdout, "%s", help_text.c_str()); +} + +void arg_list::validate() { + for (auto arg : fargs) { + if (arg.required) { + fprintf(stderr, "argument '%s' is required.\n", arg.full_name.c_str()); + exit(1); + } + } + for (auto arg : iargs) { + if (arg.required) { + fprintf(stderr, "argument '%s' is required.\n", arg.full_name.c_str()); + exit(1); + } + } + for (auto arg : bargs) { + if (arg.required) { + fprintf(stderr, "argument '%s' is required.\n", arg.full_name.c_str()); + exit(1); + } + } + for (auto arg : sargs) { + if (arg.required) { + fprintf(stderr, "argument '%s' is required.\n", arg.full_name.c_str()); + exit(1); + } + } +} + +void arg_list::parse(int argc, const char ** argv) { + int current_arg = 1; + while (current_arg < argc) { + std::string name(argv[current_arg]); + if (name == "--help") { + for_help = true; + return; + } + current_arg += 1; + current_arg += find_and_parse(name, argc - current_arg, argv + current_arg); + } +} + +int arg_list::find_and_parse(std::string name, int argc, const char ** argv) { + for (int i = 0; i < fargs.size(); i++) { + if (fargs[i].full_name == name || fargs[i].abbreviation == name) { + return fargs[i].parse(argc, argv); + } + } + for (int i = 0; i < iargs.size(); i++) { + if (iargs[i].full_name == name || iargs[i].abbreviation == name) { + return iargs[i].parse(argc, argv); + } + } + for (int i = 0; i < bargs.size(); i++) { + if (bargs[i].full_name == name || bargs[i].abbreviation == name) { + bargs[i].value = !bargs[i].value; + bargs[i].required = false; + return 0; + } + + } + for (int i = 0; i < sargs.size(); i++) { + if (sargs[i].full_name == name || sargs[i].abbreviation == name) { + return sargs[i].parse(argc, argv); + } + } + fprintf(stderr, "argument '%s' is not a valid argument. Call '--help' for information on all valid arguments.\n", name.c_str()); + exit(1); +} + +std::string arg_list::get_string_param(std::string full_name) { + for (auto arg : sargs) { + if (arg.full_name == full_name) { + return arg.value; + } + } + return ""; +} + +int * arg_list::get_int_param(std::string full_name) { + for (auto arg : iargs) { + if (arg.full_name == full_name) { + return arg.value; + } + } + return nullptr; +} + +float * arg_list::get_float_param(std::string full_name) { + for (auto arg : fargs) { + if (arg.full_name == full_name) { + return arg.value; + } + } + return nullptr; +} + +bool arg_list::get_bool_param(std::string full_name) { + for (auto arg : bargs) { + if (arg.full_name == full_name) { + return arg.value; + } + } + return false; +} + diff --git a/otherarch/ttscpp/src/dac_model.cpp b/otherarch/ttscpp/src/dac_model.cpp new file mode 100644 index 000000000..2ab640cec --- /dev/null +++ b/otherarch/ttscpp/src/dac_model.cpp @@ -0,0 +1,212 @@ +#include "dac_model.h" +#include +#include + +// For loading DAC model from gguf file. +static const std::map DAC_TENSOR_GGUF_LOOKUP = { + {"initial.bias", DAC_ENCODER_IN_BIAS}, + {"initial.weight", DAC_ENCODER_IN_KERNEL}, + {"final.bias", DAC_ENCODER_OUT_BIAS}, + {"final.weight", DAC_ENCODER_OUT_KERNEL}, + {"final.alpha", DAC_ENCODER_SNAKE_ALPHA}, +}; + +void dac_model::prep_constants(gguf_context * meta) { + int output_heads_key = search_for_gguf_keys(meta, {"parler-tts.decoder.output_heads", "output_heads", "dia.decoder.output_heads"}); + if (output_heads_key != -1) { + n_heads = gguf_get_val_u32(meta, output_heads_key); + } + + int sampling_factor_key = search_for_gguf_keys(meta, {"dac.up_sampling_factor", "up_sampling_factor"}); + if (sampling_factor_key != -1) { + up_sampling_factor = gguf_get_val_u32(meta, sampling_factor_key); + } + + int max_gen_key = search_for_gguf_keys(meta, {"parler-tts.decoder.max_generation", "max_generation", "dia.decoder.max_generation"}); + if (max_gen_key != -1) { + max_generation_size = gguf_get_val_u32(meta, max_gen_key); + } +} + +void dac_model::prep_layers(gguf_context * meta) { + for (int i = 0; i < n_heads; i++) { + quantizer_layers.push_back(general_neural_audio_codec::residual_vector_quantize_layer{}); + } + + for (int i = 0; i < n_layers; i++) { + std::string stride_key = "dac_layer_stride_" + std::to_string(i); + std::string padding_key = "dac_layer_padding_" + std::to_string(i); + int layer_stride_key = search_for_gguf_keys(meta, {"dac." + stride_key, stride_key}); + if (layer_stride_key == -1) { + TTS_ABORT("key %s must be specified in gguf file inorder to initialize the DAC audio decoder.", stride_key.c_str()); + } + int layer_padding_key = search_for_gguf_keys(meta, {"dac." + padding_key, padding_key}); + if (layer_padding_key == -1) { + TTS_ABORT("key %s must be specified in gguf file inorder to initialize the DAC audio decoder.", padding_key.c_str()); + } + layers.push_back( + general_neural_audio_codec::layer{ + gguf_get_val_u32(meta, layer_padding_key), + gguf_get_val_u32(meta, layer_stride_key), + } + ); + } +} + +void dac_model::assign_weight(std::string name, ggml_tensor * tensor) { + assign_to_audio_encoder(this, name, tensor); +} + +void assign_to_audio_encoder(dac_model * model, std::string name, ggml_tensor * tensor) { + if (DAC_TENSOR_GGUF_LOOKUP.find(name) != DAC_TENSOR_GGUF_LOOKUP.end()) { + switch(DAC_TENSOR_GGUF_LOOKUP.at(name)) { + case DAC_ENCODER_IN_BIAS: + model->in_conv_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor)); + model->set_tensor(model->in_conv_bias, tensor); + break; + case DAC_ENCODER_IN_KERNEL: + model->in_conv_kernel = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(model->in_conv_kernel, tensor); + break; + case DAC_ENCODER_OUT_BIAS: + model->out_conv_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor)); + model->set_tensor(model->out_conv_bias, tensor); + break; + case DAC_ENCODER_OUT_KERNEL: + model->out_conv_kernel = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(model->out_conv_kernel, tensor); + break; + case DAC_ENCODER_SNAKE_ALPHA: + model->snake_alpha = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(model->snake_alpha, tensor); + break; + default: + fprintf(stdout, "unassigned tensor %s\n", name.c_str()); + break; + } + } else if (std::find_if(name.begin(), name.end(), ::isdigit) != name.end()) { + auto pair = parse_layer_count(name); + int l = pair.first; + std::string lt_name = pair.second; + if (name.find("quantizers") != std::string::npos) { + general_neural_audio_codec::assign_to_quantize_layer((tts_model *) model, model->quantizer_layers[l], lt_name, tensor); + } else { + general_neural_audio_codec::assign_to_layer((tts_model *) model, model->layers[l - 1], lt_name, tensor); + } + } +} + +static struct ggml_tensor * dac_build_audio_inputs(struct ggml_context * ctx, struct dac_context * dctx, const dac_ubatch & batch, std::vector layers) { + struct ggml_tensor * embd; + + dctx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.sequence_length*dctx->model->n_heads); + ggml_set_input(dctx->inp_tokens); + + if (dctx->backend) { + ggml_backend_sched_set_tensor_backend(dctx->sched, dctx->inp_tokens, dctx->backend); + } + + for(int i = 0; i < dctx->model->n_heads; i++) { + auto quantize_layer = dctx->model->quantizer_layers[i]; + struct ggml_tensor * code = ggml_cont(ctx, ggml_view_2d(ctx, dctx->inp_tokens, 1, batch.sequence_length, dctx->model->n_heads*ggml_type_size(GGML_TYPE_I32), i*ggml_type_size(GGML_TYPE_I32))); + code = ggml_reshape_1d(ctx, code, batch.sequence_length); + code = general_neural_audio_codec::build_quantize_layer(ctx, code, quantize_layer); + + if (i == 0) { + embd = code; + } else { + embd = ggml_add(ctx, embd, code); + } + } + return embd; +} + +struct dac_context * build_new_dac_context(struct dac_model * model, int n_threads, bool use_cpu) { + dac_context * dctx = new dac_context(model, n_threads); + if (!use_cpu) { +#ifdef GGML_USE_METAL + dctx->backend = ggml_backend_metal_init(); +#endif + } + dctx->backend_cpu = ggml_backend_cpu_init(); + dctx->set_threads(); + dctx->build_schedule(); + dctx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_nodes() + ggml_graph_overhead_custom(model->max_nodes(), false)); + return dctx; +} + +void dac_runner::prepare_post_load() { + dac_ubatch batch; + batch.sequence_length = model->max_generation_size; + ggml_cgraph * gf = build_dac_graph(batch); + dctx->prep_schedule(gf); +} + +struct ggml_cgraph * dac_runner::build_dac_graph(dac_ubatch & batch) { + init_build(); + // splitting this out from the primary graph so that we can better manage streaming (i.e. sentence chunks are better performed this way) + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false); + + struct ggml_tensor * cur; + struct ggml_tensor * inputs; + + inputs = dac_build_audio_inputs(ctx, dctx, batch, model->quantizer_layers); + ggml_set_name(inputs, "quanitzed_inputs"); + + // everything besides the inputs is just a forward pass + cur = ggml_conv_1d_tts(ctx, model->in_conv_kernel, inputs, 1, 3, 1); + cur = ggml_add(ctx, cur, model->in_conv_bias); + for (auto l : model->layers) { + cur = general_neural_audio_codec::build_layer(ctx, cur, l); + } + cur = snake_1d(ctx, model->snake_alpha, cur); + cur = ggml_conv_1d_tts(ctx, model->out_conv_kernel, cur, 1, 3, 1); + cur = ggml_add(ctx, cur, model->out_conv_bias); + cur = ggml_tanh(ctx, cur); + ggml_build_forward_expand(gf, cur); + free_build(); + return gf; +} + +void dac_runner::run(uint32_t * input_tokens, uint32_t sequence_length, struct tts_response * outputs) { + dac_ubatch batch; + batch.input_tokens = input_tokens; + batch.sequence_length = sequence_length; + ggml_backend_sched_reset(dctx->sched); + + const size_t prev_size = dctx->buf_output ? ggml_backend_buffer_get_size(dctx->buf_output) : 0; + const size_t new_size = model->max_generation_size * model->up_sampling_factor * sizeof(float); + + if (!dctx->buf_output || prev_size < new_size) { + if (dctx->buf_output) { + ggml_backend_buffer_free(dctx->buf_output); + dctx->buf_output = nullptr; + dctx->logits = nullptr; + } + + dctx->buf_output = ggml_backend_buft_alloc_buffer(dctx->backend_cpu_buffer, new_size); + } + + outputs->data = (float *) ggml_backend_buffer_get_base(dctx->buf_output); + ggml_backend_buffer_clear(dctx->buf_output, 0); + + struct ggml_cgraph * gf = NULL; + gf = build_dac_graph(batch); + + // the output is always the last tensor in the graph + struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1]; + ggml_backend_sched_alloc_graph(dctx->sched, gf); + + ggml_backend_tensor_set(dctx->inp_tokens, batch.input_tokens, 0, batch.sequence_length*model->n_heads*ggml_element_size(dctx->inp_tokens)); + + ggml_backend_sched_graph_compute_async(dctx->sched, gf); + + dctx->get_ggml_node_data(result, outputs->data, batch.sequence_length*sizeof(float)*model->up_sampling_factor); + + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(dctx->sched); + outputs->n_outputs = sequence_length * model->up_sampling_factor; + return; +} + diff --git a/otherarch/ttscpp/src/dac_model.h b/otherarch/ttscpp/src/dac_model.h new file mode 100644 index 000000000..be43ad02d --- /dev/null +++ b/otherarch/ttscpp/src/dac_model.h @@ -0,0 +1,98 @@ +#ifndef dac_model_h +#define dac_model_h + +#include "general_neural_audio_codec.h" +#include + +enum dac_tensor { + DAC_ENCODER_IN_KERNEL, + DAC_ENCODER_IN_BIAS, + DAC_ENCODER_OUT_KERNEL, + DAC_ENCODER_OUT_BIAS, + DAC_ENCODER_SNAKE_ALPHA, +}; + +struct dac_quantize_layer { + struct ggml_tensor * out_proj_kernel; + struct ggml_tensor * out_proj_bias; + struct ggml_tensor * codebook; +}; + +// DAC, Descript Audio Codec, is a channel token to audio autoencoder model (though we only use its decoder functionality). +// this struct maintains the static tensors for the dac audio decoder graph. +// As such, this is designed to contain basic configuration and ggml tensor support for DAC. +// The dac_runner describes how the graph is built and run. +struct dac_model : tts_model { + // These configs are essentially built for the 44khZ 8kbps standard DAC model audio encoder and decoder + uint32_t n_layers = 4; + uint32_t n_heads = 9; + uint32_t up_sampling_factor = 512; + uint32_t max_generation_size = 2580; + + struct ggml_tensor * in_conv_kernel; + struct ggml_tensor * in_conv_bias; + struct ggml_tensor * out_conv_kernel; + struct ggml_tensor * out_conv_bias; + struct ggml_tensor * snake_alpha; + std::vector layers; + std::vector quantizer_layers; + + void assign_weight(std::string name, ggml_tensor * weight); + void prep_constants(gguf_context * meta); + void prep_layers(gguf_context * meta); + void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only) { + prep_layers(meta_ctx); + prep_constants(meta_ctx); + tts_model::setup_from_file(meta_ctx, load_context, cpu_only, "audio_encoder"); + } +}; + +// for loading DAC model from gguf file +void assign_to_audio_encoder(dac_model * model, std::string name, ggml_tensor * tensor); + +// the context used for running the dac model +struct dac_context : runner_context { + dac_context(dac_model * model, int n_threads): runner_context(n_threads), model(model) {}; + + struct dac_model * model; + + struct ggml_tensor * inp_tokens; + + void build_schedule() { + runner_context::build_schedule(model->max_nodes()); + } +}; + +struct dac_context * build_new_dac_context(struct dac_model * model, int n_threads, bool use_cpu = true); + +struct dac_ubatch { + uint32_t * input_tokens; + uint32_t sequence_length; +}; + +static struct ggml_tensor * dac_build_audio_inputs(struct ggml_context * ctx, struct dac_context * dctx, const dac_ubatch & batch, std::vector layers); + +// This struct is intended to manage the dac model's graph compilation and compute function. +struct dac_runner : tts_runner { + dac_runner(dac_model * model, dac_context * context): model(model), dctx(context) {}; + ~dac_runner() { + if (ctx) { + ggml_free(ctx); + } + model->free(); + delete model; + delete dctx; + } + dac_model * model; + dac_context * dctx; + + void init_build() { + tts_runner::init_build(&dctx->buf_compute_meta); + } + + void prepare_post_load(); + struct ggml_cgraph * build_dac_graph(dac_ubatch & batch); + void run(uint32_t * input_tokens, uint32_t sequence_length, struct tts_response * outputs); +}; + +#endif diff --git a/otherarch/ttscpp/src/dia_model.cpp b/otherarch/ttscpp/src/dia_model.cpp new file mode 100644 index 000000000..bd6dfd43a --- /dev/null +++ b/otherarch/ttscpp/src/dia_model.cpp @@ -0,0 +1,911 @@ +#include "dia_model.h" + +void dia_model::assign_weight(std::string name, struct ggml_tensor * tensor) { + std::vector parts = split(name, "."); + TTS_ASSERT(parts.size() >= 3); + + if (parts[1] == "encoder") { + assign_to_encoder(parts, tensor, name); + } else if (parts[1] == "decoder"){ + assign_to_decoder(parts, tensor, name); + } else { + TTS_ABORT("Unrecognized tensor '%s' when loading Dia from GGUF file.", name.c_str()); + } +} + +void dia_model::assign_to_encoder(std::vector parts, struct ggml_tensor * tensor, std::string name) { + if (parts[2] == "embedding") { + encoder->embedding = ggml_dup_tensor(ctx, tensor); + set_tensor(encoder->embedding, tensor); + } else if (parts[2] == "norm") { + encoder->norm = ggml_dup_tensor(ctx, tensor); + set_tensor(encoder->norm, tensor); + } else if (parts[2] == "layers") { + TTS_ASSERT(parts.size() >= 4); + int index = std::stoi(parts[3]); + TTS_ASSERT(index < decoder->layers.size()); + assign_to_encoder_layer(parts[4], encoder->layers[index], tensor); + } else { + TTS_ABORT("Unrecognized tensor '%s' when loading Dia from GGUF file.", name.c_str()); + } +} + +void dia_model::assign_to_decoder(std::vector parts, struct ggml_tensor * tensor, std::string name) { + if (parts[2] == "embeddings") { + TTS_ASSERT(parts.size() > 2); + int index = std::stoi(parts[3]); + TTS_ASSERT(index < decoder->embds.size()); + decoder->embds[index] = ggml_dup_tensor(ctx, tensor); + set_tensor(decoder->embds[index], tensor); + } else if (parts[2] == "norm") { + decoder->norm = ggml_dup_tensor(ctx, tensor); + set_tensor(decoder->norm, tensor); + } else if (parts[2] == "heads") { + TTS_ASSERT(parts.size() > 2); + int index = std::stoi(parts[3]); + TTS_ASSERT(index < decoder->heads.size()); + decoder->heads[index] = ggml_dup_tensor(ctx, tensor); + set_tensor(decoder->heads[index], tensor); + } else if (parts[2] == "layers") { + TTS_ASSERT(parts.size() >= 4); + int index = std::stoi(parts[3]); + TTS_ASSERT(index < decoder->layers.size()); + assign_to_decoder_layer(parts[4], decoder->layers[index], tensor); + } else { + TTS_ABORT("Unrecognized tensor '%s' when loading Dia from GGUF file.", name.c_str()); + } +} + +void dia_model::assign_to_encoder_layer(std::string part, dia_encoder_layer * layer, struct ggml_tensor * tensor) { + if (part == "q_proj") { + layer->q = ggml_dup_tensor(ctx, tensor); + set_tensor(layer->q, tensor); + } else if (part == "k_proj") { + layer->k = ggml_dup_tensor(ctx, tensor); + set_tensor(layer->k, tensor); + } else if (part == "v_proj") { + layer->v = ggml_dup_tensor(ctx, tensor); + set_tensor(layer->v, tensor); + } else if (part == "o_proj") { + layer->o = ggml_dup_tensor(ctx, tensor); + set_tensor(layer->o, tensor); + } else if (part == "pre_sa_norm") { + layer->self_attn_norm = ggml_dup_tensor(ctx, tensor); + set_tensor(layer->self_attn_norm, tensor); + } else if (part == "post_sa_norm") { + layer->mlp_norm = ggml_dup_tensor(ctx, tensor); + set_tensor(layer->mlp_norm, tensor); + } else if (part == "gate") { + layer->gate = ggml_dup_tensor(ctx, tensor); + set_tensor(layer->gate, tensor); + } else if (part == "up") { + layer->up = ggml_dup_tensor(ctx, tensor); + set_tensor(layer->up, tensor); + } else if (part == "wo") { + layer->out = ggml_dup_tensor(ctx, tensor); + set_tensor(layer->out, tensor); + } else { + TTS_ABORT("Unrecognized tensor '%s' for encoder layer when loading Dia from GGUF file.", part.c_str()); + } +} + +void dia_model::assign_to_decoder_layer(std::string part, dia_decoder_layer * layer, struct ggml_tensor * tensor) { + if (part == "self_q_proj") { + layer->self_attn_q = ggml_dup_tensor(ctx, tensor); + set_tensor(layer->self_attn_q, tensor); + } else if (part == "self_k_proj") { + layer->self_attn_k = ggml_dup_tensor(ctx, tensor); + set_tensor(layer->self_attn_k, tensor); + } else if (part == "self_v_proj") { + layer->self_attn_v = ggml_dup_tensor(ctx, tensor); + set_tensor(layer->self_attn_v, tensor); + } else if (part == "self_o_proj") { + layer->self_attn_o = ggml_dup_tensor(ctx, tensor); + set_tensor(layer->self_attn_o, tensor); + } else if (part == "cross_q_proj") { + layer->cross_attn_q = ggml_dup_tensor(ctx, tensor); + set_tensor(layer->cross_attn_q, tensor); + } else if (part == "cross_k_proj") { + layer->cross_attn_k = ggml_dup_tensor(ctx, tensor); + set_tensor(layer->cross_attn_k, tensor); + } else if (part == "cross_v_proj") { + layer->cross_attn_v = ggml_dup_tensor(ctx, tensor); + set_tensor(layer->cross_attn_v, tensor); + } else if (part == "cross_o_proj") { + layer->cross_attn_o = ggml_dup_tensor(ctx, tensor); + set_tensor(layer->cross_attn_o, tensor); + } else if (part == "pre_sa_norm") { + layer->self_attn_norm = ggml_dup_tensor(ctx, tensor); + set_tensor(layer->self_attn_norm, tensor); + } else if (part == "pre_mlp_norm") { + layer->mlp_norm = ggml_dup_tensor(ctx, tensor); + set_tensor(layer->mlp_norm, tensor); + } else if (part == "pre_ca_norm") { + layer->cross_attn_norm = ggml_dup_tensor(ctx, tensor); + set_tensor(layer->cross_attn_norm, tensor); + } else if (part == "gate") { + layer->gate = ggml_dup_tensor(ctx, tensor); + set_tensor(layer->gate, tensor); + } else if (part == "up") { + layer->up = ggml_dup_tensor(ctx, tensor); + set_tensor(layer->up, tensor); + } else if (part == "wo") { + layer->out = ggml_dup_tensor(ctx, tensor); + set_tensor(layer->out, tensor); + } else { + TTS_ABORT("Unrecognized tensor '%s' for encoder layer when loading Dia from GGUF file.", part.c_str()); + } +} + +void dia_model::prep_layers() { + encoder = new dia_encoder; + decoder = new dia_decoder; + encoder->layers.reserve((size_t) n_encoder_layers); + for (int i = 0; i < (int) n_encoder_layers; i++) { + dia_encoder_layer * l = new dia_encoder_layer; + encoder->layers.push_back(l); + } + + decoder->layers.reserve((size_t) n_decoder_layers); + for (int i = 0; i < (int) n_decoder_layers; i++) { + dia_decoder_layer * l = new dia_decoder_layer; + decoder->layers.push_back(l); + } + + decoder->embds.reserve((size_t) n_output_heads); + decoder->heads.reserve((size_t) n_output_heads); + for (int i = 0; i < n_output_heads; i++) { + struct ggml_tensor * h = nullptr; + struct ggml_tensor * embd = nullptr; + decoder->embds.push_back(embd); + decoder->heads.push_back(h); + } +} + +void dia_model::prep_constants(gguf_context * meta) { + int output_heads_key = gguf_find_key(meta, "dia.decoder.output_heads"); + if (output_heads_key != -1) { + n_output_heads = gguf_get_val_u32(meta, output_heads_key); + } + + int decoder_layers_key = gguf_find_key(meta, "dia.decoder.layers"); + if (decoder_layers_key != -1) { + n_decoder_layers = gguf_get_val_u32(meta, decoder_layers_key); + } + + int encoder_layers_key = gguf_find_key(meta, "dia.encoder.layers"); + if (encoder_layers_key != -1) { + n_encoder_layers = gguf_get_val_u32(meta, encoder_layers_key); + } + + int decoder_hidden_size_key = gguf_find_key(meta, "dia.decoder.hidden_size"); + if (decoder_hidden_size_key != -1) { + decoder_hidden_size = gguf_get_val_u32(meta, decoder_hidden_size_key); + } + + int decoder_attn_heads_key = gguf_find_key(meta, "dia.decoder.attn_heads"); + if (decoder_attn_heads_key != -1) { + decoder_attn_heads = gguf_get_val_u32(meta, decoder_attn_heads_key); + } + + int decoder_query_heads_key = gguf_find_key(meta, "dia.decoder.query_heads"); + if (decoder_query_heads_key != -1) { + decoder_query_heads = gguf_get_val_u32(meta, decoder_query_heads_key); + } + + int encoder_attn_heads_key = gguf_find_key(meta, "dia.encoder.attn_heads"); + if (encoder_attn_heads_key != -1) { + encoder_attn_heads = gguf_get_val_u32(meta, encoder_attn_heads_key); + } + + int head_size_key = gguf_find_key(meta, "dia.attn_head_size"); + if (head_size_key != -1) { + head_size = gguf_get_val_u32(meta, head_size_key); + } + + int eos_token_id_key = gguf_find_key(meta, "dia.eos_token_id"); + if (eos_token_id_key != -1) { + eos_token_id = gguf_get_val_u32(meta, eos_token_id_key); + } + + int bos_token_id_key = gguf_find_key(meta, "dia.bos_token_id"); + if (bos_token_id_key != -1) { + bos_token_id = gguf_get_val_u32(meta, bos_token_id_key); + } + + int pad_token_id_key = gguf_find_key(meta, "dia.pad_token_id"); + if (pad_token_id_key != -1) { + pad_token_id = gguf_get_val_u32(meta, pad_token_id_key); + } + + int max_context_key = gguf_find_key(meta, "dia.encoder.max_context_length"); + if (max_context_key != -1) { + max_encoder_context_length = gguf_get_val_u32(meta, max_context_key); + } + + int output_vocab_size_key = gguf_find_key(meta, "dia.decoder.output_vocab_size"); + if (output_vocab_size_key != -1) { + output_vocab_size = gguf_get_val_u32(meta, output_vocab_size_key); + } + + int audio_vocab_size_key = gguf_find_key(meta, "dia.decoder.audio_vocab_size"); + if (audio_vocab_size_key != -1) { + audio_vocab_size = gguf_get_val_u32(meta, audio_vocab_size_key); + } + + int max_generation_size_key = gguf_find_key(meta, "dia.decoder.max_generation_size"); + if (max_generation_size_key != -1) { + max_generation_size = gguf_get_val_u32(meta, max_generation_size_key); + } + int max_delay_key = gguf_find_key(meta, "dia.max_delay"); + if (max_delay_key != -1) { + max_delay = gguf_get_val_u32(meta, max_delay_key); + } + + // please note that this value is not currently set in the gguf encoder as it effectively only exists as a default + // python parameter (rather than an attribute in the model config) for the python Dia model. + int cfg_scale_key = gguf_find_key(meta, "dia.cfg_scale"); + if (cfg_scale_key != -1) { + cfg_scale_data[0] = gguf_get_val_f32(meta, cfg_scale_key); + } +} + +void dia_context::reset() { + current_position = 0; + prompt_size = 0; + output_tokens.clear(); + delay_steps = -1; +} + +struct dia_context * build_new_dia_context(struct dia_model * model, int n_threads, bool use_cpu) { + dia_context * dctx = new dia_context(model, n_threads); + if (!use_cpu) { +#ifdef GGML_USE_METAL + dctx->backend = ggml_backend_metal_init(); +#endif + } + dctx->backend_cpu = ggml_backend_cpu_init(); + dctx->set_threads(); + dctx->build_schedule(); + dctx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_nodes() + ggml_graph_overhead_custom(model->max_nodes(), false)); + return dctx; +} + +static bool dia_kv_cache_init(struct dia_kv_cache * cache, dia_model * model, dia_context * dctx) { + ggml_backend_buffer_type_t buft = nullptr; + // this will only really support cpu or metal for the time being; + if (dctx->backend != nullptr) { +#ifdef GGML_USE_METAL + buft = ggml_backend_metal_buffer_type(); +#endif + } else { + buft = ggml_backend_cpu_buffer_type(); + } + + struct ggml_init_params params = { + /*.mem_size =*/ (4u * model->n_decoder_layers + 1) * ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ggml_context * ctx = ggml_init(params); + if (!ctx) { + return false; + } + cache->ctx = ctx; + + cache->k_l.reserve(model->n_decoder_layers); + cache->v_l.reserve(model->n_decoder_layers); + cache->cross_k_l.reserve(model->n_decoder_layers); + cache->cross_v_l.reserve(model->n_decoder_layers); + + for (int i = 0; i < (int) model->n_decoder_layers; i++) { + struct ggml_tensor * k = ggml_new_tensor_1d(cache->ctx, cache->tensor_type, model->head_size * model->decoder_attn_heads * model->max_generation_size * 2); + struct ggml_tensor * v = ggml_new_tensor_1d(cache->ctx, cache->tensor_type, model->head_size * model->decoder_attn_heads * model->max_generation_size * 2); + struct ggml_tensor * cross_k = ggml_new_tensor_1d(cache->ctx, cache->tensor_type, model->head_size * model->decoder_attn_heads * model->max_encoder_context_length * 2); + struct ggml_tensor * cross_v = ggml_new_tensor_1d(cache->ctx, cache->tensor_type, model->head_size * model->decoder_attn_heads * model->max_encoder_context_length * 2); + ggml_format_name(k, "cache_k_l%d", i); + ggml_format_name(v, "cache_v_l%d", i); + ggml_format_name(cross_k, "cache_cross_k_l%d", i); + ggml_format_name(cross_v, "cache_cross_v_l%d", i); + cache->k_l.push_back(k); + cache->v_l.push_back(v); + cache->cross_k_l.push_back(cross_k); + cache->cross_v_l.push_back(cross_v); + } + + // allocate tensors and initialize the buffers to avoid NaNs in the padding + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(cache->ctx, buft); + if (!buf) { + return false; + } + ggml_backend_buffer_clear(buf, 0); + cache->buf = buf; + + return true; +} + +static struct ggml_tensor * build_dia_decoder_inp_embd(struct ggml_context * ctx, dia_context *dctx, dia_decoder * decoder, dia_ubatch & batch, uint32_t n_output_heads) { + struct ggml_tensor * input_embs; + + dctx->audio_inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_output_heads * 2); + ggml_set_input(dctx->audio_inp_tokens); + for (int i = 0; i < n_output_heads; i++) { + struct ggml_tensor * view = ggml_view_1d(ctx, dctx->audio_inp_tokens, 2, i * ggml_element_size(dctx->audio_inp_tokens)); + view->nb[0] = n_output_heads * ggml_element_size(dctx->audio_inp_tokens); + if (i == 0) { + input_embs = ggml_get_rows(ctx, decoder->embds[i], view); + } else { + input_embs = ggml_add(ctx, ggml_get_rows(ctx, decoder->embds[i], view), input_embs); + } + } + return input_embs; +} + +static struct ggml_tensor * dia_layer_norm(struct ggml_context * ctx, struct ggml_tensor * inputs, struct ggml_tensor * weight) { + // dia always uses 1e-5 as the default eps + float eps = 0.00001; + inputs = ggml_rms_norm(ctx, inputs, eps); + return ggml_mul(ctx, inputs, weight); +} + +static struct ggml_tensor * build_dia_encoder_attn_mask(ggml_context * ctx, struct dia_context * dctx, dia_model * model) { + dctx->encode_attn_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, (int64_t) model->max_encoder_context_length, (int64_t) model->max_encoder_context_length); + ggml_set_input(dctx->encode_attn_mask); + + return dctx->encode_attn_mask; +} + +static struct ggml_tensor * build_dia_head_outputs(struct ggml_context * ctx, dia_model * model, struct ggml_tensor * cur) { + // going to cat the heads together and then reshape them + struct ggml_tensor * out; + for (int i = 0; i < model->n_output_heads; i++) { + if (i == 0) { + out = ggml_mul_mat(ctx, model->decoder->heads[i], cur); + } else { + out = ggml_concat(ctx, out, ggml_mul_mat(ctx, model->decoder->heads[i], cur), 2); + } + } + struct ggml_tensor * cond = ggml_cont(ctx, ggml_view_2d(ctx, out, out->ne[0], out->ne[2], out->nb[2], 0)); + struct ggml_tensor * uncond = ggml_cont(ctx, ggml_view_2d(ctx, out, out->ne[0], out->ne[2], out->nb[2], out->nb[1])); + return ggml_map_custom2(ctx, cond, uncond, &cfg_scale, out->ne[0], &model->cfg_scale_data); +} + +static struct ggml_tensor * build_dia_encoder(ggml_context * ctx, dia_model * model, dia_context * dctx, dia_ubatch & batch) { + dctx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, model->max_encoder_context_length*2); + ggml_set_input(dctx->inp_tokens); + + dctx->encode_positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, model->max_encoder_context_length); + ggml_set_input(dctx->encode_positions); + + struct ggml_tensor * attn_mask = build_dia_encoder_attn_mask(ctx, dctx, model); + + struct ggml_tensor * cur = ggml_reshape_3d(ctx, ggml_get_rows(ctx, model->encoder->embedding, dctx->inp_tokens), model->encoder_hidden_size, model->max_encoder_context_length, 2); + for (auto layer : model->encoder->layers) { + struct ggml_tensor * residual = cur; + + cur = dia_layer_norm(ctx, cur, layer->self_attn_norm); + // self-attention + { + struct ggml_tensor * Qcur = ggml_mul_mat(ctx, layer->q, cur); + struct ggml_tensor * Kcur = ggml_mul_mat(ctx, layer->k, cur); + struct ggml_tensor * Vcur = ggml_mul_mat(ctx, layer->v, cur); + + // Strangely Dia follows the neoX Rotary Positional Embeddings Protocol + Qcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, Qcur, model->head_size, model->encoder_attn_heads, model->max_encoder_context_length, 2)), dctx->encode_positions, model->head_size, 2); + Kcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, Kcur, model->head_size, model->encoder_attn_heads, model->max_encoder_context_length, 2)), dctx->encode_positions, model->head_size, 2); + struct ggml_tensor * q = ggml_cont(ctx, ggml_permute(ctx, Qcur, 0, 2, 1, 3)); + struct ggml_tensor * k = ggml_cont(ctx, ggml_permute(ctx, Kcur, 0, 2, 1, 3)); + struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); + kq = ggml_soft_max_ext(ctx, kq, attn_mask, 1.0f, 0.0f); + struct ggml_tensor * v = ggml_cont_4d(ctx, ggml_transpose(ctx, Vcur), model->max_encoder_context_length, model->head_size, model->encoder_attn_heads, 2); + struct ggml_tensor * kqv = ggml_mul_mat(ctx, kq, v); + struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3); + + // It is unclear why the attention ops in Dia's encoder don't project to the embedding dimension size as is standard. Instead they up project to the decoder's embedding dimension + // then down project back the the encoder embedding dimension. + cur = ggml_cont_3d(ctx, kqv_merged, model->decoder_hidden_size, model->max_encoder_context_length, 2); + cur = ggml_mul_mat(ctx, layer->o, cur); + } + + cur = ggml_add(ctx, cur, residual); + struct ggml_tensor * residual_mlp = cur; + + cur = dia_layer_norm(ctx, cur, layer->mlp_norm); + // mlp + { + cur = ggml_mul(ctx, ggml_silu(ctx, ggml_mul_mat(ctx, layer->gate, cur)), ggml_mul_mat(ctx, layer->up, cur)); + cur = ggml_mul_mat(ctx, layer->out, cur); + } + + cur = ggml_add(ctx, cur, residual_mlp); + } + + cur = dia_layer_norm(ctx, cur, model->encoder->norm); + return cur; +} + +static struct ggml_tensor * repeat_interleave_dim1(ggml_context * ctx, struct ggml_tensor * a, int repeat) { + //return ggml_repeat(ctx, a, ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], 4*a->ne[1], a->ne[2], a->ne[3])); + struct ggml_tensor * running; + for (int i = 0; i < a->ne[1]; i++) { + int offset = i * a->nb[1]; + struct ggml_tensor * t = ggml_cont(ctx, ggml_view_4d(ctx, a, a->ne[0], 1, a->ne[2], a->ne[3], a->nb[1], a->nb[2], a->nb[3], offset)); + t = ggml_repeat(ctx, t, ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], repeat, a->ne[2], a->ne[3])); + if (i == 0) { + running = t; + } else { + running = ggml_concat(ctx, running, t, 1); + } + } + return running; +} + +static void build_dia_self_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * k, struct ggml_tensor * v, dia_ubatch & batch, int layer_index) { + int64_t attn_size = model->head_size * model->decoder_attn_heads; + + struct ggml_tensor * k_cache_view = + ggml_view_2d( + ctx, kv->k_l[layer_index], attn_size, 2, + attn_size * model->max_generation_size * ggml_element_size(kv->k_l[layer_index]), + attn_size*dctx->current_position*ggml_element_size(kv->k_l[layer_index])); + + k = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, k, model->head_size, model->decoder_attn_heads / model->decoder_query_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2); + // Since the sequence length should always be 1 here this is the most pertinent time to repeat the heads for grouped query attention. + // If GGML supported a repeat_interleave op then it would be more optimal to store just the groups in the cache and interleave the attention heads after recalling + // from the cache + k = repeat_interleave_dim1(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, k, model->head_size, model->decoder_attn_heads / model->decoder_query_heads, batch.sequence_length, 2)), model->decoder_query_heads); + k = ggml_cont(ctx, ggml_reshape_2d(ctx, k, attn_size, 2)); + + ggml_build_forward_expand(gf, ggml_cpy(ctx, k, k_cache_view)); + + struct ggml_tensor * v_cache_view = nullptr; + + v_cache_view = ggml_view_2d( + ctx, kv->v_l[layer_index], attn_size, 2, + attn_size * model->max_generation_size * ggml_element_size(kv->v_l[layer_index]), + attn_size*dctx->current_position*ggml_element_size(kv->v_l[layer_index])); + + // Since the sequence length should always be 1 here this is the most pertinent time to repeat the heads for grouped query attention. + // If GGML supported a repeat_interleave op then it would be more optimal to store just the groups in the cache and interleave the attention heads after recalling + // from the cache + v = repeat_interleave_dim1(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, v, model->head_size, model->decoder_attn_heads / model->decoder_query_heads, batch.sequence_length, 2)), model->decoder_query_heads); + + ggml_build_forward_expand(gf, ggml_cpy(ctx, v, v_cache_view)); +} + +static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * encoder_hidden_states, int layer_index) { + dia_decoder_layer * layer = model->decoder->layers[layer_index]; + struct ggml_tensor * encoder_states_key_view = ggml_cont(ctx, ggml_view_3d( + ctx, + encoder_hidden_states, + model->encoder_hidden_size, + dctx->prompt_size, + 2, + model->encoder_hidden_size * ggml_element_size(encoder_hidden_states), model->encoder_hidden_size * model->max_encoder_context_length * ggml_element_size(encoder_hidden_states), 0)); + + struct ggml_tensor * k = ggml_mul_mat(ctx, layer->cross_attn_k, encoder_states_key_view); + struct ggml_tensor * positions_view = ggml_view_1d(ctx, dctx->encode_positions, dctx->prompt_size, 0); + + k = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, k, model->head_size, model->decoder_attn_heads, dctx->prompt_size, 2)), positions_view, model->head_size, 2); + k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 1, 3, 2)); + + struct ggml_tensor * k_cache_view = + ggml_view_4d( + ctx, kv->cross_k_l[layer_index], model->head_size, model->decoder_attn_heads, 2, dctx->prompt_size, + model->head_size*ggml_element_size(kv->cross_k_l[layer_index]), + model->head_size*model->decoder_attn_heads*ggml_element_size(kv->cross_k_l[layer_index]), + model->head_size*model->decoder_attn_heads*2*ggml_element_size(kv->cross_k_l[layer_index]), + 0); + + ggml_build_forward_expand(gf, ggml_cpy(ctx, k, k_cache_view)); + + struct ggml_tensor * v = ggml_cont(ctx, ggml_transpose(ctx, ggml_mul_mat(ctx, layer->cross_attn_v, encoder_hidden_states))); + v = ggml_cont_4d(ctx, v, model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2); + + struct ggml_tensor * v_cache_view = + ggml_view_4d( + ctx, kv->cross_v_l[layer_index], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2, + model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]), + model->head_size*model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]), + model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(kv->cross_v_l[layer_index]), + 0); + + ggml_build_forward_expand(gf, ggml_cpy(ctx, v, v_cache_view)); +} + +static struct ggml_tensor * build_dia_decoder( + ggml_cgraph * gf, + ggml_context * ctx, + dia_model * model, + dia_context * dctx, + dia_kv_cache * cache, + dia_ubatch & batch, + struct ggml_tensor * encoder_hidden_states) { + dctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.sequence_length); + ggml_set_input(dctx->positions); + struct ggml_tensor * cur = build_dia_decoder_inp_embd(ctx, dctx, model->decoder, batch, model->n_output_heads); + + for (int l = 0; l < model->decoder->layers.size(); l++){ + dia_decoder_layer * layer = model->decoder->layers[l]; + struct ggml_tensor * residual = cur; + + cur = dia_layer_norm(ctx, cur, layer->self_attn_norm); + // self-attention + { + struct ggml_tensor * Qcur = ggml_mul_mat(ctx, layer->self_attn_q, cur); + struct ggml_tensor * Kcur = ggml_mul_mat(ctx, layer->self_attn_k, cur); + struct ggml_tensor * Vcur = ggml_mul_mat(ctx, layer->self_attn_v, cur); + + build_dia_self_kv_store(ctx, dctx, model, cache, gf, Kcur, Vcur, batch, l); + struct ggml_tensor * k = + ggml_view_4d(ctx, cache->k_l[l], + model->head_size, model->decoder_attn_heads, dctx->current_position + 1, 2, + ggml_element_size(cache->k_l[l]) * model->head_size, + ggml_element_size(cache->k_l[l]) * model->decoder_attn_heads * model->head_size, + ggml_element_size(cache->k_l[l]) * model->decoder_attn_heads * model->head_size * model->max_generation_size, + 0); + k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3)); + + struct ggml_tensor * v = + ggml_view_3d(ctx, cache->v_l[l], + model->head_size * model->decoder_attn_heads, dctx->current_position + 1, 2, + ggml_element_size(cache->v_l[l]) * model->decoder_attn_heads * model->head_size, + ggml_element_size(cache->v_l[l]) * model->decoder_attn_heads * model->head_size * model->max_generation_size, + 0); + v = ggml_cont_4d(ctx, ggml_transpose(ctx, v), dctx->current_position + 1, model->head_size, model->decoder_attn_heads, 2); + + // As noted in the encoder Dia uses the Neo-X protocol for RoPE. + Qcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, Qcur, model->head_size, model->decoder_attn_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2); + struct ggml_tensor * q = ggml_cont(ctx, ggml_permute(ctx, Qcur, 0, 2, 1, 3)); + struct ggml_tensor * kq = ggml_mul_mat(ctx, ggml_cont(ctx, k), q); + + // given that attention bias, scaling and masking are not used for decoding, it might be faster to prefer the #ggml_soft_max op here, + kq = ggml_soft_max_ext(ctx, kq, nullptr, 1.0f, 0.0f); + struct ggml_tensor * kqv = ggml_mul_mat(ctx, kq, v); + struct ggml_tensor * kqv_merged = ggml_cont(ctx, ggml_permute(ctx, kqv, 2, 0, 1, 3)); + cur = ggml_cont_3d(ctx, kqv_merged, model->decoder_hidden_size, batch.sequence_length, 2); + cur = ggml_mul_mat(ctx, layer->self_attn_o, cur); + } + + + // if we ever need to support multiple step decoder runs then this reshape will need to be replaced with permutation. + cur = ggml_cont_2d(ctx, cur, cur->ne[0], 2); + cur = ggml_add(ctx, cur, residual); + struct ggml_tensor * residual_cross = cur; + + cur = dia_layer_norm(ctx, cur, layer->cross_attn_norm); + // cross-attention + { + struct ggml_tensor * cross_Qcur = ggml_mul_mat(ctx, layer->cross_attn_q, cur); + + // only load the cross attention kv store when performing the encoding step + if (batch.encoder_step) { + build_dia_cross_kv_store(ctx, dctx, model, cache, gf, encoder_hidden_states, l); + } + + struct ggml_tensor * cross_k = + ggml_view_4d( + ctx, cache->cross_k_l[l], model->head_size, model->decoder_attn_heads, 2, + model->max_encoder_context_length, model->head_size*ggml_element_size(cache->cross_k_l[l]), + model->head_size*model->decoder_attn_heads*ggml_element_size(cache->cross_k_l[l]), + model->head_size*model->decoder_attn_heads*2*ggml_element_size(cache->cross_k_l[l]), + 0); + // the double permute operation shouldn't be necessary here, but it seems that currently ggml permute only currently alows for a single + // axis pair to be transposed. + cross_k = ggml_cont(ctx, ggml_permute(ctx, ggml_permute(ctx, cross_k, 0, 1, 3, 2), 0, 2, 1, 3)); + + struct ggml_tensor * cross_v = + ggml_cont(ctx, ggml_view_4d( + ctx, cache->cross_v_l[l], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2, + model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]), + model->head_size*model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]), + model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(cache->cross_v_l[l]), + 0)); + + // As noted in the encoder Dia uses the Neo-X protocol for RoPE. + cross_Qcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, cross_Qcur, model->head_size, model->decoder_attn_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2); + struct ggml_tensor * cross_q = ggml_cont(ctx, ggml_permute(ctx, cross_Qcur, 0, 2, 1, 3)); + struct ggml_tensor * cross_kq = ggml_mul_mat(ctx, cross_k, cross_q); + + // given that attention bias, scaling and masking are not used for decoding, it might be faster to prefer the #ggml_soft_max op here, + cross_kq = ggml_soft_max_ext(ctx, cross_kq, nullptr, 1.0f, 0.0f); + struct ggml_tensor * cross_kqv = ggml_mul_mat(ctx, cross_kq, cross_v); + struct ggml_tensor * cross_kqv_merged = ggml_cont(ctx, ggml_permute(ctx, cross_kqv, 2, 0, 1, 3)); + cur = ggml_cont_3d(ctx, cross_kqv_merged, model->decoder_hidden_size, batch.sequence_length, 2); + cur = ggml_mul_mat(ctx, layer->cross_attn_o, cur); + } + + + // if we ever need to support multiple step decoder runs then this reshape will need to be replaced with permutation. + cur = ggml_cont_2d(ctx, cur, cur->ne[0], 2); + cur = ggml_add(ctx, cur, residual_cross); + struct ggml_tensor * residual_mlp = cur; + + cur = dia_layer_norm(ctx, cur, layer->mlp_norm); + // mlp + { + cur = ggml_mul(ctx, ggml_silu(ctx, ggml_mul_mat(ctx, layer->gate, cur)), ggml_mul_mat(ctx, layer->up, cur)); + cur = ggml_mul_mat(ctx, layer->out, cur); + } + + cur = ggml_add(ctx, cur, residual_mlp); + } + + cur = dia_layer_norm(ctx, cur, model->decoder->norm); + cur = build_dia_head_outputs(ctx, model, cur); + return cur; +} + +void dia_runner::tokenize_sentence(std::string sentence, dia_ubatch & batch) { + // Dia's tokenization process is unusual. Essentially Dia takes the byte value for each character and uses that as + // a token array. Additionally, because Dia performs a cfg-scale adjustment before sampling tokens, it is necessary to + // generate with a conditioned context (i.e. with the text) and an unconditioned context (i.e. without any text) so that + // proper adjustments can be perfored at each generation step. This means that we need to pad the end of our tokens to the + // max context size for both the conditional and unconditional sequence. + + // if the sentence isn't prepended by dialogue start tokens, [S1] or [S2], then append one. + sentence = strip(sentence); + std::string start = sentence.substr(0, 4); + if (start != "[S1]" && start != "[S2]") { + sentence = "[S1] " + sentence; + } + if (sentence[sentence.size() - 1] != '.') { + sentence += "."; + } + + // [S1] and [S2] are special character sequences that are replaced with the special tokens 0x01 and 0x02 respectively. + std::string r1(1, 1); + std::string r2(1, 2); + while (sentence.find("[S1]") != std::string::npos) { + size_t pos = sentence.find("[S1]"); + sentence.replace(pos, 4, r1); + } + while (sentence.find("[S2]") != std::string::npos) { + size_t pos = sentence.find("[S2]"); + sentence.replace(pos, 4, r2); + } + + if (sentence.size() > model->max_encoder_context_length) { + TTS_ABORT("Dia currently only supports a max of %d characters and received an input of %d characters.", model->max_encoder_context_length, sentence.size()); + } + batch.tokens.reserve(model->max_encoder_context_length * 2); + for (auto character : sentence) { + batch.tokens.push_back((uint32_t) character); + } + batch.sentence_length = batch.tokens.size(); + // this 100 token warning is arbitrarily chosen based on spot checking small prompt performance + if (batch.sentence_length <= 100) { + fprintf(stdout, "Your prompt has fewer than 100 tokens. Please note that Dia's generation with prompts that are fewer than 100 tokens is highly inconsistent.\n"); + } + + for (int i = (int) batch.tokens.size(); i < model->max_encoder_context_length * 2; i++) { + batch.tokens.push_back(0u); + } + } + +dia_ubatch dia_runner::batch_from_sentence(std::string sentence) { + // if we are generating a new batch from tokens then we need to run the encoder step; + struct dia_ubatch batch{ 1, true}; + tokenize_sentence(sentence, batch); + batch.audio_tokens.reserve(model->n_output_heads); + for (int i = 0; i < model->n_output_heads; i++) { + batch.audio_tokens.push_back(model->bos_token_id); + } + return batch; +} + +/* + * There are two unique features of Dia's model architecture: + * 1. Dia cleans its output generation by adding the difference between its text based output (its conditional output) and its unconditional output + * to the conditional ouput before sampling. This is why the batch is set to two throughout the graph. + * + * 2. Dia's decoder attends across the entire encoded space including the pad buffer which receives a unique attention mask. This is why the + * encoder sequence is always max length. + */ +struct ggml_cgraph * dia_runner::build_dia_graph(dia_ubatch & batch) { + init_build(); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false); + struct ggml_tensor * encoded_states = nullptr; + + if (batch.encoder_step) { + encoded_states = build_dia_encoder(ctx, model, dctx, batch); + ggml_build_forward_expand(gf, encoded_states); + } + + struct ggml_tensor * cur = build_dia_decoder(gf, ctx, model, dctx, kv_cross_self, batch, encoded_states); + ggml_set_name(cur, "decoder_output"); + ggml_build_forward_expand(gf, cur); + free_build(); + + return gf; +} + +void dia_runner::configure_generation(generation_configuration * config) { + GGML_ASSERT(config->max_tokens == 0 || config->max_tokens > model->max_delay); + decode_sampler->temperature = config->temperature; + decode_sampler->repetition_penalty = config->repetition_penalty; + decode_sampler->do_sample = config->sample; + decode_sampler->top_k = config->top_k; + decode_sampler->top_p = config->top_p; + dctx->max_generation_size = config->max_tokens > model->max_delay ? config->max_tokens : model->max_generation_size; +} + +void dia_runner::set_inputs(dia_ubatch & batch) { + if (batch.encoder_step) { + ggml_backend_tensor_set(dctx->inp_tokens, batch.tokens.data(), 0, batch.tokens.size()*ggml_element_size(dctx->inp_tokens)); + int32_t * ep = (int32_t*) dctx->encode_positions->data; + float * mask = (float*) dctx->encode_attn_mask->data; + for (int i = 0; i < model->max_encoder_context_length; i++) { + ep[i] = (int32_t) i; + for (int ii = 0; ii < model->max_encoder_context_length; ii++) { + if (i < batch.sentence_length) { + mask[i*model->max_encoder_context_length + ii] = ii < batch.sentence_length ? 0.0 : -INFINITY; + } else { + mask[i*model->max_encoder_context_length + ii] = ii >= batch.sentence_length ? 0.0 : -INFINITY; + } + } + } + } + // The audio tokens need to be repeated in the input in order to support cfg-scaling. I.E we need duplicate inputs for conditional and unconditional logits. + ggml_backend_tensor_set(dctx->audio_inp_tokens, batch.audio_tokens.data(), 0, batch.audio_tokens.size()*ggml_element_size(dctx->audio_inp_tokens)); + ggml_backend_tensor_set(dctx->audio_inp_tokens, batch.audio_tokens.data(), batch.audio_tokens.size()*ggml_element_size(dctx->audio_inp_tokens), batch.audio_tokens.size()*ggml_element_size(dctx->audio_inp_tokens)); + ((int32_t*) dctx->positions->data)[0] = dctx->current_position; +} + +int dia_runner::decode(dia_ubatch & batch) { + if (batch.encoder_step) { + dctx->prompt_size = batch.sentence_length; + dctx->output_tokens.reserve(dctx->max_generation_size * model->n_output_heads); + } + ggml_backend_sched_reset(dctx->sched); + + const size_t logits_size = model->output_vocab_size * dctx->max_generation_size * model->n_output_heads; + const size_t prev_size = dctx->buf_output ? ggml_backend_buffer_get_size(dctx->buf_output) : 0; + const size_t new_size = logits_size * sizeof(float); + + if (!dctx->buf_output || prev_size < new_size) { + if (dctx->buf_output) { + ggml_backend_buffer_free(dctx->buf_output); + dctx->buf_output = nullptr; + dctx->logits = nullptr; + } + + dctx->buf_output = ggml_backend_buft_alloc_buffer(dctx->backend_cpu_buffer, new_size); + } + + dctx->logits = (float *) ggml_backend_buffer_get_base(dctx->buf_output); + + ggml_cgraph * gf = build_dia_graph(batch); + + // the output is always the last tensor in the graph + struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; + std::string resname = ggml_get_name(res); + ggml_backend_sched_alloc_graph(dctx->sched, gf); + + set_inputs(batch); + + ggml_backend_sched_graph_compute_async(dctx->sched, gf); + + float * logits_out = dctx->logits + dctx->current_position * model->output_vocab_size * model->n_output_heads; + dctx->get_ggml_node_data(res, logits_out, model->output_vocab_size * model->n_output_heads * sizeof(float)); + + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(dctx->sched); + + return 0; +} + +dia_ubatch dia_runner::build_worst_case_batch() { + struct dia_ubatch batch{ 1, true }; + batch.tokens.resize(model->max_encoder_context_length * 2); + batch.audio_tokens.resize(model->n_output_heads); + return batch; +} + +void dia_runner::prepare_post_load() { + dac_runner->prepare_post_load(); + dia_kv_cache_init(kv_cross_self, model, dctx); + auto batch = build_worst_case_batch(); + batch.sentence_length = model->max_encoder_context_length; + dctx->prompt_size = model->max_encoder_context_length; + auto gf = build_dia_graph(batch); + dctx->prep_schedule(gf); +} + +bool dia_runner::check_stopping(dia_ubatch & batch) { + if (dctx->delay_steps == -1 && (batch.audio_tokens[0] == model->eos_token_id || dctx->current_position >= dctx->max_generation_size - model->max_delay)) { + dctx->delay_steps = model->max_delay; + } + + if (dctx->delay_steps > 0) { + int step_after_eos = model->max_delay - dctx->delay_steps; + for (int i = 0; i < model->delay_pattern.size(); i++) { + if (step_after_eos == model->delay_pattern[i]) { + batch.audio_tokens[i] = model->eos_token_id; + } else if (step_after_eos > model->delay_pattern[i]) { + batch.audio_tokens[i] = model->pad_token_id; + } + } + dctx->delay_steps -= 1; + } + return dctx->delay_steps == 0; +} + +void dia_runner::adjust_output_tokens(std::vector & output_tokens, std::vector & filtered) { + // currently this is applying sliding window over the heads and filtering out bad tokens. + // If we convert the DAC model's quantizer layers to support by row + column embeddings then we will need to transpose + // the heads and the sequence here, but right now simplying using a strided view is more peformant. + size_t size = output_tokens.size(); + filtered.reserve(size); + for (int i = 0; i < (size / model->n_output_heads) - model->max_delay; i++) { + bool skip_step = false; + for (int ii = 0; ii < model->n_output_heads; ii++) { + int next_index = i*model->n_output_heads+model->delay_pattern[ii]*model->n_output_heads+ii; + if (next_index > size || output_tokens[next_index] >= model->audio_vocab_size) { + skip_step = true; + break; + } + } + if (!skip_step) { + for (int ii = 0; ii < model->n_output_heads; ii++) { + int next_index = i*model->n_output_heads+model->delay_pattern[ii]*model->n_output_heads+ii; + filtered.push_back(output_tokens[next_index]); + } + } + } +} + +int dia_runner::generate_from_batch(dia_ubatch & batch, struct tts_response * output) { + while (!check_stopping(batch)) { + int state = decode(batch); + if (state != 0) { + return state; + } + decode_sampler->sample(dctx->logits + dctx->current_position * model->n_output_heads * model->output_vocab_size, dctx->output_tokens); + dctx->current_position += batch.sequence_length; + batch = dia_ubatch{ 1 }; + uint32_t * last_outputs = (dctx->output_tokens.data() + (int) dctx->output_tokens.size() - model->n_output_heads); + batch.audio_tokens.reserve(model->n_output_heads); + for (int i = 0; i < model->n_output_heads; i++) { + batch.audio_tokens.push_back(dctx->current_position > i ? last_outputs[i] : model->bos_token_id); + } + } + + std::vector filtered_output_tokens; + adjust_output_tokens(dctx->output_tokens, filtered_output_tokens); + + dac_runner->run(filtered_output_tokens.data(), (int32_t) filtered_output_tokens.size() / model->n_output_heads, output); + return 0; +} + +int dia_runner::generate(std::string sentence, struct tts_response * output) { + dia_ubatch batch = batch_from_sentence(sentence); + dctx->reset(); + decode_sampler->reset(); + dctx->current_position = 0; + if (!kv_cross_self) { + kv_cross_self = new dia_kv_cache; + if (!dia_kv_cache_init(kv_cross_self, model, dctx)) { + return 1; + } + } + return generate_from_batch(batch, output); +} + +void dia_runner::assign_weight(std::string name, ggml_tensor * tensor) { + if (tensor->data == NULL) { + return; + } + + if (name.size() == 0) { + // handles the top level meta tensor + return; + } + + if (name.size() > 14 && name.substr(0, 14) == "audio_encoder.") { + dac_runner->model->assign_weight(name.substr(14), tensor); + } else { + model->assign_weight(name, tensor); + } +} diff --git a/otherarch/ttscpp/src/dia_model.h b/otherarch/ttscpp/src/dia_model.h new file mode 100644 index 000000000..bdca91d8c --- /dev/null +++ b/otherarch/ttscpp/src/dia_model.h @@ -0,0 +1,206 @@ +#pragma once + +#include "dac_model.h" +#include "sampler.h" + +struct dia_encoder_layer { + struct ggml_tensor * k; + struct ggml_tensor * q; + struct ggml_tensor * v; + struct ggml_tensor * o; + struct ggml_tensor * self_attn_norm; + + struct ggml_tensor * gate; + struct ggml_tensor * up; + struct ggml_tensor * out; + struct ggml_tensor * mlp_norm; +}; + +struct dia_decoder_layer { + struct ggml_tensor * self_attn_k; + struct ggml_tensor * self_attn_q; + struct ggml_tensor * self_attn_v; + struct ggml_tensor * self_attn_o; + struct ggml_tensor * self_attn_norm; + + struct ggml_tensor * cross_attn_k; + struct ggml_tensor * cross_attn_q; + struct ggml_tensor * cross_attn_v; + struct ggml_tensor * cross_attn_o; + struct ggml_tensor * cross_attn_norm; + + struct ggml_tensor * gate; + struct ggml_tensor * up; + struct ggml_tensor * out; + struct ggml_tensor * mlp_norm; + + struct ggml_tensor * pad_attn_values; +}; + +struct dia_encoder { + struct ggml_tensor * norm; + struct ggml_tensor * embedding; + std::vector layers; +}; + +struct dia_decoder { + struct ggml_tensor * norm; + std::vector embds; + std::vector heads; + std::vector layers; +}; + +struct dia_model : tts_model { + // These default configurations are based on the default configuration for the Dia 1.68b param model. + uint32_t n_output_heads = 9; + uint32_t n_encoder_layers = 12; + uint32_t n_decoder_layers = 18; + uint32_t encoder_hidden_size = 1024; + uint32_t decoder_hidden_size = 2048; + uint32_t encoder_attn_heads = 16; + uint32_t decoder_attn_heads = 16; + uint32_t decoder_query_heads = 4; + uint32_t head_size = 128; + uint32_t eos_token_id = 1024; + uint32_t pad_token_id = 1025; + uint32_t bos_token_id = 1026; + uint32_t output_vocab_size = 1028; + uint32_t audio_vocab_size = 1024; + uint32_t max_generation_size = 3072; + uint32_t max_encoder_context_length = 1024; + + + float cfg_scale_data[2] = {3.0, 1024.0}; + uint32_t max_delay = 15; + std::vector delay_pattern = {0, 8, 9, 10, 11, 12, 13, 14, 15}; + + dia_encoder * encoder; + dia_decoder * decoder; + + void assign_weight(std::string name, ggml_tensor * tensor); + void assign_to_encoder(std::vector parts, struct ggml_tensor * tensor, std::string name); + void assign_to_decoder(std::vector parts, struct ggml_tensor * tensor, std::string name); + void assign_to_encoder_layer(std::string part, dia_encoder_layer * layer, struct ggml_tensor * tensor); + void assign_to_decoder_layer(std::string part, dia_decoder_layer * layer, struct ggml_tensor * tensor); + void prep_constants(gguf_context * meta); + void prep_layers(); + void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only) { + prep_constants(meta_ctx); + prep_layers(); + tts_model::setup_from_file(meta_ctx, load_context, cpu_only, "dia", 1.30); + } +}; + +struct dia_context : runner_context { + dia_context(dia_model * model, int n_threads): runner_context(n_threads), model(model) { + max_generation_size = model->max_generation_size; + }; + + uint32_t current_position = 0; // current position in the active sequence + int delay_steps = -1; // the max remaining steps to take before terminating; is set after an eos token is seen on the first output channel + size_t prompt_size = 0; + + uint32_t max_generation_size; // this is set by the generation context or defaults to the config set on dia model. + + std::vector output_tokens; + struct dia_model * model; + + struct ggml_tensor * inp_tokens; + struct ggml_tensor * audio_inp_tokens; + struct ggml_tensor * positions; + struct ggml_tensor * encode_positions; + struct ggml_tensor * encode_attn_mask; + struct ggml_tensor * cross_attn_mask; + + void build_schedule() { + runner_context::build_schedule(model->max_nodes()); + } + void reset(); +}; + +struct dia_kv_cache { + ggml_type tensor_type = GGML_TYPE_F32; + + std::vector cross_k_l; + std::vector cross_v_l; + + std::vector k_l; + std::vector v_l; + + struct ggml_context * ctx; + ggml_backend_buffer_type_t buft; + ggml_backend_buffer_t buf; + + void free() { + ggml_free(ctx); + ggml_backend_buffer_free(buf); + } + + ~dia_kv_cache() { + free(); + } +}; + +struct dia_ubatch { + dia_ubatch(size_t sequence_length, bool encoder_step = false): sequence_length(sequence_length), encoder_step(encoder_step) {}; + bool encoder_step; // whether we are performing the prompt encoding in this step. + size_t sequence_length; // for just audio tokens the sequence length should be the total_tokens / num_heads; for normal generation this should always be 1. + size_t sentence_length; // the number of non padded tokens in the conditional context + std::vector tokens; // character tokens for the encoder + std::vector audio_tokens; // audio tokens from the last generation +}; + +struct dia_context * build_new_dia_context(struct dia_model * model, int n_threads, bool use_cpu = true); +static bool dia_kv_cache_init(struct dia_kv_cache * cache, dia_model * model, dia_context * dctx) ; +static struct ggml_tensor * build_dia_decoder_inp_embd(struct ggml_context * ctx, dia_context *dctx, dia_decoder * decoder, dia_ubatch & batch, uint32_t n_output_heads); +static struct ggml_tensor * dia_layer_norm(struct ggml_context * ctx, struct ggml_tensor * inputs, struct ggml_tensor * weight); +static struct ggml_tensor * build_dia_encoder_attn_mask(ggml_context * ctx, struct dia_context * dctx, dia_model * model); +static struct ggml_tensor * build_dia_decoder_attn_mask(ggml_context * ctx, struct dia_context * dctx, dia_ubatch & batch); +static struct ggml_tensor * build_dia_decoder_cross_attn_mask(ggml_context * ctx, struct dia_context * dctx, dia_ubatch & batch); +static struct ggml_tensor * build_dia_head_outputs(struct ggml_context * ctx, dia_model * model, struct ggml_tensor * cur); +static struct ggml_tensor * build_dia_encoder(ggml_context * ctx, dia_model * model, dia_context * dctx, dia_ubatch & batch); +static void build_dia_self_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * k, struct ggml_tensor * v, dia_ubatch & batch, int layer_index); +static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * encoder_hidden_states, int layer_index); +static struct ggml_tensor * build_dia_decoder( ggml_cgraph * gf, ggml_context * ctx, dia_model * model, dia_context * dctx, dia_kv_cache * cache, dia_ubatch & batch, struct ggml_tensor * encoder_hidden_states); + +// This struct is intended to support end-to-end TTS generation for the Dia model. As such, it manages Dia's model compilation, compute, generation, +// tokenizationm and sampling process, and uses the dac_runner struct to encode audio outputs. +struct dia_runner : tts_runner { + dia_runner(dia_model * model, dac_runner * audio_decoder, dia_context * dctx, sampler * samp, dia_kv_cache * cache): model(model), dac_runner(audio_decoder), dctx(dctx), decode_sampler(samp), kv_cross_self(cache) { + decode_sampler->vocab_size = model->output_vocab_size; + }; + ~dia_runner() { + if (ctx) { + ggml_free(ctx); + } + model->free(); + delete model; + delete kv_cross_self; + delete dac_runner; + delete dctx; + delete decode_sampler; + } + struct dia_model * model; + struct dac_runner * dac_runner; + struct dia_context * dctx; + struct dia_kv_cache * kv_cross_self = nullptr; + struct sampler * decode_sampler; + + void init_build() { + tts_runner::init_build(&dctx->buf_compute_meta); + } + + void tokenize_sentence(std::string sentence, dia_ubatch & tokens); + dia_ubatch batch_from_sentence(std::string sentence); + void configure_generation(generation_configuration * config); + void assign_weight(std::string name, ggml_tensor * tensor); + dia_ubatch build_worst_case_batch(); + struct ggml_cgraph * build_dia_graph(dia_ubatch & batch); + void set_inputs(dia_ubatch & batch); + int decode(dia_ubatch & batch); + void prepare_post_load(); + int generate(std::string sentence, struct tts_response * response); + bool check_stopping(dia_ubatch & batch); + void adjust_output_tokens(std::vector & output_tokens, std::vector & filtered); + int generate_from_batch(dia_ubatch & batch, struct tts_response * output); +}; diff --git a/otherarch/ttscpp/src/general_neural_audio_codec.cpp b/otherarch/ttscpp/src/general_neural_audio_codec.cpp new file mode 100644 index 000000000..7978f135c --- /dev/null +++ b/otherarch/ttscpp/src/general_neural_audio_codec.cpp @@ -0,0 +1,172 @@ +#include "general_neural_audio_codec.h" +#include +#include +#include + +namespace general_neural_audio_codec { + // This contains a mapping between string names and gguf_tensor enum values for the purposes of assigning the weights from a gguf file + // to the general_neural_audio_codec::layer. + // Please note that some gguf_tensor values have multiple keys; this is to support backwards compatibility with original DAC settings. + static const std::map GENERAL_NEURAL_AUDIO_CODEC_TENSOR_LOOKUP = { + {".final.alpha", LAYER_ALPHA}, + {".final.bias", LAYER_INPUT_BIAS}, + {".final.weight", LAYER_INPUT_KERNEL}, + {".alpha", LAYER_ALPHA}, + {".bias", LAYER_INPUT_BIAS}, + {".weight", LAYER_INPUT_KERNEL}, + {".noise_weight", LAYER_NOISE_KERNEL}, + {".res.initial.alpha", RESIDUAL_UNIT_INPUT_ALPHA}, + {".res.initial.bias", RESIDUAL_UNIT_INPUT_BIAS}, + {".res.initial.weight", RESIDUAL_UNIT_INPUT_KERNEL}, + {".res.final.alpha", RESIDUAL_UNIT_OUTPUT_ALPHA}, + {".res.final.bias", RESIDUAL_UNIT_OUTPUT_BIAS}, + {".res.final.weight", RESIDUAL_UNIT_OUTPUT_KERNEL}, + {".in_alpha", RESIDUAL_UNIT_INPUT_ALPHA}, + {".in_bias", RESIDUAL_UNIT_INPUT_BIAS}, + {".in_weight", RESIDUAL_UNIT_INPUT_KERNEL}, + {".out_alpha", RESIDUAL_UNIT_OUTPUT_ALPHA}, + {".out_bias", RESIDUAL_UNIT_OUTPUT_BIAS}, + {".out_weight", RESIDUAL_UNIT_OUTPUT_KERNEL}, + {".out_proj.bias", QUANTIZER_LAYER_OUT_BIAS}, + {".out_proj.weight", QUANTIZER_LAYER_OUT_KERNEL}, + {".codebook.weight", QUANTIZER_LAYER_CODEBOOK}, + }; + + void assign_to_residual_unit(tts_model * model, residual_unit & unit, std::string name, struct ggml_tensor * tensor) { + try { + gguf_tensor tensor_type = GENERAL_NEURAL_AUDIO_CODEC_TENSOR_LOOKUP.at(name); + switch (tensor_type) { + case RESIDUAL_UNIT_INPUT_ALPHA: + unit.in_alpha = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(unit.in_alpha, tensor); + break; + case RESIDUAL_UNIT_OUTPUT_ALPHA: + unit.out_alpha = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(unit.out_alpha, tensor); + break; + case RESIDUAL_UNIT_INPUT_KERNEL: + unit.in_conv_kernel = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(unit.in_conv_kernel, tensor); + break; + case RESIDUAL_UNIT_OUTPUT_KERNEL: + unit.out_conv_kernel = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(unit.out_conv_kernel, tensor); + break; + case RESIDUAL_UNIT_INPUT_BIAS: + unit.in_conv_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor)); + model->set_tensor(unit.in_conv_bias, tensor); + break; + case RESIDUAL_UNIT_OUTPUT_BIAS: + unit.out_conv_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor)); + model->set_tensor(unit.out_conv_bias, tensor); + break; + default: + fprintf(stdout, "residual unit unassigned tensor %s\n", name.c_str()); + break; + } + } catch (const std::out_of_range& e) { + TTS_ABORT("Tensor, '%s', is not a valid tensor general_neural_audio_codec::residual_unit tensor.", name.c_str()); + } + } + + void assign_to_layer(tts_model * model, layer & l, std::string name, struct ggml_tensor * tensor) { + if (GENERAL_NEURAL_AUDIO_CODEC_TENSOR_LOOKUP.find(name) != GENERAL_NEURAL_AUDIO_CODEC_TENSOR_LOOKUP.end()) { + switch(GENERAL_NEURAL_AUDIO_CODEC_TENSOR_LOOKUP.at(name)) { + case LAYER_ALPHA: + l.in_alpha = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(l.in_alpha, tensor); + break; + case LAYER_INPUT_KERNEL: + l.in_conv_kernel = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(l.in_conv_kernel, tensor); + break; + case LAYER_INPUT_BIAS: + l.in_conv_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor)); + model->set_tensor(l.in_conv_bias, tensor); + break; + case LAYER_NOISE_KERNEL: + l.noise_conv_kernel = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(l.noise_conv_kernel, tensor); + break; + default: + fprintf(stdout, "layer unassigned tensor %s\n", name.c_str()); + break; + } + } else if (std::find_if(name.begin(), name.end(), ::isdigit) != name.end()) { + auto pair = parse_layer_count(name); + int i = pair.first; + std::string lt_name = pair.second; + assign_to_residual_unit(model, l.residual_blocks[i], lt_name, tensor); + } else { + TTS_ABORT("Tensor, '%s', is not a valid tensor general_neural_audio_codec::layer tensor.", name.c_str()); + } + } + + void assign_to_quantize_layer(tts_model * model, residual_vector_quantize_layer & l, std::string name, struct ggml_tensor * tensor) { + try { + switch(GENERAL_NEURAL_AUDIO_CODEC_TENSOR_LOOKUP.at(name)) { + case QUANTIZER_LAYER_OUT_KERNEL: + l.out_proj_kernel = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(l.out_proj_kernel, tensor); + break; + case QUANTIZER_LAYER_OUT_BIAS: + l.out_proj_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor)); + model->set_tensor(l.out_proj_bias, tensor); + break; + case QUANTIZER_LAYER_CODEBOOK: + l.codebook = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(l.codebook, tensor); + break; + default: + fprintf(stdout, "quantized layer unassigned tensor %s\n", name.c_str()); + break; + } + } catch (const std::out_of_range& e) { + // older GGUF files still have the unused in_proj convolutional layer, so ignore it if we find it. + if (!has_prefix(name, ".in_proj")) { + TTS_ABORT("Error: %s\nTensor, '%s', is not a valid tensor.", e.what(), name.c_str()); + } + } + } + + struct ggml_tensor * build_residual_unit(ggml_context * ctx, struct ggml_tensor * cur, residual_unit & unit) { + struct ggml_tensor * residual = cur; + cur = snake_1d(ctx, unit.in_alpha, cur); + if (unit.groups > 1) { + // depthwise 1d convolution is equivalent to convolution in which grouping is equal to filter size. + // If there is a divergence between filter size and grouping then the kernel's output filters will not be zero. + TTS_ASSERT(unit.in_conv_kernel->ne[1] == 1); + cur = ggml_conv_1d_dw_tts(ctx, unit.in_conv_kernel, cur, 1, unit.padding, unit.dilation); + } else { + cur = ggml_conv_1d_tts(ctx, unit.in_conv_kernel, cur, 1, unit.padding, unit.dilation); + } + cur = ggml_add(ctx, cur, unit.in_conv_bias); + cur = snake_1d(ctx, unit.out_alpha, cur); + cur = ggml_conv_1d_tts(ctx, unit.out_conv_kernel, cur, 1, 0, 1); + cur = ggml_add(ctx, cur, unit.out_conv_bias); + return ggml_add(ctx, cur, residual); + } + + struct ggml_tensor * build_layer(ggml_context * ctx, struct ggml_tensor * cur, layer & l, struct ggml_tensor * noise) { + cur = snake_1d(ctx, l.in_alpha, cur); + cur = ggml_conv_transpose_1d_tts(ctx, l.in_conv_kernel, cur, l.stride, l.padding, 1, 0, 1); + cur = ggml_add(ctx, cur, l.in_conv_bias); + if (l.noise_conv_kernel && noise) { + struct ggml_tensor * x = ggml_conv_1d_tts(ctx, l.noise_conv_kernel, cur, 1, 0, 1); + x = ggml_mul(ctx, x, noise); + cur = ggml_add(ctx, cur, x); + } + for (int i = 0; i < l.residual_blocks.size(); i++) { + cur = build_residual_unit(ctx, cur, l.residual_blocks[i]); + } + return cur; + } + + struct ggml_tensor * build_quantize_layer(ggml_context * ctx, struct ggml_tensor * cur, residual_vector_quantize_layer & l) { + cur = ggml_get_rows(ctx, l.codebook, cur); + cur = ggml_cont(ctx, ggml_transpose(ctx, cur)); + cur = ggml_conv_1d_tts(ctx, l.out_proj_kernel, cur, 1, 0, 1); + cur = ggml_add(ctx, cur, l.out_proj_bias); + return cur; + } +} diff --git a/otherarch/ttscpp/src/general_neural_audio_codec.h b/otherarch/ttscpp/src/general_neural_audio_codec.h new file mode 100644 index 000000000..1ec0a42b7 --- /dev/null +++ b/otherarch/ttscpp/src/general_neural_audio_codec.h @@ -0,0 +1,67 @@ +#pragma once + +#include "tts_model.h" + +// This namespace implements a general abstraction of the core functionality used in common neural audio codecs like DAC and SNAC. +namespace general_neural_audio_codec { + enum gguf_tensor { + LAYER_ALPHA, + LAYER_INPUT_KERNEL, + LAYER_INPUT_BIAS, + LAYER_NOISE_KERNEL, + RESIDUAL_UNIT_INPUT_ALPHA, + RESIDUAL_UNIT_OUTPUT_ALPHA, + RESIDUAL_UNIT_INPUT_KERNEL, + RESIDUAL_UNIT_OUTPUT_KERNEL, + RESIDUAL_UNIT_INPUT_BIAS, + RESIDUAL_UNIT_OUTPUT_BIAS, + QUANTIZER_LAYER_OUT_KERNEL, + QUANTIZER_LAYER_OUT_BIAS, + QUANTIZER_LAYER_CODEBOOK + }; + + struct residual_vector_quantize_layer { + struct ggml_tensor * out_proj_kernel; + struct ggml_tensor * out_proj_bias; + struct ggml_tensor * codebook; + }; + + struct residual_unit { + residual_unit(uint32_t padding, uint32_t dilation, uint32_t groups = 1): padding(padding), dilation(dilation), groups(groups) {} + struct ggml_tensor * in_alpha; + struct ggml_tensor * in_conv_kernel; + struct ggml_tensor * in_conv_bias; + struct ggml_tensor * out_alpha; + struct ggml_tensor * out_conv_kernel; + struct ggml_tensor * out_conv_bias; + + uint32_t padding; + uint32_t dilation; + uint32_t groups; + }; + + struct layer { + layer(uint32_t padding, uint32_t stride, uint32_t groups = 1): padding(padding), stride(stride) { + for (int i = 0; i < 3; i++) { + residual_blocks.push_back(residual_unit{(uint32_t) pow(3, (i + 1)), (uint32_t) pow(3, i), groups}); + } + } + struct ggml_tensor * in_alpha; + struct ggml_tensor * in_conv_kernel; + struct ggml_tensor * in_conv_bias; + struct ggml_tensor * noise_conv_kernel = nullptr; + + uint32_t padding; + uint32_t stride; + + std::vector residual_blocks; + }; + + void assign_to_residual_unit(tts_model * model, residual_unit & unit, std::string name, struct ggml_tensor * tensor); + void assign_to_layer(tts_model * model, layer & l, std::string name, struct ggml_tensor * tensor); + void assign_to_quantize_layer(tts_model * model, residual_vector_quantize_layer & l, std::string name, struct ggml_tensor * tensor); + + struct ggml_tensor * build_residual_unit(ggml_context * ctx, struct ggml_tensor * cur, residual_unit & unit); + struct ggml_tensor * build_layer(ggml_context * ctx, struct ggml_tensor * cur, layer & l, struct ggml_tensor * noise = nullptr); + struct ggml_tensor * build_quantize_layer(ggml_context * ctx, struct ggml_tensor * cur, residual_vector_quantize_layer & l); +} diff --git a/otherarch/ttscpp/src/kokoro_model.cpp b/otherarch/ttscpp/src/kokoro_model.cpp new file mode 100644 index 000000000..955d2a344 --- /dev/null +++ b/otherarch/ttscpp/src/kokoro_model.cpp @@ -0,0 +1,1484 @@ +#include "kokoro_model.h" + +static struct ggml_tensor * build_albert_attn_mask(ggml_context * ctx, struct kokoro_duration_context *kctx, const kokoro_ubatch & batch) { + kctx->attn_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, (int64_t) batch.n_tokens, (int64_t) batch.n_tokens); + ggml_set_input(kctx->attn_mask); + + return kctx->attn_mask; +} + +static struct ggml_tensor * build_albert_inputs(ggml_context * ctx, kokoro_model * model, ggml_tensor * input_tokens, ggml_tensor * positions, ggml_tensor * token_types) { + struct ggml_tensor * tinpts = ggml_cont(ctx, ggml_get_rows(ctx, model->token_embd, input_tokens)); + struct ggml_tensor * pinpts = ggml_get_rows(ctx, model->position_embd, positions); + + struct ggml_tensor * inpts = ggml_cont(ctx, ggml_add(ctx, tinpts, pinpts)); + if (!model->static_token_types) { + // Token type embeddings are actually static for kokoro at the moment, so we should never need to compute this on the fly. + return ggml_add(ctx, inpts, ggml_get_rows(ctx, model->token_type_embd, token_types)); + } + struct ggml_tensor * ainpts = ggml_add(ctx, inpts, model->static_token_type_values); + + struct ggml_tensor * out = ggml_cont(ctx, build_albert_norm(ctx, ainpts, model->input_norm_weight, model->input_norm_bias)); + return ggml_add(ctx, ggml_mul_mat(ctx, model->embd_hidden, out), model->embd_hidden_bias); +} + +static struct ggml_tensor * build_albert_norm(ggml_context * ctx, ggml_tensor * cur, ggml_tensor * weight, ggml_tensor * bias) { + // this is the standard eps for Albert + float eps = 0.000000000001; + cur = ggml_norm(ctx, cur, eps); + cur = ggml_cont(ctx, ggml_add(ctx, ggml_mul(ctx, cur, weight), bias)); + return cur; +} + +static struct ggml_tensor * build_lstm_run(ggml_context * ctx, ggml_cgraph * gf, ggml_tensor * input, ggml_tensor * h_0, ggml_tensor * c_0, std::vector weights, std::vector biases, uint32_t sequence_length, bool reversed = false); + +static struct ggml_tensor * build_lstm(ggml_context * ctx, ggml_tensor * input, lstm* rnn, uint32_t sequence_length, ggml_cgraph * gf) { + struct ggml_tensor * resp = input; + struct ggml_tensor * reverse_resp = input; + + // iterate over cells first so that at each pass to the next cell we have a fully formed vector (this improves performance as well as allocation for stacked lstms) + for (int c = 0; c < rnn->cells.size(); c++) { + ggml_build_forward_expand(gf, resp); + resp = build_lstm_run(ctx, gf, resp, rnn->hidden[c], rnn->states[c], rnn->cells[c]->weights, rnn->cells[c]->biases, sequence_length); + if (rnn->bidirectional) { + reverse_resp = build_lstm_run(ctx, gf, reverse_resp, rnn->hidden[c], rnn->states[c], rnn->cells[c]->reverse_weights, rnn->cells[c]->reverse_biases, sequence_length, true); + } + } + if (rnn->bidirectional) { + resp = ggml_concat(ctx, resp, reverse_resp, 0); + } + return resp; +} + +static struct ggml_tensor * build_lstm_run(ggml_context * ctx, ggml_cgraph * gf, ggml_tensor * input, ggml_tensor * h_0, ggml_tensor * c_0, std::vector weights, std::vector biases, uint32_t sequence_length, bool reversed) { + struct ggml_tensor * I = ggml_add(ctx, ggml_mul_mat(ctx, weights[0], input), biases[0]); + struct ggml_tensor * F = ggml_add(ctx, ggml_mul_mat(ctx, weights[2], input), biases[2]); + struct ggml_tensor * G = ggml_add(ctx, ggml_mul_mat(ctx, weights[4], input), biases[4]); + struct ggml_tensor * O = ggml_add(ctx, ggml_mul_mat(ctx, weights[6], input), biases[6]); + + struct ggml_tensor * outputs; + + for (int index = 0; index < sequence_length; index++) { + int i = reversed ? sequence_length - 1 - index : index; + struct ggml_tensor * I_cur = ggml_view_3d(ctx, I, I->ne[0], 1, I->ne[2], I->nb[0], I->nb[1], I->nb[1]*i); + I_cur = ggml_sigmoid(ctx, ggml_add(ctx, I_cur, ggml_add(ctx, ggml_mul_mat(ctx, weights[1], h_0), biases[1]))); + + struct ggml_tensor * F_cur = ggml_view_3d(ctx, F, F->ne[0], 1, F->ne[2], F->nb[0], F->nb[1], F->nb[1]*i); + F_cur = ggml_sigmoid(ctx, ggml_add(ctx, F_cur, ggml_add(ctx, ggml_mul_mat(ctx, weights[3], h_0), biases[3]))); + + struct ggml_tensor * G_cur = ggml_view_3d(ctx, G, G->ne[0], 1, G->ne[2], G->nb[0], G->nb[1], G->nb[1]*i); + G_cur = ggml_tanh(ctx, ggml_add(ctx, G_cur, ggml_add(ctx, ggml_mul_mat(ctx, weights[5], h_0), biases[5]))); + + struct ggml_tensor * O_cur = ggml_view_3d(ctx, O, O->ne[0], 1, O->ne[2], O->nb[0], O->nb[1], O->nb[1]*i); + O_cur = ggml_sigmoid(ctx, ggml_add(ctx, O_cur, ggml_add(ctx, ggml_mul_mat(ctx, weights[7], h_0), biases[7]))); + + c_0 = ggml_add(ctx, ggml_mul(ctx, F_cur, c_0), ggml_mul(ctx, I_cur, G_cur)); + h_0 = ggml_mul(ctx, ggml_tanh(ctx, c_0), O_cur); + + if (index == 0) { + outputs = h_0; + } else { + outputs = reversed ? ggml_concat(ctx, h_0, outputs, 1) : ggml_concat(ctx, outputs, h_0, 1); + } + ggml_build_forward_expand(gf, outputs); + } + return outputs; +} + +static struct ggml_tensor * build_ada_residual_conv(ggml_context * ctx, struct ggml_tensor * x, ada_residual_conv_block * block, struct ggml_tensor * style, struct ggml_tensor * sqrt_tensor) { + struct ggml_tensor * cur = x; + struct ggml_tensor * gamma; + struct ggml_tensor * beta; + + gamma = ggml_add(ctx, ggml_mul_mat(ctx, block->norm1_gamma, style), block->norm1_gamma_bias); + beta = ggml_add(ctx, ggml_mul_mat(ctx, block->norm1_beta, style), block->norm1_beta_bias); + cur = ggml_norm(ctx, x, 0.00001); + + // The addition between gamma * x and x is performed here because ggml doesn't support scalar multiplication without initializing the scalars in advance. + // An optimal remedy to this would be to increment the gamma bias above by one when preparing the gguf file for the model. + cur = ggml_add(ctx, cur, ggml_mul(ctx, cur, ggml_transpose(ctx, gamma))); + cur = ggml_add(ctx, cur, ggml_transpose(ctx, beta)); + cur = ggml_leaky_relu(ctx, cur, 0.2f, false); + + if (block->pool) { + cur = ggml_conv_transpose_1d_tts(ctx, block->pool, cur, 2, 1, 1, 1, cur->ne[1]); + cur = ggml_add(ctx, cur, block->pool_bias); + } + + cur = ggml_conv_1d_tts(ctx, block->conv1, cur, 1, 1, 1); + + cur = ggml_add(ctx, cur, block->conv1_bias); + gamma = ggml_add(ctx, ggml_mul_mat(ctx, block->norm2_gamma, style), block->norm2_gamma_bias); + beta = ggml_add(ctx, ggml_mul_mat(ctx, block->norm2_beta, style), block->norm2_beta_bias); + cur = ggml_norm(ctx, cur, 0.00001); + + // The addition between gamma * x and x is performed here because ggml doesn't support scalar multiplication without initializing the scalars in advance. + // An optimal remedy to this would be to increment the gamma bias above by one when preparing the gguf file for the model. + cur = ggml_add(ctx, cur, ggml_mul(ctx, cur, ggml_transpose(ctx, gamma))); + cur = ggml_add(ctx, cur, ggml_transpose(ctx, beta)); + cur = ggml_leaky_relu(ctx, cur, 0.2f, false); + cur = ggml_add(ctx, ggml_conv_1d_tts(ctx, block->conv2, cur, 1, 1, 1), block->conv2_bias); + + struct ggml_tensor * res = cur; + cur = x; + if (block->upsample) { + cur = ggml_cont(ctx, ggml_transpose(ctx, cur)); + if (block->pool) { + cur = ggml_upscale_ext(ctx, cur, cur->ne[0], cur->ne[1]*2, cur->ne[2], cur->ne[3],GGML_SCALE_MODE_NEAREST); + } + cur = ggml_mul_mat(ctx, block->upsample, cur); + cur = ggml_cont(ctx, ggml_transpose(ctx, cur)); + } + cur = ggml_div(ctx, ggml_add(ctx, res, cur), sqrt_tensor); + return cur; +} + +static struct ggml_tensor * build_kokoro_generator_res_block(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * style, kokoro_generator_residual_block * block) { + struct ggml_tensor * cur; + struct ggml_tensor * gamma; + struct ggml_tensor * beta; + struct ggml_tensor * inpl = x; + for (int i = 0; i < block->convs1_weights.size(); i++) { + gamma = ggml_add(ctx, ggml_mul_mat(ctx, block->adain1d_1_gamma_weights[i], style), block->adain1d_1_gamma_biases[i]); + beta = ggml_add(ctx, ggml_mul_mat(ctx, block->adain1d_1_beta_weights[i], style), block->adain1d_1_beta_biases[i]); + cur = ggml_cont(ctx, ggml_transpose(ctx, ggml_norm(ctx, inpl, 0.00001))); + + // The addition between gamma * x and x is performed here because ggml doesn't support scalar multiplication without initializing the scalars in advance. + // An optimal remedy to this would be to increment the gamma bias above by one when preparing the gguf file for the model. + cur = ggml_add(ctx, ggml_add(ctx, cur, ggml_mul(ctx, cur, gamma)), beta); + cur = snake_1d(ctx, block->input_alphas[i], ggml_cont(ctx, ggml_transpose(ctx, cur))); + + cur = ggml_add(ctx, ggml_conv_1d_tts(ctx, block->convs1_weights[i], cur, 1, block->conv1_paddings[i], block->conv1_dilations[i]), block->convs1_biases[i]); + gamma = ggml_add(ctx, ggml_mul_mat(ctx, block->adain1d_2_gamma_weights[i], style), block->adain1d_2_gamma_biases[i]); + beta = ggml_add(ctx, ggml_mul_mat(ctx, block->adain1d_2_beta_weights[i], style), block->adain1d_2_beta_biases[i]); + cur = ggml_cont(ctx, ggml_transpose(ctx, ggml_norm(ctx, cur, 0.00001))); + + // The addition between gamma * x and x is performed here because ggml doesn't support scalar multiplication without initializing the scalars in advance. + // An optimal remedy to this would be to increment the gamma bias above by one when preparing the gguf file for the model. + cur = ggml_cont(ctx, ggml_transpose(ctx, ggml_add(ctx, ggml_add(ctx, cur, ggml_mul(ctx, cur, gamma)), beta))); + + cur = snake_1d(ctx, block->output_alphas[i], cur); + cur = ggml_add(ctx, ggml_conv_1d_tts(ctx, block->convs2_weights[i], cur, 1, block->conv1_paddings[0], 1), block->convs2_biases[i]); + inpl = ggml_add(ctx, inpl, cur); + } + return inpl; +} + +static struct ggml_tensor * build_noise_block(ggml_context * ctx, kokoro_noise_residual_block * block, struct ggml_tensor * x, struct ggml_tensor * style) { + // This conv_1d seems replaceable with squeezed and transposed ggml_mul_mut, but s0 and p0 are dynamic + ggml_tensor * cur = ggml_add(ctx, ggml_conv_1d_tts(ctx, block->input_conv, x, block->input_conv_stride, block->input_conv_padding, 1), block->input_conv_bias); + return build_kokoro_generator_res_block(ctx, cur, style, block->res_block); +} + +static struct ggml_tensor * build_sin_gen(ggml_context * ctx, kokoro_model * model, kokoro_context * kctx, struct ggml_tensor * x, int harmonic_num, int sequence_length, float voice_threshold, float sin_amp, float noise_std) { + struct ggml_tensor * cur = ggml_mul(ctx, ggml_repeat(ctx, x, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, x->ne[0], harmonic_num)), model->harmonic_sampling_norm); + cur = ggml_mul(ctx, ggml_cumsum(ctx, ggml_mod(ctx, cur, 1.0f)), model->sampling_factor_scalar); + cur = ggml_upscale_linear(ctx, cur, 300); + struct ggml_tensor * upscaled = ggml_upscale_ext(ctx, x, x->ne[0]*300, x->ne[1], x->ne[2], x->ne[3],GGML_SCALE_MODE_NEAREST); + + kctx->uv_noise_data = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sequence_length*harmonic_num+4); + ggml_set_input(kctx->uv_noise_data); + + struct ggml_tensor * fake = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, sequence_length, harmonic_num, 2); + + // ggml doesn't support boolean tensors nor does it support greater than and roll ops. As a result, we represent these boolean tensors as 1.0 or 0.0 or simply perform + // multiplications in place via a custom map. + struct ggml_tensor * uv_noise = ggml_map_custom3(ctx, fake, upscaled, kctx->uv_noise_data, &uv_noise_compute, sequence_length, nullptr); + + + struct ggml_tensor * noise = ggml_cont(ctx, ggml_view_2d(ctx, uv_noise, uv_noise->ne[0], uv_noise->ne[1], uv_noise->nb[1], uv_noise->nb[2])); + struct ggml_tensor * uv = ggml_cont(ctx, ggml_view_2d(ctx, uv_noise, uv_noise->ne[0], uv_noise->ne[1], uv_noise->nb[1], 0)); + + return ggml_cont(ctx, ggml_transpose(ctx, ggml_add(ctx, ggml_mul(ctx, ggml_sin(ctx, cur), uv), noise))); +} + +static struct ggml_tensor * build_generator(ggml_context * ctx, kokoro_model * model, kokoro_context * kctx, struct ggml_tensor * x, struct ggml_tensor * style, struct ggml_tensor * f0_curve, kokoro_generator* generator, int sequence_length, struct ggml_tensor * window_sq_sum, ggml_cgraph * gf) { + struct ggml_tensor * sing = build_sin_gen(ctx, model, kctx, f0_curve, model->harmonic_num + 1, f0_curve->ne[0] * 300, model->voice_threshold, model->sin_amp, model->noise_std); + struct ggml_tensor * har = ggml_tanh(ctx, ggml_add(ctx, ggml_mul_mat(ctx, generator->m_source_weight, sing), generator->m_source_bias)); + + har = stft(ctx, ggml_cont(ctx, ggml_transpose(ctx, har)), generator->window, model->true_n_fft, model->stft_hop, true, true); + + // stft returns a vector of shape [nfft, frames, batch, 2] where the final shape (2) separates the magnitude and the phase + // kokoro concatenates the n_fft from the magnitude and the phase together so we have to split them up and concatenate + // along the n_fft axis + struct ggml_tensor * mhar = ggml_cont(ctx, ggml_view_3d(ctx, har, har->ne[0], har->ne[1], har->ne[2], har->nb[1], har->nb[2], 0)); + struct ggml_tensor * phhar = ggml_cont(ctx, ggml_view_3d(ctx, har, har->ne[0], har->ne[1], har->ne[2], har->nb[1], har->nb[2], har->nb[3])); + struct ggml_tensor * combined_har = ggml_cont(ctx, ggml_transpose(ctx, ggml_concat(ctx, mhar, phhar, 0))); + + struct ggml_tensor * cur = x; + for (int i = 0; i < generator->ups.size(); i++) { + cur = ggml_leaky_relu(ctx, cur, 0.1f, false); + cur = ggml_add(ctx, ggml_conv_transpose_1d_tts(ctx, generator->ups[i]->upsample_weight, ggml_cont(ctx, ggml_transpose(ctx, cur)), generator->ups[i]->stride, generator->ups[i]->padding, 1, 0, 1), generator->ups[i]->upsample_bias); + if (i == generator->ups.size() - 1) { + // This is a hacky way of implementing the simple reflection padding used here. + // In general, ggml should eventually be built to support expressive reflective padding but for such simple front padding this makes more sense. + struct ggml_tensor * temp = ggml_cont(ctx, ggml_view_3d(ctx, cur, 1, cur->ne[1], cur->ne[2], cur->nb[1], cur->nb[2], cur->nb[0])); + cur = ggml_concat(ctx, temp, cur, 0); + } + struct ggml_tensor * x_source = build_noise_block(ctx, generator->noise_blocks[i], ggml_cont(ctx, combined_har), style); + cur = ggml_add(ctx, cur, x_source); + struct ggml_tensor * x = cur; + for (int ii = 0; ii < model->n_kernels; ii++) { + if (ii == 0) { + cur = build_kokoro_generator_res_block(ctx, x, style, generator->res_blocks[i*model->n_kernels+ii]); + } else { + cur = ggml_add(ctx, cur, build_kokoro_generator_res_block(ctx, x, style, generator->res_blocks[i*model->n_kernels+ii])); + } + } + cur = ggml_cont(ctx, ggml_transpose(ctx, ggml_div(ctx, cur, model->n_kernels_tensor))); + ggml_build_forward_expand(gf, cur); + } + + cur = ggml_leaky_relu(ctx, cur, 0.01f, false); + cur = ggml_add(ctx, ggml_conv_1d_tts(ctx, generator->out_conv_weight, ggml_cont(ctx, ggml_transpose(ctx, cur)), 1, model->out_conv_padding, 1), generator->out_conv_bias); + + struct ggml_tensor * spec = ggml_view_3d(ctx, cur, cur->ne[0], model->post_n_fft, cur->ne[2], cur->nb[1], cur->nb[2], 0); + struct ggml_tensor * phase = ggml_view_3d(ctx, cur, cur->ne[0], cur->ne[1] - model->post_n_fft, cur->ne[2], cur->nb[1], cur->nb[2], cur->nb[1] * model->post_n_fft); + phase = ggml_sin(ctx, phase); + spec = ggml_exp(ctx, spec); + + cur = ggml_concat(ctx, spec, phase, 3); // istft expects the magnitude and phase concatenated after the batch; + cur = istft(ctx, ggml_cont(ctx, ggml_transpose(ctx, cur)), window_sq_sum, generator->window, model->true_n_fft, model->stft_hop, true, true); + ggml_set_name(cur, "after_res_gen"); + return cur; +} + +static struct kokoro_generator_residual_block * build_res_block_from_file(gguf_context * meta, std::string base_config_key) { + struct kokoro_generator_residual_block * grb = new struct kokoro_generator_residual_block; + // these residual blocks always have 3 convolutional layers + for (int i = 0; i < 3; i++) { + grb->adain1d_1_gamma_weights.push_back(nullptr); + grb->adain1d_2_gamma_weights.push_back(nullptr); + grb->adain1d_1_gamma_biases.push_back(nullptr); + grb->adain1d_2_gamma_biases.push_back(nullptr); + grb->adain1d_1_beta_weights.push_back(nullptr); + grb->adain1d_2_beta_weights.push_back(nullptr); + grb->adain1d_1_beta_biases.push_back(nullptr); + grb->adain1d_2_beta_biases.push_back(nullptr); + grb->input_alphas.push_back(nullptr); + grb->output_alphas.push_back(nullptr); + grb->convs1_weights.push_back(nullptr); + grb->convs1_biases.push_back(nullptr); + grb->convs2_weights.push_back(nullptr); + grb->convs2_biases.push_back(nullptr); + int padding_key = gguf_find_key(meta, (base_config_key + "." + std::to_string(i) + ".padding").c_str()); + int dilation_key = gguf_find_key(meta, (base_config_key + "." + std::to_string(i) + ".dilation").c_str()); + if (padding_key == -1 || dilation_key == -1) { + TTS_ABORT("Could not find dilation and padding for generator residual block at key, '%s.%d'.", base_config_key.c_str(), i); + } + grb->conv1_dilations.push_back(gguf_get_val_u32(meta, dilation_key)); + grb->conv1_paddings.push_back(gguf_get_val_u32(meta, padding_key)); + } + return grb; +} + +static struct kokoro_noise_residual_block * build_noise_block_from_file(gguf_context * meta, int index) { + struct kokoro_noise_residual_block * nb = new struct kokoro_noise_residual_block; + std::string base = "kokoro.decoder.generator.noise_blocks." + std::to_string(index); + nb->res_block = build_res_block_from_file(meta, base + ".res_block"); + int stride_key = gguf_find_key(meta, (base + ".stride").c_str()); + int padding_key = gguf_find_key(meta, (base + ".padding").c_str()); + if (padding_key == -1 || stride_key == -1) { + TTS_ABORT("both padding and stride keys must be assigned in order to initialize a kokoro noise block."); + } + nb->input_conv_stride = gguf_get_val_u32(meta, stride_key); + nb->input_conv_padding = gguf_get_val_u32(meta, padding_key); + return nb; +} + +static struct kokoro_generator_upsample_block * kokoro_generator_upsample_block(gguf_context * meta, int index) { + struct kokoro_generator_upsample_block * usb = new struct kokoro_generator_upsample_block; + std::string base = "kokoro.decoder.generator.up_convs." + std::to_string(index); + int stride_key = gguf_find_key(meta, (base + ".stride").c_str()); + int padding_key = gguf_find_key(meta, (base + ".padding").c_str()); + if (padding_key == -1 || stride_key == -1) { + TTS_ABORT("both padding and stride keys must be assigned in order to initialize a kokoro upsample block."); + } + usb->stride = gguf_get_val_u32(meta, stride_key); + usb->padding = gguf_get_val_u32(meta, padding_key); + return usb; +} + +size_t kokoro_model::max_gen_nodes() { + return std::max(8192, generation_node_counter*2); +} + +size_t kokoro_model::max_duration_nodes() { + return std::max(8192, duration_node_counter*2); +} + +void kokoro_model::post_load_assign() { + size_t original_offset = offset; + n_kernels_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); + n_kernels_tensor->buffer = buf; + n_kernels_tensor->data = (void *)((uint8_t *) ggml_backend_buffer_get_base(buf) + offset); + size_t size = ggml_nbytes(n_kernels_tensor); + float nker = (float) n_kernels; + ggml_backend_tensor_set(n_kernels_tensor, &nker, 0, size); + offset += size; + + sqrt_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); + sqrt_tensor->buffer = buf; + sqrt_tensor->data = (void *)((uint8_t *) ggml_backend_buffer_get_base(buf) + offset); + size = ggml_nbytes(sqrt_tensor); + float sqrt2 = sqrtf(2.0f); + ggml_backend_tensor_set(sqrt_tensor, &sqrt2, 0, size); + offset += size; + + std::vector data{}; + for (int l = 0; l < lstms.size(); l++) { + lstm * rnn = lstms[l]; + const int32_t hidden_size = rnn->cells[0]->biases[0]->ne[0]; + data.resize(hidden_size); + + for (int i = 0; i < rnn->cells.size(); i++) { + struct ggml_tensor * h = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); + struct ggml_tensor * s = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); + h->buffer = buf; + h->data = (void *)((uint8_t *) ggml_backend_buffer_get_base(buf) + offset); + size_t size = ggml_nbytes(h); + ggml_backend_tensor_set(h, data.data(), 0, size); + ggml_format_name(h, "lstm%d_hidden", l); + offset += size; + s->buffer = buf; + s->data = (void *)((uint8_t *) ggml_backend_buffer_get_base(buf) + offset); + ggml_backend_tensor_set(s, data.data(), 0, size); + ggml_format_name(h, "lstm%d_state", l); + offset += size; + rnn->hidden.push_back(h); + rnn->states.push_back(s); + } + data.clear(); + } + + if (window == "hann") { + std::vector wdata; + wdata.reserve(true_n_fft); + hann_window(true_n_fft, wdata); + decoder->generator->window = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, true_n_fft); + decoder->generator->window->buffer = buf; + decoder->generator->window->data = (void *)((uint8_t *) ggml_backend_buffer_get_base(buf) + offset); + size_t size = ggml_nbytes(decoder->generator->window); + ggml_backend_tensor_set(decoder->generator->window, wdata.data(), 0, size); + ggml_set_name(decoder->generator->window, "stft_window"); + offset += size; + wdata.clear(); + } else { + TTS_ABORT("Window of type %s is not supported.", window.c_str()); + } + + harmonic_sampling_norm = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, harmonic_num + 1); + harmonic_sampling_norm->buffer = buf; + harmonic_sampling_norm->data = (void *)((uint8_t *) ggml_backend_buffer_get_base(buf) + offset); + std::vector hdata; + hdata.reserve(harmonic_num + 1); + for (int i = 0; i < harmonic_num + 1; i++) { + hdata.push_back(((float)i + 1.0f) / sample_rate); + } + size_t hsize = ggml_nbytes(harmonic_sampling_norm); + ggml_backend_tensor_set(harmonic_sampling_norm, hdata.data(), 0, hsize); + hdata.clear(); + offset += hsize; + + sampling_factor_scalar = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); + sampling_factor_scalar->buffer = buf; + sampling_factor_scalar->data = (void *)((uint8_t *) ggml_backend_buffer_get_base(buf) + offset); + size_t scsize = ggml_nbytes(sampling_factor_scalar); + // while it might appear that the upsampling_rate could be used here, the interpolation rate (i.e. the upsampling scale) is actually independent in the kokoro model implementation. + float sample_scalar = upsample_scale*2.0f*M_PI; + ggml_backend_tensor_set(sampling_factor_scalar, &sample_scalar, 0, scsize); + offset += scsize; + post_load_tensor_bytes = 300 + offset - original_offset; +} + +void kokoro_model::assign_lstm(lstm * rnn, std::string name, ggml_tensor * tensor) { + std::vector parts = split(name, "."); + int i = std::stoi(parts[0]); + int ii = std::stoi(parts[2]); + if (parts[1] == "weights") { + rnn->cells[i]->weights[ii] = ggml_dup_tensor(ctx, tensor); + set_tensor(rnn->cells[i]->weights[ii], tensor); + } else if (parts[1] == "biases") { + rnn->cells[i]->biases[ii] = ggml_dup_tensor(ctx, tensor); + set_tensor(rnn->cells[i]->biases[ii], tensor); + } else if (parts[1] == "reverse_weights") { + rnn->cells[i]->reverse_weights[ii] = ggml_dup_tensor(ctx, tensor); + set_tensor(rnn->cells[i]->reverse_weights[ii], tensor); + } else if (parts[1] == "reverse_biases") { + rnn->cells[i]->reverse_biases[ii] = ggml_dup_tensor(ctx, tensor); + set_tensor(rnn->cells[i]->reverse_biases[ii], tensor); + } +} + +void kokoro_model::assign_weight(std::string name, ggml_tensor * tensor) { + // all kokoro tensors are prepended by "kokoro" so lets trim that off and assign based on the module + std::vector parts = split(name, "."); + if (parts.size() < 2) { + return; // handle the null context tensor; + } + if (parts[1] == "albert") { + assign_albert_weight(name.substr(7+parts[1].size()+1), tensor); + } else if (parts[1] == "duration_predictor") { + assign_duration_weight(name.substr(7+parts[1].size()+1), tensor); + } else if (parts[1] == "text_encoder") { + assign_text_encoder_weight(name.substr(7+parts[1].size()+1), tensor); + } else if (parts[1] == "decoder") { + assign_decoder_weight(name.substr(7+parts[1].size()+1), tensor); + } else if (parts[1] == "voice_tensors") { + voices[parts[2]] = ggml_dup_tensor(ctx, tensor); + set_tensor(voices[parts[2]], tensor); + } +} + +void kokoro_model::assign_generator_weight(kokoro_generator * generator, std::string name, ggml_tensor * tensor) { + if (name == "m_source_weight") { + generator->m_source_weight = ggml_dup_tensor(ctx, tensor); + set_tensor(generator->m_source_weight, tensor); + } else if (name == "m_source_bias") { + generator->m_source_bias = ggml_dup_tensor(ctx, tensor); + set_tensor(generator->m_source_bias, tensor); + } else if (name == "conv_post_weight") { + generator->out_conv_weight = ggml_dup_tensor(ctx, tensor); + set_tensor(generator->out_conv_weight, tensor); + } else if (name == "conv_post_bias") { + generator->out_conv_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); + set_tensor(generator->out_conv_bias, tensor); + } else { + std::vector parts = split(name, "."); + int i = std::stoi(parts[1]); + if (parts[0] == "noise_blocks") { + if (parts[2] == "conv_weight") { + generator->noise_blocks[i]->input_conv = ggml_dup_tensor(ctx, tensor); + set_tensor(generator->noise_blocks[i]->input_conv, tensor); + } else if (parts[2] == "conv_bias") { + generator->noise_blocks[i]->input_conv_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); + set_tensor(generator->noise_blocks[i]->input_conv_bias, tensor); + } else if (parts[2] == "resblock") { + assign_gen_resblock(generator->noise_blocks[i]->res_block, name.substr(parts[0].size()+parts[1].size()+parts[2].size()+3), tensor); + } + } else if (parts[0] == "resblocks") { + assign_gen_resblock(generator->res_blocks[i], name.substr(parts[0].size()+parts[1].size()+2), tensor); + } else if (parts[0] == "ups") { + if (parts[2] == "weight") { + generator->ups[i]->upsample_weight = ggml_dup_tensor(ctx, tensor); + set_tensor(generator->ups[i]->upsample_weight, tensor); + } else if (parts[2] == "bias") { + generator->ups[i]->upsample_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); + set_tensor(generator->ups[i]->upsample_bias, tensor); + } + } + } +} + +void kokoro_model::assign_gen_resblock(kokoro_generator_residual_block * block, std::string name, ggml_tensor * tensor) { + std::vector parts = split(name, "."); + int i = std::stoi(parts[0]); + if (parts[1] == "gamma1_weight") { + block->adain1d_1_gamma_weights[i] = ggml_dup_tensor(ctx, tensor); + set_tensor(block->adain1d_1_gamma_weights[i], tensor); + } else if (parts[1] == "gamma2_weight") { + block->adain1d_2_gamma_weights[i] = ggml_dup_tensor(ctx, tensor); + set_tensor(block->adain1d_2_gamma_weights[i], tensor); + } else if (parts[1] == "gamma1_bias") { + block->adain1d_1_gamma_biases[i] = ggml_dup_tensor(ctx, tensor); + set_tensor(block->adain1d_1_gamma_biases[i], tensor); + } else if (parts[1] == "gamma2_bias") { + block->adain1d_2_gamma_biases[i] = ggml_dup_tensor(ctx, tensor); + set_tensor(block->adain1d_2_gamma_biases[i], tensor); + } else if (parts[1] == "beta1_weight") { + block->adain1d_1_beta_weights[i] = ggml_dup_tensor(ctx, tensor); + set_tensor(block->adain1d_1_beta_weights[i], tensor); + } else if (parts[1] == "beta2_weight") { + block->adain1d_2_beta_weights[i] = ggml_dup_tensor(ctx, tensor); + set_tensor(block->adain1d_2_beta_weights[i], tensor); + } else if (parts[1] == "beta1_bias") { + block->adain1d_1_beta_biases[i] = ggml_dup_tensor(ctx, tensor); + set_tensor(block->adain1d_1_beta_biases[i], tensor); + } else if (parts[1] == "beta2_bias") { + block->adain1d_2_beta_biases[i] = ggml_dup_tensor(ctx, tensor); + set_tensor(block->adain1d_2_beta_biases[i], tensor); + } else if (parts[1] == "convs1_weight") { + block->convs1_weights[i] = ggml_dup_tensor(ctx, tensor); + set_tensor(block->convs1_weights[i], tensor); + } else if (parts[1] == "convs2_weight") { + block->convs2_weights[i] = ggml_dup_tensor(ctx, tensor); + set_tensor(block->convs2_weights[i], tensor); + } else if (parts[1] == "convs1_bias") { + block->convs1_biases[i] = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); + set_tensor(block->convs1_biases[i], tensor); + } else if (parts[1] == "convs2_bias") { + block->convs2_biases[i] = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); + set_tensor(block->convs2_biases[i], tensor); + } else if (parts[1] == "alpha1") { + block->input_alphas[i] = ggml_dup_tensor(ctx, tensor); + set_tensor(block->input_alphas[i], tensor); + } else if (parts[1] == "alpha2") { + block->output_alphas[i] = ggml_dup_tensor(ctx, tensor); + set_tensor(block->output_alphas[i], tensor); + } +} + +/** + * Removes the last axis, for cases where it's redundantly of length 1. + * assert x.ndim == 3; numpy.squeeze(x, axis=-1) + */ +static ggml_tensor * squeeze_3d_2d_e0(ggml_context * ctx, ggml_tensor * x) { + TTS_ASSERT(x->ne[0] == 1); + TTS_ASSERT(ggml_is_contiguous(x)); + return ggml_reshape_2d(ctx, x, x->ne[1], x->ne[2]); +} + +void kokoro_model::assign_ada_res_block(ada_residual_conv_block * block, std::string name, ggml_tensor * tensor) { + if (name == "norm1_gamma_weight") { + block->norm1_gamma = ggml_dup_tensor(ctx, tensor); + set_tensor(block->norm1_gamma, tensor); + } else if (name == "norm2_gamma_weight") { + block->norm2_gamma = ggml_dup_tensor(ctx, tensor); + set_tensor(block->norm2_gamma, tensor); + } else if (name == "norm1_gamma_bias") { + block->norm1_gamma_bias = ggml_dup_tensor(ctx, tensor); + set_tensor(block->norm1_gamma_bias, tensor); + } else if (name == "norm2_gamma_bias") { + block->norm2_gamma_bias = ggml_dup_tensor(ctx, tensor); + set_tensor(block->norm2_gamma_bias, tensor); + } else if (name == "norm1_beta_weight") { + block->norm1_beta = ggml_dup_tensor(ctx, tensor); + set_tensor(block->norm1_beta, tensor); + } else if (name == "norm2_beta_weight") { + block->norm2_beta = ggml_dup_tensor(ctx, tensor); + set_tensor(block->norm2_beta, tensor); + } else if (name == "norm1_beta_bias") { + block->norm1_beta_bias = ggml_dup_tensor(ctx, tensor); + set_tensor(block->norm1_beta_bias, tensor); + } else if (name == "norm2_beta_bias") { + block->norm2_beta_bias = ggml_dup_tensor(ctx, tensor); + set_tensor(block->norm2_beta_bias, tensor); + } else if (name == "conv1_weight") { + block->conv1 = ggml_dup_tensor(ctx, tensor); + set_tensor(block->conv1, tensor); + } else if (name == "conv2_weight") { + block->conv2 = ggml_dup_tensor(ctx, tensor); + set_tensor(block->conv2, tensor); + } else if (name == "conv1_bias") { + block->conv1_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); + set_tensor(block->conv1_bias, tensor); + } else if (name == "conv2_bias") { + block->conv2_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); + set_tensor(block->conv2_bias, tensor); + } else if (name == "pool_weight") { + block->pool = ggml_dup_tensor(ctx, tensor); + set_tensor(block->pool, tensor); + } else if (name == "pool_bias") { + block->pool_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); + set_tensor(block->pool_bias, tensor); + } else if (name == "conv1x1_weight") { + tensor = squeeze_3d_2d_e0(ctx, tensor); + block->upsample = ggml_dup_tensor(ctx, tensor); + set_tensor(block->upsample, tensor); + } else if (name == "conv1x1_bias") { + block->upsample_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); + set_tensor(block->upsample_bias, tensor); + } +} + +void kokoro_model::assign_decoder_weight(std::string name, ggml_tensor * tensor) { + if (name == "f0_conv_weight") { + decoder->f0_conv = ggml_dup_tensor(ctx, tensor); + set_tensor(decoder->f0_conv, tensor); + } else if (name == "f0_conv_bias") { + decoder->f0_conv_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); + set_tensor(decoder->f0_conv_bias, tensor); + } else if (name == "n_conv_weight") { + decoder->n_conv = ggml_dup_tensor(ctx, tensor); + set_tensor(decoder->n_conv, tensor); + } else if (name == "n_conv_bias") { + decoder->n_conv_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); + set_tensor(decoder->n_conv_bias, tensor); + } else if (name == "asr_conv_weight") { + tensor = squeeze_3d_2d_e0(ctx, tensor); + decoder->asr_conv = ggml_dup_tensor(ctx, tensor); + set_tensor(decoder->asr_conv, tensor); + } else if (name == "asr_conv_bias") { + decoder->asr_conv_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); + set_tensor(decoder->asr_conv_bias, tensor); + } else if (has_prefix(name, "decoder_blocks")) { + std::vector parts = split(name, "."); + int i = std::stoi(parts[1]); + assign_ada_res_block(decoder->decoder_blocks[i], parts[2], tensor); + } else if (has_prefix(name, "encoder_block")) { + std::vector parts = split(name, "."); + assign_ada_res_block(decoder->encoder_block, parts[1], tensor); + } else if (has_prefix(name, "generator")) { + assign_generator_weight(decoder->generator, name.substr(10), tensor); + } +} + +void kokoro_model::assign_duration_weight(std::string name, ggml_tensor * tensor) { + if (name == "encode") { + prosody_pred->albert_encode = ggml_dup_tensor(ctx, tensor); + set_tensor(prosody_pred->albert_encode , tensor); + } else if (name == "encode_bias") { + prosody_pred->albert_encode_bias = ggml_dup_tensor(ctx, tensor); + set_tensor(prosody_pred->albert_encode_bias, tensor); + } else if (name == "duration_proj") { + prosody_pred->duration_proj = ggml_dup_tensor(ctx, tensor); + set_tensor(prosody_pred->duration_proj, tensor); + } else if (name == "duration_proj_bias") { + prosody_pred->duration_proj_bias = ggml_dup_tensor(ctx, tensor); + set_tensor(prosody_pred->duration_proj_bias, tensor); + } else if (name == "n_proj_kernel") { + tensor = squeeze_3d_2d_e0(ctx, tensor); + prosody_pred->n_proj_kernel = ggml_dup_tensor(ctx, tensor); + set_tensor(prosody_pred->n_proj_kernel, tensor); + } else if (name == "n_proj_bias") { + prosody_pred->n_proj_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); + set_tensor(prosody_pred->n_proj_bias, tensor); + } else if (name == "f0_proj_kernel") { + tensor = squeeze_3d_2d_e0(ctx, tensor); + prosody_pred->f0_proj_kernel = ggml_dup_tensor(ctx, tensor); + set_tensor(prosody_pred->f0_proj_kernel, tensor); + } else if (name == "f0_proj_bias") { + prosody_pred->f0_proj_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); + set_tensor(prosody_pred->f0_proj_bias, tensor); + } else { + std::vector parts = split(name, "."); + if (parts[0] == "shared_lstm") { + assign_lstm(prosody_pred->shared_lstm, name.substr(parts[0].size()+1), tensor); + } else if (parts[0] == "duration_lstm") { + assign_lstm(prosody_pred->duration_proj_lstm, name.substr(parts[0].size()+1), tensor); + } else if (parts[0] == "f0_blocks") { + int i = std::stoi(parts[1]); + assign_ada_res_block(prosody_pred->f0_blocks[i], parts[2], tensor); + } else if (parts[0] == "n_blocks") { + int i = std::stoi(parts[1]); + assign_ada_res_block(prosody_pred->n_blocks[i], parts[2], tensor); + } else if (parts[0] == "layers") { + int i = std::stoi(parts[1]); + i = i / 2; + if (parts[2] == "gamma_weight") { + prosody_pred->layers[i]->ada_norm_gamma_weight = ggml_dup_tensor(ctx, tensor); + set_tensor(prosody_pred->layers[i]->ada_norm_gamma_weight , tensor); + } else if (parts[2] == "gamma_bias") { + prosody_pred->layers[i]->ada_norm_gamma_bias = ggml_dup_tensor(ctx, tensor); + set_tensor(prosody_pred->layers[i]->ada_norm_gamma_bias , tensor); + } else if (parts[2] == "beta_weight") { + prosody_pred->layers[i]->ada_norm_beta_weight = ggml_dup_tensor(ctx, tensor); + set_tensor(prosody_pred->layers[i]->ada_norm_beta_weight , tensor); + } else if (parts[2] == "beta_bias") { + prosody_pred->layers[i]->ada_norm_beta_bias = ggml_dup_tensor(ctx, tensor); + set_tensor(prosody_pred->layers[i]->ada_norm_beta_bias , tensor); + } else if (parts[2] == "lstm") { + assign_lstm(prosody_pred->layers[i]->rnn, name.substr(parts[0].size()+parts[1].size()+parts[2].size()+3), tensor); + } + } + } +} + +void kokoro_model::assign_text_encoder_weight(std::string name, ggml_tensor * tensor) { + if (name == "embedding_weight") { + text_encoder->embd = ggml_dup_tensor(ctx, tensor); + set_tensor(text_encoder->embd, tensor); + } else if (has_prefix(name, "lstm")) { + assign_lstm(text_encoder->out_lstm, name.substr(5), tensor); + } else if (has_prefix(name, "layers")) { + std::vector parts = split(name, "."); + int i = std::stoi(parts[1]); + if (parts[2] == "gamma") { + text_encoder->conv_layers[i]->norm_gamma = ggml_dup_tensor(ctx, tensor); + set_tensor(text_encoder->conv_layers[i]->norm_gamma, tensor); + } else if (parts[2] == "beta") { + text_encoder->conv_layers[i]->norm_beta = ggml_dup_tensor(ctx, tensor); + set_tensor(text_encoder->conv_layers[i]->norm_beta, tensor); + } else if (parts[2] == "weight") { + text_encoder->conv_layers[i]->conv_weight = ggml_dup_tensor(ctx, tensor); + set_tensor(text_encoder->conv_layers[i]->conv_weight, tensor); + } else if (parts[2] == "bias") { + text_encoder->conv_layers[i]->conv_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); + set_tensor(text_encoder->conv_layers[i]->conv_bias, tensor); + } + } +} + +void kokoro_model::assign_albert_weight(std::string name, ggml_tensor * tensor) { + if (name == "embd") { + embd_hidden = ggml_dup_tensor(ctx, tensor); + set_tensor(embd_hidden, tensor); + } else if (name == "embd_bias") { + embd_hidden_bias = ggml_dup_tensor(ctx, tensor); + set_tensor(embd_hidden_bias, tensor); + } else if (name == "token_embd") { + token_embd = ggml_dup_tensor(ctx, tensor); + set_tensor(token_embd, tensor); + } else if (name == "position_embd") { + position_embd = ggml_dup_tensor(ctx, tensor); + set_tensor(position_embd, tensor); + } else if (name == "norm") { + input_norm_weight = ggml_dup_tensor(ctx, tensor); + set_tensor(input_norm_weight, tensor); + } else if (name == "norm_bias") { + input_norm_bias = ggml_dup_tensor(ctx, tensor); + set_tensor(input_norm_bias, tensor); + } else if (name == "token_type_embd") { + static_token_type_values = ggml_dup_tensor(ctx, tensor); + set_tensor(static_token_type_values, tensor); + } else if (has_prefix(name, "layer")) { + std::vector parts = split(name, '.'); + int i = std::stoi(parts[1]); + if (parts[2] == "ffn") { + layers[i]->ffn = ggml_dup_tensor(ctx, tensor); + set_tensor(layers[i]->ffn, tensor); + } else if (parts[2] == "ffn_bias") { + layers[i]->ffn_bias = ggml_dup_tensor(ctx, tensor); + set_tensor(layers[i]->ffn_bias, tensor); + } else if (parts[2] == "ffn_out") { + layers[i]->ffn_out = ggml_dup_tensor(ctx, tensor); + set_tensor(layers[i]->ffn_out, tensor); + } else if (parts[2] == "ffn_out_bias") { + layers[i]->ffn_out_bias = ggml_dup_tensor(ctx, tensor); + set_tensor(layers[i]->ffn_out_bias, tensor); + } else if (parts[2] == "attn_norm") { + layers[i]->layer_output_norm_weight = ggml_dup_tensor(ctx, tensor); + set_tensor(layers[i]->layer_output_norm_weight, tensor); + } else if (parts[2] == "attn_norm_bias") { + layers[i]->layer_output_norm_bias = ggml_dup_tensor(ctx, tensor); + set_tensor(layers[i]->layer_output_norm_bias, tensor); + } else if (parts[2] == "q") { + layers[i]->q = ggml_dup_tensor(ctx, tensor); + set_tensor(layers[i]->q, tensor); + } else if (parts[2] == "k") { + layers[i]->k = ggml_dup_tensor(ctx, tensor); + set_tensor(layers[i]->k, tensor); + } else if (parts[2] == "v") { + layers[i]->v = ggml_dup_tensor(ctx, tensor); + set_tensor(layers[i]->v, tensor); + } else if (parts[2] == "o") { + layers[i]->o = ggml_dup_tensor(ctx, tensor); + set_tensor(layers[i]->o, tensor); + } else if (parts[2] == "q_bias") { + layers[i]->q_bias = ggml_dup_tensor(ctx, tensor); + set_tensor(layers[i]->q_bias, tensor); + } else if (parts[2] == "k_bias") { + layers[i]->k_bias = ggml_dup_tensor(ctx, tensor); + set_tensor(layers[i]->k_bias, tensor); + } else if (parts[2] == "v_bias") { + layers[i]->v_bias = ggml_dup_tensor(ctx, tensor); + set_tensor(layers[i]->v_bias, tensor); + } else if (parts[2] == "o_bias") { + layers[i]->o_bias = ggml_dup_tensor(ctx, tensor); + set_tensor(layers[i]->o_bias, tensor); + } else if (parts[2] == "ffn_norm") { + layers[i]->attn_norm_weight = ggml_dup_tensor(ctx, tensor); + set_tensor(layers[i]->attn_norm_weight, tensor); + } else if (parts[2] == "ffn_norm_bias") { + layers[i]->attn_norm_bias = ggml_dup_tensor(ctx, tensor); + set_tensor(layers[i]->attn_norm_bias, tensor); + } + } +} + +lstm * kokoro_model::prep_lstm() { + lstm * rnn = new lstm; + lstm_cell * cell = new lstm_cell; + for (int i = 0; i < 8; i++) { + cell->weights.push_back(nullptr); + cell->biases.push_back(nullptr); + cell->reverse_weights.push_back(nullptr); + cell->reverse_biases.push_back(nullptr); + } + rnn->cells.push_back(cell); + rnn->bidirectional = true; + lstms.push_back(rnn); + return rnn; +} + +void kokoro_model::prep_layers(gguf_context * meta) { + prosody_pred = new duration_predictor; + prosody_pred->shared_lstm = prep_lstm(); + prosody_pred->duration_proj_lstm = prep_lstm(); + text_encoder = new kokoro_text_encoder; + decoder = new kokoro_decoder; + decoder->generator = new kokoro_generator; + decoder->encoder_block = new ada_residual_conv_block; + text_encoder->out_lstm = prep_lstm(); + + for (int i = 0; i < n_layers; i++) { + layers.push_back(new albert_layer); + } + + for (int i = 0; i < f0_n_blocks; i++) { + ada_residual_conv_block * f0 = new ada_residual_conv_block; + ada_residual_conv_block * n = new ada_residual_conv_block; + prosody_pred->f0_blocks.push_back(f0); + prosody_pred->n_blocks.push_back(n); + } + + for (int i = 0; i < n_duration_prediction_layers; i++) { + duration_predictor_layer* dpl = new duration_predictor_layer; + dpl->rnn = prep_lstm(); + prosody_pred->layers.push_back(dpl); + } + + for (int i = 0; i < n_decoder_blocks; i++) { + decoder->decoder_blocks.push_back(new ada_residual_conv_block); + } + + for (int i = 0; i < n_noise_blocks; i++) { + struct kokoro_noise_residual_block * nb = build_noise_block_from_file(meta, i); + decoder->generator->noise_blocks.push_back(nb); + } + + for (int i = 0; i < n_upsamples; i++) { + struct kokoro_generator_upsample_block * ub = kokoro_generator_upsample_block(meta, i); + decoder->generator->ups.push_back(ub); + } + + for (int i = 0; i < n_res_blocks; i++) { + struct kokoro_generator_residual_block* rb = build_res_block_from_file(meta, "kokoro.decoder.generator.res_blocks." + std::to_string(i)); + decoder->generator->res_blocks.push_back(rb); + } + + for (int i = 0; i < n_conv_layers; i++) { + text_encoder->conv_layers.push_back(new kokoro_text_encoder_conv_layer); + } +} + +void kokoro_model::prep_constants(gguf_context * meta) { + // get constants for the Albert duration prediction model + int context_size_key = gguf_find_key(meta, "kokoro.duration_predictor.albert.context_length"); + if (context_size_key != -1) { + max_context_length = gguf_get_val_u32(meta, context_size_key);; + } + + int vocab_size_key = gguf_find_key(meta, "kokoro.tokenizer.vocab_size"); + if (vocab_size_key != -1) { + vocab_size = gguf_get_val_u32(meta, vocab_size_key); + } + + int hidden_size_key = gguf_find_key(meta, "kokoro.duration_predictor.albert.hidden_size"); + if (hidden_size_key != -1) { + hidden_size = gguf_get_val_u32(meta, hidden_size_key); + } + + int attn_heads_key = gguf_find_key(meta, "kokoro.duration_predictor.albert.attn_heads"); + if (attn_heads_key != -1) { + n_attn_heads = gguf_get_val_u32(meta, attn_heads_key); + head_size = (uint32_t) hidden_size / n_attn_heads; + } + + int albert_layers_key = gguf_find_key(meta, "kokoro.duration_predictor.albert.layers"); + if (albert_layers_key != -1) { + n_layers = gguf_get_val_u32(meta, albert_layers_key); + } + + int recurrence_key = gguf_find_key(meta, "kokoro.duration_predictor.albert.recurrence"); + if (recurrence_key != -1) { + n_recurrence = gguf_get_val_u32(meta, recurrence_key); + } + + int duration_hidden_key = gguf_find_key(meta, "kokoro.duration_predictor.hidden_size"); + if (duration_hidden_key != -1) { + duration_hidden_size = gguf_get_val_u32(meta, duration_hidden_key); + } + + int up_sampling_factor_key = gguf_find_key(meta, "kokoro.decoder.generator.up_sampling_factor"); + if (up_sampling_factor_key != -1) { + up_sampling_factor = gguf_get_val_u32(meta, up_sampling_factor_key); + } + + int f0_n_blocks_key = gguf_find_key(meta, "kokoro.duration_predictor.f0_n_blocks"); + if (f0_n_blocks_key != -1) { + f0_n_blocks = gguf_get_val_u32(meta, f0_n_blocks_key); + } + + int duration_pred_layers_key = gguf_find_key(meta, "kokoro.duration_predictor.layers"); + if (duration_pred_layers_key != -1) { + n_duration_prediction_layers = gguf_get_val_u32(meta, duration_pred_layers_key); + } + + // get text and decoding configuration for generation + int n_conv_layers_key = gguf_find_key(meta, "kokoro.text_encoder.layers"); + if (n_conv_layers_key != -1) { + n_conv_layers = gguf_get_val_u32(meta, n_conv_layers_key); + } + + int n_kernels_key = gguf_find_key(meta, "kokoro.decoder.generator.kernels"); + if (n_kernels_key != -1) { + n_kernels = gguf_get_val_u32(meta, n_kernels_key); + } + + int n_upsamples_key = gguf_find_key(meta, "kokoro.decoder.generator.upsamples"); + if (n_upsamples_key != -1) { + n_upsamples = gguf_get_val_u32(meta, n_upsamples_key); + } + + int n_decoder_blocks_key = gguf_find_key(meta, "kokoro.decoder.generator.layers"); + if (n_decoder_blocks_key != -1) { + n_decoder_blocks = gguf_get_val_u32(meta, n_decoder_blocks_key); + } + + int out_conv_padding_key = gguf_find_key(meta, "kokoro.decoder.generator.padding"); + if (out_conv_padding_key != -1) { + out_conv_padding = gguf_get_val_u32(meta, out_conv_padding_key); + } + + int n_fft_key = gguf_find_key(meta, "kokoro.decoder.generator.n_fft"); + if (n_fft_key != -1) { + true_n_fft = gguf_get_val_u32(meta, n_fft_key); + post_n_fft = (uint32_t) true_n_fft / 2 + 1; + } + + int stft_hop_key = gguf_find_key(meta, "kokoro.decoder.generator.hop"); + if (stft_hop_key != -1) { + stft_hop = gguf_get_val_u32(meta, stft_hop_key); + } +} + +kokoro_ubatch kokoro_duration_runner::build_worst_case_batch() { + kokoro_ubatch batch; + batch.n_tokens = model->max_context_length; + return batch; +} + +struct ggml_cgraph * kokoro_duration_runner::build_kokoro_duration_graph(kokoro_ubatch & batch) { + init_build(); + // This '110000' number is coming from the number of nodes necessary for the longest possible sequence computed by of the graph. + // While it may be possible to precompute this by determining the longest possible duration against he maximum context length of the model, + // it is not easily performed given that nodes do not necessarily line up predictably with the number of tensors in the model or its submodels. + // In order to side step this problem I computed the graph and determined the size in advance and use that constant value here. + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 110000, false); + + struct ggml_tensor * voice = model->voices[kctx->voice]; + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + kctx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens); + ggml_set_input(kctx->inp_tokens); + + if (!model->static_token_types) { + kctx->token_types = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens); + ggml_set_input(kctx->token_types); + } + + kctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens); + ggml_set_input(kctx->positions); + + inpL = build_albert_inputs(ctx, model, kctx->inp_tokens, kctx->positions, kctx->token_types); + ggml_set_name(inpL, "albert_embeddings"); + cur = inpL; + + struct ggml_tensor * KQ_mask_dec = build_albert_attn_mask(ctx, kctx, batch); + + for (int r = 0; r < model->n_recurrence; r++) { + for (int l = 0; l < model->n_layers; l++) { + struct ggml_tensor * residual = cur ; + struct ggml_tensor * attn_out; + + // self-attention + { + struct ggml_tensor * Qcur = ggml_add(ctx, ggml_mul_mat(ctx, model->layers[l]->q, cur), model->layers[l]->q_bias); + struct ggml_tensor * Kcur = ggml_add(ctx, ggml_mul_mat(ctx, model->layers[l]->k, cur), model->layers[l]->k_bias); + struct ggml_tensor * Vcur = ggml_add(ctx, ggml_mul_mat(ctx, model->layers[l]->v, cur), model->layers[l]->v_bias); + + Qcur = ggml_reshape_3d(ctx, Qcur, model->head_size, model->n_attn_heads, batch.n_tokens); + Kcur = ggml_reshape_3d(ctx, Kcur, model->head_size, model->n_attn_heads, batch.n_tokens); + + struct ggml_tensor * q = ggml_permute(ctx, Qcur, 0, 2, 1, 3); + struct ggml_tensor * k = ggml_cont(ctx, ggml_permute(ctx, Kcur, 0, 2, 1, 3)); + struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); + + kq = ggml_soft_max_ext(ctx, kq, KQ_mask_dec, model->scale, 0.0f); + + struct ggml_tensor * v = ggml_cont_3d(ctx, ggml_transpose(ctx, Vcur), batch.n_tokens, model->head_size, model->n_attn_heads); + struct ggml_tensor * kqv = ggml_mul_mat(ctx, kq, v); + struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3); + attn_out = ggml_cont_2d(ctx, kqv_merged, model->hidden_size, batch.n_tokens); + attn_out = ggml_add(ctx, ggml_mul_mat(ctx, model->layers[l]->o, attn_out), model->layers[l]->o_bias); + } + cur = ggml_add(ctx, attn_out, residual); + cur = build_albert_norm(ctx, cur, model->layers[l]->attn_norm_weight, model->layers[l]->attn_norm_bias); + + struct ggml_tensor * residualffn = cur; + + // ffn + { + cur = ggml_gelu(ctx, ggml_add(ctx, ggml_mul_mat(ctx, model->layers[l]->ffn, cur), model->layers[l]->ffn_bias)); + cur = ggml_add(ctx, ggml_mul_mat(ctx, model->layers[l]->ffn_out, cur), model->layers[l]->ffn_out_bias); + } + + cur = ggml_add(ctx, cur, residualffn); + cur = build_albert_norm(ctx, cur, model->layers[l]->layer_output_norm_weight, model->layers[l]->layer_output_norm_bias); + } + ggml_build_forward_expand(gf, cur); + } + + // duration / prosody prediction + cur = ggml_add(ctx, ggml_mul_mat(ctx, model->prosody_pred->albert_encode, cur), model->prosody_pred->albert_encode_bias); + + struct ggml_tensor * style_half = ggml_cont(ctx, ggml_view_1d(ctx, voice, voice->ne[0]/2, voice->ne[0] / 2 * voice->nb[0] + (batch.n_tokens - 3) * voice->nb[1])); + + cur = ggml_concat(ctx, cur, ggml_repeat(ctx, style_half, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, style_half->ne[0], cur->ne[1])), 0); + + for (auto l : model->prosody_pred->layers) { + cur = build_lstm(ctx, cur, l->rnn, batch.n_tokens, gf); + + struct ggml_tensor * gamma = ggml_add(ctx, ggml_mul_mat(ctx, l->ada_norm_gamma_weight, style_half), l->ada_norm_gamma_bias); + struct ggml_tensor * beta = ggml_add(ctx, ggml_mul_mat(ctx, l->ada_norm_beta_weight, style_half), l->ada_norm_beta_bias); + + cur = ggml_norm(ctx, cur, 0.00001); + + // The addition between gamma * x and x is performed here because ggml doesn't support scalar multiplication without initializing the scalars in advance. + // An optimal remedy to this would be to increment the gamma bias above by one when preparing the gguf file for the model. + cur = ggml_add(ctx, ggml_add(ctx, cur, ggml_mul(ctx, cur, gamma)), beta); + cur = ggml_concat(ctx, cur, ggml_repeat(ctx, style_half, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, style_half->ne[0], cur->ne[1])), 0); + } + + struct ggml_tensor * d = ggml_cont(ctx, cur); + ggml_set_name(d, "duration_hidden_states"); + ggml_build_forward_expand(gf, d); + + struct ggml_tensor * len; + cur = build_lstm(ctx, cur, model->prosody_pred->duration_proj_lstm, batch.n_tokens, gf); + cur = ggml_sigmoid(ctx, ggml_add(ctx, ggml_mul_mat(ctx, model->prosody_pred->duration_proj, cur), model->prosody_pred->duration_proj_bias)); + // If we were to support speed we would add a constant tensor for the speed and divide here. + len = ggml_round(ctx, ggml_sum_rows(ctx, cur)); + len = ggml_clamp(ctx, ggml_round(ctx, ggml_sum_rows(ctx, cur)), 1.0f, 50.0f); + + ggml_build_forward_expand(gf, len); + + free_build(); + + return gf; +} + +void kokoro_duration_runner::prepare_post_load() { + auto batch = build_worst_case_batch(); + auto gf = build_kokoro_duration_graph(batch); + kctx->prep_schedule(gf); +} + +void kokoro_duration_runner::set_inputs(kokoro_ubatch & batch) { + ggml_backend_tensor_set(kctx->inp_tokens, batch.input_tokens, 0, batch.n_tokens*ggml_element_size(kctx->inp_tokens)); + uint32_t * positions_d = nullptr; + positions_d = (uint32_t *) kctx->positions->data; + float * attn_d = nullptr; + attn_d = (float *) kctx->attn_mask->data; + for (uint32_t i = 0; i < batch.n_tokens; i++) { + positions_d[i] = i; + for (uint32_t ii = 0; ii < batch.n_tokens; ii++) { + attn_d[i*batch.n_tokens + ii] = 0.0f; // Kokoro doesn't use causal attention as it isnt an autoregressive generative model; + } + } +} + +void kokoro_duration_runner::run(kokoro_ubatch & batch) { + ggml_backend_sched_reset(kctx->sched); + + size_t prev_size = kctx->buf_output ? ggml_backend_buffer_get_size(kctx->buf_output) : 0; + size_t new_size = model->max_context_length * (model->duration_hidden_size + model->style_half_size) * sizeof(float); + + if (!kctx->buf_output || prev_size < new_size) { + if (kctx->buf_output) { + ggml_backend_buffer_free(kctx->buf_output); + kctx->buf_output = nullptr; + kctx->logits = nullptr; + } + kctx->buf_output = ggml_backend_buft_alloc_buffer(kctx->backend_cpu_buffer, new_size); + } + + prev_size = kctx->buf_len_output ? ggml_backend_buffer_get_size(kctx->buf_len_output) : 0; + new_size = model->max_context_length * sizeof(float); + + if (!kctx->buf_len_output || prev_size < new_size) { + if (kctx->buf_output) { + ggml_backend_buffer_free(kctx->buf_len_output); + kctx->buf_len_output = nullptr; + kctx->lens = nullptr; + } + + kctx->buf_len_output = ggml_backend_buft_alloc_buffer(kctx->backend_cpu_buffer, new_size); + } + + + batch.resp->hidden_states = (float *) ggml_backend_buffer_get_base(kctx->buf_output); + ggml_backend_buffer_clear(kctx->buf_output, 0); + batch.resp->lengths = (float *) ggml_backend_buffer_get_base(kctx->buf_len_output); + ggml_backend_buffer_clear(kctx->buf_len_output, 0); + + struct ggml_cgraph * gf = NULL; + gf = build_kokoro_duration_graph(batch); + + // the output is always the last tensor in the graph + struct ggml_tensor * lens = gf->nodes[gf->n_nodes - 1]; + // the reused duration hidden states are computed before a node chunk which has a size that is sequence length dependent + struct ggml_tensor * hidden_states = gf->nodes[gf->n_nodes - 22 - 52 * batch.n_tokens]; + ggml_backend_sched_alloc_graph(kctx->sched, gf); + + set_inputs(batch); + + ggml_backend_sched_graph_compute_async(kctx->sched, gf); + + kctx->get_ggml_node_data(lens, batch.resp->lengths, batch.n_tokens*sizeof(float), kctx->buf_len_output); + kctx->get_ggml_node_data(hidden_states, batch.resp->hidden_states, batch.n_tokens*(model->duration_hidden_size+model->style_half_size)*sizeof(float)); + + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(kctx->sched); + batch.resp->n_outputs = batch.n_tokens; +} + +kokoro_ubatch kokoro_runner::build_worst_case_batch() { + kokoro_ubatch batch; + batch.n_tokens = model->max_context_length; + batch.resp = new kokoro_duration_response; + batch.resp->n_outputs = model->max_context_length; + kctx->total_duration = model->max_context_length * model->max_duration_per_token; + kctx->sequence_length = model->max_context_length; + std::vector lengths; + lengths.reserve(model->max_context_length); + for (int i = 0; i < model->max_context_length; i++) { + lengths.push_back(50.0f); + } + batch.resp->lengths = lengths.data(); + return batch; +} + +struct ggml_cgraph * kokoro_runner::build_kokoro_graph(kokoro_ubatch & batch) { + init_build(); + // This '570000' number is coming from the number of nodes necessary for the longest possible sequence computed by the graph. + // While it may be possible to precompute this by determining the longest possible duration against he maximum context length of the model, + // it is not easily performed given that nodes do not necessarily line up predictably with the number of tensors in the model or its submodels. + // In order to side step this problem I computed the graph and determined the size in advance and use that constant value here. + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 570000, false); + + struct ggml_tensor * voice = model->voices[kctx->voice]; + struct ggml_tensor * style_half = ggml_view_1d(ctx, voice, voice->ne[0]/2, voice->ne[0] / 2 * voice->nb[0] + (batch.n_tokens - 3) * voice->nb[1]); + struct ggml_tensor * cur; + + kctx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens); + ggml_set_input(kctx->inp_tokens); + + kctx->duration_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, kctx->total_duration, kctx->sequence_length); + ggml_set_input(kctx->duration_mask); + + kctx->duration_pred = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, model->duration_hidden_size + model->style_half_size, kctx->sequence_length); + ggml_set_input(kctx->duration_pred); + + // seeing as we are setting the inputs for these, we shouldn't need to perform tranpositions here + cur = ggml_mul_mat(ctx, ggml_cont(ctx, ggml_transpose(ctx, kctx->duration_mask)), ggml_cont(ctx, ggml_transpose(ctx, kctx->duration_pred))); + cur = ggml_cont(ctx, ggml_transpose(ctx, cur)); + + cur = build_lstm(ctx, cur, model->prosody_pred->shared_lstm, cur->ne[1], gf); + + + struct ggml_tensor * f0_curve = cur; + f0_curve = ggml_cont(ctx, ggml_transpose(ctx, f0_curve)); + for (auto block : model->prosody_pred->f0_blocks) { + f0_curve = build_ada_residual_conv(ctx, f0_curve, block, style_half, model->sqrt_tensor); + } + f0_curve = ggml_cont(ctx, ggml_transpose(ctx, f0_curve)); + f0_curve = ggml_mul_mat(ctx, model->prosody_pred->f0_proj_kernel, f0_curve); + f0_curve = squeeze_3d_2d_e0(ctx, f0_curve); + f0_curve = ggml_add(ctx, f0_curve, model->prosody_pred->f0_proj_bias); + ggml_set_name(f0_curve, "f0_out"); + + struct ggml_tensor * n = cur; + n = ggml_cont(ctx, ggml_transpose(ctx, n)); + for (auto block : model->prosody_pred->n_blocks) { + n = build_ada_residual_conv(ctx, n, block, style_half, model->sqrt_tensor); + } + n = ggml_cont(ctx, ggml_transpose(ctx, n)); + n = ggml_mul_mat(ctx, model->prosody_pred->n_proj_kernel, n); + n = squeeze_3d_2d_e0(ctx, n); + n = ggml_add(ctx, n, model->prosody_pred->n_proj_bias); + ggml_set_name(n, "n_out"); + ggml_build_forward_expand(gf, n); + + // kokoro text encoding; + struct ggml_tensor * asr; + //struct ggml_tensor * embd; + { + cur = ggml_get_rows(ctx, model->text_encoder->embd, kctx->inp_tokens); + + for (auto l : model->text_encoder->conv_layers) { + cur = ggml_cont(ctx, ggml_transpose(ctx, ggml_add(ctx, ggml_conv_1d_tts(ctx, l->conv_weight, ggml_cont(ctx, ggml_transpose(ctx, cur)), 1, 2, 1), l->conv_bias))); + cur = ggml_norm(ctx, cur, 0.00001); + cur = ggml_add(ctx, ggml_mul(ctx, cur, l->norm_gamma), l->norm_beta); + cur = ggml_leaky_relu(ctx, cur, 0.2f, false); + } + + cur = build_lstm(ctx, cur, model->text_encoder->out_lstm, kctx->sequence_length, gf); + asr = ggml_mul_mat(ctx, ggml_cont(ctx, ggml_transpose(ctx, cur)), ggml_cont(ctx, ggml_transpose(ctx, kctx->duration_mask))); + } + + // decoding and generation prep + struct ggml_tensor * asr_res; + struct ggml_tensor * f0; + struct ggml_tensor * n_base; + struct ggml_tensor * style_half2 = ggml_view_1d(ctx, voice, voice->ne[0]/2, (batch.n_tokens - 3) * voice->nb[1]); + + { + f0 = ggml_add(ctx, ggml_conv_1d_tts(ctx, model->decoder->f0_conv, f0_curve, 2, 1, 1), model->decoder->f0_conv_bias); + n_base = ggml_add(ctx, ggml_conv_1d_tts(ctx, model->decoder->n_conv, n, 2, 1, 1), model->decoder->n_conv_bias); + cur = ggml_concat(ctx, ggml_concat(ctx, ggml_cont(ctx, ggml_transpose(ctx, asr)), f0, 1), n_base, 1); + cur = build_ada_residual_conv(ctx, cur, model->decoder->encoder_block, style_half2, model->sqrt_tensor); + ggml_build_forward_expand(gf, cur); + + asr_res = ggml_mul_mat(ctx, model->decoder->asr_conv, asr); + asr_res = ggml_add(ctx, asr_res, ggml_transpose(ctx, model->decoder->asr_conv_bias)); + + asr_res = ggml_cont(ctx, ggml_transpose(ctx, asr_res)); + for (auto l : model->decoder->decoder_blocks) { + cur = ggml_concat(ctx, ggml_concat(ctx, ggml_concat(ctx, cur, asr_res, 1), f0, 1), n_base, 1 ); + cur = build_ada_residual_conv(ctx, cur, l, style_half2, model->sqrt_tensor); + ggml_build_forward_expand(gf, cur); + } + cur = ggml_cont(ctx, ggml_transpose(ctx, cur)); + } + + kctx->window_sq_sum = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, kctx->total_duration*model->up_sampling_factor); + ggml_set_input(kctx->window_sq_sum); + + // run generation + cur = build_generator(ctx, model, kctx, cur, style_half2, f0_curve, model->decoder->generator, (int)kctx->sequence_length, kctx->window_sq_sum, gf); + ggml_build_forward_expand(gf, cur); + free_build(); + return gf; +} + +void kokoro_runner::prepare_post_load() { + model->post_load_assign(); + drunner->prepare_post_load(); + auto batch = build_worst_case_batch(); + auto gf = build_kokoro_graph(batch); + kctx->prep_schedule(gf); + free(batch.resp); +} + +void kokoro_runner::set_inputs(kokoro_ubatch & batch, uint32_t total_size) { + random_uniform_gen(total_size * model->up_sampling_factor * (model->harmonic_num + 1), ((float*)kctx->uv_noise_data->data) + 4); + ((float*) kctx->uv_noise_data->data)[0] = model->voice_threshold; + ((float*) kctx->uv_noise_data->data)[1] = model->noise_std; + ((float*) kctx->uv_noise_data->data)[2] = model->sin_amp; + ((float*) kctx->uv_noise_data->data)[3] = model->sin_amp / 3.0f; + compute_window_squared_sum(model->true_n_fft, model->stft_hop, total_size*model->up_sampling_factor/model->stft_hop, (float*) kctx->window_sq_sum->data, (float*) model->decoder->generator->window->data); + kctx->sequence_length = batch.n_tokens; + kctx->total_duration = total_size; + ggml_backend_tensor_set(kctx->inp_tokens, batch.input_tokens, 0, batch.n_tokens*ggml_element_size(kctx->inp_tokens)); + ggml_backend_tensor_set(kctx->duration_pred, batch.resp->hidden_states, 0, batch.n_tokens*(model->duration_hidden_size + model->style_half_size)*ggml_element_size(kctx->duration_pred)); + float * d = nullptr; + float running = 0; + d = (float *) kctx->duration_mask->data; + for (uint32_t i = 0; i < batch.n_tokens; i++) { + float next_running = running + batch.resp->lengths[i]; + for (uint32_t ii = 0; ii < total_size; ii++) { + d[i*total_size+ii] = ii >= running && ii < next_running ? 1.0f : 0.0f; + } + running = next_running; + } +} + +void kokoro_runner::run(kokoro_ubatch & batch, tts_response * outputs) { + batch.resp = new kokoro_duration_response; + drunner->run(batch); + + ggml_backend_sched_reset(kctx->sched); + + const size_t prev_size = kctx->buf_output ? ggml_backend_buffer_get_size(kctx->buf_output) : 0; + uint32_t total_length = 0; + for (int i = 0; i < batch.resp->n_outputs; i++) { + total_length += (uint32_t) batch.resp->lengths[i]; + } + const size_t new_size = total_length * model->up_sampling_factor * sizeof(float); + + if (!kctx->buf_output || prev_size < new_size) { + if (kctx->buf_output) { + ggml_backend_buffer_free(kctx->buf_output); + kctx->buf_output = nullptr; + kctx->logits = nullptr; + } + kctx->buf_output = ggml_backend_buft_alloc_buffer(kctx->backend_cpu_buffer, new_size); + } + + outputs->data = (float *) ggml_backend_buffer_get_base(kctx->buf_output); + ggml_backend_buffer_clear(kctx->buf_output, 0); + + kctx->sequence_length = batch.n_tokens; + kctx->total_duration = total_length; + + struct ggml_cgraph * gf = NULL; + gf = build_kokoro_graph(batch); + + // the output is always the last tensor in the graph + struct ggml_tensor * output = gf->nodes[gf->n_nodes - 1]; + + ggml_backend_sched_alloc_graph(kctx->sched, gf); + + set_inputs(batch, total_length); + + ggml_backend_sched_graph_compute_async(kctx->sched, gf); + + kctx->get_ggml_node_data(output, outputs->data, new_size); + + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(kctx->sched); + outputs->n_outputs = total_length*model->up_sampling_factor; + free(batch.resp); + return; +} + +void kokoro_runner::assign_weight(std::string name, ggml_tensor * tensor) { + model->assign_weight(name, tensor); +} + +/* + * #tokenize_chunks is used to split up a larger than max context size (512) token prompt into discrete + * blocks for generation. This solution, in accordance with Kokoro's pyTorch implementation, splits + * the prompt by sentence when possible (this can result in slower inference but generally produces cleaner + * speech). If a disinct sentence is too long, then it splits at the nearest space. + */ +std::vector> kokoro_runner::tokenize_chunks(std::vector clauses) { + std::vector> chunks; + for (auto clause : clauses) { + clause = strip(clause); + if (clause.empty()) { + continue; + } + std::vector tokens; + tokens.push_back(model->bos_token_id); + tokenizer->tokenize(clause, tokens); + // if there are more clause tokens than the max context length then try to split by space tokens. + // To be protective, split mid-word when there are no spaces (this should never happen). + if (tokens.size() > model->max_context_length - 2) { + // we skip the first token here becuase it is the bos token. + int last_space_token = 1; + int last_split = 1; + for (int i = 1; i < tokens.size(); i++) { + if (tokens[i] == model->space_token_id) { + last_space_token = i; + } + if ((i - last_split) + chunks.back().size() >= model->max_context_length - 1) { + if (last_space_token > last_split) { + std::vector portion = { model->bos_token_id }; + portion.insert(portion.end(), tokens.begin() + last_split, tokens.begin() + last_space_token); + portion.push_back(model->eos_token_id); + chunks.push_back(portion); + last_split = last_space_token; + } else { + std::vector portion = { model->bos_token_id }; + portion.insert(portion.end(), tokens.begin() + last_split, tokens.begin() + i + 1); + portion.push_back(model->eos_token_id); + chunks.push_back(portion); + last_split = i + 1; + } + } + } + if (last_split + 1 < tokens.size()) { + std::vector portion = { model->bos_token_id }; + portion.insert(portion.end(), tokens.begin() + last_split, tokens.end()); + portion.push_back(model->eos_token_id); + chunks.push_back(portion); + } + } else { + tokens.push_back(model->eos_token_id); + chunks.push_back(tokens); + } + } + return chunks; +} + +int kokoro_runner::generate(std::string prompt, struct tts_response * response, std::string voice, std::string voice_code) { + if (model->voices.find(voice) == model->voices.end()) { + TTS_ABORT("Failed to find Kokoro voice '%s' aborting.\n", voice.c_str()); + } else { + // if the language changed then we should change the phonemization voice + if (phmzr->mode == ESPEAK && kctx->voice[0] != voice[0]) { + if (voice_code.empty()) { + voice_code = get_espeak_id_from_kokoro_voice(voice); + } + update_voice(voice_code); + } + kctx->voice = voice; + drunner->kctx->voice = voice; + } + // replace all non-sentence terminating characters with '--' which espeak will treat as a pause. + // We preserve the other punctuation for cleaner chunking pre-tokenization + prompt = replace_any(prompt, ",;:", "--"); + prompt = replace_any(prompt, "\n", " "); + std::string phonemized_prompt = phmzr->text_to_phonemes(prompt); + + // Kokoro users a utf-8 single character tokenizer so if the size of the prompt is smaller than the max context length without the + // beginning of sentence and end of sentence tokens then we can compute it all at once. + if (phonemized_prompt.size() < model->max_context_length - 2) { + // we preserved punctuation and Kokoro interprets these tokens as end of sentence tokens, so we have to remove them for all-at-once compute. + phonemized_prompt = strip(replace_any(phonemized_prompt, ".!?", "")); + if (phonemized_prompt.empty()) { + return 0; + } + std::vector tokens; + tokens.push_back(model->bos_token_id); + tokenizer->tokenize(phonemized_prompt, tokens); + tokens.push_back(model->eos_token_id); + kokoro_ubatch batch; + batch.n_tokens = tokens.size(); + batch.input_tokens = tokens.data(); + run(batch, response); + } else { + // TODO: determine the performance to memory trade off in using a batched compute approach verse this chunking approach. + // This approach is likely to be slower than a batched approach, but given the already huge memory overhead of Kokoro's graph it + // might be preferable to use this chunking approach. + std::vector clauses = split(phonemized_prompt, ".!?"); + for (auto tokens : tokenize_chunks(clauses)) { + kokoro_ubatch batch; + batch.n_tokens = tokens.size(); + batch.input_tokens = tokens.data(); + struct tts_response * partial = new tts_response; + run(batch, partial); + append_to_response(response, partial); + } + } + return 0; +} + +std::vector kokoro_runner::list_voices() { + std::vector voices; + voices.reserve(model->voices.size()); + for (auto voice : model->voices) { + voices.push_back(voice.first); + } + return voices; +} + + +std::string get_espeak_id_from_kokoro_voice(std::string voice) { + return !voice.empty() && KOKORO_LANG_TO_ESPEAK_ID.find(voice[0]) != KOKORO_LANG_TO_ESPEAK_ID.end() ? KOKORO_LANG_TO_ESPEAK_ID[voice[0]] : "gmw/en-US"; +} + +struct kokoro_duration_context * build_new_duration_kokoro_context(struct kokoro_model * model, int n_threads, bool use_cpu) { + kokoro_duration_context * kctx = new kokoro_duration_context(model, n_threads); + if (!use_cpu) { +#ifdef GGML_USE_METAL + kctx->backend = ggml_backend_metal_init(); +#endif + } + kctx->backend_cpu = ggml_backend_cpu_init(); + kctx->set_threads(); + kctx->build_schedule(); + kctx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_duration_nodes()*5 + ggml_graph_overhead_custom(model->max_duration_nodes()*5, false)); + return kctx; +} + + +struct kokoro_context * build_new_kokoro_context(struct kokoro_model * model, int n_threads, bool use_cpu) { + kokoro_context * kctx = new kokoro_context(model, n_threads); + if (!use_cpu) { +#ifdef GGML_USE_METAL + kctx->backend = ggml_backend_metal_init(); +#endif + } + kctx->backend_cpu = ggml_backend_cpu_init(); + kctx->set_threads(); + kctx->build_schedule(); + kctx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_gen_nodes()*30 + ggml_graph_overhead_custom(model->max_gen_nodes()*30, false)); + return kctx; +} diff --git a/otherarch/ttscpp/src/kokoro_model.h b/otherarch/ttscpp/src/kokoro_model.h new file mode 100644 index 000000000..b4f4f9671 --- /dev/null +++ b/otherarch/ttscpp/src/kokoro_model.h @@ -0,0 +1,462 @@ +#ifndef kokoro_model_h +#define kokoro_model_h + +#include +#include "tts_model.h" +#include "tokenizer.h" +#include "phonemizer.h" + +// Rather than using ISO 639-2 language codes, Kokoro voice pack specify their corresponding language via their first letter. +// Below is a map that describes the relationship between those designations and espeak-ng's voice identifiers so that the +// appropriate phonemization protocol can inferred from the Kokoro voice. +static std::map KOKORO_LANG_TO_ESPEAK_ID = { + {'a', "gmw/en-US"}, + {'b', "gmw/en"}, + {'e', "roa/es"}, + {'f', "roa/fr"}, + {'h', "inc/hi"}, + {'i', "roa/it"}, + {'j', "jpx/ja"}, + {'p', "roa/pt-BR"}, + {'z', "sit/cmn"} +}; + +struct lstm_cell { + std::vector weights; + std::vector biases; + std::vector reverse_weights; + std::vector reverse_biases; +}; + +struct lstm { + std::vector hidden; + std::vector states; + + bool bidirectional = false; + std::vector cells; +}; + +struct duration_predictor_layer { + lstm * rnn; + struct ggml_tensor * ada_norm_gamma_weight; + struct ggml_tensor * ada_norm_gamma_bias; + struct ggml_tensor * ada_norm_beta_weight; + struct ggml_tensor * ada_norm_beta_bias; +}; + +struct ada_residual_conv_block { + struct ggml_tensor * conv1; + struct ggml_tensor * conv1_bias; + struct ggml_tensor * conv2; + struct ggml_tensor * conv2_bias; + struct ggml_tensor * norm1_gamma; + struct ggml_tensor * norm1_gamma_bias; + struct ggml_tensor * norm1_beta; + struct ggml_tensor * norm1_beta_bias; + struct ggml_tensor * norm2_gamma; + struct ggml_tensor * norm2_gamma_bias; + struct ggml_tensor * norm2_beta; + struct ggml_tensor * norm2_beta_bias; + struct ggml_tensor * pool = nullptr; + struct ggml_tensor * pool_bias = nullptr; + struct ggml_tensor * upsample = nullptr; + struct ggml_tensor * upsample_bias = nullptr; +}; + +struct duration_predictor { + struct ggml_tensor * albert_encode; + struct ggml_tensor * albert_encode_bias; + std::vector layers; + lstm * duration_proj_lstm; + struct ggml_tensor * duration_proj; + struct ggml_tensor * duration_proj_bias; + struct ggml_tensor * n_proj_kernel; + struct ggml_tensor * n_proj_bias; + struct ggml_tensor * f0_proj_kernel; + struct ggml_tensor * f0_proj_bias; + lstm * shared_lstm; + std::vector f0_blocks; + std::vector n_blocks; +}; + +struct kokoro_text_encoder_conv_layer { + struct ggml_tensor * norm_gamma; + struct ggml_tensor * norm_beta; + struct ggml_tensor * conv_weight; + struct ggml_tensor * conv_bias; +}; + +struct kokoro_text_encoder { + struct ggml_tensor * embd; + std::vector conv_layers; + lstm * out_lstm; +}; + +struct kokoro_generator_residual_block { + std::vector conv1_dilations; + std::vector conv1_paddings; + + std::vector adain1d_1_gamma_weights; + std::vector adain1d_2_gamma_weights; + std::vector adain1d_1_gamma_biases; + std::vector adain1d_2_gamma_biases; + std::vector adain1d_1_beta_weights; + std::vector adain1d_2_beta_weights; + std::vector adain1d_1_beta_biases; + std::vector adain1d_2_beta_biases; + std::vector input_alphas; + std::vector output_alphas; + std::vector convs1_weights; + std::vector convs1_biases; + std::vector convs2_weights; + std::vector convs2_biases; +}; + +struct kokoro_noise_residual_block { + uint32_t input_conv_stride; + uint32_t input_conv_padding; + + struct ggml_tensor * input_conv; + struct ggml_tensor * input_conv_bias; + struct kokoro_generator_residual_block * res_block; +}; + +struct kokoro_generator_upsample_block { + uint32_t padding; + uint32_t stride; + + // these are just conv transpose layers + struct ggml_tensor * upsample_weight; + struct ggml_tensor * upsample_bias; +}; + +struct kokoro_generator { + // unfortunately the squared sum of the windows needs to be computed dynamically per run because it is dependent + // on the sequence size of the generation and the hop is typically less than half the size of our window. + struct ggml_tensor * window; + + struct ggml_tensor * m_source_weight; + struct ggml_tensor * m_source_bias; + struct ggml_tensor * out_conv_weight; + struct ggml_tensor * out_conv_bias; + std::vector noise_blocks; + std::vector res_blocks; + std::vector ups; +}; + +struct kokoro_decoder { + struct ggml_tensor * f0_conv; + struct ggml_tensor * f0_conv_bias; + struct ggml_tensor * n_conv; + struct ggml_tensor * n_conv_bias; + struct ggml_tensor * asr_conv; + struct ggml_tensor * asr_conv_bias; + std::vector decoder_blocks; + ada_residual_conv_block* encoder_block; + kokoro_generator * generator; +}; + +struct albert_layer { + struct ggml_tensor * ffn; + struct ggml_tensor * ffn_out; + struct ggml_tensor * ffn_bias; + struct ggml_tensor * ffn_out_bias; + struct ggml_tensor * layer_output_norm_weight; + struct ggml_tensor * layer_output_norm_bias; + struct ggml_tensor * q; + struct ggml_tensor * k; + struct ggml_tensor * v; + struct ggml_tensor * o; + struct ggml_tensor * q_bias; + struct ggml_tensor * k_bias; + struct ggml_tensor * v_bias; + struct ggml_tensor * o_bias; + struct ggml_tensor * attn_norm_weight; + struct ggml_tensor * attn_norm_bias; +}; + +struct kokoro_model : tts_model { + // standard configruation for Kokoro's Albert model + // tokenization + uint32_t bos_token_id = 0; + uint32_t eos_token_id = 0; + uint32_t space_token_id = 16; + // duration prediction + uint32_t max_context_length = 512; + uint32_t vocab_size = 178; + uint32_t hidden_size = 768; + uint32_t n_attn_heads = 12; + uint32_t n_layers = 1; + uint32_t n_recurrence = 12; + uint32_t head_size = 64; + uint32_t duration_hidden_size = 512; + uint32_t up_sampling_factor; + float upsample_scale = 300.0f; + float scale = 0.125f; + + // standard configuration for duration prediction + uint32_t f0_n_blocks = 3; + uint32_t n_duration_prediction_layers = 3; + // while it is technically possible for the duration predictor to assign 50 values per token there is no practical need to + // allocate that many items to the sequence as it is impossible for all tokens to require such long durations and each + // allocation increases node allocation size by O(N) + uint32_t max_duration_per_token = 20; + uint32_t style_half_size = 128; + + // standard text encoding configuration + uint32_t n_conv_layers = 3; + + // standard decoder configuration + uint32_t n_kernels = 3; + uint32_t n_upsamples = 2; + uint32_t n_decoder_blocks = 4; + uint32_t n_res_blocks = 6; + uint32_t n_noise_blocks = 2; + uint32_t out_conv_padding = 3; + uint32_t post_n_fft = 11; + uint32_t true_n_fft = 20; + uint32_t stft_hop = 5; + uint32_t harmonic_num = 8; + float sin_amp = 0.1f; + float noise_std = 0.003f; + float voice_threshold = 10.0f; + float sample_rate = 24000.0f; + std::string window = "hann"; + + // It is really annoying that ggml doesn't allow using non ggml tensors as the operator for simple math ops. + // This is just the constant defined above as a tensor. + struct ggml_tensor * n_kernels_tensor; + + // Kokoro loads albert with use_pooling = true but doesn't use the pooling outputs. + bool uses_pooling = false; + bool static_token_types = true; + + std::map voices; + + // Albert portion of the model + struct ggml_tensor * embd_hidden; + struct ggml_tensor * embd_hidden_bias; + struct ggml_tensor * token_type_embd = nullptr; + struct ggml_tensor * token_embd; + struct ggml_tensor * position_embd; + struct ggml_tensor * input_norm_weight; + struct ggml_tensor * input_norm_bias; + struct ggml_tensor * static_token_type_values = nullptr; + struct ggml_tensor * pool = nullptr; + struct ggml_tensor * pool_bias = nullptr; + std::vector layers; + + struct ggml_tensor * harmonic_sampling_norm = nullptr; // a static 1x9 harmonic multiplier + struct ggml_tensor * sampling_factor_scalar = nullptr; // a static scalar + struct ggml_tensor * sqrt_tensor = nullptr; // static tensor for constant division + + // Prosody Predictor portion of the model + struct duration_predictor * prosody_pred; + + // Text encoding portion of the model + struct kokoro_text_encoder * text_encoder; + + // Decoding and Generation portion of the model + struct kokoro_decoder * decoder; + + // the default hidden states need to be initialized + std::vector lstms; + + size_t duration_node_counter = 0; + size_t generation_node_counter = 0; + // setting this is likely unnecessary as it is precomputed by the post load function. + uint32_t post_load_tensor_bytes = 13000; + + size_t max_gen_nodes(); + size_t max_duration_nodes(); + + lstm * prep_lstm(); + // helper functions for assigning tensors to substructs + void assign_lstm(lstm * rnn, std::string name, ggml_tensor * tensor); + void assign_generator_weight(kokoro_generator * generator, std::string name, ggml_tensor * tensor); + void assign_gen_resblock(kokoro_generator_residual_block * block, std::string name, ggml_tensor * tensor); + void assign_ada_res_block(ada_residual_conv_block * block, std::string name, ggml_tensor * tensor); + void assign_decoder_weight(std::string name, ggml_tensor * tensor); + void assign_duration_weight(std::string name, ggml_tensor * tensor); + void assign_text_encoder_weight(std::string name, ggml_tensor * tensor); + void assign_albert_weight(std::string name, ggml_tensor * tensor); + + + void post_load_assign(); + void assign_weight(std::string name, ggml_tensor * tensor); + void prep_layers(gguf_context * meta); + void prep_constants(gguf_context * meta); + void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only = true) { + std::function fn = ([&](ggml_tensor* cur) { + std::string name = ggml_get_name(cur); + size_t increment = 1; + if (name.find("lstm") != std::string::npos) { + increment = max_context_length; + } + if (name.find("duration_predictor") != std::string::npos) { + duration_node_counter += increment; + } else { + generation_node_counter += increment; + } + }); + compute_tensor_meta_cb = &fn; + prep_constants(meta_ctx); + prep_layers(meta_ctx); + tts_model::setup_from_file(meta_ctx, load_context, cpu_only, "kokoro", 1.6, post_load_tensor_bytes); + } +}; + +struct kokoro_ubatch { + size_t n_tokens; // the number of tokens in our encoded sequence + uint32_t * input_tokens; // [n_tokens] + struct kokoro_duration_response * resp = nullptr; +}; + +struct kokoro_duration_context : runner_context { + kokoro_duration_context(kokoro_model * model, int n_threads): runner_context(n_threads), model(model) {}; + ~kokoro_duration_context() { + ggml_backend_buffer_free(buf_len_output); + } + + std::string voice = "af_alloy"; + struct kokoro_model * model; + ggml_backend_buffer_t buf_len_output = nullptr; + + + size_t logits_size = 0; // capacity (of floats) for logits + float * lens = nullptr; + + struct ggml_tensor * inp_tokens; + struct ggml_tensor * positions; + struct ggml_tensor * attn_mask; + struct ggml_tensor * token_types = nullptr; + + void build_schedule() { + runner_context::build_schedule(model->max_duration_nodes()*5); + } +}; + +static struct ggml_tensor * build_albert_attn_mask(ggml_context * ctx, struct kokoro_duration_context *kctx, const kokoro_ubatch & batch); +static struct ggml_tensor * build_albert_inputs(ggml_context * ctx, kokoro_model * model, ggml_tensor * input_tokens, ggml_tensor * positions, ggml_tensor * token_types); +static struct ggml_tensor * build_albert_norm(ggml_context * ctx, ggml_tensor * cur, ggml_tensor * weight, ggml_tensor * bias); +static struct ggml_tensor * build_ada_residual_conv(ggml_context * ctx, struct ggml_tensor * x, ada_residual_conv_block * block, struct ggml_tensor * style, struct ggml_tensor * sqrt_tensor); +static struct ggml_tensor * build_kokoro_generator_res_block(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * style, kokoro_generator_residual_block * block); +static struct ggml_tensor * build_noise_block(ggml_context * ctx, kokoro_noise_residual_block * block, struct ggml_tensor * x, struct ggml_tensor * style); +static kokoro_generator_residual_block * build_res_block_from_file(gguf_context * meta, std::string base_config_key); +static kokoro_noise_residual_block * build_noise_block_from_file(gguf_context * meta, int index); +static kokoro_generator_upsample_block* kokoro_generator_upsample_block(gguf_context * meta, int index); + +std::string get_espeak_id_from_kokoro_voice(std::string voice); +struct kokoro_duration_context * build_new_duration_kokoro_context(struct kokoro_model * model, int n_threads, bool use_cpu = true); + +struct kokoro_duration_response { + size_t n_outputs; + float * lengths; + float * hidden_states; +}; + +// This struct is intended to manage graph and compute for the duration prediction portion of the kokoro model. +// Duration computation and speech generation are separated into distinct graphs because the precomputed graph structure of ggml doesn't +// support the tensor dependent views that would otherwise be necessary. +struct kokoro_duration_runner : tts_runner { + kokoro_duration_runner(kokoro_model * model, kokoro_duration_context * context, single_pass_tokenizer * tokenizer): model(model), kctx(context), tokenizer(tokenizer) {}; + ~kokoro_duration_runner() { + if (ctx) { + ggml_free(ctx); + } + model->free(); + delete model; + delete kctx; + } + struct single_pass_tokenizer * tokenizer; + kokoro_model * model; + kokoro_duration_context * kctx; + + void init_build() { + tts_runner::init_build(&kctx->buf_compute_meta); + } + + void prepare_post_load(); + struct kokoro_ubatch build_worst_case_batch(); + void set_inputs(kokoro_ubatch & batch); + struct ggml_cgraph * build_kokoro_duration_graph(kokoro_ubatch & batch); + void run(kokoro_ubatch & ubatch); +}; + +struct kokoro_context : runner_context { + kokoro_context(kokoro_model * model, int n_threads): runner_context(n_threads), model(model) {}; + ~kokoro_context() { + ggml_backend_sched_free(sched); + ggml_backend_free(backend_cpu); + if (backend) { + ggml_backend_free(backend); + } + if (buf_output) { + ggml_backend_buffer_free(buf_output); + } + } + + std::string voice = "af_alloy"; + + struct kokoro_model * model; + + uint32_t total_duration; + uint32_t sequence_length; + + struct ggml_tensor * inp_tokens; + struct ggml_tensor * duration_pred; + struct ggml_tensor * duration_mask; + struct ggml_tensor * window_sq_sum; // needs to be calculatd from the generator window. + struct ggml_tensor * uv_noise_data; + + void build_schedule() { + runner_context::build_schedule(model->max_gen_nodes()*30); + } +}; + +// TODO: now that we are passing the context down to these methods we should clean up their parameters +static struct ggml_tensor * build_generator(ggml_context * ctx, kokoro_model * model, kokoro_context * kctx, struct ggml_tensor * x, struct ggml_tensor * style, struct ggml_tensor * f0_curve, kokoro_generator* generator, int sequence_length, struct ggml_tensor * window_sq_sum, ggml_cgraph * gf); +static struct ggml_tensor * build_sin_gen(ggml_context * ctx, kokoro_model * model, kokoro_context * kctx, struct ggml_tensor * x, int harmonic_num, int sequence_length, float voice_threshold, float sin_amp, float noise_std); + +struct kokoro_context * build_new_kokoro_context(struct kokoro_model * model, int n_threads, bool use_cpu = true); + +// This manages the graph compilation of computation for the Kokoro model. +struct kokoro_runner : tts_runner { + kokoro_runner(kokoro_model * model, kokoro_context * context, single_pass_tokenizer * tokenizer, kokoro_duration_runner * drunner, phonemizer * phmzr): model(model), kctx(context), tokenizer(tokenizer), drunner(drunner), phmzr(phmzr) { + tts_runner::sampling_rate = 24000.0f; + tts_runner::supports_voices = true; + }; + ~kokoro_runner() { + if (ctx) { + ggml_free(ctx); + } + delete drunner; + model->free(); + delete model; + delete kctx; + delete phmzr; + } + struct single_pass_tokenizer * tokenizer; + kokoro_model * model; + kokoro_context * kctx; + kokoro_duration_runner * drunner; + phonemizer * phmzr; + + std::string default_voice = "af_alloy"; + + void init_build() { + tts_runner::init_build(&kctx->buf_compute_meta); + } + + std::vector list_voices(); + std::vector> tokenize_chunks(std::vector clauses); + void assign_weight(std::string name, ggml_tensor * tensor); + void prepare_post_load(); + kokoro_ubatch build_worst_case_batch(); + void set_inputs(kokoro_ubatch & batch, uint32_t total_size); + struct ggml_cgraph * build_kokoro_graph(kokoro_ubatch & batch); + void run(kokoro_ubatch & batch, struct tts_response * outputs); + int generate(std::string prompt, struct tts_response * response, std::string voice, std::string voice_code = ""); +}; + +#endif diff --git a/otherarch/ttscpp/src/orpheus_model.cpp b/otherarch/ttscpp/src/orpheus_model.cpp new file mode 100644 index 000000000..4866af208 --- /dev/null +++ b/otherarch/ttscpp/src/orpheus_model.cpp @@ -0,0 +1,475 @@ +#include "orpheus_model.h" + +#include + +// These tokens and variables aren't defined in the Orpheus' model configuration but instead are defined inline in various python functions. +// As such, they are not discoverable so defining them as unconfigurable constants should be fine. +static constexpr std::array orpheus_voices{"zoe", "zac","jess", "leo", "mia", "julia", "leah"}; +static constexpr std::array orpheus_prepended_tokens = { 128259, 128000 }; +static constexpr std::array orpheus_appended_tokens = { 128009, 128260, 128261, 128257 }; + +void orpheus_model::assign_weight(std::string name, struct ggml_tensor * tensor) { + if (name == "norm") { + output_norm = ggml_dup_tensor(ctx, tensor); + set_tensor(output_norm, tensor); + } else if (name == "lm_head") { + head = ggml_dup_tensor(ctx, tensor); + set_tensor(head, tensor); + } else if (name == "embed_tokens") { + embd = ggml_dup_tensor(ctx, tensor); + set_tensor(embd, tensor); + } else if (name == "rope_frequencies") { + rope_frequencies = ggml_dup_tensor(ctx, tensor); + set_tensor(rope_frequencies, tensor); + } else if (has_prefix(name, "layers")) { + auto lpair = parse_layer_count(name); + int l = lpair.first; + std::string lt_name = lpair.second; + assign_to_layer(lt_name, layers[l], tensor); + } +} + +void orpheus_model::assign_to_layer(std::string part, orpheus_layer & layer, struct ggml_tensor * tensor) { + if (part == ".self_attn.k_proj") { + layer.k = ggml_dup_tensor(ctx, tensor); + set_tensor(layer.k, tensor); + } else if (part == ".self_attn.q_proj") { + layer.q = ggml_dup_tensor(ctx, tensor); + set_tensor(layer.q, tensor); + } else if (part == ".self_attn.v_proj") { + layer.v = ggml_dup_tensor(ctx, tensor); + set_tensor(layer.v, tensor); + } else if (part == ".self_attn.o_proj") { + layer.o = ggml_dup_tensor(ctx, tensor); + set_tensor(layer.o, tensor); + } else if (part == ".mlp.gate_proj") { + layer.gate = ggml_dup_tensor(ctx, tensor); + set_tensor(layer.gate, tensor); + } else if (part == ".mlp.up_proj") { + layer.up = ggml_dup_tensor(ctx, tensor); + set_tensor(layer.up, tensor); + } else if (part == ".mlp.down_proj") { + layer.down = ggml_dup_tensor(ctx, tensor); + set_tensor(layer.down, tensor); + } else if (part == ".input_layernorm") { + layer.input_norm = ggml_dup_tensor(ctx, tensor); + set_tensor(layer.input_norm, tensor); + } else if (part == ".post_attention_layernorm") { + layer.post_attention_norm = ggml_dup_tensor(ctx, tensor); + set_tensor(layer.post_attention_norm, tensor); + } +} + +void orpheus_model::prep_constants(gguf_context * meta) { + // get constants for orpheus + int vocab_size_key = gguf_find_key(meta, "orpheus.vocab_size"); + if (vocab_size_key != -1) { + vocab_size = gguf_get_val_u32(meta, vocab_size_key); + } + + int attn_heads_key = gguf_find_key(meta, "orpheus.attn_heads"); + if (attn_heads_key != -1) { + n_attn_heads = gguf_get_val_u32(meta, attn_heads_key); + } + + int kv_attn_heads_key = gguf_find_key(meta, "orpheus.kv_attn_heads"); + if (kv_attn_heads_key != -1) { + n_kv_attn_heads = gguf_get_val_u32(meta, kv_attn_heads_key); + } + + int head_size_key = gguf_find_key(meta, "orpheus.head_dim"); + if (head_size_key != -1) { + head_size = gguf_get_val_u32(meta, head_size_key); + } + + int stopping_token_key = gguf_find_key(meta, "orpheus.stopping_token_id"); + if (stopping_token_key != -1) { + stopping_token_id = gguf_get_val_u32(meta, stopping_token_key);; + } + + int eos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.eos_token_id"); + if (eos_token_id_key != -1) { + eos_token_id = gguf_get_val_u32(meta, eos_token_id_key); + } + + int bos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.bos_token_id"); + if (bos_token_id_key != -1) { + bos_token_id = gguf_get_val_u32(meta, bos_token_id_key); + } + + int hidden_size_key = gguf_find_key(meta, "orpheus.hidden_size"); + if (hidden_size_key != -1) { + hidden_size = gguf_get_val_u32(meta, hidden_size_key); + } + + int kv_hidden_size_key = gguf_find_key(meta, "orpheus.kv_hidden_size"); + if (kv_hidden_size_key != -1) { + kv_hidden_size = gguf_get_val_u32(meta, kv_hidden_size_key); + } +} + +void orpheus_model::prep_layers(gguf_context * meta) { + int n_layers_key = gguf_find_key(meta, "orpheus.layers"); + if (n_layers_key == -1) { + TTS_ABORT("the 'orpheus.layers' must be specified in the GGUF file."); + } + n_layers = (int) gguf_get_val_u32(meta, n_layers_key); + for (int i = 0; i < n_layers; i++) { + layers.push_back(orpheus_layer{}); + } +} + +struct ggml_tensor * orpheus_build_layer_norm(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * weight) { + float eps = 0.00001; + return ggml_mul(ctx, ggml_rms_norm(ctx, x, eps), weight); +} + +struct ggml_tensor * build_attn_mask(ggml_context * ctx, orpheus_context * octx, orpheus_ubatch & batch) { + octx->attn_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, (int64_t) octx->current_position + batch.n_tokens, (int64_t) octx->current_position + batch.n_tokens); + ggml_set_input(octx->attn_mask); + return octx->attn_mask; +} + + void orpheus_context::reset() { + output_tokens.clear(); + current_position = 0; + n_outputs = 0; + } + +orpheus_context * build_new_orpheus_context(orpheus_model * model, int n_threads, bool use_cpu) { + orpheus_context * octx = new orpheus_context(model, n_threads); + if (!use_cpu) { +#ifdef GGML_USE_METAL + octx->backend = ggml_backend_metal_init(); +#endif + } + octx->backend_cpu = ggml_backend_cpu_init(); + octx->set_threads(); + octx->build_schedule(); + octx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_nodes() + ggml_graph_overhead_custom(model->max_nodes(), false)); + return octx; +} + +void orpheus_runner::orpheus_kv_cache_init() { + ggml_backend_buffer_type_t buft = nullptr; + if (octx->backend != nullptr) { +#ifdef GGML_USE_METAL + buft = ggml_backend_metal_buffer_type(); +#endif + } else { + buft = ggml_backend_cpu_buffer_type(); + } + + struct ggml_init_params params = { + /*.mem_size =*/ (2u * model->layers.size() + 1)*ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ggml_context * ctx = ggml_init(params); + if (!ctx) { + TTS_ABORT("%s: failed to initialze ggml context for key value cache.\n", __func__); + } + if (!kv_self) { + kv_self = new orpheus_kv_cache; + } + kv_self->ctx = ctx; + kv_self->k_l.reserve(model->layers.size()); + kv_self->v_l.reserve(model->layers.size()); + + for (int i = 0; i < (int) model->layers.size(); i++) { + ggml_tensor * k = ggml_new_tensor_1d(kv_self->ctx, kv_self->cache_type, model->hidden_size * (model->max_context_length + model->max_generation_size)); + ggml_tensor * v = ggml_new_tensor_1d(kv_self->ctx, kv_self->cache_type, model->hidden_size * (model->max_context_length + model->max_generation_size)); + ggml_format_name(k, "cache_k_l%d", i); + ggml_format_name(v, "cache_v_l%d", i); + kv_self->k_l.push_back(k); + kv_self->v_l.push_back(v); + } + + // allocate tensors and initialize the buffers to avoid NaNs in the padding + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(kv_self->ctx, buft); + ggml_backend_buffer_clear(buf, 0); + kv_self->buf = buf; + } + + void orpheus_runner::orpheus_build_kv_store(struct ggml_context * ctx, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, int index, uint32_t n_tokens, int repeat) { + k_cur = ggml_rope_ext(ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, k_cur, model->head_size, model->n_kv_attn_heads, n_tokens)), octx->positions, model->rope_frequencies, + model->head_size, 2,0, 500000.0f, + 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); + + // A performance comparison between this method, i.e. performing 3 incremental copy operations in order to achieve repeat_interleave, + // and performing the repeat operation upfront before performign a single copy needs to be performed in order to better optimize this function. + // Additionally, it might be more performant for the values transposition to be performed prior to appending it to the cache, as it would save us + // from incrementally larger transpositions with generation. + for (int i = 0; i < repeat; i++) { + struct ggml_tensor * k_cache_view = ggml_view_3d( + ctx, + kv_self->k_l[index], + model->head_size, + model->n_kv_attn_heads, + n_tokens, + ggml_element_size(kv_self->k_l[index]) * model->head_size * repeat, + ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size, + ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size * octx->current_position + i * ggml_element_size(kv_self->k_l[index]) * model->head_size + ); + ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view)); + + struct ggml_tensor * v_cache_view = ggml_view_3d( + ctx, + kv_self->v_l[index], + model->head_size, + model->n_kv_attn_heads, + n_tokens, + ggml_element_size(kv_self->k_l[index]) * model->head_size * repeat, + ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size, + ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size * octx->current_position + i * ggml_element_size(kv_self->k_l[index]) * model->head_size + ); + ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view)); + } +} + +struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch) { + init_build(); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + const int32_t full_sequence_length = octx->current_position + (uint32_t) batch.n_tokens; + octx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens); + ggml_set_input(octx->positions); + octx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens); + ggml_set_input(octx->inp_tokens); + inpL = ggml_get_rows(ctx, model->embd, octx->inp_tokens); + + struct ggml_tensor * KQ_mask_dec = build_attn_mask(ctx, octx, batch); + + for (int l = 0; l < model->n_layers; l++) { + struct ggml_tensor * residual = inpL; + cur = orpheus_build_layer_norm(ctx, inpL, model->layers[l].input_norm); + + struct ggml_tensor * attn_out; + + // self-attention + { + struct ggml_tensor * Qcur = ggml_mul_mat(ctx, model->layers[l].q, cur); + struct ggml_tensor * Kcur = ggml_mul_mat(ctx, model->layers[l].k, cur); + struct ggml_tensor * Vcur = ggml_mul_mat(ctx, model->layers[l].v, cur); + + orpheus_build_kv_store(ctx, gf, Kcur, Vcur, l, batch.n_tokens, 3); + struct ggml_tensor * k = + ggml_cont(ctx, ggml_view_3d(ctx, kv_self->k_l[l], + model->head_size, full_sequence_length, model->n_attn_heads, + ggml_element_size(kv_self->k_l[l]) * model->n_attn_heads * model->head_size, + ggml_element_size(kv_self->k_l[l]) * model->head_size, + 0)); + + struct ggml_tensor * v = + ggml_view_2d(ctx, kv_self->v_l[l], + model->hidden_size, full_sequence_length, + ggml_element_size(kv_self->k_l[l]) * model->hidden_size, + 0); + + v = ggml_cont_3d(ctx, ggml_transpose(ctx, v), full_sequence_length, model->head_size, model->n_attn_heads); + + Qcur = ggml_rope_ext( + ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, Qcur, model->head_size, model->n_attn_heads, batch.n_tokens)), + octx->positions, model->rope_frequencies, model->head_size, 2, 0, 500000.0f, // rope theta + 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); + + struct ggml_tensor * q = ggml_cont(ctx, ggml_permute(ctx, Qcur, 0, 2, 1, 3)); + struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); + kq = ggml_soft_max_ext(ctx, kq, KQ_mask_dec, 1.0f/sqrtf(model->head_size), 0.0f); + struct ggml_tensor * kqv = ggml_mul_mat(ctx, kq, v); + struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3); + attn_out = ggml_cont_2d(ctx, kqv_merged, model->hidden_size, batch.n_tokens); + attn_out = ggml_mul_mat(ctx, model->layers[l].o, attn_out); + } + + cur = ggml_add(ctx, attn_out, residual); + + struct ggml_tensor * residualffn = cur; + + // mlp + { + cur = orpheus_build_layer_norm(ctx, cur, model->layers[l].post_attention_norm); + cur = ggml_mul(ctx, ggml_silu(ctx, ggml_mul_mat(ctx, model->layers[l].gate, cur)), ggml_mul_mat(ctx, model->layers[l].up, cur)); + cur = ggml_mul_mat(ctx, model->layers[l].down, cur); + } + cur = ggml_add(ctx, cur, residualffn); + inpL = cur; + } + + cur = orpheus_build_layer_norm(ctx, cur, model->output_norm); + // only about 40k of the output head is actually uses for generation purposes. Ideally the head tensor should be shrunk and sampled tokens should be incremented. + cur = ggml_mul_mat(ctx, model->head, cur); + if (batch.n_tokens > 1) { + cur = ggml_cont(ctx, ggml_view_1d(ctx, cur, model->vocab_size, ggml_element_size(cur) * (cur->ne[1] - 1) * model->vocab_size)); + } + ggml_build_forward_expand(gf, cur); + free_build(); + + return gf; +} + +void orpheus_runner::decode(orpheus_ubatch & batch) { + ggml_backend_sched_reset(octx->sched); + + octx->output_tokens.reserve(model->max_generation_size); + + const size_t new_size = model->vocab_size * model->max_generation_size * sizeof(float); + octx->prep_output_buffer(new_size); + + ggml_cgraph * gf = build_orpheus_graph(batch); + + // the output is always the last tensor in the graph + struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; + ggml_backend_sched_alloc_graph(octx->sched, gf); + + set_inputs(batch); + ggml_backend_sched_graph_compute_async(octx->sched, gf); + + float * logits_out = octx->logits + octx->n_outputs * model->vocab_size; + octx->get_ggml_node_data(res, logits_out, model->vocab_size * sizeof(float)); + + // update the total number of outputs retrieved and the current position + octx->current_position += batch.n_tokens; + + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(octx->sched); +} + +void orpheus_runner::set_inputs(orpheus_ubatch & batch) { + ggml_backend_tensor_set(octx->inp_tokens, batch.tokens.data(), 0, batch.tokens.size()*ggml_element_size(octx->inp_tokens)); + int32_t * pos = (int32_t*) octx->positions->data; + float * mask = (float*) octx->attn_mask->data; + uint32_t max_pos = octx->current_position + batch.n_tokens; + for (int i = 0; i < batch.n_tokens; i++) { + pos[i] = (int32_t) octx->current_position + i; + for (int ii = 0; ii < max_pos; ii++) { + mask[i*max_pos + ii] = ii > pos[i] ? -INFINITY : 0.0f; + } + } +} + +orpheus_ubatch orpheus_runner::batch_from_sentence(std::string sentence) { + struct orpheus_ubatch batch; + for (auto t : orpheus_prepended_tokens) { + batch.tokens.push_back(t); + } + if (!octx->voice.empty()) { + sentence = octx->voice + ": " + sentence; + } + tokenizer->tokenize(sentence, batch.tokens); + for (auto t : orpheus_appended_tokens) { + batch.tokens.push_back(t); + } + batch.n_tokens = batch.tokens.size(); + return batch; +} + +std::vector> orpheus_runner::prepare_output_tokens() { + size_t chunks = octx->output_tokens.size() / 7; + std::vector> output_tokens; + for (int i = 0; i < model->audio_heads; i++) { + output_tokens.push_back(std::vector{}); + } + for (int i = 0; i < chunks; i++) { + for (int ii = 0; ii < 7; ii++) { + uint32_t thead = model->heads[ii]; + // the manipulations below are not configured because they are performed inline via undocumented constants in the Orpheus codebase. + // Essentially this is how Orpheus converts discrete samples from the output shape to the audio input shape. + uint32_t t = octx->output_tokens[i*7 + ii] - 128266 - ((ii % 7) * 4096); + output_tokens[thead].push_back(t); + } + } + return output_tokens; +} + +void orpheus_runner::generate_from_batch(orpheus_ubatch & batch, struct tts_response * output) { + while ((octx->output_tokens.size() == 0 || octx->output_tokens.back() != model->stopping_token_id) && octx->output_tokens.size() < model->max_generation_size) { + decode(batch); + generation_sampler->sample(octx->logits + octx->n_outputs * model->vocab_size, octx->output_tokens); + // only increment the output count after sampling + octx->n_outputs++; + batch = orpheus_ubatch{ + 1, {octx->output_tokens.back()} + }; + } + // this case could be better addressed by adding spliting to the generation process. + if (octx->output_tokens.size() >= model->max_generation_size) { + fprintf(stdout, "Warning: generation hit its max default length. The generated audio may not contain the entire prompt.\n"); + } + std::vector> processed_output_tokens = prepare_output_tokens(); + srunner->run(processed_output_tokens, output); +} + +int orpheus_runner::generate(std::string sentence, struct tts_response * response) { + orpheus_ubatch batch = batch_from_sentence(sentence); + // it should be possible to update the max context window size, but currently it is extremely unlikely that a single prompt will + // surpass the default size. + if (batch.tokens.size() > model->max_context_length) { + TTS_ABORT("The prompt was too large for the default context window. Try splitting up or shortenning the prompt."); + } + octx->reset(); + generation_sampler->reset(); + if (!kv_self) { + orpheus_kv_cache_init(); + } + generate_from_batch(batch, response); + return 0; +} + +void orpheus_runner::configure_generation(generation_configuration * config) { + generation_sampler->temperature = config->temperature; + generation_sampler->repetition_penalty = config->repetition_penalty; + generation_sampler->do_sample = config->sample; + generation_sampler->top_k = config->top_k; + generation_sampler->top_p = config->top_p; + if (std::find(orpheus_voices.begin(), orpheus_voices.end(), config->voice) == orpheus_voices.end() && !config->voice.empty()) { + TTS_ABORT("Voice '%s' is not a valid voice for Orpheus.", config->voice.c_str()); + } + octx->voice = config->voice; +} + +orpheus_ubatch orpheus_runner::build_worst_case_batch() { + orpheus_ubatch batch; + batch.n_tokens = model->max_context_length; + return batch; +} + +void orpheus_runner::assign_weight(std::string name, ggml_tensor * tensor) { + if (tensor->data == NULL) { + return; + } + + if (name.size() == 0) { + // handles the top level meta tensor + return; + } + + if (name.size() > 5 && name.substr(0, 5) == "snac.") { + srunner->model->assign_weight(name.substr(5), tensor); + } else if (name.size() > 8 && name.substr(0, 8) == "orpheus.") { + model->assign_weight(name.substr(8), tensor); + } else { + fprintf(stdout, "Warning: function %s encountered an unhandled tensor named '%s'.\n", __func__, name.c_str()); + } +} + +void orpheus_runner::prepare_post_load() { + srunner->prepare_post_load(); + orpheus_kv_cache_init(); + auto batch = build_worst_case_batch(); + auto gf = build_orpheus_graph(batch); + octx->prep_schedule(gf); +} + +std::vector list_voices() { + std::vector voices; + voices.reserve(orpheus_voices.size()); + for (auto voice : orpheus_voices) { + voices.push_back(voice); + } + return voices; +} diff --git a/otherarch/ttscpp/src/orpheus_model.h b/otherarch/ttscpp/src/orpheus_model.h new file mode 100644 index 000000000..9f02d7697 --- /dev/null +++ b/otherarch/ttscpp/src/orpheus_model.h @@ -0,0 +1,146 @@ +#pragma once + +#include "sampler.h" +#include "tokenizer.h" +#include "snac_model.h" + +// Orpheus uses vLLM with a llama-3 architecture. The only critical difference from the normal llama architecture is the use of kv heads. + +struct orpheus_layer { + struct ggml_tensor * input_norm; + struct ggml_tensor * post_attention_norm; + struct ggml_tensor * q; + struct ggml_tensor * k; + struct ggml_tensor * v; + struct ggml_tensor * o; + struct ggml_tensor * gate; + struct ggml_tensor * up; + struct ggml_tensor * down; +}; + +struct orpheus_model : tts_model { + uint32_t vocab_size = 156940; + uint32_t n_attn_heads = 24; + uint32_t n_kv_attn_heads = 8; + uint32_t head_size = 128; + uint32_t max_context_length = 1024; + // the generation size is technically arbitrary as the model can handle a large context. This size comes out to being 25.6 seconds. + uint32_t max_generation_size = 2100; + uint32_t stopping_token_id = 128258; + uint32_t eos_token_id = 128001; + uint32_t bos_token_id = 128000; + uint32_t hidden_size = 3072; + uint32_t kv_hidden_size = 1024; + uint32_t audio_heads = 3; + uint32_t heads[7] = {0, 1, 2, 2, 1, 2, 2}; + + int n_layers = 28; + + struct std::vector layers; + struct ggml_tensor * head; + struct ggml_tensor * embd; + struct ggml_tensor * output_norm; + struct ggml_tensor * rope_frequencies; + + void assign_weight(std::string name, ggml_tensor * tensor); + void assign_to_layer(std::string part, orpheus_layer & layer, struct ggml_tensor * tensor); + void prep_constants(gguf_context * meta); + void prep_layers(gguf_context * meta); + void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only) { + prep_constants(meta_ctx); + prep_layers(meta_ctx); + tts_model::setup_from_file(meta_ctx, load_context, cpu_only, "orpheus", 1.30); + } +}; + +struct orpheus_context : runner_context { + orpheus_context(orpheus_model * model, int n_threads): runner_context(n_threads), model(model) {}; + struct orpheus_model * model; + + uint32_t current_position = 0; // current position in the active sequence + uint32_t n_outputs = 0; // the position of the text prompt termination (used for adjusting the cache when incrementally generating) + std::string voice; + + std::vector output_tokens; + + void reset(); + void build_schedule() { + runner_context::build_schedule(model->max_nodes()); + } + + struct ggml_tensor * inp_tokens; + struct ggml_tensor * attn_mask; + struct ggml_tensor * positions; +}; + +struct orpheus_kv_cache { + ggml_type cache_type = GGML_TYPE_F32; + + std::vector k_l; + std::vector v_l; + + struct ggml_context * ctx; + ggml_backend_buffer_type_t buft; + ggml_backend_buffer_t buf; + + void free() { + ggml_free(ctx); + ggml_backend_buffer_free(buf); + } + + ~orpheus_kv_cache() { + free(); + } +}; + +struct orpheus_context * build_new_orpheus_context(struct orpheus_model * model, int n_threads, bool use_cpu = true); + +struct orpheus_ubatch { + orpheus_ubatch() = default; + orpheus_ubatch(size_t n_tokens, std::vector tokens): n_tokens(n_tokens), tokens(tokens) {}; + size_t n_tokens; // total sentence tokens + std::vector tokens; // [n_tokens] +}; + +struct orpheus_runner : tts_runner { + orpheus_runner( + orpheus_model * model, + snac_runner * audio_decoder, + orpheus_context * octx, + bpe_tokenizer * bt, + sampler * samp, + orpheus_kv_cache * cache): model(model), srunner(audio_decoder), octx(octx), tokenizer(bt), generation_sampler(samp), kv_self(cache) { + tts_runner::sampling_rate = 24000.0f; + generation_sampler->n_output_heads = 1; + generation_sampler->vocab_size = model->vocab_size; + generation_sampler->eos_token_id = model->eos_token_id; + } + orpheus_model * model; + snac_runner * srunner; + orpheus_context * octx; + bpe_tokenizer * tokenizer; + orpheus_kv_cache * kv_self; + sampler * generation_sampler; + + void init_build() { + tts_runner::init_build(&octx->buf_compute_meta); + } + + std::vector list_voices(); + struct ggml_cgraph * build_orpheus_graph(orpheus_ubatch & batch); + void orpheus_kv_cache_init(); + void orpheus_build_kv_store(struct ggml_context * ctx, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, int index, uint32_t n_tokens, int repeat); + void configure_generation(generation_configuration * config); + void assign_weight(std::string name, ggml_tensor * tensor); + std::vector> prepare_output_tokens(); + orpheus_ubatch build_worst_case_batch(); + orpheus_ubatch batch_from_sentence(std::string sentence); + void set_inputs(orpheus_ubatch & batch); + void decode(orpheus_ubatch & batch); + void prepare_post_load(); + int generate(std::string sentence, struct tts_response * response); + void generate_from_batch(orpheus_ubatch & batch, struct tts_response * output); +}; + +static struct ggml_tensor * orpheus_build_layer_norm(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * weight); +static struct ggml_tensor * build_attn_mask(ggml_context * ctx, orpheus_context * octx, orpheus_ubatch & batch); diff --git a/otherarch/ttscpp/src/parler_model.cpp b/otherarch/ttscpp/src/parler_model.cpp new file mode 100644 index 000000000..78cbebdfc --- /dev/null +++ b/otherarch/ttscpp/src/parler_model.cpp @@ -0,0 +1,874 @@ +#include "parler_model.h" + +// For loading parler model from gguf file. +static const std::map PARLER_TENSOR_GGUF_LOOKUP = { + {"layer_norm.weight", PARLER_NORM}, + {"layer_norm.bias", PARLER_NORM_BIAS}, + {"embed_prompts", PARLER_EMBD_PROMPTS}, + {"text_encoding", PARLER_TEXT_ENCODING}, + {"positional_embed", PARLER_POSITIONAL_EMBD}, + {".self_attn.q_proj.weight", PARLER_LAYER_SELF_ATTN_Q}, + {".self_attn.k_proj.weight", PARLER_LAYER_SELF_ATTN_K}, + {".self_attn.v_proj.weight", PARLER_LAYER_SELF_ATTN_V}, + {".self_attn.out_proj.weight", PARLER_LAYER_SELF_ATTN_O}, + {".self_attn_layer_norm.weight", PARLER_LAYER_SELF_ATTN_NORM}, + {".self_attn_layer_norm.bias", PARLER_LAYER_SELF_ATTN_NORM_BIAS}, + {".encoder_attn.q_proj.weight", PARLER_LAYER_ATTN_Q}, + {".encoder_attn.k_proj.weight", PARLER_LAYER_ATTN_K}, + {".encoder_attn.v_proj.weight", PARLER_LAYER_ATTN_V}, + {".encoder_attn.out_proj.weight", PARLER_LAYER_ATTN_O}, + {".encoder_attn_layer_norm.weight", PARLER_LAYER_ATTN_NORM}, + {".encoder_attn_layer_norm.bias", PARLER_LAYER_ATTN_NORM_BIAS}, + {".fc1.weight", PARLER_LAYER_FC1}, + {".fc2.weight", PARLER_LAYER_FC2}, + {".final_layer_norm.weight", PARLER_LAYER_OUT_NORM}, + {".final_layer_norm.bias", PARLER_LAYER_OUT_NORM_BIAS}, + {".weight", PARLER_EMBD}, + {".weight.head", PARLER_HEAD} +}; + +void parler_tts_model::assign_weight(std::string name, ggml_tensor * tensor) { + assign_to_decoder(this, name, tensor); +} + +void parler_tts_model::prep_layers(gguf_context * meta_ctx) { + layers.reserve((size_t) n_layers); + for (int i = 0; i < (int) n_layers; i++) { + parler_layer * l = new parler_layer{}; + layers.push_back(l); + } + + embds.reserve((size_t) n_output_heads); + heads.reserve((size_t) n_output_heads); + for (int i = 0; i < n_output_heads; i++) { + struct ggml_tensor * h = nullptr; + struct ggml_tensor * embd = nullptr; + embds.push_back(embd); + heads.push_back(h); + } +} + +void parler_tts_model::prep_constants(gguf_context * meta) { + int encode_length_key = search_for_gguf_keys(meta, {"parler-tts.decoder.encode_length", "encode_length"}); + if (encode_length_key == -1) { + TTS_ABORT("key 'parler-tts.decoder.encode_length' must be specified in gguf file."); + } + n_encode_length = gguf_get_val_u32(meta, encode_length_key); + + int hidden_size_key = search_for_gguf_keys(meta, {"parler-tts.decoder.hidden_size", "hidden_size"}); + if (hidden_size_key != -1) { + hidden_size = gguf_get_val_u32(meta, hidden_size_key); + } + + int output_heads_key = search_for_gguf_keys(meta, {"parler-tts.decoder.output_heads", "output_heads"}); + if (output_heads_key != -1) { + n_output_heads = gguf_get_val_u32(meta, output_heads_key); + } + int ctx_length_key = search_for_gguf_keys(meta, {"parler-tts.decoder.context_length", "ctx_length"}); + if (ctx_length_key != -1) { + max_ctx_length = gguf_get_val_u32(meta, ctx_length_key); + } + + int attn_heads_key = search_for_gguf_keys(meta, {"parler-tts.decoder.attention.head_count", "attn_heads"}); + if (attn_heads_key != -1) { + n_attn_heads = gguf_get_val_u32(meta, attn_heads_key); + } + head_size = hidden_size / n_attn_heads; + max_cross_nodes = n_attn_heads * 2; + + int output_vocab_size_key = search_for_gguf_keys(meta, {"parler-tts.decoder.out_vocab_size", "out_vocab_size"}); + if (output_vocab_size_key != -1) { + output_vocab_size = gguf_get_val_u32(meta, output_vocab_size_key); + } + + int audio_vocab_size_key = search_for_gguf_keys(meta, {"parler-tts.decoder.audio_vocab_size", "audio_vocab_size"}); + if (audio_vocab_size_key != -1) { + audio_vocab_size = gguf_get_val_u32(meta, audio_vocab_size_key); + } + + int max_gen_key = search_for_gguf_keys(meta, {"parler-tts.decoder.max_generation", "max_generation"}); + if (max_gen_key != -1) { + max_generation_size = gguf_get_val_u32(meta, max_gen_key); + } + + int n_layers_key = search_for_gguf_keys(meta, {"parler-tts.decoder.num_hidden_layers", "num_hidden_layers"}); + if (n_layers_key != -1) { + n_layers = gguf_get_val_u32(meta, n_layers_key); + } + + int bos_token_id_key = search_for_gguf_keys(meta, {"audio.bos_token_id", "bos_token_id"}); + if (bos_token_id_key != -1) { + bos_token_id = gguf_get_val_u32(meta, bos_token_id_key); + } + + int eos_token_id_key = search_for_gguf_keys(meta, {"audio.eos_token_id", "eos_token_id"}); + if (eos_token_id_key != -1) { + eos_token_id = gguf_get_val_u32(meta, eos_token_id_key); + } +} + +void parler_tts_model::prep_cross_key_values(int n_threads, struct tts_response * conditional_prompt) { + ggml_backend_t backend_cpu = ggml_backend_cpu_init(); + ggml_backend_buffer_type_t backend_cpu_buffer = ggml_backend_cpu_buffer_type(); + // Let it create a disposable threadpool just this once + ggml_backend_cpu_set_n_threads(backend_cpu, n_threads); + std::vector bufs = {backend_cpu_buffer}; + std::vector backs = {backend_cpu}; + ggml_backend_sched_t sched = ggml_backend_sched_new(backs.data(), bufs.data(), 1, max_cross_nodes*n_layers, false, false); + + std::vector buf_compute_meta; + buf_compute_meta.resize(max_cross_nodes*n_layers*ggml_tensor_overhead() + ggml_graph_overhead_custom(max_cross_nodes*n_layers, false)); + + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute_meta.size(), + /*.mem_buffer =*/ buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + struct ggml_context * cctx = ggml_init(params); + struct ggml_cgraph * gf = ggml_new_graph_custom(cctx, 4096, false); + if (conditional_prompt) { + // If we are updating the conditional prompt then we have to reset the tensor offsets into the ggml_context otherwise we could overflow the assigned buffer and lose our prompt. + // These offsets are assigned by #set_tensor below. + offset -= n_encode_length*hidden_size*sizeof(float)*n_layers*2; + precomputed_input_emb = ggml_new_tensor_2d(cctx, GGML_TYPE_F32, conditional_prompt->hidden_size, conditional_prompt->n_outputs); + ggml_set_input(precomputed_input_emb); + n_encode_length = conditional_prompt->n_outputs; + } + + for (int i = 0; i < layers.size(); i++) { + struct ggml_tensor * Kcur = ggml_mul_mat(cctx, layers[i]->attn_k_proj, precomputed_input_emb); + struct ggml_tensor * Vcur = ggml_mul_mat(cctx, layers[i]->attn_v_proj, precomputed_input_emb); + + Kcur = ggml_reshape_3d(cctx, Kcur, head_size, n_attn_heads, n_encode_length); + Vcur = ggml_transpose(cctx, Vcur); + + struct ggml_tensor * k = ggml_cont(cctx, ggml_permute(cctx, Kcur, 0, 2, 1, 3)); + ggml_set_name(k, ("cross_key_" + std::to_string(i)).c_str()); + ggml_build_forward_expand(gf, k); + + struct ggml_tensor * v = ggml_cont_3d(cctx, Vcur, n_encode_length, head_size, n_attn_heads); + ggml_set_name(v, ("cross_value_" + std::to_string(i)).c_str()); + ggml_build_forward_expand(gf, v); + } + + ggml_free(cctx); + ggml_backend_sched_reserve(sched, gf); + ggml_backend_sched_alloc_graph(sched, gf); + if (conditional_prompt) { + ggml_backend_tensor_set(precomputed_input_emb, conditional_prompt->data, 0, conditional_prompt->n_outputs*conditional_prompt->hidden_size*ggml_element_size(precomputed_input_emb)); + } + + ggml_backend_sched_graph_compute_async(sched, gf); + + for (int i = 0; i < layers.size(); i++) { + struct ggml_tensor * k = ggml_graph_get_tensor(gf, ("cross_key_" + std::to_string(i)).c_str()); + layers[i]->cross_k = ggml_dup_tensor(ctx, k); + set_tensor(layers[i]->cross_k, k); + struct ggml_tensor * v = ggml_graph_get_tensor(gf, ("cross_value_" + std::to_string(i)).c_str()); + layers[i]->cross_v = ggml_dup_tensor(ctx, v); + set_tensor(layers[i]->cross_v, v); + } + ggml_backend_sched_free(sched); + ggml_backend_free(backend_cpu); +} + +void assign_parler_layer(parler_tts_model * model, parler_layer * layer, std::string name, ggml_tensor * tensor) { + try { + switch(PARLER_TENSOR_GGUF_LOOKUP.at(name)) { + case PARLER_LAYER_SELF_ATTN_Q: + layer->self_attn_q_proj = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer->self_attn_q_proj, tensor); + break; + case PARLER_LAYER_SELF_ATTN_K: + layer->self_attn_k_proj = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer->self_attn_k_proj, tensor); + break; + case PARLER_LAYER_SELF_ATTN_V: + layer->self_attn_v_proj = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer->self_attn_v_proj, tensor); + break; + case PARLER_LAYER_SELF_ATTN_O: + layer->self_attn_o_proj = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer->self_attn_o_proj, tensor); + break; + case PARLER_LAYER_SELF_ATTN_NORM: + layer->self_attn_norm = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer->self_attn_norm, tensor); + break; + case PARLER_LAYER_SELF_ATTN_NORM_BIAS: + layer->self_attn_norm_bias = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer->self_attn_norm_bias, tensor); + break; + case PARLER_LAYER_ATTN_Q: + if (model->use_cross_attn) { + layer->attn_q_proj = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer->attn_q_proj, tensor); + } + break; + case PARLER_LAYER_ATTN_K: + if (model->use_cross_attn) { + layer->attn_k_proj = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer->attn_k_proj, tensor); + } + break; + case PARLER_LAYER_ATTN_V: + if (model->use_cross_attn) { + layer->attn_v_proj = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer->attn_v_proj, tensor); + } + break; + case PARLER_LAYER_ATTN_O: + if (model->use_cross_attn) { + layer->attn_o_proj = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer->attn_o_proj, tensor); + } + break; + case PARLER_LAYER_ATTN_NORM: + if (model->use_cross_attn) { + layer->attn_norm = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer->attn_norm, tensor); + } + break; + case PARLER_LAYER_ATTN_NORM_BIAS: + if (model->use_cross_attn) { + layer->attn_norm_bias = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer->attn_norm_bias, tensor); + } + break; + case PARLER_LAYER_FC1: + layer->fc1 = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer->fc1, tensor); + break; + case PARLER_LAYER_FC2: + layer->fc2 = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer->fc2, tensor); + break; + case PARLER_LAYER_OUT_NORM: + layer->final_norm = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer->final_norm, tensor); + break; + case PARLER_LAYER_OUT_NORM_BIAS: + layer->final_norm_bias = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer->final_norm_bias, tensor); + break; + default: + fprintf(stdout, "unassigned tensor %s\n", name.c_str()); + break; + } + } catch (const std::out_of_range& e) { + TTS_ABORT("Error: %s\nTensor, '%s', is not a valid tensor.", e.what(), name.c_str()); + } +} + +void assign_to_decoder(parler_tts_model * model, const std::string name, ggml_tensor * tensor) { + if (PARLER_TENSOR_GGUF_LOOKUP.find(name) != PARLER_TENSOR_GGUF_LOOKUP.end()) { + try { + switch (PARLER_TENSOR_GGUF_LOOKUP.at(name)) { + case PARLER_NORM: + model->layer_norm = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(model->layer_norm, tensor); + break; + case PARLER_NORM_BIAS: + model->layer_norm_bias = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(model->layer_norm_bias, tensor); + break; + case PARLER_EMBD_PROMPTS: + model->prompt_embd = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(model->prompt_embd, tensor); + break; + case PARLER_TEXT_ENCODING: + if (model->use_cross_attn) { + model->precomputed_input_emb = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(model->precomputed_input_emb, tensor); + } + break; + case PARLER_POSITIONAL_EMBD: + model->precomputed_positional_embds = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(model->precomputed_positional_embds, tensor); + break; + default: + fprintf(stdout, "unassigned tensor %s\n", name.c_str()); + break; + } + } catch (const std::out_of_range& e) { + TTS_ABORT("Error: %s\nTensor, '%s', is not a valid tensor.", e.what(), name.c_str()); + } + } else if (std::find_if(name.begin(), name.end(), ::isdigit) != name.end()) { + auto pair = parse_layer_count(name); + int layer = pair.first; + std::string lt_name = pair.second; + if (name.find("embed_tokens") != std::string::npos) { + model->embds[layer] = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(model->embds[layer], tensor); + } else if (name.find("lm_heads") != std::string::npos) { + model->heads[layer] = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(model->heads[layer], tensor); + } else { + assign_parler_layer(model, model->layers[layer], lt_name, tensor); + } + } +} + +void parler_context::reset(int32_t n_output_heads) { + n_outputs = 0; + prompt_end_position = 0; + current_position = 0; + output_size = 0; + output_tokens.clear(); + eos_seen.clear(); + for (int i = 0; i < (int) n_output_heads; i++) { + eos_seen.push_back(false); + } +} + +struct parler_context * build_new_parler_context(struct parler_tts_model * model, int n_threads, bool use_cpu) { + parler_context * pctx = new parler_context(model, n_threads); + if (!use_cpu) { +#ifdef GGML_USE_METAL + pctx->backend = ggml_backend_metal_init(); +#endif + } + pctx->eos_seen.reserve(model->n_output_heads); + pctx->backend_cpu = ggml_backend_cpu_init(); + pctx->set_threads(); + pctx->build_schedule(); + pctx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_nodes() + ggml_graph_overhead_custom(model->max_nodes(), false)); + return pctx; +} + +static bool parler_kv_cache_init(struct parler_kv_cache * cache, parler_tts_model * model, parler_context * pctx, int32_t seq_id) { + const int64_t n_layer = (int64_t) model->layers.size(); + cache->seq_id = seq_id; + + ggml_backend_buffer_type_t buft = nullptr; + // this will only really support cpu or metal for the time being; + if (pctx->backend != nullptr) { +#ifdef GGML_USE_METAL + buft = ggml_backend_metal_buffer_type(); +#endif + } else { + buft = ggml_backend_cpu_buffer_type(); + } + + struct ggml_init_params params = { + /*.mem_size =*/ (2u*model->n_layers+1)*ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ggml_context * ctx = ggml_init(params); + if (!ctx) { + return false; + } + cache->ctx = ctx; + + + cache->k_l.reserve(n_layer); + cache->v_l.reserve(n_layer); + + for (int i = 0; i < (int) n_layer; i++) { + ggml_tensor * k = ggml_new_tensor_1d(cache->ctx, cache->type_k, model->hidden_size*model->max_ctx_length); + ggml_tensor * v = ggml_new_tensor_1d(cache->ctx, cache->type_v, model->hidden_size*model->max_ctx_length); + ggml_format_name(k, "cache_k_l%d", i); + ggml_format_name(v, "cache_v_l%d", i); + cache->k_l.push_back(k); + cache->v_l.push_back(v); + } + + // allocate tensors and initialize the buffers to avoid NaNs in the padding + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(cache->ctx, buft); + if (!buf) { + return false; + } + ggml_backend_buffer_clear(buf, 0); + cache->buf = buf; + + return true; +} + +struct ggml_tensor * parler_build_inp_embd(struct ggml_context * ctx, struct parler_context * pctx, parler_tts_model * model, parler_ubatch & batch) { + // Parler has two embedding schemas one for the text input and one for generative audio tokens. These two schemas have effectively distinct shapes (i.e. [batch_size, sequence_length] and [batch_size, sequence_lenghth, num_codebooks] respectively). + // This means that depending on where we are in generation we need to follow a distinct pattern + struct ggml_tensor * input_embs; + pctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.sequence_length); + ggml_set_input(pctx->positions); + if (batch.audio_generation) { + pctx->audio_inp_tokens = ggml_reshape_2d(ctx, ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_audio_tokens), batch.n_audio_tokens / model->n_output_heads, model->n_output_heads); + ggml_set_input(pctx->audio_inp_tokens); + struct ggml_tensor * audio_tokens = ggml_reshape_2d(ctx, pctx->audio_inp_tokens, batch.n_audio_tokens / model->n_output_heads, model->n_output_heads); + for (int i = 0; i < model->n_output_heads; i++) { + if (i == 0) { + input_embs = ggml_get_rows(ctx, model->embds[i], ggml_view_2d(ctx, audio_tokens, 1, batch.n_audio_tokens / model->n_output_heads, audio_tokens->nb[1], i*sizeof(int32_t))); + } else { + input_embs = ggml_add(ctx, ggml_get_rows(ctx, model->embds[i], ggml_view_2d(ctx, audio_tokens, 1, batch.n_audio_tokens / model->n_output_heads, audio_tokens->nb[1], i*sizeof(int32_t))), input_embs); + } + } + } else { + pctx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens); + ggml_set_input(pctx->inp_tokens); + input_embs = ggml_get_rows(ctx, model->prompt_embd, pctx->inp_tokens); + } + return ggml_add(ctx, input_embs, ggml_get_rows(ctx, model->precomputed_positional_embds, pctx->positions)); +} + +struct ggml_tensor * parler_build_layer_norm(struct ggml_context * ctx, struct ggml_tensor * inputs, struct ggml_tensor * weight, struct ggml_tensor * bias) { + // parler always uses default eps + float eps = 0.00001; + inputs = ggml_norm(ctx, inputs, eps); + inputs = ggml_mul(ctx, inputs, weight); + return ggml_add(ctx, inputs, bias); +} + +void parler_build_kv_store(struct ggml_context * ctx, parler_kv_cache * kv, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, int32_t n_tokens, int32_t kv_head, int32_t index, int32_t n_embd_gqa) { + // this is the max context size; + const int64_t n_ctx = 4096; + + struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv->k_l[index], n_tokens*n_embd_gqa, ggml_row_size(kv->k_l[index]->type, n_embd_gqa)*kv_head); + + ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view)); + + assert(v_cur->ne[0] == n_embd_gqa && v_cur->ne[1] == n_tokens); + + struct ggml_tensor * v_cache_view = nullptr; + + v_cache_view = ggml_view_2d(ctx, kv->v_l[index], n_tokens, n_embd_gqa, + ( n_ctx)*ggml_element_size(kv->v_l[index]), + (kv_head)*ggml_element_size(kv->v_l[index])); + + v_cur = ggml_cont(ctx, ggml_transpose(ctx, v_cur)); + + ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view)); +} + +struct ggml_tensor * parler_build_head_outputs(struct ggml_context * ctx, parler_tts_model * model, struct ggml_tensor * cur) { + // going to cat the heads together and then reshape them; + // honestly ggml doesn't provide good support for stacking and discrete tensor access + struct ggml_tensor * out; + for (int i = 0; i < model->n_output_heads; i++) { + if (i == 0) { + out = ggml_mul_mat(ctx, model->heads[i], cur); + } else { + out = ggml_concat(ctx, out, ggml_mul_mat(ctx, model->heads[i], cur), 1); + } + } + ggml_set_name(out, "final_out"); + //out = ggml_cont(ctx, ggml_transpose(ctx, out)); + + int32_t sql_len = (int32_t) (ggml_nelements(out) / (model->output_vocab_size * model->n_output_heads)); + return ggml_cont_3d(ctx, out, model->output_vocab_size, sql_len, model->n_output_heads); +} + +struct ggml_tensor * build_attn_mask(ggml_context * ctx, parler_context * pctx, parler_ubatch & batch) { + pctx->attn_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, (int64_t) pctx->current_position + batch.sequence_length, (int64_t) pctx->current_position + batch.sequence_length); + ggml_set_input(pctx->attn_mask); + + return pctx->attn_mask; +} + +struct ggml_tensor * build_attn_mask_cross(ggml_context * ctx, parler_context * pctx, parler_tts_model * model, parler_ubatch & batch) { + pctx->attn_mask_cross = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, (int64_t) model->n_encode_length, (int64_t) batch.sequence_length); + ggml_set_input(pctx->attn_mask_cross); + + return pctx->attn_mask_cross; +} + +static struct parler_ubatch batch_from_sentence(std::string sentence, parler_tts_model * model, unigram_tokenizer * tokenizer) { + struct parler_ubatch batch; + batch.audio_generation = false; + std::vector* token_ids = new std::vector; + tokenizer->tokenize(sentence, *token_ids); + token_ids->push_back(tokenizer->eos_token); + batch.current_step = 0; + batch.n_tokens = token_ids->size(); + batch.n_audio_tokens = 0; + batch.sequence_length = batch.n_tokens; // sequence_length is equal to the number of tokens for non-audio generation + std::vector* position = new std::vector; + for (uint32_t i = 0; i < batch.sequence_length; i++) { + position->push_back(i); + } + std::vector* order = new std::vector; + for (int i = 0; i < batch.sequence_length; i++) { + if (i >= batch.sequence_length - 1) { + order->push_back(0); + } else { + order->push_back(i+1); + } + } + batch.positions = position->data(); + batch.tokens = token_ids->data(); + return batch; +} + +void parler_tts_runner::assign_weight(std::string name, ggml_tensor * tensor) { + std::string::size_type pos = name.find(".", 0); + std::string top_level(name.substr(0, pos)); + std::string value(name.substr(pos + 1)); + if (tensor->data == NULL) { + return; + } + if (top_level == "audio_encoder") { + dac_runner->model->assign_weight(value, tensor); + } else if (top_level == "decoder") { + model->assign_weight(value, tensor); + } else { + return; + } +} + +void parler_tts_runner::update_conditional_prompt(const std::string file_path, const std::string prompt, int n_threads, bool cpu_only) { + t5_runner * text_encoder = text_encoder_from_file(file_path, n_threads, tokenizer, cpu_only); + tts_response* response; + text_encoder->generate(prompt, response); + model->prep_cross_key_values(n_threads, response); + delete text_encoder; + return; +} + + +struct ggml_cgraph * parler_tts_runner::build_parler_graph(parler_ubatch & batch) { + init_build(); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + const int32_t full_sequence_length = pctx->current_position + (uint32_t) batch.sequence_length; + + inpL = parler_build_inp_embd(ctx, pctx, model, batch); + + struct ggml_tensor * KQ_mask_dec = build_attn_mask(ctx, pctx, batch); + struct ggml_tensor * KQ_mask_cross = build_attn_mask_cross(ctx, pctx, model, batch); + + for (int l = 0; l < model->n_layers; l++) { + struct ggml_tensor * residual = inpL; + ggml_set_name(inpL, ("layer_" + std::to_string(l) + "_input").c_str()); + + cur = parler_build_layer_norm(ctx, inpL, model->layers[l]->self_attn_norm, model->layers[l]->self_attn_norm_bias); + + struct ggml_tensor * attn_out; + + // self-attention + { + struct ggml_tensor * Qcur = ggml_mul_mat(ctx, model->layers[l]->self_attn_q_proj, cur); + struct ggml_tensor * Kcur = ggml_mul_mat(ctx, model->layers[l]->self_attn_k_proj, cur); + struct ggml_tensor * Vcur = ggml_mul_mat(ctx, model->layers[l]->self_attn_v_proj, cur); + + parler_build_kv_store(ctx, kv_self, gf, Kcur, Vcur, (int32_t) batch.sequence_length, pctx->current_position, l, model->hidden_size); + struct ggml_tensor * k = + ggml_view_3d(ctx, kv_self->k_l[l], + model->head_size, full_sequence_length, model->n_attn_heads, + ggml_row_size(kv_self->k_l[l]->type, model->hidden_size), + ggml_row_size(kv_self->k_l[l]->type, model->head_size), + 0); + + + struct ggml_tensor * v = + ggml_view_3d(ctx, kv_self->v_l[l], + full_sequence_length, model->head_size, model->n_attn_heads, + ggml_element_size(kv_self->v_l[l])*model->max_ctx_length, + ggml_element_size(kv_self->v_l[l])*model->max_ctx_length*model->head_size, + 0); + + Qcur = ggml_reshape_3d(ctx, Qcur, model->head_size, model->n_attn_heads, batch.sequence_length); + struct ggml_tensor * q = ggml_cont(ctx, ggml_permute(ctx, Qcur, 0, 2, 1, 3)); + struct ggml_tensor * kq = ggml_mul_mat(ctx, ggml_cont(ctx, k), q); + kq = ggml_soft_max_ext(ctx, kq, KQ_mask_dec, 1.0f/sqrtf(model->head_size), 0.0f); + struct ggml_tensor * kqv = ggml_mul_mat(ctx, kq, v); + struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3); + attn_out = ggml_cont_2d(ctx, kqv_merged, model->hidden_size, batch.sequence_length); + attn_out = ggml_mul_mat(ctx, model->layers[l]->self_attn_o_proj, attn_out); + } + + cur = ggml_add(ctx, attn_out, residual); + + if (model->use_cross_attn) { + struct ggml_tensor * residuala = cur; + + // norm + cur = parler_build_layer_norm(ctx, cur, model->layers[l]->attn_norm, model->layers[l]->attn_norm_bias); + + //cross-attention + struct ggml_tensor * Qcur = ggml_mul_mat(ctx, model->layers[l]->attn_q_proj, cur); + Qcur = ggml_reshape_3d(ctx, Qcur, model->head_size, model->n_attn_heads, batch.sequence_length); + + struct ggml_tensor * q = ggml_cont(ctx, ggml_permute(ctx, Qcur, 0, 2, 1, 3)); + + struct ggml_tensor * kq = ggml_mul_mat(ctx, model->layers[l]->cross_k, q); + kq = ggml_soft_max_ext(ctx, kq, KQ_mask_cross, 1.0f/sqrtf(model->head_size), 0.0f); + + struct ggml_tensor * kqv = ggml_mul_mat(ctx, kq, model->layers[l]->cross_v); + struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3); + cur = ggml_cont_2d(ctx, kqv_merged, model->hidden_size, batch.sequence_length); + cur = ggml_mul_mat(ctx, model->layers[l]->attn_o_proj, cur); + cur = ggml_add(ctx, cur, residuala); + } + + struct ggml_tensor * residualffn = cur; + + cur = parler_build_layer_norm(ctx, cur, model->layers[l]->final_norm, model->layers[l]->final_norm_bias); + cur = ggml_mul_mat(ctx, model->layers[l]->fc1, cur); + cur = ggml_gelu(ctx, cur); + cur = ggml_mul_mat(ctx, model->layers[l]->fc2, cur); + cur = ggml_add(ctx, cur, residualffn); + inpL = cur; + } + + cur = parler_build_layer_norm(ctx, cur, model->layer_norm, model->layer_norm_bias); + cur = parler_build_head_outputs(ctx, model, cur); + ggml_build_forward_expand(gf, cur); + free_build(); + + return gf; +} + +void parler_tts_runner::configure_generation(generation_configuration * config) { + sampler->temperature = config->temperature; + sampler->repetition_penalty = config->repetition_penalty; + sampler->do_sample = config->sample; + sampler->top_k = config->top_k; + sampler->top_p = config->top_p; + model->use_cross_attn = config->use_cross_attn; +} + +void parler_tts_runner::set_inputs(parler_ubatch & batch) { + if (batch.audio_generation) { + ggml_backend_tensor_set(pctx->audio_inp_tokens, batch.audio_tokens, 0, batch.n_audio_tokens*ggml_element_size(pctx->audio_inp_tokens)); + } else { + ggml_backend_tensor_set(pctx->inp_tokens, batch.tokens, 0, batch.n_tokens*ggml_element_size(pctx->inp_tokens)); + } + ggml_backend_tensor_set(pctx->positions, batch.positions, 0, batch.sequence_length*ggml_element_size(pctx->positions)); + float * d = nullptr; + d = (float *) pctx->attn_mask->data; + uint32_t max_pos = pctx->current_position + batch.sequence_length; + for (int i = 0; i < batch.sequence_length; i++) { + uint32_t pos = batch.positions[i]; + for (int ii = 0; ii < max_pos; ii++) { + d[i*max_pos + ii] = ii > pos ? -INFINITY : 0.0f; + } + } + + if (model->use_cross_attn) { + float * d2 = nullptr; + d2 = (float *) pctx->attn_mask_cross->data; + for (int i = 0; i < model->n_encode_length; i++) { + for (int ii = 0; ii < batch.sequence_length; ii++) { + d2[i*batch.sequence_length + ii] = 0.0f; + } + } + } + +} +void parler_tts_runner::parler_graph_compute(ggml_cgraph * gf) { + ggml_backend_sched_graph_compute_async(pctx->sched, gf); +} + +int parler_tts_runner::decode(parler_ubatch & batch) { + ggml_backend_sched_reset(pctx->sched); + + pctx->output_tokens.reserve(model->max_generation_size); + + const size_t logits_size = model->output_vocab_size*model->max_generation_size*model->n_output_heads; + const size_t prev_size = pctx->buf_output ? ggml_backend_buffer_get_size(pctx->buf_output) : 0; + const size_t new_size = logits_size * sizeof(float); + + if (!pctx->buf_output || prev_size < new_size) { + if (pctx->buf_output) { + ggml_backend_buffer_free(pctx->buf_output); + pctx->buf_output = nullptr; + pctx->logits = nullptr; + } + + pctx->buf_output = ggml_backend_buft_alloc_buffer(pctx->backend_cpu_buffer, new_size); + } + + pctx->logits = (float *) ggml_backend_buffer_get_base(pctx->buf_output); + //ggml_backend_buffer_clear(pctx->buf_output, 0); + + ggml_cgraph * gf = build_parler_graph(batch); + + // the output is always the last tensor in the graph + struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; + ggml_backend_sched_alloc_graph(pctx->sched, gf); + + // use the sequence_length variable here so that audio input tokens are handled correctly. + size_t n_outputs_new = batch.sequence_length; + + set_inputs(batch); + parler_graph_compute(gf); + + float * logits_out = pctx->logits + pctx->n_outputs * model->output_vocab_size * model->n_output_heads; + pctx->get_ggml_node_data(res, logits_out, n_outputs_new*model->output_vocab_size*model->n_output_heads*sizeof(float)); + + // set to total number of outputs in the batch*/ + pctx->n_outputs += n_outputs_new; + + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(pctx->sched); + + return 0; +} + +parler_ubatch parler_tts_runner::build_worst_case_batch() { + struct parler_ubatch batch; + batch.audio_generation = false; + batch.n_tokens = model->max_ctx_length; + batch.n_audio_tokens = 0; + batch.sequence_length = model->max_ctx_length; + return batch; +} + +void parler_tts_runner::prepare_post_load() { + dac_runner->prepare_post_load(); + parler_kv_cache_init(kv_self, model, pctx, std::mt19937(std::random_device{}())()); + auto batch = build_worst_case_batch(); + auto gf = build_parler_graph(batch); + pctx->prep_schedule(gf); +} + +bool parler_tts_runner::adjust_for_sequence_continuation(struct parler_ubatch & batch) { + return false; // not implemneted +} + +bool parler_tts_runner::check_stopping() { + int32_t token_position = (int32_t) pctx->output_tokens.size() - (int32_t) model->n_output_heads; + if (token_position < 0) { + return false; + } + if (pctx->current_position >= model->max_generation_size) { + return true; + } + + bool channels_complete = true; + for (int i = 0; i < model->n_output_heads; i++) { + pctx->eos_seen[i] = pctx->eos_seen[i] || pctx->output_tokens[token_position+i] == model->eos_token_id; + if (channels_complete) { + channels_complete = pctx->eos_seen[i]; + } + } + return channels_complete; +} + +void parler_tts_runner::adjust_output_tokens(std::vector & output_tokens, std::vector & filtered) { + // currently this is applying sliding window over the heads and filtering out bad tokens. + // If we convert the DAC model's quantizer layers to support by row + column embeddings then we will need to transpose + // the heads and the sequence here, but right now simplying using a strided view is more peformant. + size_t size = output_tokens.size(); + filtered.reserve(size); + for (int i = 0; i < size / model->n_output_heads; i++) { + bool remove = false; + for (int ii = 0; ii < model->n_output_heads; ii++) { + int next_index = i*model->n_output_heads+ii*model->n_output_heads+ii; + if (next_index > size || output_tokens[next_index] >= model->audio_vocab_size) { + remove = true; + break; + } + } + if (!remove) { + for (int ii = 0; ii < model->n_output_heads; ii++) { + int next_index = i*model->n_output_heads+ii*model->n_output_heads+ii; + if (next_index > size) { + filtered.push_back(model->eos_token_id); + } else { + filtered.push_back(output_tokens[next_index]); + } + } + } + } +} + +int parler_tts_runner::generate_from_batch(parler_ubatch & batch, struct tts_response * output) { + std::vector next_decoder_token_ids; + next_decoder_token_ids.reserve(model->n_output_heads); + + while (!check_stopping()) { + int state = decode(batch); + if (state != 0) { + return state; + } + if (!batch.audio_generation) { + pctx->prompt_end_position += batch.sequence_length; + } + if (batch.audio_generation) { + sampler->sample(pctx->logits + pctx->current_position * model->n_output_heads * model->output_vocab_size, pctx->output_tokens); + } + pctx->current_position += batch.sequence_length; + next_decoder_token_ids.clear(); + uint32_t * last_outputs = (pctx->output_tokens.data() + (int) pctx->output_tokens.size() - model->n_output_heads); + for (int i = 0; i < model->n_output_heads; i++) { + next_decoder_token_ids.push_back(batch.current_step > i ? pctx->eos_seen[i] ? model->eos_token_id : last_outputs[i] : model->bos_token_id); + } + batch = parler_ubatch{ + true, 0, 9, 1, nullptr, next_decoder_token_ids.data(), &pctx->current_position, nullptr, batch.current_step+1 + }; + } + + std::vector filtered_output_tokens; + adjust_output_tokens(pctx->output_tokens, filtered_output_tokens); + dac_runner->run(filtered_output_tokens.data(), (int32_t) filtered_output_tokens.size() / model->n_output_heads, output); + return 0; +} + +int parler_tts_runner::generate_audio_tokens(std::string sentence) { + parler_ubatch batch = batch_from_sentence(sentence, model, tokenizer); + pctx->reset(model->n_output_heads); + sampler->reset(); + int32_t seq_id = std::mt19937(std::random_device{}())(); + if (!kv_self) { + kv_self = new parler_kv_cache; + if (!parler_kv_cache_init(kv_self, model, pctx, seq_id)) { + return 1; + } + } + + std::vector next_decoder_token_ids; + next_decoder_token_ids.reserve(model->n_output_heads); + + while (!check_stopping()) { + int state = decode(batch); + if (state != 0) { + return state; + } + if (!batch.audio_generation) { + pctx->prompt_end_position += batch.sequence_length; + } + if (batch.audio_generation) { + sampler->sample(pctx->logits + pctx->current_position * model->n_output_heads * model->output_vocab_size, pctx->output_tokens); + } + pctx->current_position += batch.sequence_length; + next_decoder_token_ids.clear(); + uint32_t * last_outputs = (pctx->output_tokens.data() + (int) pctx->output_tokens.size() - model->n_output_heads); + for (int i = 0; i < model->n_output_heads; i++) { + next_decoder_token_ids.push_back(batch.current_step > i ? pctx->eos_seen[i] ? model->eos_token_id : last_outputs[i] : model->bos_token_id); + } + batch = parler_ubatch{ + true, 0, 9, 1, nullptr, next_decoder_token_ids.data(), &pctx->current_position, nullptr, batch.current_step+1 + }; + } + + return 0; +} + +void parler_tts_runner::just_audio_token_decode(uint32_t * tokens, int32_t sq_len, struct tts_response * outputs) { + dac_runner->run(tokens, sq_len, outputs); +} + +int parler_tts_runner::generate(std::string sentence, struct tts_response * output, int32_t seq_id) { + parler_ubatch batch = batch_from_sentence(sentence, model, tokenizer); + pctx->reset(model->n_output_heads); + sampler->reset(); + if (pctx->seq_id != seq_id || seq_id == -1) { + seq_id = std::mt19937(std::random_device{}())(); + pctx->current_position = 0; + if (!kv_self) { + kv_self = new parler_kv_cache; + if (!parler_kv_cache_init(kv_self, model, pctx, seq_id)) { + return 1; + } + } + } else { + if (!adjust_for_sequence_continuation(batch)) { + return 2; + } + } + return generate_from_batch(batch, output); +} diff --git a/otherarch/ttscpp/src/parler_model.h b/otherarch/ttscpp/src/parler_model.h new file mode 100644 index 000000000..463910f49 --- /dev/null +++ b/otherarch/ttscpp/src/parler_model.h @@ -0,0 +1,225 @@ +#ifndef parler_model_h +#define parler_model_h + +#include "dac_model.h" +#include "t5_encoder_model.h" +#include "sampler.h" + +enum parler_tensor { + PARLER_EMBD, + PARLER_EMBD_PROMPTS, + PARLER_TEXT_ENCODING, + PARLER_POSITIONAL_EMBD, + PARLER_HEAD, + PARLER_NORM, + PARLER_NORM_BIAS, + PARLER_LAYER_SELF_ATTN_Q, + PARLER_LAYER_SELF_ATTN_K, + PARLER_LAYER_SELF_ATTN_V, + PARLER_LAYER_SELF_ATTN_O, + PARLER_LAYER_SELF_ATTN_NORM, + PARLER_LAYER_SELF_ATTN_NORM_BIAS, + PARLER_LAYER_ATTN_Q, + PARLER_LAYER_ATTN_K, + PARLER_LAYER_ATTN_V, + PARLER_LAYER_ATTN_O, + PARLER_LAYER_ATTN_NORM, + PARLER_LAYER_ATTN_NORM_BIAS, + PARLER_LAYER_FC1, + PARLER_LAYER_FC2, + PARLER_LAYER_OUT_NORM, + PARLER_LAYER_OUT_NORM_BIAS, +}; + +struct parler_layer { + struct ggml_tensor * self_attn_k_proj; + struct ggml_tensor * self_attn_q_proj; + struct ggml_tensor * self_attn_v_proj; + struct ggml_tensor * self_attn_o_proj; + struct ggml_tensor * self_attn_norm; + struct ggml_tensor * self_attn_norm_bias; + + struct ggml_tensor * attn_k_proj; + struct ggml_tensor * attn_q_proj; + struct ggml_tensor * attn_v_proj; + struct ggml_tensor * attn_o_proj; + struct ggml_tensor * attn_norm; + struct ggml_tensor * attn_norm_bias; + + struct ggml_tensor * cross_k; + struct ggml_tensor * cross_v; + + struct ggml_tensor * fc1; + struct ggml_tensor * fc2; + struct ggml_tensor * final_norm; + struct ggml_tensor * final_norm_bias; +}; + +struct parler_tts_model : tts_model { + // These default configurations are based on the configuration of Parler TTS Mini (version 1.0) + uint32_t n_output_heads = 9; + uint32_t n_encode_length; + uint32_t max_encode_length = 512; // This corresponds with the max token length of the conditional prompt + uint32_t hidden_size = 1024; + uint32_t max_ctx_length = 4096; + uint32_t n_attn_heads = 16; + uint32_t head_size = 64; + uint32_t output_vocab_size = 1088; + uint32_t eos_token_id = 1024; + uint32_t audio_vocab_size = 1024; + uint32_t max_generation_size = 2580; + uint32_t n_layers = 24; + uint32_t bos_token_id = 1025; + uint32_t max_cross_nodes = 32; + uint32_t prompt_vocab_size; + + bool use_cross_attn = true; + + std::vector embds; + std::vector layers; + std::vector heads; + + struct ggml_tensor * precomputed_input_emb; + struct ggml_tensor * precomputed_positional_embds; + + struct ggml_tensor * layer_norm; + struct ggml_tensor * layer_norm_bias; + struct ggml_tensor * prompt_embd; + + void assign_weight(std::string name, ggml_tensor * tensor); + void prep_constants(gguf_context * meta); + void prep_layers(gguf_context * meta); + void prep_cross_key_values(int n_threads, struct tts_response * conditional_prompt = nullptr); + void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only) { + prep_constants(meta_ctx); + prep_layers(meta_ctx); + tts_model::setup_from_file(meta_ctx, load_context, cpu_only, "decoder", 1.30, max_encode_length*hidden_size*sizeof(float)*n_layers*2); + } +}; + +// For assigning weights to the parler model from a gguf file. +void assign_parler_layer(parler_tts_model * model, parler_layer & layer, std::string name, ggml_tensor * tensor); +void assign_to_decoder(parler_tts_model * model, const std::string name, ggml_tensor * tensor); + +struct parler_context : runner_context { + parler_context(parler_tts_model * model, int n_threads): runner_context(n_threads), model(model) {}; + struct parler_tts_model * model; + std::vector eos_seen; + + bool use_cache = true; + + size_t output_size = 0; // capacity (of tokens positions) for the output buffers + int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch + uint32_t current_position = 0; // current position in the active sequence + uint32_t prompt_end_position = 0; // the position of the text prompt termination (used for adjusting the cache when incrementally generating) + int32_t seq_id; // a unique identifier associated with the active sequence. + + std::vector output_tokens; + + struct ggml_tensor * inp_tokens; + struct ggml_tensor * audio_inp_tokens; + struct ggml_tensor * positions; + struct ggml_tensor * attn_mask; + struct ggml_tensor * attn_mask_cross; + + void build_schedule() { + runner_context::build_schedule(model->max_nodes()); + } + void reset(int32_t n_output_heads); +}; + +struct parler_kv_cache { + int32_t seq_id; + + ggml_type type_k = GGML_TYPE_F32; + ggml_type type_v = GGML_TYPE_F32; + + std::vector k_l; + std::vector v_l; + + struct ggml_context * ctx; + ggml_backend_buffer_type_t buft; + ggml_backend_buffer_t buf; + + void free() { + ggml_free(ctx); + ggml_backend_buffer_free(buf); + } + + ~parler_kv_cache() { + free(); + } +}; + +struct parler_ubatch { + parler_ubatch(bool audio_generation, size_t n_tokens, size_t n_audio_tokens, size_t sequence_length, + uint32_t * tokens, uint32_t * audio_tokens, uint32_t * positions, uint32_t * true_order, + int current_step): audio_generation(audio_generation), n_tokens(n_tokens), n_audio_tokens(n_audio_tokens), sequence_length(sequence_length), tokens(tokens), audio_tokens(audio_tokens), positions(positions), true_order(true_order), current_step(current_step) {}; + parler_ubatch() {}; + bool audio_generation; // whether we are receiving codebook decoded tokens or text tokens + size_t n_tokens; // total sentence tokens + size_t n_audio_tokens; // total audio tokens + size_t sequence_length; // for just audio tokens the sequence length should be the total_tokens / num_heads; in general this should be n_tokens + n_audio_tokens / num_heads + uint32_t * tokens; // [n_tokens] + uint32_t * audio_tokens; // [n_audio_tokens] + uint32_t * positions; // [sequence_length] + uint32_t * true_order; + int current_step = 0; // total_generations +}; + +struct parler_context * build_new_parler_context(struct parler_tts_model * model, int n_threads, bool use_cpu = true); +static bool parler_kv_cache_init(struct parler_kv_cache * cache, parler_tts_model * model, parler_context * pctx, int32_t seq_id); + +struct ggml_tensor * parler_build_inp_embd(struct ggml_context * ctx, struct parler_context * pctx, parler_tts_model * model, const parler_ubatch & batch); +struct ggml_tensor * parler_build_layer_norm(struct ggml_context * ctx, struct ggml_tensor * inputs, struct ggml_tensor * weight, struct ggml_tensor * bias); +void parler_build_kv_store(struct ggml_context * ctx, const parler_kv_cache * kv, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, int32_t n_tokens, int32_t kv_head, int32_t index, int32_t n_embd_gqa); +struct ggml_tensor * parler_build_head_outputs(struct ggml_context * ctx, parler_tts_model * model, struct ggml_tensor * cur); +struct ggml_tensor * build_attn_mask(ggml_context * ctx, parler_context * pctx, parler_ubatch & batch); +struct ggml_tensor * build_attn_mask_cross(ggml_context * ctx, parler_context * pctx, parler_tts_model * model, parler_ubatch & batch); +static struct parler_ubatch batch_from_sentence(std::string sentence, parler_tts_model * model, unigram_tokenizer * tokenizer); + +// This struct is intended to support end-to-end TTS generation. As such, it manages the parler tts model compilation, compute and generation process, +// the tokenization and sampling process, and uses the dac_runner struct to encode audio outputs. +struct parler_tts_runner : tts_runner { + parler_tts_runner(parler_tts_model * model, dac_runner * audio_decoder, parler_context * pctx, unigram_tokenizer * ut, sampler * samp, parler_kv_cache * cache): model(model), dac_runner(audio_decoder), pctx(pctx), tokenizer(ut), sampler(samp), kv_self(cache) {}; + ~parler_tts_runner() { + if (ctx) { + ggml_free(ctx); + } + model->free(); + delete model; + delete kv_self; + delete dac_runner; + delete pctx; + delete sampler; + } + struct parler_tts_model * model; + struct dac_runner * dac_runner; + struct parler_context * pctx; + struct unigram_tokenizer * tokenizer; + struct parler_kv_cache * kv_self = nullptr; + struct sampler * sampler; + + void init_build() { + tts_runner::init_build(&pctx->buf_compute_meta); + } + + void configure_generation(generation_configuration * config); + void assign_weight(std::string name, ggml_tensor * tensor); + parler_ubatch build_worst_case_batch(); + struct ggml_cgraph * build_parler_graph(parler_ubatch & batch); + void set_inputs(parler_ubatch & batch); + int decode(parler_ubatch & batch); + void prepare_post_load(); + bool adjust_for_sequence_continuation(struct parler_ubatch & batch); + int generate(std::string sentence, struct tts_response * response, int32_t seq_id = -1); + bool check_stopping(); + void adjust_output_tokens(std::vector & output_tokens, std::vector & filtered); + int generate_from_batch(parler_ubatch & batch, struct tts_response * output); + void parler_graph_compute(ggml_cgraph * gf); + void just_audio_token_decode(uint32_t * tokens, int32_t sq_len, struct tts_response * output); + int generate_audio_tokens(std::string sentence); + void update_conditional_prompt(const std::string file_path, const std::string prompt, int n_threads, bool cpu_only = true); +}; + +#endif diff --git a/otherarch/ttscpp/src/phonemizer.cpp b/otherarch/ttscpp/src/phonemizer.cpp new file mode 100644 index 000000000..36da56723 --- /dev/null +++ b/otherarch/ttscpp/src/phonemizer.cpp @@ -0,0 +1,1180 @@ +#include "phonemizer.h" + +#ifdef ESPEAK_INSTALL +/** + * espeak_wrapper functions and assignments + * + * The espeak_wrapper is a singleton which wraps threaded calls to espeak-ng with a shared mutex + */ + +// non-const static members must be initialized out of line +espeak_wrapper* espeak_wrapper::instance{nullptr}; +std::mutex espeak_wrapper::mutex; + +espeak_wrapper * espeak_wrapper::get_instance() { + if (!instance) { + instance = new espeak_wrapper; + } + return instance; +} + +const espeak_VOICE ** espeak_wrapper::list_voices() { + std::lock_guard lock(mutex); + return espeak_ListVoices(nullptr); +} + +espeak_ERROR espeak_wrapper::set_voice(const char * voice_code) { + std::lock_guard lock(mutex); + return espeak_SetVoiceByName(voice_code); +} + +const char * espeak_wrapper::text_to_phonemes(const void ** textptr, int textmode, int phonememode) { + std::lock_guard lock(mutex); + return espeak_TextToPhonemes(textptr, textmode, phonememode); +} + +void espeak_wrapper::initialize(espeak_AUDIO_OUTPUT output, int buflength, const char * path, int options) { + std::lock_guard lock(mutex); + if (!espeak_initialized) { + espeak_initialized = true; + espeak_Initialize(output, buflength, path, options); + } +} +#endif + +/** + * Helper functions for string parsing + */ +const std::unordered_set inline_combine_sets(const std::vector> sets) { + std::unordered_set combined; + for (auto set : sets) { + combined.insert(set.begin(), set.end()); + } + return combined; +} + +std::string replace(std::string target, char to_replace, char replacement) { + for (int i = 0; i < target.size(); i++) { + if (target[i] == to_replace) { + target[i] = replacement; + } + } + return target; +} + +std::string to_lower(std::string word) { + std::transform(word.begin(), word.end(), word.begin(), + [](unsigned char c){ return std::tolower(c); + }); + return word; +} + +std::string to_upper(std::string word) { + std::transform(word.begin(), word.end(), word.begin(), + [](unsigned char c){ return std::toupper(c); + }); + return word; +} + +std::string replace_accents(std::string word) { + std::string new_word; + for (int i = 0; i < word.size();) { + int grab = 0; + while(i+grab+1 < word.size() && (word[i+grab + 1] & 0b11000000) == 0b10000000) { + ++grab; + } + ++grab; + + if (grab > 1) { + std::string accent = word.substr(i, grab); + if (ACCENTED_A.find(accent) != std::string::npos) { + new_word.push_back('a'); + } else if (ACCENTED_C.find(accent) != std::string::npos) { + new_word.push_back('c'); + } else if (ACCENTED_E.find(accent) != std::string::npos) { + new_word.push_back('e'); + } else if (ACCENTED_I.find(accent) != std::string::npos) { + new_word.push_back('i'); + } else if (ACCENTED_N.find(accent) != std::string::npos) { + new_word.push_back('n'); + } else if (ACCENTED_O.find(accent) != std::string::npos) { + new_word.push_back('o'); + } else if (ACCENTED_U.find(accent) != std::string::npos) { + new_word.push_back('u'); + } else { + // non accented charactes in a word string should really be possible but for the sake of keeping this function pure + // just put the multibyte character back; + new_word.append(accent); + + } + } else { + new_word.push_back(word[i]); + } + i += grab; + } + return new_word; +} + +int upper_count(std::string word) { + int count = 0; + for (char letter : word) { + if (isupper(letter)) { + count += 1; + } + } + return count; +} + +bool is_all_upper(std::string word) { + for (char letter : word) { + if (!isupper(letter)) { + return false; + } + } + return true; +} + +/* + * Text condition checks + */ +bool is_roman_numeral(char letter) { + return ROMAN_NUMERAL_CHARACTERS.find(letter) != std::string::npos; +} + +bool can_be_roman_numeral(std::string word) { + for (int i = 0; i < word.size(); i++) { + if (!is_roman_numeral(word[i])) { + return false; + } + } + return true; +} + +bool is_alphabetic(char letter) { + return ALPHABET.find(letter) != std::string::npos; +} + +bool is_numeric(char letter) { + int val = (int) letter; + return val >= 48 && val <= 57; +} + + +std::string parse_voice_code(std::string voice_code) { +#ifdef ESPEAK_INSTALL + voice_code = to_lower(voice_code); + const espeak_VOICE * primary_match = nullptr; + const espeak_VOICE * secondary_match = nullptr; + bool search_by_lc = voice_code.size() == 2; + bool search_by_lfc = !search_by_lc && voice_code.size() == 3; + bool search_by_id = !search_by_lfc && voice_code.find("/") != std::string::npos; + // It is common for locale's to be '_' separated rather than '-' separated. Check for both. + bool search_by_lcc = !search_by_id && (voice_code.find("-") != std::string::npos || voice_code.find("_") != std::string::npos); + if (search_by_id || search_by_lcc) { + voice_code = replace(voice_code, '_', '-'); + } + const espeak_VOICE** espeak_voices = espeak_wrapper::get_instance()->list_voices(); + // ideally we'd use the espeak voice scores which order voices by preference, but they are only returned when a voice_spec is passed to the list api and + // the voice spec isn't compatible with partials (e.g. country codes, language family code, etc) + int i = 0; + while (espeak_voices[i] != nullptr) { + auto identifier_parts = split(espeak_voices[i]->identifier, "/"); + // it is possible to add languages to espeak-ng without following their identifier pattern, if we run into such a language just try to match against + // the identifier and otherwise continue; + if (identifier_parts.size() == 1) { + if (voice_code == identifier_parts[0] || voice_code == espeak_voices[i]->name) { + primary_match = espeak_voices[i]; + } else { + continue; + } + } + if (search_by_lc) { + std::string language_part = identifier_parts[1]; + if (language_part == voice_code) { + primary_match = espeak_voices[i]; + break; // if we have an exact match then we can exit + } else if (has_prefix(language_part, voice_code) && (!primary_match || strlen(primary_match->identifier) > strlen(espeak_voices[i]->identifier))) { + // prefer the smaller codes as longer codes typically refer to more specific locales + primary_match = espeak_voices[i] ; + } else { + auto subparts = split(language_part, "-"); + if (subparts.size() > 1 && to_lower(subparts[1]) == voice_code && (!secondary_match || strlen(secondary_match->identifier) > strlen(espeak_voices[i]->identifier))) { + // country codes are typically capitalized in espeak-ng + secondary_match = espeak_voices[i]; + } + } + } else if (search_by_lfc) { + // espeak-ng uses language family codes in their identifiers, but also uses ISO 639-3 language codes for some languages. + // Since language codes are more specific attempt to match against the language code as the primary and match against the language family + // code as the secondary. + if (has_prefix(identifier_parts[1], voice_code) && (!primary_match || strlen(primary_match->identifier) > strlen(espeak_voices[i]->identifier))) { + primary_match = espeak_voices[i]; + } else if (identifier_parts[0] == voice_code && (!secondary_match || strlen(secondary_match->identifier) > strlen(espeak_voices[i]->identifier))) { + secondary_match = espeak_voices[i]; + } + } else if (search_by_id && has_prefix(to_lower(espeak_voices[i]->identifier), voice_code) && (!primary_match || strlen(primary_match->identifier) > strlen(espeak_voices[i]->identifier))) { + primary_match = espeak_voices[i]; + } else if (search_by_lcc && has_prefix(to_lower(identifier_parts[1]), voice_code) && (!primary_match || strlen(primary_match->identifier) > strlen(espeak_voices[i]->identifier))) { + primary_match = espeak_voices[i]; + } else if (to_lower(espeak_voices[i]->name).find(voice_code) != std::string::npos && (!primary_match || strlen(primary_match->identifier) > strlen(espeak_voices[i]->identifier))) { + primary_match = espeak_voices[i]; + } + i++; + } + if (!primary_match && !secondary_match) { + TTS_ABORT("Failed to match espeak voice code '%s' to known espeak voices.\n", voice_code.c_str()); + } + if (!primary_match) { + primary_match = secondary_match; + } + fprintf(stdout, "Passed Espeak Voice Code '%s' doesn't directly match any known Espeak Voice IDs. Nearest match with name '%s' and id '%s' will be used instead.\n", voice_code.c_str(), primary_match->name, primary_match->identifier); + return std::string(primary_match->identifier); +#else + TTS_ABORT("Attempted to list voices without espeak-ng installed."); +#endif +} + +void update_voice(std::string voice_code) { +#ifdef ESPEAK_INSTALL + espeak_ERROR e = espeak_wrapper::get_instance()->set_voice(voice_code.c_str()); + if (e != EE_OK) { + voice_code = parse_voice_code(voice_code); + espeak_wrapper::get_instance()->set_voice(voice_code.c_str()); + } +#else + TTS_ABORT("Attempted to set voice without espeak-ng installed."); +#endif +} + + +void conditions::reset_for_clause_end() { + hyphenated = false; + was_punctuated_acronym = false; + beginning_of_clause = true; + was_number = false; +} + +void conditions::reset_for_space() { + hyphenated = false; + was_punctuated_acronym = false; + was_word = false; +} + +void conditions::update_for_word(std::string word, bool allow_for_upper_check) { + if (allow_for_upper_check && !is_all_upper(word)) { + was_all_capitalized = false; + } + was_word = true; + beginning_of_clause = false; + hyphenated = false; + was_number = false; +} + +std::string corpus::next(int count) { + if (location == size || count == 0) { + return ""; + } + int final_loc = location; + int grabbed = 0; + while(grabbed < count && final_loc < size) { + while(final_loc + 1 < size && (text[final_loc+1] & 0b11000000) == 0b10000000) { + ++final_loc; + } + ++final_loc; + ++grabbed; + } + return std::string(text+location, text+final_loc); +} + +std::string corpus::last(int count) { + if (location == 0 || count == 0) { + return ""; + } + int final_loc = location - 1; + int grabbed = 0; + while(grabbed < count && final_loc > 0) { + while((text[final_loc] & 0b11000000) == 0b10000000) { + --final_loc; + } + ++grabbed; + } + + return std::string(text+final_loc, text+location-1); +} + +std::string corpus::pop(int count) { + std::string ret = next(count); + location += ret.size(); + return ret; +} + +std::string corpus::after(int aftr, int count) { + size_t new_loc = location + aftr; + if (new_loc >= size || count == 0) { + return ""; + } + int final_loc = new_loc; + int grabbed = 0; + while(grabbed < count && final_loc < size) { + while(final_loc+1 < size && (text[final_loc+1] & 0b11000000) == 0b10000000) { + ++final_loc; + } + ++final_loc; + ++grabbed; + } + return std::string(text+new_loc, text+final_loc); +} + +std::string corpus::size_pop(size_t pop_size) { + size_t tsize = std::min(pop_size, size - location); + std::string ret = std::string(text+location, text+location+tsize); + location += tsize; + return ret; +} + +std::string corpus::next_in(std::string val, bool* has_accent) { + int n = 0; + int running = 0; + std::string nafter = next(); + while (nafter != "" && val.find(nafter) != std::string::npos) { + if (has_accent && !(*has_accent) && COMMON_ACCENTED_CHARACTERS.find(nafter) != std::string::npos) { + *has_accent = true; + } + ++n; + running += nafter.size(); + nafter = after(running); + } + return next(n); +} + +std::string corpus::pop_in(std::string val) { + int n = 0; + size_t running = 0; + std::string nafter = next(); + running += nafter.size(); + while (nafter != "" && val.find(nafter) != std::string::npos) { + ++n; + nafter = after(running); + running += nafter.size(); + } + return pop(n); +} + +std::string corpus::after_until(int aftr, std::string val) { + int n = 0; + std::string nafter = after(aftr); + while (nafter != "" && val.find(nafter) != std::string::npos) { + ++n; + nafter = after(n); + } + return after(aftr, n); +} + +std::string phonemizer_rule::lookup_rule(std::vector & keys, int index) { + if (index >= keys.size()) { + return value; + } + std::string found_key = keys[index]; + bool found_match = false; + for (const auto& pair : rules) { + if (pair.first == found_key) { + found_match = true; + break; + } else if (pair.first[0] == '*' && has_suffix(found_key, pair.first.substr(1))) { + found_match = true; + found_key = pair.first; + break; + } else if (pair.first.back() == '*' && has_prefix(found_key, pair.first.substr(0, pair.first.size()-1))) { + found_match = true; + found_key = pair.first; + break; + } + } + if (found_match) { + return rules.at(found_key)->lookup_rule(keys, index + 1); + } else { + return value; + } +} + +std::string word_phonemizer::lookup_rule(std::string word, std::string current, std::string before, std::string after) { + if (rules.find(current) == rules.end()) { + return ""; + } + std::vector lookup_keys = {before, after, word}; + return rules[current]->lookup_rule(lookup_keys, 0); +} + +void word_phonemizer::add_rule(std::vector keys, std::string phoneme) { + phonemizer_rule * current_rule = nullptr; + for (int i = 0; i < keys.size(); i++) { + if (current_rule) { + if (current_rule->rules.find(keys[i]) == current_rule->rules.end()) { + phonemizer_rule * nrule = new phonemizer_rule; + current_rule->rules[keys[i]] = nrule; + current_rule = nrule; + } else { + current_rule = current_rule->rules.at(keys[i]); + } + } else { + if (rules.find(keys[i]) == rules.end()) { + current_rule = new phonemizer_rule; + rules[keys[i]] = current_rule; + } else { + current_rule = rules.at(keys[i]); + } + } + } + if (current_rule) { + current_rule->value = phoneme; + } +} + +std::string word_phonemizer::phonemize(std::string word) { + std::vector graphemes; + word = to_lower(word); + tokenizer->token_split(word, graphemes); + std::string phoneme = ""; + for (int i = 0; i < graphemes.size(); i++) { + std::string before = i > 0 ? graphemes[i-1] : "^"; + std::string after = i + 1 < graphemes.size() ? graphemes[i+1] : "$"; + std::string current = graphemes[i]; + phoneme += lookup_rule(word, current, before, after); + } + return phoneme; +} + +std::string build_subthousand_phoneme(int value) { + int hundreds = value / 100; + std::string phoneme = hundreds > 0 ? NUMBER_PHONEMES[hundreds] + " " + HUNDRED_PHONEME : ""; + value = value % 100; + if (value > 0 && value < 20) { + phoneme += NUMBER_PHONEMES[value]; + } else if (value > 0) { + phoneme += SUB_HUNDRED_NUMBERS[(value / 10) - 2]; + value = value % 10; + if (value > 0) { + phoneme += " " + NUMBER_PHONEMES[value]; + } + } + return phoneme; +} + +std::string build_number_phoneme(long long int remainder) { + std::string phoneme = ""; + bool started = false; + if (remainder > TRILLION) { + long long int trillions = (long long int) remainder / TRILLION; + phoneme += build_subthousand_phoneme(trillions) + " " + TRILLION_PHONEME; + remainder = (long long int) remainder % TRILLION; + if (remainder > 0) { + phoneme += ","; + } + started = true; + } + if (remainder > BILLION) { + long long int billions = (long long int) remainder / BILLION; + remainder = (long long int) remainder % BILLION; + std::string billion_part = build_subthousand_phoneme(billions) + " " + BILLION_PHONEME; + if (!started) { + phoneme += remainder > 0 ? billion_part + "," : billion_part; + + } else if (remainder == 0) { + phoneme += " " + billion_part; + } else { + phoneme += " " + billion_part + ","; + } + started = true; + } + if (remainder > MILLION) { + long long int millions = (long long int) remainder / MILLION; + remainder = (long long int) remainder % MILLION; + std::string million_part = build_subthousand_phoneme(millions) + " " + MILLION_PHONEME; + if (!started) { + phoneme += remainder > 0 ? million_part + "," : million_part; + } else if (remainder == 0) { + phoneme += " " + million_part; + } else { + phoneme += " " + million_part + ","; + } + started = true; + } + if (remainder > 1000) { + long long int thousands = (long long int) remainder / 1000; + remainder = (long long int) remainder % 1000; + std::string thousand_part = build_subthousand_phoneme(thousands) + " " + THOUSAND_PHONEME; + if (!started) { + phoneme += remainder > 0 ? thousand_part + "," : thousand_part; + } else if (remainder == 0) { + phoneme += " " + thousand_part; + } else { + phoneme += " " + thousand_part + ","; + } + started = true; + } + if (remainder > 0) { + if (started) { + phoneme += " " + build_subthousand_phoneme(remainder); + } else { + phoneme += build_subthousand_phoneme(remainder); + } + } + return phoneme; +} + +bool dictionary_response::is_successful() { + return code < 200; +} + +bool dictionary_response::is_match(corpus* text, conditions* flags) { + if (not_at_clause_end) { + std::string chunk = text->next_in(NON_CLAUSE_WORD_CHARACTERS); + std::string after = text->after(chunk.size()); + if (after == "!" || after == "." || after == "?") { + return false; + } + } + return text->next(after_match.size()) == after_match && (!expects_to_be_proceeded_by_number || flags->was_number) && (!not_at_clause_start || !flags->beginning_of_clause); +} + +dictionary_response * phoneme_dictionary::lookup(corpus * text, std::string value, conditions* flags) { + if (lookup_map.find(value) == lookup_map.end()) { + return not_found_response; + } + std::vector possibilities = lookup_map.at(value); + for (auto possible : possibilities) { + if (possible->code == SUCCESS || (possible->code == SUCCESS_PARTIAL && possible->is_match(text, flags))) { + return possible; + } + } + return phonetic_fallback_response; +} + +bool phonemizer::handle_space(corpus* text, std::string* output, conditions* flags) { + flags->reset_for_space(); + text->pop_in(" \n\f\t"); + if (output->back() != ' ') { + output->append(" "); + } + return true; +} + +void phonemizer::append_numeric_series(std::string series, std::string* output, conditions * flags) { + if (flags->was_word && output->back() != ' ' && !flags->hyphenated) { + output->append(" "); + } + for (int i = 0; i < series.size(); i++) { + int numeral = series[i] - '0'; + output->append(NUMBER_PHONEMES[numeral]); + if (i + 1 < series.size()) { + output->append(" "); + } + } + if (series.size() > 0) { + flags->update_for_word(series); + flags->was_number = true; + } +} + +bool phonemizer::handle_numeric_series(corpus* text, std::string* output, conditions* flags) { + std::string series = text->pop_in(NUMBER_CHARACTERS); + append_numeric_series(series, output, flags); + return true; +} + +bool phonemizer::handle_numeric(corpus* text, std::string* output, conditions* flags) { + /* + * There are four recognized ways of separating large arabic numerals: + * 1. No breaks or seperations exception for the decimal (e.g. '32000.012' or '32000,012') + * 2. Space separated breaks between every three digits and comma separated decimals (e.g. '32 000,012') + * 3. Period separated breaks between every three digits and comma separated decimals (e.g. '32.000,012') + * 4. Comma separated breaks between every three digits and period separated decimals (e.g. '32,000.012') + * + * This implementation will support all three approaches up to the trillions, after which numbers will be represented as a series + * of distinct digits. Non conforming patterns, e.g. multiple commas, multiple periods, or multiple spaces that are not three + * digits apart, will not be treated as continuous numbers but rather separate numerical strings. + */ + std::string number = text->next_in(COMPATIBLE_NUMERICS); + number = strip(number, ",. "); + + // For numerics, we don't necessarily want to stop reading from the corpus at periods, commas, or spaces. + char large_number_separator = '\0'; + char decimal_separator = '\0'; + char last_break_char = '\0'; + bool invalid_format = false; + int count_since_break = 0; + std::string built = ""; + for (char & c : number) { + if (is_numeric(c)) { + built += c; + count_since_break += 1; + } else if (last_break_char =='\0') { + if (count_since_break > 3) { + decimal_separator = c; + } + last_break_char = c; + built += c; + count_since_break = 0; + } else if (c != last_break_char) { + if (c == ' ') { + break; + } else if (count_since_break == 3 && decimal_separator == '\0') { + if (large_number_separator == '\0') { + large_number_separator = last_break_char; + } + decimal_separator = c; + built += c; + count_since_break = 0; + last_break_char = c; + } else if (count_since_break != 3) { + if (large_number_separator != '\0') { + invalid_format = true; + } + break; + } else { + break; + } + } else if (c == last_break_char) { + if (decimal_separator != '\0') { + break; + } else if (count_since_break != 3) { + invalid_format = true; + break; + } else { + large_number_separator = c; + built += c; + count_since_break = 0; + } + } + } + + if (!invalid_format) { + if (large_number_separator != '\0' && decimal_separator == '\0' && count_since_break != 3) { + invalid_format = true; + } else if (count_since_break == 3 && last_break_char != '\0' && decimal_separator == '\0' && large_number_separator == '\0') { + large_number_separator = last_break_char; + } else if (count_since_break != 3 && last_break_char != '\0' && decimal_separator == '\0' && large_number_separator == '\0') { + decimal_separator = last_break_char; + } + } + + if (invalid_format) { + return handle_numeric_series(text, output, flags); + } + + if (large_number_separator != '\0') { + built.erase(std::remove(built.begin(), built.end(), large_number_separator), built.end()); + } + if (decimal_separator == ',') { + replace(built, decimal_separator, '.'); + } + long long int value = std::stoll(built); + + if (value >= LARGEST_PRONOUNCABLE_NUMBER) { + return handle_numeric_series(text, output, flags); + } + + text->size_pop(built.size()); + + std::string noutput = build_number_phoneme(value); + if (noutput.size() > 0) { + if (flags->was_word && output->back() != ' ' && !flags->hyphenated) { + output->append(" "); + } + output->append(noutput); + flags->update_for_word(built); + flags->was_number = true; + } + if (decimal_separator != '\0') { + std::vector parts = split(built, decimal_separator); + if (parts[1].size() > 0) { + output->append(" " + POINT_PHONEME + " "); + append_numeric_series(parts[1], output, flags); + } + } + return true; +} + +bool phonemizer::is_acronym_like(corpus* text, std::string word, conditions* flags) { + if (word.find(".") != std::string::npos) { + for (std::string part : split(word, ".")) { + if (part.size() == 0) { + return false; + } + if (part.size() > 1) { + if (part.size() > 2 || !(isupper(part[0]) && islower(part[1]))) { + return false; + } + } + } + return true; + } else if (word.size() < 4) { + return small_english_words.find(to_lower(word)) == small_english_words.end(); + } else if (is_all_upper(word)) { + if (flags->was_all_capitalized || is_all_upper(text->after_until(word.size()+1, " "))) { + flags->was_all_capitalized = true; + return false; + } + return true; + } else if (!is_all_upper(word) && upper_count(word) > (int) word.length() / 2) { + return true; + } + return false; +} + +bool phonemizer::handle_roman_numeral(corpus* text, std::string* output, conditions * flags) { + auto next = text->next(); + next = to_lower(next); + int total = 0; + int last_value = 0; + std::string running = ""; + while (is_roman_numeral(next[0])) { + bool found = false; + for (int size = 4; size > 0; size--) { + std::string chunk = text->after(running.size(), size); + chunk = to_lower(chunk); + if (ROMAN_NUMERALS.find(chunk) != ROMAN_NUMERALS.end()) { + found = true; + int found_value = ROMAN_NUMERALS.at(chunk); + if (total == 0 || last_value > found_value) { + total += found_value; + last_value = found_value; + running += chunk; + } else { + return false; + } + } + } + if (found) { + next = text->after(running.size()); + to_lower(next); + continue; + } + return false; + } + + std::string noutput = build_number_phoneme(total); + if (flags->was_word && output->back() != ' ' && !flags->hyphenated) { + output->append(" "); + } + output->append(noutput); + text->size_pop(running.size()); + flags->update_for_word(running, false); + flags->was_number = true; + + return true; +} + +bool phonemizer::handle_acronym(corpus* text, std::string word, std::string* output, conditions * flags) { + std::string out = ""; + for (int i = 0; i < word.size(); i++) { + try { + if (word[i] == '.') { + flags->was_punctuated_acronym = true; + continue; + } + char letter = std::tolower(word[i]); + out += LETTER_PHONEMES.at(letter); + } catch (const std::out_of_range& e) { + continue; + } + } + text->size_pop(word.size()); + if (flags->was_word && output->back() != ' ' && !flags->hyphenated) { + output->append(" "); + } + output->append(out); + flags->update_for_word(word, false); + return true; +} + +bool phonemizer::handle_phonetic(corpus* text, std::string word, std::string* output, conditions* flags, size_t unaccented_size_difference) { + if (flags->was_word && output->back() != ' ' && !flags->hyphenated) { + output->append(" "); + } + output->append(phonetic_phonemizer->phonemize(word)); + text->size_pop(word.size()+unaccented_size_difference); + flags->update_for_word(word); + return true; +} + +bool phonemizer::process_word(corpus* text, std::string* output, std::string word, conditions* flags, bool has_accent) { + dictionary_response* response; + size_t unaccented_size_difference = 0; + if (has_accent) { + response = dict->lookup(text, word, flags); + if (!response->is_successful()) { + unaccented_size_difference = word.size(); + word = replace_accents(word); + unaccented_size_difference -= word.size(); + response = dict->lookup(text, word, flags); + } + } else { + response = dict->lookup(text, word, flags); + } + + if (response->is_successful()) { + if (flags->was_word && output->back() != ' ' && !flags->hyphenated) { + output->append(" "); + } + flags->update_for_word(word); + if (response->code != SUCCESS) { + word += response->after_match; + output->append(response->value); + text->size_pop(word.size()+unaccented_size_difference); + return true; + } else { + output->append(response->value); + text->size_pop(word.size()+unaccented_size_difference); + return true; + } + } else if (can_be_roman_numeral(word) && is_all_upper(word) && small_english_words.find(to_lower(word)) == small_english_words.end() && handle_roman_numeral(text, output, flags)) { + return true; + } else if (is_acronym_like(text, word, flags)) { + return handle_acronym(text, word, output, flags); + } else if (word.find(".") < word.length()) { + bool part_has_accent = false; + std::string word_part = text->next_in(ALPHABET+COMMON_ACCENTED_CHARACTERS, &part_has_accent); + process_word(text, output, word_part, flags, part_has_accent); + handle_punctuation(text, ".", output, flags); + output->append(" "); + flags->reset_for_space(); + return true; + } else { + return handle_phonetic(text, word, output, flags, unaccented_size_difference); + } + return true; +} + +bool phonemizer::handle_word(corpus * text, std::string* output, conditions * flags) { + bool has_accent = false; + std::string word = text->next_in(WORD_CHARACTERS, &has_accent); + while (word.size() > 0 && word.back() == '.') { + word = word.substr(0,word.size()-1); + } + + return process_word(text, output, word, flags, has_accent); +} + +bool phonemizer::handle_replacement(corpus* text, std::string next, std::string* output, conditions * flags) { + if (flags->was_word && output->back() != ' ' && !flags->hyphenated) { + output->append(" "); + } + output->append(REPLACEABLE.at(next)); + flags->update_for_word(next); + text->pop(); + return true; +} + +bool phonemizer::handle_possession_plural(corpus* text, std::string* output, conditions * flags) { + if (text->next(2) == "'s") { + std::string last = text->last(); + if (VOWELS.find(to_lower(last)[0]) != std::string::npos) { + output->append("z"); + } else if (last == "s" || last == "z") { + output->append("ᵻz"); + } else if (is_alphabetic(last[0])) { + output->append("s"); + } else { + output->append("ˈɛs"); + } + text->pop(2); + } else { + text->pop(); + } + return true; +} + +bool phonemizer::handle_contraction(corpus* text, std::string* output, conditions * flags) { + text->pop(); + std::string next = text->next_in(ALPHABET); + next = to_lower(next); + try { + output->append(CONTRACTION_PHONEMES.at(next)); + } catch (const std::out_of_range& e) { + // in the situation that we cannt find a contraction then we just want to pop the ' character and continue + // it could be the end of a single quote which is ignored by the espeak phonemizer. + return true; + } + // make sure to pop the contraction. + text->pop_in(ALPHABET); + return true; +} + +bool phonemizer::handle_punctuation(corpus* text, std::string next, std::string* output, conditions * flags) { + std::string last = text->last(); + std::string after = text->after(); + if (next[0] == '.') { + if (flags->was_punctuated_acronym) { + // we finished an acronym + flags->was_punctuated_acronym = false; + output->append(next); + text->pop(); + if (text->after(1, 2) == "'s") { + return handle_possession_plural(text, output, flags); + } + return true; + } + std::string chunk = text->next_in("."); + /*if (chunk.size() > 1) { + flags->pre_pause += 4; + }*/ + output->append(chunk); + text->size_pop(chunk.size()); + return true; + } else if (next == "'") { + if (flags->was_word && (after == "s" || !is_alphabetic(after[0]))) { + return handle_possession_plural(text, output, flags); + } else if (flags->was_word && (CONTRACTION_PHONEMES.find(after) != CONTRACTION_PHONEMES.end() || CONTRACTION_PHONEMES.find(text->after(next.size(), 2)) != CONTRACTION_PHONEMES.end())) { + return handle_contraction(text, output, flags); + } else { + // could be the end or start of a quote + text->pop(); + return true; + } + } else if (next[0] == '-') { + if (last == " " && after == " ") { + //flags->pre_pause += 4; + text->pop(2); + flags->reset_for_space(); + return true; + } else if (after[0] == '-') { + //flags->pre_pause += 4; + text->pop(2); + output->append(" "); + flags->reset_for_space(); + return true; + } else if (!flags->beginning_of_clause && flags->was_word && is_alphabetic(after[0])) { + flags->hyphenated = true; + text->pop(); + return true; + } else { + // ignore it + text->pop(); + return true; + } + } + else if (CLAUSE_BREAKS.find(next) != std::string::npos) { + output->append(next); + flags->reset_for_clause_end(); + text->pop(); + return true; + } else if (NOOP_BREAKS.find(next) != std::string::npos) { + output->append(next); + text->pop(); + return true; + } else if (REPLACEABLE.find(next) != REPLACEABLE.end()) { + return handle_replacement(text, next, output, flags); + } else { + // ignore it + text->pop(); + return true; + } +} + +bool phonemizer::route(corpus * text, std::string* output, conditions * flags) { + std::string next = text->next(); + if (next == "") { + // we finished lexing the corpus + return false; + } + if (SPACE_CHARACTERS.find(next) != std::string::npos) { + return handle_space(text, output, flags); + } else if (is_numeric(next[0])) { + return handle_numeric(text, output, flags); + } else if (is_alphabetic(next[0])) { + return handle_word(text, output, flags); + } else { + return handle_punctuation(text, next, output, flags); + } +} + +#ifdef ESPEAK_INSTALL +std::string phonemizer::espeak_text_to_phonemes(const char * text) { + int mode = phoneme_mode == IPA ? (0 << 8 | 0x02) : (0 << 8 | 0x01); + const void ** txt_ptr = (const void**)&text; + const char * resp = espeak_wrapper::get_instance()->text_to_phonemes(txt_ptr, espeakCHARS_UTF8, mode); + return strip(std::string(resp)); +} +#endif + +std::string phonemizer::text_to_phonemes(const char * text, size_t size) { + std::string output = ""; + if (mode == ESPEAK) { +#ifdef ESPEAK_INSTALL + auto parts = split(text, STOPPING_TOKENS, true); + std::string phonemes = ""; + for (int i = 0; i < parts.size(); i+=2) { + phonemes += espeak_text_to_phonemes(parts[i].c_str()); + if (preserve_punctuation && i + 1 < parts.size()) { + phonemes += parts[i+1]; + } + } + return phonemes; +#else + TTS_ABORT("%s attempted to run in espeak mode without espeak installed. \n", __func__); +#endif + } else { + text_to_phonemes(text, size, &output); + } + return output; +} + +std::string phonemizer::text_to_phonemes(std::string text) { + return text_to_phonemes(text.c_str(), text.size()); +} + +void phonemizer::text_to_phonemes(const char * text, size_t size, std::string* output) { + if (mode == ESPEAK) { +#ifdef ESPEAK_INSTALL + TTS_ABORT("%s attempted to run in espeak mode with output already defined. \n", __func__); +#else + TTS_ABORT("%s attempted to run in espeak mode without espeak installed. \n", __func__); +#endif + return; + } + corpus * corpus_text = new corpus(text, size); + conditions * flags = new conditions; + bool running = true; + while (running) { + running = route(corpus_text, output, flags); + } + delete corpus_text; + delete flags; +} + +void phonemizer::text_to_phonemes(std::string text, std::string* output) { + text_to_phonemes(text.c_str(), text.size(), output); +} + +struct word_phonemizer * word_phonemizer_from_gguf(gguf_context * meta) { + struct single_pass_tokenizer * tokenizer = single_pass_tokenizer_from_gguf(meta); + word_phonemizer * wph = new word_phonemizer(tokenizer); + int rule_keys_key = gguf_find_key(meta, "phonemizer.rules.keys"); + int phoneme_key = gguf_find_key(meta, "phonemizer.rules.phonemes"); + if (rule_keys_key == -1 || phoneme_key == -1) { + TTS_ABORT("Both 'phonemizer.rules.keys' and 'phonemizer.rules.phonemes' keys must be set in order to support phonemization."); + } + int key_count = gguf_get_arr_n(meta, rule_keys_key); + assert(key_count == gguf_get_arr_n(meta, phoneme_key)); + for (int i = 0; i < key_count; i++) { + std::string rule_key = gguf_get_arr_str(meta, rule_keys_key, i); + std::string phoneme = gguf_get_arr_str(meta, phoneme_key, i); + wph->add_rule(split(rule_key, "."), phoneme); + } + return wph; +} + +dictionary_response * response_from_string(std::string value, std::string key) { + std::vector parts = split(value, ":"); + bool has_spacing = parts.size() > 1; + bool expects_to_be_proceeded_by_number = key[0] == '$'; + bool not_at_start = key[0] == '#'; + bool not_at_end = key.back() == '#'; + if (!has_spacing) { + dictionary_response * resp = new dictionary_response(SUCCESS, value); + resp->expects_to_be_proceeded_by_number = expects_to_be_proceeded_by_number; + resp->not_at_clause_end = not_at_end; + resp->not_at_clause_start = not_at_start; + return resp; + } else { + dictionary_response * resp = new dictionary_response(SUCCESS_PARTIAL, parts[0]); + resp->after_match = parts[1]; + resp->expects_to_be_proceeded_by_number = expects_to_be_proceeded_by_number; + resp->not_at_clause_end = not_at_end; + resp->not_at_clause_start = not_at_start; + return resp; + } +} + +struct phoneme_dictionary * phoneme_dictionary_from_gguf(gguf_context * meta) { + struct phoneme_dictionary * dict = new phoneme_dictionary; + + int keys_key = gguf_find_key(meta, "phonemizer.dictionary.keys"); + int values_key = gguf_find_key(meta, "phonemizer.dictionary.values"); + if (keys_key == -1 || values_key == -1) { + TTS_ABORT("Both 'phonemizer.dictionary.keys' and 'phonemizer.dictionary.values' keys must be set in order to support phonemization."); + } + int key_count = gguf_get_arr_n(meta, keys_key); + assert(key_count == gguf_get_arr_n(meta, values_key)); + for (int i = 0; i < key_count; i++) { + std::string key = gguf_get_arr_str(meta, keys_key, i); + std::string values = gguf_get_arr_str(meta, values_key, i); + std::vector out; + for (std::string val : split(values, ",")) { + out.push_back(response_from_string(val, key)); + } + if (key[0] == '$' || key[0] == '#') { + key = key.substr(1); + } + if (key.back() == '#') { + key = key.substr(0, key.size() - 1); + } + dict->lookup_map[key] = out; + } + return dict; +} + +struct phonemizer * phonemizer_from_gguf(gguf_context * meta, const std::string espeak_voice_code) { + int mode_key = gguf_find_key(meta, "phonemizer.type"); + phonemizer * ph; + if (mode_key == -1) { + TTS_ABORT("Key 'phonemizer.type' must be specified in gguf file for all models using a phonemizer."); + } + uint32_t ph_type = gguf_get_val_u32(meta, mode_key); + + if ((phonemizer_type) ph_type == ESPEAK) { +#ifdef ESPEAK_INSTALL + espeak_wrapper::get_instance()->initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, ESPEAK_DATA_PATH, 0); + + update_voice(espeak_voice_code); + + ph = new phonemizer(nullptr, nullptr); + ph->mode = ESPEAK; +#else + TTS_ABORT("%s attempted to load an espeak phonemizer without espeak installed. \n", __func__); +#endif + int phoneme_type_key = gguf_find_key(meta, "phonemizer.phoneme_type"); + if (phoneme_type_key != -1) { + uint32_t phoneme_typing = gguf_get_val_u32(meta, mode_key); + if ((phoneme_type)phoneme_typing == ESPEAK_PHONEMES) { + ph->phoneme_mode = ESPEAK_PHONEMES; + } + } + return ph; + } + struct word_phonemizer * phonetic_ph = word_phonemizer_from_gguf(meta); + struct phoneme_dictionary * dict = phoneme_dictionary_from_gguf(meta); + ph = new phonemizer(dict, phonetic_ph); + return ph; +} + +struct phonemizer * espeak_phonemizer(bool use_espeak_phonemes, std::string espeak_voice_code) { +#ifdef ESPEAK_INSTALL + espeak_wrapper::get_instance()->initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, ESPEAK_DATA_PATH, 0); + + update_voice(espeak_voice_code); + + phonemizer * ph = new phonemizer(nullptr, nullptr); + ph->mode = ESPEAK; + if (use_espeak_phonemes) { + ph->phoneme_mode = ESPEAK_PHONEMES; + } + return ph; +#else + TTS_ABORT("%s attempted to load an espeak phonemizer without espeak installed. \n", __func__); +#endif +} + +struct phonemizer * phonemizer_from_file(const std::string fname, const std::string espeak_voice_code) { + ggml_context * weight_ctx = NULL; + struct gguf_init_params params = { + /*.no_alloc =*/ false, + /*.ctx =*/ &weight_ctx, + }; + gguf_context * meta_ctx = gguf_init_from_file(fname.c_str(), params); + if (!meta_ctx) { + TTS_ABORT("%s failed for file %s\n", __func__, fname.c_str()); + } + return phonemizer_from_gguf(meta_ctx, espeak_voice_code); +} diff --git a/otherarch/ttscpp/src/sampler.cpp b/otherarch/ttscpp/src/sampler.cpp new file mode 100644 index 000000000..b2f2cc1b7 --- /dev/null +++ b/otherarch/ttscpp/src/sampler.cpp @@ -0,0 +1,204 @@ +#include "sampler.h" + +void sampler::sample(float * logits, std::vector & output_tokens) { + // assume that we are pointing to the start of the first token output; + if (!do_sample) { + return max(logits, output_tokens); + } + std::vector max_vals; + // the max_head_probs variable is used when top-p is applied but exists to address the case in which top-k and top-p cause the cumulative probability of the nucleus to beless than or + // equal to top_p; + std::vector max_head_probs; + + // This allows us to perform an effective softmax without logarithms or big number calculations. + // Additionally by avoiding large number division we drastically improve the stability of + // our softmax implementation; + max(logits, max_vals); + + std::vector> picks; + bool use_nucleus_sampling = false; + bool performed_softmax = false; + + if (top_p < 1.0) { + // if we are nucleus sampling via top-p then we need to perform softmax over the samples before getting top_k samples, so that we don't trim beyond top_p. + // Otherwise, if we are not performing top-p sampling then it is more efficient to perform softmax after getting the top_k nucleus. + softmax(logits, picks, max_vals); + performed_softmax = true; + } + if (top_k > 0 && top_k < vocab_size) { + picks = topk(logits, performed_softmax); + use_nucleus_sampling = true; + } + + if (top_p >= 1.0) { + softmax(logits, picks, max_vals); + performed_softmax = true; + } + + if (top_p < 1.0) { + topp(logits, picks, max_head_probs); + use_nucleus_sampling = true; + } + + bool has_repetition_penalty = repetition_penalty != 1.0; + if (has_repetition_penalty && (last_token_ids.size() == 0 || repetition_counts.size() == 0)) { + reset(); + } + std::minstd_rand gen(std::random_device{}()); + std::uniform_real_distribution dist(0.0f, 1.0f); + for (int i = 0; i < n_output_heads; i++) { + float assignment = top_p < 1.0 ? dist(gen) * max_head_probs[i] : dist(gen); + float cumulative = 0.0f; + for (uint32_t j = 0; j < (use_nucleus_sampling ? picks[i].size() : vocab_size); j++) { + int ii = use_nucleus_sampling ? (int) picks[i][j] : j; + cumulative += *(logits+(i*vocab_size+ii)); + // with top_k and top_p it is possible for the assignment to be greater than the cumulative value + if (assignment <= cumulative || ii >= vocab_size + 1 || j >= picks[i].size() - 1) { + if (has_repetition_penalty) { + if (last_token_ids[i] != ii) { + repetition_counts[i] = 0; + } + last_token_ids[i] = ii; + repetition_counts[i] += 1; + } + output_tokens.push_back(ii); + break; + } + } + } +} + +void sampler::reset() { + if (repetition_penalty != 1.0) { + last_token_ids.clear(); + repetition_counts.clear(); + for (int i = 0; i < n_output_heads; i++) { + last_token_ids.push_back(-1); + repetition_counts.push_back(0); + } + } +} + +void sampler::softmax(float * logits, std::vector> picks, std::vector max_indices) { + bool use_nucleus_sampling = picks.size() > 0; + bool has_repetition_penalty = repetition_penalty != 1.0f; + bool has_temperature = temperature != 1.0f; + for (int i = 0; i < n_output_heads; i++) { + float cumsum = 0.0; + float max_val = logits[i*vocab_size + max_indices[i]]; + if (has_repetition_penalty && last_token_ids[i] == max_indices[i]) { + max_val /= (pow(repetition_penalty, repetition_counts[i])); + } + if (has_temperature) { + max_val /= temperature; + } + for (int j = 0; j < (use_nucleus_sampling ? picks[i].size() : vocab_size); j++) { + int ii = use_nucleus_sampling ? (int) picks[i][j] : j; + int index = i * vocab_size + ii; + float v = *(logits + index); + if (has_repetition_penalty && last_token_ids[i] == ii) { + v /= (pow(repetition_penalty, repetition_counts[i])); + } + if (has_temperature) { + v /= temperature; + } + v = expf(v - max_val); + cumsum += v; + logits[index] = v; + } + for (int j = 0; j < (use_nucleus_sampling ? picks[i].size() : vocab_size); j++) { + int ii = use_nucleus_sampling ? picks[i][j] : j; + int index = i * vocab_size + ii; + float v = *(logits + index); + logits[index] = v / cumsum; + } + } +} + +void sampler::topp(float * logits, std::vector> & picks, std::vector & max_head_probs) { + if (picks.empty()) { + // we need to get the softmaxed logits ordered + for (int i = 0; i < n_output_heads; i++) { + std::vector head_picks(vocab_size); + iota(head_picks.begin(), head_picks.end(), 0); + // have to sort with repetition penalty applied so not to inavertently trim our nucleus size. + std::sort(head_picks.begin(), head_picks.end(), [&logits, &i, this](size_t s1, size_t s2) { + float v1 = logits[i*vocab_size+s1]; + float v2 = logits[i*vocab_size+s2]; + return v1 > v2; + }); + + picks.push_back(head_picks); + } + } + // if we didn't already perform topk or if the probable sum of topk logits is greater than top_p then we need to trim. + for (int i = 0; i < n_output_heads; i++) { + float prob_sum = 0.0f; + int trim_to = -1; + for (int ii = 0; ii < picks[i].size(); ii++) { + prob_sum += logits[i*vocab_size+picks[i][ii]]; + if (prob_sum >= top_p) { + trim_to = ii+1; + break; + } + } + max_head_probs.push_back(std::min(prob_sum, top_p)); + if (trim_to > 0) { + picks[i] = std::vector(picks[i].begin(), picks[i].begin()+trim_to); + } + } +} + +std::vector> sampler::topk(float * logits, bool performed_softmax) { + bool has_repetition_penalty = repetition_penalty != 1.0f; + std::vector> head_picks; + if (vocab_size < top_k) { + // technically we should never get here, but lets be protective. + for (int i = 0; i < n_output_heads; i++) { + std::vector picks(vocab_size); + iota(picks.begin(), picks.end(), 0); + head_picks.push_back(picks); + } + return head_picks; + } + for (int i = 0; i < n_output_heads; i++) { + std::vector picks(vocab_size); + iota(picks.begin(), picks.end(), 0); + // have to sort with repetition penalty applied so not to inavertently trim our nucleus size. + std::sort(picks.begin(), picks.end(), [&logits, &i, &has_repetition_penalty, &performed_softmax, this](size_t s1, size_t s2) { + float v1 = logits[i*vocab_size+s1]; + float v2 = logits[i*vocab_size+s2]; + if (!performed_softmax) { + if (has_repetition_penalty && last_token_ids[i] == s1) { + v1 /= (pow(repetition_penalty, repetition_counts[i])); + } else if (has_repetition_penalty && last_token_ids[i] == s2) { + v2 /= (pow(repetition_penalty, repetition_counts[i])); + } + } + return v1 > v2; + }); + head_picks.push_back(std::vector(picks.begin(), picks.begin() + top_k)); + } + return head_picks; +} + +void sampler::max(float * logits, std::vector & output_tokens) { + bool has_repetition_penalty = repetition_penalty != 1.0f; + for (int i = 0; i < n_output_heads; i++) { + float max = -INFINITY; + uint32_t token_id = 0; + for (uint32_t ii = 0; ii < vocab_size; ii++) { + float v = *(logits+i*vocab_size+ii); + // while repetition penalty will never be used for maximum token selection, it is used for the logarithmic stabilization of + // the softmax function in which case it is possible for repetition counts to be set. + if (has_repetition_penalty && last_token_ids[i] == ii) { + v /= (pow(repetition_penalty, repetition_counts[i])); + } + if (v > max) { + max = v; + token_id = ii; + } + } + output_tokens.push_back(token_id); + } +} diff --git a/otherarch/ttscpp/src/sampler.h b/otherarch/ttscpp/src/sampler.h new file mode 100644 index 000000000..0b8941e4c --- /dev/null +++ b/otherarch/ttscpp/src/sampler.h @@ -0,0 +1,33 @@ +#ifndef sampler_h +#define sampler_h + +#include +#include +#include +#include +#include + +// currently this is only built to support single sequence output sampling without beam search. +struct sampler { + // These default configurations are based on the generation configuration for Parler TTS Mini (version 1.0) + uint32_t n_output_heads = 9; + uint32_t eos_token_id = 1024; + uint32_t vocab_size = 1088; + float temperature = 1.0f; + uint32_t top_k = 0; + float top_p = 1.0f; + float repetition_penalty = 1.0f; + std::vector last_token_ids; + std::vector repetition_counts; + bool do_sample = true; + bool apply_softmax = true; + + void sample(float * logits, std::vector & output_tokens); + void softmax(float * logits, std::vector> picks, std::vector max_indices); + void max(float * logits, std::vector & output_tokens); + std::vector> topk(float * logits, bool performed_softmax); + void topp(float * logits, std::vector> & picks, std::vector & max_head_probs); + void reset(); +}; + +#endif diff --git a/otherarch/ttscpp/src/snac_model.cpp b/otherarch/ttscpp/src/snac_model.cpp new file mode 100644 index 000000000..0f58c62e1 --- /dev/null +++ b/otherarch/ttscpp/src/snac_model.cpp @@ -0,0 +1,209 @@ +#include "snac_model.h" + +void snac_model::prep_constants(gguf_context * meta) { + int heads_key = gguf_find_key(meta, "snac.audio_token_channels"); + if (heads_key != -1) { + n_heads = gguf_get_val_u32(meta, heads_key); + } + + int sampling_factor_key = gguf_find_key(meta, "snac.up_sampling_factor"); + if (sampling_factor_key != -1) { + up_sampling_factor = gguf_get_val_u32(meta, sampling_factor_key); + } + + int max_gen_key = gguf_find_key(meta, "snac.max_generation_size"); + if (max_gen_key != -1) { + max_generation_size = gguf_get_val_u32(meta, max_gen_key); + } +} + +void snac_model::prep_layers(gguf_context * meta) { + for (int i = 0; i < n_heads; i++) { + quantizer_layers.push_back(general_neural_audio_codec::residual_vector_quantize_layer{}); + } + + for (int i = 0; i < n_layers; i++) { + std::string stride_key = "snac.snac_layer_stride_" + std::to_string(i); + std::string padding_key = "snac.snac_layer_padding_" + std::to_string(i); + std::string grouping_key = "snac.snac_layer_grouping_" + std::to_string(i); + int layer_stride_key = gguf_find_key(meta, stride_key.c_str()); + if (layer_stride_key == -1) { + TTS_ABORT("key %s must be specified in gguf file inorder to initialize the SNAC audio decoder.", stride_key.c_str()); + } + int layer_padding_key = gguf_find_key(meta, padding_key.c_str()); + if (layer_padding_key == -1) { + TTS_ABORT("key %s must be specified in gguf file inorder to initialize the SNAC audio decoder.", padding_key.c_str()); + } + int layer_grouping_key = gguf_find_key(meta, grouping_key.c_str()); + if (layer_grouping_key == -1) { + TTS_ABORT("key %s must be specified in gguf file inorder to initialize the SNAC audio decoder.", grouping_key.c_str()); + } + layers.push_back( + general_neural_audio_codec::layer{ + gguf_get_val_u32(meta, layer_padding_key), + gguf_get_val_u32(meta, layer_stride_key), + gguf_get_val_u32(meta, layer_grouping_key) + } + ); + } +} + +void snac_model::assign_weight(std::string name, ggml_tensor * tensor) { + if (name == "alpha_out") { + snake_alpha = ggml_dup_tensor(ctx, tensor); + set_tensor(snake_alpha, tensor); + } else if (name == "in.weight") { + in_conv_kernel = ggml_dup_tensor(ctx, tensor); + set_tensor(in_conv_kernel, tensor); + } else if (name == "in.bias") { + in_conv_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); + set_tensor(in_conv_bias, tensor); + } else if (name == "up.weight") { + up_conv_kernel = ggml_dup_tensor(ctx, tensor); + set_tensor(up_conv_kernel, tensor); + } else if (name == "up.bias") { + up_conv_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); + set_tensor(up_conv_bias, tensor); + } else if (name == "final.weight") { + out_conv_kernel = ggml_dup_tensor(ctx, tensor); + set_tensor(out_conv_kernel, tensor); + } else if (name == "final.bias") { + out_conv_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); + set_tensor(out_conv_bias, tensor); + } else if (has_prefix(name, "layers")) { + auto pair = parse_layer_count(name); + int l = pair.first; + std::string lt_name = pair.second; + general_neural_audio_codec::assign_to_layer((tts_model *) this, layers[l], lt_name, tensor); + } else if (has_prefix(name, "quantizers")) { + auto pair = parse_layer_count(name); + int l = pair.first; + std::string lt_name = pair.second; + general_neural_audio_codec::assign_to_quantize_layer((tts_model *) this, quantizer_layers[l], lt_name, tensor); + } +} + +static struct ggml_tensor * snac_build_audio_inputs(struct ggml_context * ctx, struct snac_context * sctx, size_t sequence_length, std::vector layers) { + struct ggml_tensor * embd; + // these devisors represent the discreate repeats performed against each of the three input heads. + sctx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sequence_length / 4 + sequence_length / 2 + sequence_length); + ggml_set_input(sctx->inp_tokens); + size_t last_stride = 0; + for(int i = 0; i < sctx->model->n_heads; i++) { + auto quantize_layer = sctx->model->quantizer_layers[i]; + struct ggml_tensor * inp_head = ggml_cont(ctx, ggml_view_1d(ctx, sctx->inp_tokens, sequence_length / sctx->model->repeats[i], last_stride)); + last_stride += (sequence_length / sctx->model->repeats[i]) * ggml_element_size(sctx->inp_tokens); + struct ggml_tensor * code = general_neural_audio_codec::build_quantize_layer(ctx, inp_head, quantize_layer); + if (sctx->model->repeats[i] > 1) { + // this manipulation is equivalent to repeat_interleave against the first dimension of the tensor + code = ggml_repeat(ctx, ggml_cont_3d(ctx, code, 1, code->ne[0], code->ne[1]), ggml_new_tensor_3d(ctx, GGML_TYPE_F32, sctx->model->repeats[i], code->ne[0], sctx->model->embd)); + code = ggml_cont_2d(ctx, code, sequence_length, code->ne[2]); + } + if (i == 0) { + embd = code; + } else { + embd = ggml_add(ctx, embd, code); + } + } + return embd; +} + +snac_context * build_new_snac_context(struct snac_model * model, int n_threads, bool use_cpu) { + snac_context * sctx = new snac_context(model, n_threads); + if (!use_cpu) { +#ifdef GGML_USE_METAL + sctx->backend = ggml_backend_metal_init(); +#endif + } + sctx->backend_cpu = ggml_backend_cpu_init(); + sctx->set_threads(); + sctx->build_schedule(); + sctx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_nodes() + ggml_graph_overhead_custom(model->max_nodes(), false)); + return sctx; +} + +void snac_runner::prepare_post_load() { + ggml_cgraph * gf = build_snac_graph(model->max_generation_size); + sctx->prep_schedule(gf); +} + +struct ggml_cgraph * snac_runner::build_snac_graph(size_t sequence_length) { + init_build(); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false); + + struct ggml_tensor * cur; + struct ggml_tensor * inputs; + + sctx->noise = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model->noise_steps_sum * sequence_length); + ggml_set_input(sctx->noise); + + inputs = snac_build_audio_inputs(ctx, sctx, sequence_length, model->quantizer_layers); + cur = ggml_conv_1d_dw_tts(ctx, model->in_conv_kernel, inputs, 1, 3, 1); + cur = ggml_add(ctx, cur, model->in_conv_bias); + cur = ggml_conv_1d_tts(ctx, model->up_conv_kernel, cur, 1, 0, 1); + cur = ggml_add(ctx, cur, model->up_conv_bias); + size_t noise_offset = 0; + for (int l = 0; l < model->layers.size(); l++) { + auto layer = model->layers[l]; + struct ggml_tensor * noise = ggml_cont(ctx, ggml_view_1d(ctx, sctx->noise, model->noise_steps[l] * sequence_length, noise_offset)); + noise_offset += model->noise_steps[l] * sequence_length * sizeof(float); + cur = general_neural_audio_codec::build_layer(ctx, cur, layer, noise); + } + cur = snake_1d(ctx, model->snake_alpha, cur); + cur = ggml_conv_1d_tts(ctx, model->out_conv_kernel, cur, 1, 3, 1); + cur = ggml_add(ctx, cur, model->out_conv_bias); + cur = ggml_tanh(ctx, cur); + ggml_build_forward_expand(gf, cur); + free_build(); + return gf; +} + +void snac_runner::set_inputs(std::vector> & tokens) { + ggml_backend_tensor_set( + sctx->inp_tokens, tokens[0].data(), 0, + tokens[0].size()*ggml_element_size(sctx->inp_tokens) + ); + + ggml_backend_tensor_set( + sctx->inp_tokens, tokens[1].data(), tokens[0].size() * ggml_element_size(sctx->inp_tokens), + tokens[1].size() * ggml_element_size(sctx->inp_tokens) + ); + + ggml_backend_tensor_set( + sctx->inp_tokens, tokens[2].data(), + tokens[1].size()*ggml_element_size(sctx->inp_tokens)+tokens[0].size()*ggml_element_size(sctx->inp_tokens), + tokens[2].size()*ggml_element_size(sctx->inp_tokens) + ); + size_t sequence_length = tokens[2].size(); + random_normal_gen(model->noise_steps_sum * sequence_length, (float*) sctx->noise->data); +} + +void snac_runner::run(std::vector> & tokens, struct tts_response * outputs) { + size_t sequence_length = tokens[2].size(); + ggml_backend_sched_reset(sctx->sched); + + sctx->prep_output_buffer(model->max_generation_size * model->up_sampling_factor * sizeof(float)); + + outputs->data = sctx->logits; + ggml_backend_buffer_clear(sctx->buf_output, 0); + + struct ggml_cgraph * gf = NULL; + gf = build_snac_graph(sequence_length); + + // the output is always the last tensor in the graph + struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1]; + ggml_backend_sched_alloc_graph(sctx->sched, gf); + + set_inputs(tokens); + + ggml_backend_sched_graph_compute_async(sctx->sched, gf); + + sctx->get_ggml_node_data(result, outputs->data, sequence_length*sizeof(float)*model->up_sampling_factor); + + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(sctx->sched); + outputs->n_outputs = sequence_length * model->up_sampling_factor; + return; +} + diff --git a/otherarch/ttscpp/src/snac_model.h b/otherarch/ttscpp/src/snac_model.h new file mode 100644 index 000000000..9450c1b75 --- /dev/null +++ b/otherarch/ttscpp/src/snac_model.h @@ -0,0 +1,86 @@ +#pragma once + +#include "general_neural_audio_codec.h" + +// SNAC, Scale Neural Audio Codec, is another neural audio codec much like DAC. +// The key differences are that it uses grouping in the residual units of its layers, +// performs a repeat_interleave over the second and third input channels, applies +// a noise convolutional layer after input encoding for each layer, and applies +// an extra convolutional layer before residual layers are applied. +struct snac_model : tts_model { + // general configuration from SNAC as used by Orpheus + uint32_t n_layers = 4; + uint32_t n_heads = 3; + uint32_t up_sampling_factor = 512; + uint32_t embd = 768; + size_t max_generation_size = 2580; + uint32_t repeats[3] = {4, 2, 1}; + // configuration for adding noise + uint32_t noise_steps[4] = {8, 64, 256, 512}; + uint32_t noise_steps_sum = 840; + bool use_noise = true; + + struct ggml_tensor * repeat_interleave_buffer; + + struct ggml_tensor * in_conv_kernel; + struct ggml_tensor * in_conv_bias; + struct ggml_tensor * up_conv_kernel; + struct ggml_tensor * up_conv_bias; + struct ggml_tensor * out_conv_kernel; + struct ggml_tensor * out_conv_bias; + struct ggml_tensor * snake_alpha; + std::vector layers; + std::vector quantizer_layers; + + void assign_weight(std::string name, ggml_tensor * weight); + void prep_constants(gguf_context * meta); + void prep_layers(gguf_context * meta); + void post_load_assign(); + void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only) { + prep_layers(meta_ctx); + prep_constants(meta_ctx); + tts_model::setup_from_file(meta_ctx, load_context, cpu_only, "snac"); + } +}; + +// the context used for running the snac model +struct snac_context : runner_context { + snac_context(snac_model * model, int n_threads): runner_context(n_threads), model(model) {}; + + struct snac_model * model; + + struct ggml_tensor * inp_tokens; + struct ggml_tensor * noise; + + void build_schedule() { + runner_context::build_schedule(model->max_nodes()); + } +}; + +snac_context * build_new_snac_context(struct snac_model * model, int n_threads, bool use_cpu = true); + +static struct ggml_tensor * snac_build_audio_inputs(struct ggml_context * ctx, struct snac_context * sctx, size_t sequence_length, std::vector layers); + +// This struct is intended to manage the snac model's graph compilation and compute function. +struct snac_runner : tts_runner { + snac_runner(snac_model * model, snac_context * context): model(model), sctx(context) {}; + ~snac_runner() { + if (ctx) { + ggml_free(ctx); + } + model->free(); + delete model; + delete sctx; + } + snac_model * model; + snac_context * sctx; + + void init_build() { + tts_runner::init_build(&sctx->buf_compute_meta); + } + + void set_inputs(std::vector> & tokens); + void prepare_post_load(); + struct ggml_cgraph * build_snac_graph(size_t sequence_length); + void run(std::vector> & tokens, struct tts_response * outputs); +}; diff --git a/otherarch/ttscpp/src/t5_encoder_model.cpp b/otherarch/ttscpp/src/t5_encoder_model.cpp new file mode 100644 index 000000000..2dbc7614d --- /dev/null +++ b/otherarch/ttscpp/src/t5_encoder_model.cpp @@ -0,0 +1,402 @@ +#include "t5_encoder_model.h" + +static const std::map T5_TENSOR_GGUF_LOOKUP = { + {"t5encoder.token_embd", T5_EMBD}, + {"t5encoder.enc.final_layer_norm", T5_NORM}, + {"t5encoder.down_proj", T5_DOWN_PROJ}, + {"t5encoder.down_proj_bias", T5_DOWN_PROJ_BIAS}, + {".attn_norm", T5_LAYER_ATTN_NORM}, + {".attn_q", T5_LAYER_ATTN_Q}, + {".attn_k", T5_LAYER_ATTN_K}, + {".attn_v", T5_LAYER_ATTN_V}, + {".attn_o", T5_LAYER_ATTN_O}, + {".attn_rel_b", T5_RELATIVE_BIAS}, + {".ffn_norm", T5_LAYER_OUT_NORM}, + {".ffn_gate", T5_LAYER_WI_1}, + {".ffn_down", T5_LAYER_WO}, + {".ffn_up", T5_LAYER_WI_0}, +}; + +void assign_to_t5_layer(t5_encoder * model, t5_layer & layer, std::string name, ggml_tensor * tensor) { + try { + switch(T5_TENSOR_GGUF_LOOKUP.at(name)) { + case T5_LAYER_ATTN_NORM: + layer.attn_norm = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer.attn_norm, tensor); + break; + case T5_LAYER_ATTN_Q: + layer.q = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer.q, tensor); + break; + case T5_LAYER_ATTN_K: + layer.k = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer.k, tensor); + break; + case T5_LAYER_ATTN_V: + layer.v = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer.v, tensor); + break; + case T5_LAYER_ATTN_O: + layer.o = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer.o, tensor); + break; + case T5_LAYER_OUT_NORM: + layer.mlp_norm = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer.mlp_norm, tensor); + break; + case T5_LAYER_WI_1: + layer.wi_1 = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer.wi_1, tensor); + break; + case T5_LAYER_WI_0: + layer.wi_0 = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer.wi_0, tensor); + break; + case T5_LAYER_WO: + layer.wo = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(layer.wo, tensor); + break; + case T5_RELATIVE_BIAS: + model->relative_attn_bias = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(model->relative_attn_bias, tensor); + break; + default: + fprintf(stdout, "unassigned tensor %s\n", name.c_str()); + break; + } + } catch (const std::out_of_range& e) { + TTS_ABORT("Error: %s\nTensor, '%s', is not a valid tensor.", e.what(), name.c_str()); + } +} + +void assign_to_t5_encoder(t5_encoder * model, const std::string name, ggml_tensor * tensor) { + if (tensor->data == NULL) { + return; + } + std::string::size_type pos = name.find(".", 0); + std::string top_level(name.substr(0, pos)); + if (T5_TENSOR_GGUF_LOOKUP.find(name) != T5_TENSOR_GGUF_LOOKUP.end()) { + switch (T5_TENSOR_GGUF_LOOKUP.at(name)) { + case T5_EMBD: + model->embd = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(model->embd, tensor); + break; + case T5_NORM: + model->out_norm = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(model->out_norm, tensor); + break; + case T5_DOWN_PROJ: + model->down_proj = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(model->down_proj, tensor); + break; + case T5_DOWN_PROJ_BIAS: + model->down_proj_bias = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(model->down_proj_bias, tensor); + break; + default: + fprintf(stdout, "unassigned tensor %s\n", name.c_str()); + break; + } + } else if (top_level == "t5encoder") { + auto pair = parse_layer_count(name, 2); + int l = pair.first; + std::string lt_name = pair.second; + + assign_to_t5_layer(model, model->layers[l], lt_name, tensor); + } else { + return; + } +} + +void t5_encoder::prep_layers(gguf_context * meta) { + for (uint32_t i = 0; i < n_layers; i++) { + t5_layer l; + layers.push_back(l); + } +} + +void t5_encoder::prep_constants(gguf_context * meta) { + int n_layers_key = gguf_find_key(meta, "t5encoder.block_count"); + if (n_layers_key != -1) { + n_layers = gguf_get_val_u32(meta, n_layers_key); + } + + int hidden_size_key = gguf_find_key(meta, "t5encoder.embedding_length"); + if (hidden_size_key != -1) { + hidden_size = gguf_get_val_u32(meta, hidden_size_key); + } + + int attn_heads_key = gguf_find_key(meta, "t5encoder.attention.head_count"); + if (attn_heads_key != -1) { + n_attn_heads = gguf_get_val_u32(meta, attn_heads_key); + } + + int context_size_key = gguf_find_key(meta, "t5encoder.context_length"); + if (context_size_key != -1) { + max_context_length = gguf_get_val_u32(meta, context_size_key); + } + + int bos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.bos_token_id"); + if (bos_token_id_key != -1) { + bos_token_id = gguf_get_val_u32(meta, bos_token_id_key); + } + + int eos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.eos_token_id"); + if (eos_token_id_key != -1) { + eos_token_id = gguf_get_val_u32(meta, eos_token_id_key); + } + + int vocab_size_key = gguf_find_key(meta, "t5encoder.vocab_size"); + if (vocab_size_key == -1) { + TTS_ABORT("key 't5encoder.vocab_size' must be specified in gguf file."); + } + vocab_size = gguf_get_val_u32(meta, vocab_size_key); + + int output_size_key = gguf_find_key(meta, "t5encoder.output_size"); + if (output_size_key != -1) { + output_size = gguf_get_val_u32(meta, output_size_key); + } +} + +void t5_encoder::assign_weight(std::string name, ggml_tensor * tensor) { + assign_to_t5_encoder(this, name, tensor); +} + +struct t5_context * build_new_t5_context(struct t5_encoder * model, int n_threads, bool use_cpu) { + t5_context * t5ctx = new t5_context(model, n_threads); + if (!use_cpu) { +#ifdef GGML_USE_METAL + t5ctx->backend = ggml_backend_metal_init(); +#endif + } + t5ctx->backend_cpu = ggml_backend_cpu_init(); + t5ctx->set_threads(); + t5ctx->build_schedule(); + t5ctx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_nodes() + ggml_graph_overhead_custom(model->max_nodes(), false)); + return t5ctx; +} + +static struct ggml_tensor * build_t5_norm(struct ggml_context * ctx, struct ggml_tensor * cur, struct ggml_tensor * weight) { + // this is static for all versions of t5 flan + float eps = 0.000001; + cur = ggml_rms_norm(ctx, cur, eps); + cur = ggml_mul(ctx, cur, weight); + return cur; +} + +static struct ggml_tensor * build_t5_attn_mask(ggml_context * ctx, struct t5_context *t5ctx, const t5_ubatch & batch) { + t5ctx->attn_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, (int64_t) batch.n_tokens, (int64_t) batch.n_tokens); + ggml_set_input(t5ctx->attn_mask); + + return t5ctx->attn_mask; +} + +static struct ggml_tensor * build_t5_pos_bias(ggml_context * ctx, struct ggml_tensor * pos_bucket, struct ggml_tensor * relative_attn_bias) { + struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0); + struct ggml_tensor * pos_bias = ggml_get_rows(ctx, relative_attn_bias, pos_bucket_1d); + + pos_bias = ggml_view_3d(ctx, pos_bias, pos_bias->ne[0], pos_bucket->ne[0], pos_bucket->ne[1], ggml_element_size(pos_bias) * pos_bias->ne[0], ggml_element_size(pos_bias) * pos_bias->ne[0] * pos_bucket->ne[0], 0); + pos_bias = ggml_permute(ctx, pos_bias, 2, 1, 0, 3); + pos_bias = ggml_cont(ctx, pos_bias); + return pos_bias; +} + +t5_ubatch t5_runner::build_worst_case_batch() { + struct t5_ubatch batch; + batch.n_tokens = model->max_context_length; + return batch; +} + +void t5_runner::prepare_post_load() { + auto batch = build_worst_case_batch(); + auto gf = build_t5_graph(batch); + t5ctx->prep_schedule(gf); +} + +struct ggml_cgraph * t5_runner::build_t5_graph(t5_ubatch & batch) { + init_build(); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + //t5ctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens); + //ggml_set_input(t5ctx->positions); + + t5ctx->inp_pos_bucket = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, batch.n_tokens, batch.n_tokens); + ggml_set_input(t5ctx->inp_pos_bucket); + + t5ctx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens); + ggml_set_input(t5ctx->inp_tokens); + + inpL = ggml_get_rows(ctx, model->embd, t5ctx->inp_tokens); + + struct ggml_tensor * KQ_mask_dec = build_t5_attn_mask(ctx, t5ctx, batch); + struct ggml_tensor * pos_bias = build_t5_pos_bias(ctx, t5ctx->inp_pos_bucket, model->relative_attn_bias); + + for (int l = 0; l < model->n_layers; l++) { + struct ggml_tensor * residual = inpL; + + cur = build_t5_norm(ctx, inpL, model->layers[l].attn_norm); + + struct ggml_tensor * attn_out; + + // self-attention + { + struct ggml_tensor * Qcur = ggml_mul_mat(ctx, model->layers[l].q, cur); + struct ggml_tensor * Kcur = ggml_mul_mat(ctx, model->layers[l].k, cur); + struct ggml_tensor * Vcur = ggml_mul_mat(ctx, model->layers[l].v, cur); + + Qcur = ggml_reshape_3d(ctx, Qcur, model->head_size, model->n_attn_heads, batch.n_tokens); + Kcur = ggml_reshape_3d(ctx, Kcur, model->head_size, model->n_attn_heads, batch.n_tokens); + + struct ggml_tensor * q = ggml_permute(ctx, Qcur, 0, 2, 1, 3); + struct ggml_tensor * k = ggml_cont(ctx, ggml_permute(ctx, Kcur, 0, 2, 1, 3)); + + struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); + kq = ggml_add(ctx, kq, pos_bias); + + kq = ggml_soft_max_ext(ctx, kq, KQ_mask_dec, 1.0f, 0.0f); + + struct ggml_tensor * v = ggml_cont_3d(ctx, ggml_transpose(ctx, Vcur), batch.n_tokens, model->head_size, model->n_attn_heads); + struct ggml_tensor * kqv = ggml_mul_mat(ctx, kq, v); + struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3); + attn_out = ggml_cont_2d(ctx, kqv_merged, model->hidden_size, batch.n_tokens); + attn_out = ggml_mul_mat(ctx, model->layers[l].o, attn_out); + } + + cur = ggml_add(ctx, attn_out, residual); + struct ggml_tensor * residualmlp = cur; + + // mlp + { + cur = build_t5_norm(ctx, cur, model->layers[l].mlp_norm); + struct ggml_tensor * gate_proj = ggml_mul_mat(ctx, model->layers[l].wi_1, cur); + cur = ggml_mul(ctx, ggml_gelu(ctx, ggml_mul_mat(ctx, model->layers[l].wi_0, cur)), gate_proj); + cur = ggml_mul_mat(ctx, model->layers[l].wo, cur); + } + + cur = ggml_add(ctx, cur, residualmlp); + inpL = cur; + } + + cur = build_t5_norm(ctx, cur, model->out_norm); + + if (model->down_proj) { + cur = ggml_mul_mat(ctx, model->down_proj, cur); + } + + if (model->down_proj_bias) { + cur = ggml_add(ctx, cur, model->down_proj_bias); + } + + ggml_build_forward_expand(gf, cur); + + free_build(); + + return gf; +} + +void t5_runner::set_inputs(t5_ubatch & batch) { + ggml_backend_tensor_set(t5ctx->inp_tokens, batch.input_tokens, 0, batch.n_tokens*ggml_element_size(t5ctx->inp_tokens)); + float * attn_mask = nullptr; + uint32_t * positions = nullptr; + uint32_t * pos_bucket = nullptr; + attn_mask = (float *) t5ctx->attn_mask->data; + positions = (uint32_t *) t5ctx->positions->data; + pos_bucket = (uint32_t *) t5ctx->inp_pos_bucket->data; + int n_buckets = (int) model->relative_attn_buckets / 2; + int max_exact = (int) n_buckets / 2; + float logarithmic_denominator = log(128.0 / max_exact); + for (int i = 0; i < batch.n_tokens; i++) { + for (int ii = 0; ii < batch.n_tokens; ii++) { + int ab_rpos = abs(i - ii); + int rpos = i - ii; + attn_mask[i*batch.n_tokens + ii] = 0.0f; //ii > i ? -INFINITY : 0.0f; + pos_bucket[i*batch.n_tokens + ii] = (uint32_t) (rpos > 0 ? n_buckets : 0) + (ab_rpos < max_exact ? ab_rpos : std::min((n_buckets - 1), (max_exact + (int)((log((ab_rpos / max_exact)) / logarithmic_denominator) * max_exact)))); + } + } + +} + +void t5_runner::run(uint32_t * input_tokens, uint32_t sequence_length, struct tts_response * outputs) { + t5_ubatch batch; + batch.input_tokens = input_tokens; + batch.n_tokens = sequence_length; + ggml_backend_sched_reset(t5ctx->sched); + + const size_t prev_size = t5ctx->buf_output ? ggml_backend_buffer_get_size(t5ctx->buf_output) : 0; + const size_t new_size = model->max_context_length * model->output_size * sizeof(float); + + if (!t5ctx->buf_output || prev_size < new_size) { + if (t5ctx->buf_output) { + ggml_backend_buffer_free(t5ctx->buf_output); + t5ctx->buf_output = nullptr; + t5ctx->logits = nullptr; + } + + t5ctx->buf_output = ggml_backend_buft_alloc_buffer(t5ctx->backend_cpu_buffer, new_size); + } + + outputs->data = (float *) ggml_backend_buffer_get_base(t5ctx->buf_output); + ggml_backend_buffer_clear(t5ctx->buf_output, 0); + struct ggml_cgraph * gf = NULL; + gf = build_t5_graph(batch); + // the output is always the last tensor in the graph + struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1]; + ggml_backend_sched_alloc_graph(t5ctx->sched, gf); + set_inputs(batch); + + ggml_backend_sched_graph_compute_async(t5ctx->sched, gf); + + t5ctx->get_ggml_node_data(result, outputs->data, batch.n_tokens*sizeof(float)*model->output_size); + + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(t5ctx->sched); + outputs->n_outputs = sequence_length; + outputs->hidden_size = model->output_size; + return; +} + +int t5_runner::generate(std::string prompt, tts_response *response) { + std::vector tokens; + tokenizer->tokenize(prompt, tokens); + tokens.push_back(model->eos_token_id); + run(tokens.data(), (uint32_t) tokens.size(), response); + return 0; +} + +struct t5_runner * text_encoder_from_file(std::string file_path, int n_threads, unigram_tokenizer * tokenizer, bool cpu_only) { + t5_encoder * model = new t5_encoder; + ggml_context * weight_ctx = NULL; + + struct gguf_init_params params = { + /*.no_alloc =*/ false, + /*.ctx =*/ &weight_ctx, + }; + gguf_context * meta_ctx = gguf_init_from_file(file_path.c_str(), params); + if (!meta_ctx) { + TTS_ABORT("%s failed for file %s\n", __func__, file_path.c_str()); + } + if (!tokenizer) { + tokenizer = unigram_tokenizer_from_gguf(meta_ctx); + } + if (!tokenizer->init) { + tokenizer->initialize_tokenizer(); + } + model->setup_from_file(meta_ctx, weight_ctx, cpu_only); + + // TODO: change this weight assignment pattern to mirror llama.cpp + for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) { + model->assign_weight(cur->name, cur); + } + + struct t5_context * t5ctx = build_new_t5_context(model, n_threads, cpu_only); + struct t5_runner * runner = new t5_runner(model, t5ctx, tokenizer); + runner->prepare_post_load(); + gguf_free(meta_ctx); + ggml_free(weight_ctx); + + return runner; +} diff --git a/otherarch/ttscpp/src/t5_encoder_model.h b/otherarch/ttscpp/src/t5_encoder_model.h new file mode 100644 index 000000000..9a801873d --- /dev/null +++ b/otherarch/ttscpp/src/t5_encoder_model.h @@ -0,0 +1,130 @@ +#ifndef t5_encoder_model_h +#define t5_encoder_model_h + +#include "tts_model.h" +#include "tokenizer.h" + + +enum t5_tensor { + T5_EMBD, + T5_NORM, + T5_DOWN_PROJ, + T5_DOWN_PROJ_BIAS, + T5_RELATIVE_BIAS, + T5_LAYER_ATTN_Q, + T5_LAYER_ATTN_K, + T5_LAYER_ATTN_V, + T5_LAYER_ATTN_O, + T5_LAYER_ATTN_NORM, + T5_LAYER_WI_0, + T5_LAYER_WI_1, + T5_LAYER_WO, + T5_LAYER_OUT_NORM, +}; + +struct t5_layer { + struct ggml_tensor * q; + struct ggml_tensor * k; + struct ggml_tensor * v; + struct ggml_tensor * o; + struct ggml_tensor * attn_norm; + struct ggml_tensor * wi_0; + struct ggml_tensor * wi_1; + struct ggml_tensor * wo; + struct ggml_tensor * mlp_norm; +}; + +// this struct maintains the static tensors for a t5_encoder model +// the defautl configuration is form copied from standard configuration for +// flan-t5-xl. Note this model is slightly different from a standard t5 encoder. +// Specifically this model has a down projection which converts the text encoder's +// hidden size to the hidden size of the parler decoder. +struct t5_encoder : tts_model { + // These configs are essentially built for the 44khZ 8kbps standard DAC model audio encoder and decoder + uint32_t n_layers = 24; + uint32_t n_attn_heads = 32; + uint32_t head_size = 64; + uint32_t hidden_size = 2048; + uint32_t relative_attn_buckets = 32; + uint32_t eos_token_id = 1; + uint32_t bos_token_id = 0; + uint32_t max_context_length = 512; + uint32_t output_size = 1536; + uint32_t vocab_size; + + struct ggml_tensor * embd; + struct ggml_tensor * relative_attn_bias; + struct ggml_tensor * out_norm; + struct ggml_tensor * down_proj = nullptr; + struct ggml_tensor * down_proj_bias = nullptr; + std::vector layers; + + void assign_weight(std::string name, ggml_tensor * tensor); + void prep_layers(gguf_context * meta); + void prep_constants(gguf_context * meta); + void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only = true) { + prep_constants(meta_ctx); + prep_layers(meta_ctx); + tts_model::setup_from_file(meta_ctx, load_context, cpu_only, "t5encoder", 1.25); + } +}; + +// For assigning weights from gguf file to local model. +void assign_to_t5_encoder(t5_encoder * model, const std::string name, ggml_tensor * tensor); +void assign_to_t5_layer(t5_encoder * model, t5_layer & layer, std::string name, ggml_tensor * tensor); + +struct t5_context : runner_context { + t5_context(t5_encoder * model, int n_threads): runner_context(n_threads), model(model) {}; + + struct t5_encoder * model; + + struct ggml_tensor * inp_tokens; + struct ggml_tensor * positions; + struct ggml_tensor * attn_mask; + struct ggml_tensor * inp_pos_bucket; + + void build_schedule() { + runner_context::build_schedule(model->max_nodes()); + } +}; + +struct t5_context * build_new_t5_context(struct t5_encoder * model, int n_threads, bool use_cpu = true); + +struct t5_ubatch { + size_t n_tokens; // the number of tokens in our encoded sequence + uint32_t * input_tokens; // [n_tokens] +}; + +static struct ggml_tensor * build_t5_norm(struct ggml_context * ctx, struct ggml_tensor * cur, struct ggml_tensor * weight); +static struct ggml_tensor * build_t5_attn_mask(ggml_context * ctx, struct t5_context *t5ctx, const t5_ubatch & batch); + +// This struct is intended to manage the t5 encoder model's graph compilation and compute function. +struct t5_runner : tts_runner { + t5_runner(t5_encoder * model, t5_context * context, unigram_tokenizer * tokenizer): model(model), t5ctx(context), tokenizer(tokenizer) {}; + ~t5_runner() { + if (ctx) { + ggml_free(ctx); + } + model->free(); + delete model; + delete t5ctx; + } + struct unigram_tokenizer * tokenizer; + t5_encoder * model; + t5_context * t5ctx; + + void init_build() { + tts_runner::init_build(&t5ctx->buf_compute_meta); + } + + void prepare_post_load(); + struct t5_ubatch build_worst_case_batch(); + void set_inputs(t5_ubatch & batch); + struct ggml_cgraph * build_t5_graph(t5_ubatch & batch); + void run(uint32_t * input_tokens, uint32_t sequence_length, struct tts_response * outputs); + int generate(std::string prompt, struct tts_response * response); +}; + +struct t5_runner * text_encoder_from_file(std::string file_path, int n_threads, unigram_tokenizer * tokenizer, bool cpu_only = true); + +#endif diff --git a/otherarch/ttscpp/src/tokenizer.cpp b/otherarch/ttscpp/src/tokenizer.cpp new file mode 100644 index 000000000..9b870d44a --- /dev/null +++ b/otherarch/ttscpp/src/tokenizer.cpp @@ -0,0 +1,331 @@ +#include "tokenizer.h" + +void token_trie::add(const std::string & gram, uint32_t token) { + _add(gram, token, 0); +} + +void token_trie::_add(const std::string & gram, uint32_t new_token, size_t index) { + if (index >= gram.size()) { + has_value = true; + token = new_token; + return; + } + const char c = gram[index]; + auto res = children.find(c); + if (res != children.end()) { + res->second._add(gram, new_token, index + 1); + } else { + struct token_trie nt{}; + nt._add(gram, new_token, index + 1); + children[c] = nt; + } +} + +const struct token_trie * token_trie::traverse(const char c) const { + auto res = children.find(c); + if (res != children.end()) { + return &res->second; + } + + return NULL; +} + +size_t unicode_len_utf8_tts(char src) { + const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; + uint8_t highbits = static_cast(src) >> 4; + return lookup[highbits]; +} + +void unigram_tokenizer::initialize_tokenizer() { + for (const auto it : vocab) { + root_trie.add(it.first, it.second); + } + init = true; +} + +// the general approach here is to find the character grams that sum to the max possible value over the entire text sequence. +// The particular algorithm used here effectively works by walking the text and at each index storing the max value of all possible gram combinations +// we can then reverse that sequence to pick the best possible tokens. +void unigram_tokenizer::tokenize(const std::string & text, std::vector & tokens) { + if (!init) { + TTS_ABORT("Error: %s\nTokenizer must be initialized before #tokenize is called."); + } + // the parler tokenizer's normalizer (i.e. the bert normalizer implemented by huggingface tokenizers libs) only deduplicates and strips extra spaces and + // optionally handles chinese characters and accents (neither of which are currently supported here). + std::string normalized = text; + if (dedupe_spaces) { + normalized = " " + std::regex_replace(text, duped_spaces, " "); + } + + size_t text_length = normalized.size(); + + // initialize score_sum to neg infinity so it will be always lower than sums of token scores + std::vector results(text_length + 1, {unk_token, 0, -INFINITY}); + results[0] = { unk_token, 0, 0 }; + + size_t offset = 0; + + while (offset < text_length) { + size_t current_offset = offset; + // pulled this directly from llama.cpp; I suspect that this is for handling of non-utf8 steps (to be marked as unknown tokens) + size_t n_utf8_code_units = std::min(unicode_len_utf8_tts(normalized[offset]), text_length - offset); + + bool found_unknown = true; + const struct result & current_best = results[offset]; + + // find the current branch in the trie + const struct token_trie * node = root_trie.traverse(normalized[current_offset++]); + // search for the next token + while (current_offset <= text_length && node != NULL) { + // check if this is a complete token (it could just be an unkown step between two tokens). + if (node->has_value) { + // check if it corresponds to the whole utf8 step + if (current_offset - offset == n_utf8_code_units) { + found_unknown = false; + } + float score = current_best.score + scores[node->token]; + struct result & current_champ = results[current_offset]; + if (score > current_champ.score) { + struct result challenger = { node->token, offset, score }; + current_champ = challenger; + } + } + node = node->traverse(normalized[current_offset++]); + } + + // if we found an unknown token, process it + if (found_unknown) { + current_offset = offset + n_utf8_code_units; + struct result & current_champ = results[current_offset]; + float score = current_best.score + unk_token_score; + if (score > current_champ.score) { + struct result challenger = { unk_token, offset, score }; + current_champ = challenger; + } + } + + // move one utf8 step + offset += n_utf8_code_units; + } + + // if we have more than on unknown token in a row, we can join them. + bool is_prev_unknown = false; + // iterate from the last result backwards and get the best performing tokens + for (struct result & result = results[text_length]; ; result = results[result.offset]) { + bool is_unknown = result.token == unk_token; + if (!(is_prev_unknown && is_unknown)) { + tokens.push_back(result.token); + } + if (result.offset == 0) { + break; + } + is_prev_unknown = is_unknown; + } + + // reverse the tokens since we added tokens starting from the end of the input + std::reverse(tokens.begin(), tokens.end()); +} + +// loading the vocab to the tokenizer from gguf file. +unigram_tokenizer * unigram_tokenizer_from_gguf(gguf_context * meta) { + std::unordered_map vocab; + std::vector scores; + int vocab_key = gguf_find_key(meta, "tokenizer.ggml.tokens"); + int vocab_size = gguf_get_arr_n(meta, vocab_key); + scores.reserve(vocab_size); + for (int i = 0; i < vocab_size; i++) { + std::string val = gguf_get_arr_str(meta, vocab_key, i); + vocab[val] = (uint32_t) i; + } + int scores_key = gguf_find_key(meta, "tokenizer.ggml.scores"); + int scores_size = gguf_get_arr_n(meta, scores_key); + assert(scores_size == vocab_size); + float * data = (float*) gguf_get_arr_data(meta, scores_key); + for (int i = 0; i < scores_size; i++) { + scores.push_back(data[i]); + } + int unkown_token_key = gguf_find_key(meta, "tokenizer.ggml.unknown_token_id"); + uint32_t token = gguf_get_val_u32(meta, unkown_token_key); + + auto tokenizer = new unigram_tokenizer(vocab, token, scores[token], scores); + + uint32_t eos_token_key = gguf_find_key(meta, "tokenizer.ggml.eos_token_id"); + if (eos_token_key != -1) { + tokenizer->eos_token = gguf_get_val_u32(meta, eos_token_key); + } + return tokenizer; +} + +void single_pass_tokenizer::tokenize(const std::string & text, std::vector & token_ids) { + std::string remaining = text; + while (remaining.size() > 0) { + uint32_t token_id = unknown_id; + for (int i = 1; i < std::min(remaining.size()+1, max_size+1); i++) { + std::string part = remaining.substr(0, i); + ptrdiff_t pos = std::distance(tokens.begin(), std::find(tokens.begin(), tokens.end(), part)); + if (pos < tokens.size()) { + token_id = (uint32_t) pos; + remaining = remaining.substr(part.size(), remaining.size() - part.size()); + break; + } + } + if (token_id == unknown_id) { + remaining = remaining.substr(1, remaining.size() - 1); + } + token_ids.push_back(token_id); + } +} + +void single_pass_tokenizer::token_split(const std::string & text, std::vector & tokens) { + std::string remaining = text; + while (remaining.size() > 0) { + // String copying is much slower than using a std::string_view, but the former is simpler to implement for now. + std::string token = remaining.substr(0, 1); + for (int i = 1; i < remaining.size(); i++) { + std::string part = remaining.substr(0, i+1); + if (token_vocab.find(part) == token_vocab.end()) { + break; + } + token = part; + } + tokens.push_back(token); + remaining = remaining.substr(token.size(), remaining.size() - token.size()); + } +} + +struct single_pass_tokenizer * single_pass_tokenizer_from_gguf(gguf_context * meta, std::string key_name) { + int tokens_key = gguf_find_key(meta, key_name.c_str()); + if (tokens_key == -1) { + TTS_ABORT("The '%s' key must be set in order to support single pass tokenization.", key_name.c_str()); + } + std::vector tokens; + int token_count = gguf_get_arr_n(meta, tokens_key); + for (int i = 0; i < token_count; i++) { + tokens.push_back(gguf_get_arr_str(meta, tokens_key, i)); + } + return new single_pass_tokenizer(tokens); +} + +void bpe_symbol::add_merges(std::priority_queue, bpe_merge_comp> & merges, std::unordered_map, int, pair_hash> & rank_map, bool only_forward) { + if (!only_forward && last) { + auto rid = std::make_pair(last->as_str(), as_str()); + if (rank_map.find(rid) != rank_map.end()) { + bpe_merge m{last, this, rank_map[rid], last->size + size}; + merges.push(m); + } + } + + if (next) { + auto rid = std::make_pair(as_str(), next->as_str()); + if (rank_map.find(rid) != rank_map.end()) { + bpe_merge m{this, next, rank_map[rid], size + next->size}; + merges.push(m); + } + } +} + +std::string bpe_symbol::as_str() { + return std::string(token, size); +} + +bool bpe_merge_comp::operator() (const bpe_merge & a, const bpe_merge & b) { + return a.rank > b.rank || (a.rank == b.rank && a.a && b.a && a.a->pos > b.a->pos); +} + +size_t pair_hash::operator() (const std::pair & p) const { + return std::hash{}(p.first) ^ (std::hash{}(p.second) << 1); +} + +bpe_symbol * bpe_merge::merge() { + a->size += b->size; + b->size = -1; + a->next = b->next; + if (a->next) { + a->next->last = a; + } + return a; +} + +void pair_builder::join_pairs(std::unordered_map, int, pair_hash> & rank_map) { + std::priority_queue, bpe_merge_comp> merges; + for (auto part : parts) { + part->add_merges(merges, rank_map, true); + } + while (!merges.empty()) { + auto m = merges.top(); + merges.pop(); + if (m.a->size > 0 && m.b->size > 0 && m.new_size == m.a->size + m.b->size) { + m.merge(); + m.a->add_merges(merges, rank_map); + } + + } +} + +void bpe_tokenizer::tokenize(const std::string & text, std::vector & token_ids) { + std::vector chunks = split(text, " ", true); + bool space_prior = false; + for (auto chunk : chunks) { + if (chunk != " ") { + bpe_tokenize(space_prior ? "Ġ" + chunk : chunk, token_ids); + } else { + space_prior = true; + } + } +} + +void bpe_tokenizer::bpe_tokenize(std::string chunk, std::vector & token_ids) { + if (tokens_to_ids.find(chunk) != tokens_to_ids.end()) { + token_ids.push_back(tokens_to_ids[chunk]); + return; + } + auto pb = pair_builder{chunk}; + pb.join_pairs(ranks); + bpe_symbol * next = pb.parts[0]; + while (next) { + token_ids.push_back(tokens_to_ids[next->as_str()]); + next = next->next; + } +} + +bpe_tokenizer * bpe_tokenizer_from_gguf(gguf_context * meta, std::string base_name) { + int vocab_key = gguf_find_key(meta, (base_name + ".tokens").c_str()); + if (vocab_key == -1) { + TTS_ABORT("The '%s' key must be set in order to support BPE tokenization.", (base_name + ".tokens").c_str()); + } + int merges_key = gguf_find_key(meta, (base_name + ".merges").c_str()); + if (merges_key == -1) { + TTS_ABORT("The '%s' key must be set in order to support BPE tokenization.", (base_name + ".merges").c_str()); + } + int eos_token_id_key = gguf_find_key(meta, (base_name + ".eos_token_id").c_str()); + if (eos_token_id_key == -1) { + TTS_ABORT("The '%s' key must be set in order to support BPE tokenization.", (base_name + ".eos_token_id").c_str()); + } + int bos_token_id_key = gguf_find_key(meta, (base_name + ".bos_token_id").c_str()); + if (bos_token_id_key == -1) { + TTS_ABORT("The '%s' key must be set in order to support BPE tokenization.", (base_name + ".bos_token_id").c_str()); + } + + uint32_t bos_token_id = gguf_get_val_u32(meta, bos_token_id_key); + uint32_t eos_token_id = gguf_get_val_u32(meta, eos_token_id_key); + + std::unordered_map vocab; + int token_count = gguf_get_arr_n(meta, vocab_key); + for (int i = 0; i < token_count; i++) { + vocab[gguf_get_arr_str(meta, vocab_key, i)] = (uint32_t) i; + } + + std::unordered_map, int, pair_hash> ranks; + int merge_count = gguf_get_arr_n(meta, merges_key); + + for (int i = 0; i < merge_count; i++) { + auto raw_merge = gguf_get_arr_str(meta, merges_key, i); + std::vector pair = split(raw_merge, " "); + if (pair.size() != 2) { + TTS_ABORT("Invalid pair, '%s', found in BPE merges, '%s', at index %d.", raw_merge, (base_name + ".merges").c_str(), i); + } + ranks[std::make_pair<>(pair[0], pair[1])] = i; + } + + return new bpe_tokenizer(vocab, ranks, bos_token_id, eos_token_id); +} diff --git a/otherarch/ttscpp/src/tokenizer.h b/otherarch/ttscpp/src/tokenizer.h new file mode 100644 index 000000000..f1bd01b11 --- /dev/null +++ b/otherarch/ttscpp/src/tokenizer.h @@ -0,0 +1,154 @@ +#ifndef tokenizer_h +#define tokenizer_h + +#include +#include +#include +#include +#include +#include +#include "ttsutil.h" + +struct token_trie { + bool has_value = false; + uint32_t token; + std::map children; + + void add(const std::string & gram, uint32_t token); + void _add(const std::string & gram, uint32_t new_token, size_t index); + const struct token_trie * traverse(const char c) const; +}; + +static std::regex duped_spaces("\\s{2,}"); +static std::regex spaces("\\s"); + +struct result { + uint32_t token; + size_t offset; + float score; +}; + +// much of this is implemented in llama.cpp, but in order to simplify this for my use case, I reimplementing here. +// There are several important simplifications here: +// 1. I only implement unigram tokenization +// 2. I don't need to support detokenization +struct unigram_tokenizer { + unigram_tokenizer(std::unordered_map vocab, uint32_t unk_token, float unk_token_score, std::vector scores): vocab(vocab), unk_token(unk_token), unk_token_score(unk_token_score), scores(scores) {}; + ~unigram_tokenizer() = default; + + std::unordered_map vocab; + std::vector scores; + struct token_trie root_trie; + uint32_t unk_token; + float unk_token_score; + uint32_t eos_token = 1; + bool dedupe_spaces = true; + bool init = false; + + void initialize_tokenizer(); + void tokenize(const std::string & text, std::vector & tokens); +}; + +// For intializing a new tokenizer from a gguf file meta +unigram_tokenizer * unigram_tokenizer_from_gguf(gguf_context * meta); + +// While this functions like a tokenizer, no token ids are assigned as the token ids never need to be used in the context in which this is +// currently being used. This tokenizer pattern is currently being used by the phonemizer to break up a word into its relevant graphemes. +// As such, only the graphemes need to be returned. +struct single_pass_tokenizer { + single_pass_tokenizer(std::vector tkns): tokens(tkns) { + max_size = 0; + for (auto token : tkns) { + token_vocab.insert(token); + if (token.size() > max_size) { + max_size = token.size(); + } + } + } + size_t max_size; + uint32_t unknown_id = 0; + std::vector tokens; + std::unordered_set token_vocab; + void tokenize(const std::string & text, std::vector & token_ids); + void token_split(const std::string & text, std::vector & tokens); +}; + +single_pass_tokenizer * single_pass_tokenizer_from_gguf(gguf_context * meta, std::string key_name = "phonemizer.graphemes"); + +struct bpe_symbol; + +struct bpe_merge { + bpe_symbol * a; + bpe_symbol * b; + int rank; + int new_size; + + bpe_symbol * merge(); +}; + +struct bpe_merge_comp{ + bool operator() (const bpe_merge & a, const bpe_merge & b); +}; + +struct pair_hash { + size_t operator() (const std::pair & p) const; +}; + +struct bpe_symbol { + bpe_symbol(const char * token): token(token) {}; + const char* token; + int size = 1; + int pos; + bpe_symbol * next = nullptr; + bpe_symbol * last = nullptr; + + void add_merges(std::priority_queue, bpe_merge_comp> & merges, std::unordered_map, int, pair_hash> & rank_map, bool only_forward = false); + std::string as_str(); +}; + +struct pair_builder { + pair_builder(std::string word) { + bpe_symbol * last = nullptr; + for (int i = 0; i < word.size(); i++) { + int increment = 0; + // make sure we process each utf-8 character. + while(i + increment + 1 < word.size() && (word[i+increment+1] & 0b11000000) == 0b10000000) { + ++increment; + } + bpe_symbol * part = new bpe_symbol(word.data()+i); + part->pos = i; + part->size += increment; + i += increment; + if (last) { + last->next = part; + part->last = last; + } + last = part; + parts.push_back(part); + } + } + + ~pair_builder() { + for (auto p : parts) { + delete p; + } + } + + void join_pairs(std::unordered_map, int, pair_hash> & rank_map); + std::vector parts; +}; + +struct bpe_tokenizer { + bpe_tokenizer(std::unordered_map & tokens_to_ids, std::unordered_map, int, pair_hash> & ranks, uint32_t bos, uint32_t eos): tokens_to_ids(tokens_to_ids), ranks(ranks), eos_token_id(eos), bos_token_id(bos) {}; + std::unordered_map tokens_to_ids; + std::unordered_map, int, pair_hash> ranks; + uint32_t eos_token_id; + uint32_t bos_token_id; + + void tokenize(const std::string & text, std::vector & token_ids); + void bpe_tokenize(std::string chunk, std::vector & token_ids); +}; + +bpe_tokenizer * bpe_tokenizer_from_gguf(gguf_context * meta, std::string base_name = "tokenizer.ggml"); + +#endif diff --git a/otherarch/ttscpp/src/tts.cpp b/otherarch/ttscpp/src/tts.cpp new file mode 100644 index 000000000..7cc0b6c0e --- /dev/null +++ b/otherarch/ttscpp/src/tts.cpp @@ -0,0 +1,445 @@ +#include "tts.h" +#include + +// A list of all of the top level GGUF names under kokoro.duration_predictor that have quantization compatible tensors. +static constexpr std::array DURATION_PREDICTOR_QUANTIZATION_COMPATIBLE_PARTS = { + "duration_proj", + "encode", + "shared_lstm", + "duration_lstm", + "layers" +}; + +struct tts_runner * orpheus_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) { + orpheus_model * model = new orpheus_model; + snac_model * audio_model = new snac_model; + bpe_tokenizer * bt = bpe_tokenizer_from_gguf(meta_ctx); + model->setup_from_file(meta_ctx, weight_ctx, cpu_only); + audio_model->setup_from_file(meta_ctx, weight_ctx, cpu_only); + sampler * samp = new sampler; + snac_context * sctx = build_new_snac_context(audio_model, n_threads, cpu_only); + snac_runner * audio_decoder = new snac_runner(audio_model, sctx); + orpheus_context * octx = build_new_orpheus_context(model, n_threads, cpu_only); + orpheus_kv_cache * cache = new orpheus_kv_cache; + orpheus_runner * runner = new orpheus_runner(model, audio_decoder, octx, bt, samp, cache); + + for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) { + runner->assign_weight(cur->name, cur); + } + + runner->prepare_post_load(); + + gguf_free(meta_ctx); + ggml_free(weight_ctx); + runner->arch = arch; + + return (tts_runner*)runner; +} + +struct tts_runner * parler_tts_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) { + parler_tts_model * model = new parler_tts_model; + dac_model * audio_model = new dac_model; + unigram_tokenizer * ut = unigram_tokenizer_from_gguf(meta_ctx); + ut->initialize_tokenizer(); + model->use_cross_attn = config->use_cross_attn; + model->setup_from_file(meta_ctx, weight_ctx, cpu_only); + audio_model->setup_from_file(meta_ctx, weight_ctx, cpu_only); + struct sampler * samp = new sampler; + struct dac_context * dctx = build_new_dac_context(audio_model, n_threads, cpu_only); + struct dac_runner * audio_decoder = new dac_runner(audio_model, dctx); + struct parler_context * pctx = build_new_parler_context(model, n_threads, cpu_only); + struct parler_kv_cache * cache = new parler_kv_cache; + struct parler_tts_runner * runner = new parler_tts_runner(model, audio_decoder, pctx, ut, samp, cache); + + // TODO: change this weight assignment pattern to mirror llama.cpp + for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) { + runner->assign_weight(cur->name, cur); + } + + if (config->use_cross_attn) { + runner->model->prep_cross_key_values(n_threads); + } + + runner->prepare_post_load(); + + gguf_free(meta_ctx); + ggml_free(weight_ctx); + runner->arch = arch; + + return (tts_runner*)runner; +} + +struct tts_runner * kokoro_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) { + kokoro_model * model = new kokoro_model; + single_pass_tokenizer * spt = single_pass_tokenizer_from_gguf(meta_ctx, "tokenizer.ggml.tokens"); + model->setup_from_file(meta_ctx, weight_ctx, cpu_only); + struct kokoro_duration_context * kdctx = build_new_duration_kokoro_context(model, n_threads, cpu_only); + struct kokoro_duration_runner * duration_runner = new kokoro_duration_runner(model, kdctx, spt); + struct kokoro_context * kctx = build_new_kokoro_context(model, n_threads, cpu_only); + // if an espeak voice id wasn't specifically set infer it from the kokoro voice, if it was override it, otherwise fallback to American English. + std::string espeak_voice_id = config->espeak_voice_id; + if (espeak_voice_id.empty()) { + espeak_voice_id = !config->voice.empty() && KOKORO_LANG_TO_ESPEAK_ID.find(config->voice.at(0)) != KOKORO_LANG_TO_ESPEAK_ID.end() ? KOKORO_LANG_TO_ESPEAK_ID[config->voice.at(0)] : "gmw/en-US"; + } + struct phonemizer * phmzr = phonemizer_from_gguf(meta_ctx, espeak_voice_id); + struct kokoro_runner * runner = new kokoro_runner(model, kctx, spt, duration_runner, phmzr); + + // TODO: change this weight assignment pattern to mirror llama.cpp + for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) { + runner->assign_weight(cur->name, cur); + } + + runner->prepare_post_load(); + + gguf_free(meta_ctx); + ggml_free(weight_ctx); + runner->arch = arch; + + return (tts_runner*)runner; +} + +struct tts_runner * dia_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) { + dia_model * model = new dia_model; + dac_model * audio_model = new dac_model; + model->setup_from_file(meta_ctx, weight_ctx, cpu_only); + audio_model->setup_from_file(meta_ctx, weight_ctx, cpu_only); + struct sampler * samp = new sampler; + struct dac_context * dctx = build_new_dac_context(audio_model, n_threads, cpu_only); + struct dac_runner * audio_decoder = new dac_runner(audio_model, dctx); + struct dia_context * diactx = build_new_dia_context(model, n_threads, cpu_only); + struct dia_kv_cache * cache = new dia_kv_cache; + struct dia_runner * runner = new dia_runner(model, audio_decoder, diactx, samp, cache); + + for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) { + runner->assign_weight(cur->name, cur); + } + + runner->prepare_post_load(); + + gguf_free(meta_ctx); + ggml_free(weight_ctx); + runner->arch = arch; + + return (tts_runner*)runner; +} + +// currently only metal and cpu devices are supported, so cpu_only only describes whether or not to try to load and run on metal. +struct tts_runner * runner_from_file(const std::string & fname, int n_threads, generation_configuration * config, bool cpu_only) { + ggml_context * weight_ctx = NULL; + + struct gguf_init_params params = { + /*.no_alloc =*/ false, + /*.ctx =*/ &weight_ctx, + }; + gguf_context * meta_ctx = gguf_init_from_file(fname.c_str(), params); + if (!meta_ctx) { + TTS_ABORT("%s failed for file %s\n", __func__, fname.c_str()); + } + int arch_key = gguf_find_key(meta_ctx, "general.architecture"); + if (arch_key == -1) { + TTS_ABORT("%s failed for file %s. No architecture is set.\n", __func__, fname.c_str()); + } + std::string arch = std::string(gguf_get_val_str(meta_ctx, arch_key)); + if (SUPPORTED_ARCHITECTURES.find(arch) == SUPPORTED_ARCHITECTURES.end()) { + TTS_ABORT("%s failed for file %s. The architecture '%s' is not supported.", __func__, fname.c_str(), arch.c_str()); + } + tts_arch arch_type = SUPPORTED_ARCHITECTURES.at(arch); + switch(arch_type) { + case PARLER_TTS_ARCH: + return parler_tts_from_file(meta_ctx, weight_ctx, n_threads, config, arch_type, cpu_only); + case KOKORO_ARCH: + return kokoro_from_file(meta_ctx, weight_ctx, n_threads, config, arch_type, cpu_only); + case DIA_ARCH: + return dia_from_file(meta_ctx, weight_ctx, n_threads, config, arch_type, cpu_only); + case ORPHEUS_ARCH: + return orpheus_from_file(meta_ctx, weight_ctx, n_threads, config, arch_type, cpu_only); + default: + TTS_ABORT("%s failed for file %s. The architecture '%s' is not supported.", __func__, fname.c_str(), arch.c_str()); + } +} + +int generate(tts_runner * runner, std::string sentence, struct tts_response * response, generation_configuration * config) { + switch(runner->arch) { + case PARLER_TTS_ARCH: + ((parler_tts_runner*)runner)->configure_generation(config); + return ((parler_tts_runner*)runner)->generate(sentence, response); + case KOKORO_ARCH: + return ((kokoro_runner*)runner)->generate(sentence, response, config->voice, config->espeak_voice_id); + case DIA_ARCH: + ((dia_runner*)runner)->configure_generation(config); + return ((dia_runner*)runner)->generate(sentence, response); + case ORPHEUS_ARCH: + ((orpheus_runner*)runner)->configure_generation(config); + return ((orpheus_runner*)runner)->generate(sentence, response); + default: + TTS_ABORT("%s failed. The architecture '%d' is not supported.", __func__, runner->arch); + } +} + +std::vector list_voices(tts_runner * runner) { + switch(runner->arch) { + case KOKORO_ARCH: + return ((kokoro_runner*)runner)->list_voices(); + default: + TTS_ABORT("%s failed. The architecture '%d' does not support #list_voices supported.", __func__, runner->arch); + } +} + +void update_conditional_prompt(tts_runner * runner, const std::string file_path, const std::string prompt, bool cpu_only) { + int n_threads = ((parler_tts_runner*)runner)->pctx->n_threads; + ((parler_tts_runner*)runner)->update_conditional_prompt(file_path, prompt, n_threads, cpu_only); +} + +bool kokoro_is_f16_compatible(std::string name) { + return name.find("voice_tensors") == std::string::npos && + name.find("bias") == std::string::npos && + name.find("gamma") == std::string::npos && + name.find("beta") == std::string::npos && + name.find("alpha") == std::string::npos && + !has_suffix(name, "embd") && + !has_suffix(name, "norm"); +} + +bool kokoro_is_quantizable(std::string name, struct quantization_params * params) { + if (kokoro_is_f16_compatible(name)) { + if (has_prefix(name, "kokoro.albert") || has_prefix(name, "kokoro.text_encoder.lstm")) { + return true; + } else if (has_prefix(name, "kokoro.duration_predictor.")) { + std::vector parts = split(name, "."); + for (std::string part : DURATION_PREDICTOR_QUANTIZATION_COMPATIBLE_PARTS) { + if (part == parts[2]) { + return true; + } + } + } + } + return false; +} + +bool dia_is_quantizable(std::string name, struct quantization_params * params) { + // The DAC audio encoder / decoder is not compatible with quantization and normalization tensors should not be quantized. + bool quantizable = !has_prefix(name, "audio_encoder") && !has_suffix(name, "norm"); + if (!params->quantize_output_heads) { + quantizable = quantizable && !has_prefix(name, "dia.decoder.heads"); + } + return quantizable; +} + +bool parler_is_quanitizable(std::string name, struct quantization_params * params) { + // the DAC audio encoder / decoder is not compatible with quantization, normalization weight shouldn't be quantized, and the text encoding shouldn't be normalized. + bool quantizable = !has_prefix(name, "audio_encoder") && !has_suffix(name, "norm.weight") && !has_suffix(name, "text_encoding") && !has_suffix(name, "positional_embed") && !has_suffix(name, "norm.bias"); + if (!params->quantize_output_heads) { + quantizable = quantizable && !has_suffix(name, "weight.head"); + } + if (!params->quantize_text_embeddings) { + quantizable = quantizable && !has_suffix(name, "embed_prompts"); + } + if (!params->quantize_cross_attn_kv) { + quantizable = quantizable && !has_suffix(name, "encoder_attn.k_proj.weight") && !has_suffix(name, "encoder_attn.v_proj.weight"); + } + return quantizable; +} + +bool is_quantizable(tts_arch arch, std::string name, struct quantization_params * params) { + switch(arch) { + case PARLER_TTS_ARCH: + return parler_is_quanitizable(name, params); + case DIA_ARCH: + return dia_is_quantizable(name, params); + case KOKORO_ARCH: + return kokoro_is_quantizable(name, params); + default: + TTS_ABORT("%s failed. The architecture '%d' is not supported.", __func__, arch); + } +} + +size_t quantize_tensor(void * new_data, struct ggml_tensor * tensor, const float * imatrix, enum ggml_type qtype, uint32_t n_threads) { + // much of this is form copied from llama.cpp + int chunk_size_multiplier = 1; + if (qtype == GGML_TYPE_Q4_0_4_4 || qtype == GGML_TYPE_Q4_0_4_8 || qtype == GGML_TYPE_Q4_0_8_8) { + if ((qtype == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) qtype = GGML_TYPE_Q4_0; + else if (tensor->ne[1] % 4 != 0) qtype = GGML_TYPE_Q4_0; + if (qtype == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8; + else if (qtype == GGML_TYPE_Q4_0_4_4 || qtype == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4; + } + size_t out_size = 0; + const int32_t d3_step = tensor->ne[0] * tensor->ne[1]; + const int32_t n_per_row = tensor->ne[0]; + const int32_t nrows = tensor->ne[1]; + static const int32_t min_chunk_size = 32 * 512; + const int32_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) * chunk_size_multiplier; + uint32_t thread_count = std::max(1, std::min((int)n_threads, (int)(d3_step + chunk_size - 1) / chunk_size)); + std::mutex mutex; + + for (int32_t d3_index = 0; d3_index < tensor->ne[2]; d3_index++) { + const float * f32_data_d3 = ((float *) tensor->data) + d3_index * d3_step; + void * new_data_d3 = (char *)new_data + ggml_row_size(qtype, tensor->ne[0]) * d3_index * nrows; + const float * imatrix_03 = imatrix ? imatrix + d3_index * tensor->ne[0] : nullptr; + if (thread_count <= 1) { + // not threaded + out_size += ggml_quantize_chunk(qtype, f32_data_d3, new_data_d3, 0, nrows, n_per_row, imatrix); + } else { + std::vector threads; + int64_t counter = 0; + size_t new_size = 0; + bool valid = true; + for (uint32_t t = 0; t < thread_count; t++) { + auto func = [&mutex, &counter, &new_size, &valid, qtype, f32_data_d3, new_data_d3, chunk_size, nrows, n_per_row, imatrix]() { + const int64_t nrows_per_chunk = chunk_size / n_per_row; + size_t local_size = 0; + while (true) { + std::unique_lock lock(mutex); + int64_t first_row = counter; + counter += nrows_per_chunk; + if (first_row >= nrows) { + if (local_size > 0) { + new_size += local_size; + } + break; + } + lock.unlock(); + const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk); + size_t this_size = ggml_quantize_chunk(qtype, f32_data_d3, new_data_d3, first_row * n_per_row, this_nrow, n_per_row, imatrix); + local_size += this_size; + + // validate the quantized data; I am not sure how this would occur, but there is always the safe fallback on doing this single threaded. + const size_t row_size = ggml_row_size(qtype, n_per_row); + void * this_data = (char *) new_data_d3 + first_row * row_size; + if (!ggml_validate_row_data(qtype, this_data, this_size)) { + std::unique_lock lock(mutex); + valid = false; + break; + } + } + }; + threads.push_back(std::thread(func)); + } + for (auto & t : threads) t.join(); + + if (!valid) { + TTS_ABORT("Validation of quantized data failed. Please try again and/or switch to single thread quantization.\n"); + } + out_size += new_size; + } + } + return out_size; +} + +static void zeros(std::ofstream & file, size_t n) { + char zero = 0; + for (size_t i = 0; i < n; ++i) { + file.write(&zero, 1); + } +} + +template +struct no_init { + T value; + no_init() { /* do nothing */ } +}; + +void quantize_gguf(const std::string & ifile, const std::string & ofile, struct quantization_params * params) { + ggml_context * weight_ctx = NULL; + struct gguf_init_params gguf_params = { + /*.no_alloc =*/ false, + /*.ctx =*/ &weight_ctx, + }; + gguf_context * meta_ctx = gguf_init_from_file(ifile.c_str(), gguf_params); + std::string arch = "parler-tts"; // only parler-tts gguf files should lack an explicit architecture. + + int arch_key = gguf_find_key(meta_ctx, "general.architecture"); + if (arch_key != -1) { + arch = std::string(gguf_get_val_str(meta_ctx, arch_key)); + } + tts_arch arch_type = SUPPORTED_ARCHITECTURES.at(arch); + + if (params->quantize_type != GGML_TYPE_Q5_0 && params->quantize_type != GGML_TYPE_Q8_0 && params->quantize_type != GGML_TYPE_F16 && params->quantize_type != GGML_TYPE_Q4_0) { + fprintf(stdout, "Warning, %s is untested for quantization type '%d'. Use at your own risk.\n", arch.c_str(), params->quantize_type); + } + + const size_t align = GGUF_DEFAULT_ALIGNMENT; + gguf_context_ptr ctx_out { gguf_init_empty() }; + + // copy the KV pairs from the input file + gguf_set_kv(ctx_out.get(), meta_ctx); + gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); + gguf_set_val_u32(ctx_out.get(), "general.quantization_type", params->quantize_type); + for (ggml_tensor * tensor = ggml_get_first_tensor(weight_ctx); tensor; tensor = ggml_get_next_tensor(weight_ctx, tensor)) { + std::string name = ggml_get_name(tensor); + if (name.size() != 0) { + gguf_add_tensor(ctx_out.get(), tensor); + } + } + + std::vector> work; + + std::ofstream fout; + auto close_ofstream = [&]() { + // Write metadata and close file handler + if (fout.is_open()) { + fout.seekp(0); + std::vector data(gguf_get_meta_size(ctx_out.get())); + gguf_get_meta_data(ctx_out.get(), data.data()); + fout.write((const char *) data.data(), data.size()); + fout.close(); + } + }; + auto new_ofstream = [&]() { + std::string fname = ofile; + fout = std::ofstream(fname, std::ios::binary); + fout.exceptions(std::ofstream::failbit); // fail fast on write errors + const size_t meta_size = gguf_get_meta_size(ctx_out.get()); + // placeholder for the meta data + ::zeros(fout, meta_size); + }; + new_ofstream(); + for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) { + enum ggml_type new_type; + void * new_data; + size_t new_size; + std::string name = ggml_get_name(cur); + + if (name.size() == 0) { + continue; + } + + if (is_quantizable(arch_type, name, params)) { + if ((cur->type) != GGML_TYPE_F32) { + TTS_ABORT("ERROR: All quantized tensors must be transformed from 32bit floats. Tensor, '%s', has improper type, '%d'\n", cur->name, cur->type); + } + new_type = params->quantize_type; + if ((new_type >= GGML_TYPE_IQ2_XXS && new_type <= GGML_TYPE_IQ4_XS)) { + TTS_ABORT("ERROR: Quantization type '%d' requires an importance matrix.\n", new_type); + } + const int64_t nelement_size = ggml_nelements(cur) * 4; + if (work.size() < (size_t)nelement_size) { + work.resize(nelement_size); // upper bound on size + } + new_data = work.data(); + new_size = quantize_tensor(new_data, cur, nullptr, new_type, params->n_threads); + } else if ((params->convert_non_quantizable_to_f16 && kokoro_is_f16_compatible(name)) || (params->convert_dac_to_f16 && has_prefix(name, "audio_encoder") && !has_suffix(name, "alpha"))) { + if ((cur->type) != GGML_TYPE_F32) { + TTS_ABORT("ERROR: All converted tensors must be transformed from 32bit floats. Tensor, '%s', has improper type, '%d'\n", cur->name, cur->type); + } + new_type = GGML_TYPE_F16; + const int64_t nelement_size = ggml_nelements(cur) * 4; + if (work.size() < (size_t)nelement_size) { + work.resize(nelement_size); // upper bound on size + } + new_data = work.data(); + new_size = quantize_tensor(new_data, cur, nullptr, new_type, params->n_threads); + } else { + new_type = cur->type; + new_data = cur->data; + new_size = ggml_nbytes(cur); + } + + gguf_set_tensor_type(ctx_out.get(), name.c_str(), new_type); + gguf_set_tensor_data(ctx_out.get(), name.c_str(), new_data); + fprintf(stdout, "At tensor: '%s' with new size: %zu bytes\n", name.c_str(), new_size); + // write tensor data + padding + fout.write((const char *) new_data, new_size); + zeros(fout, GGML_PAD(new_size, align) - new_size); + } + close_ofstream(); +} diff --git a/otherarch/ttscpp/src/tts_model.cpp b/otherarch/ttscpp/src/tts_model.cpp new file mode 100644 index 000000000..19754295a --- /dev/null +++ b/otherarch/ttscpp/src/tts_model.cpp @@ -0,0 +1,157 @@ +#include "tts_model.h" +#include "ggml-backend.h" +#include "ggml-cpu.h" + +void append_to_response(struct tts_response * response, struct tts_response * to_append) { + float * new_data = (float *) malloc((response->n_outputs + to_append->n_outputs) * sizeof(float)); + if (response->n_outputs > 0) { + std::memcpy(new_data, response->data, response->n_outputs*sizeof(float)); + } + if (to_append->n_outputs > 0) { + float * next_loc = new_data + response->n_outputs; + std::memcpy(next_loc, to_append->data, to_append->n_outputs*sizeof(float)); + } + response->data = new_data; + response->n_outputs += to_append->n_outputs; +} + +/* + * Pulls output_size to prepped buffer 'output' from 'output_node' tensor. If no buffer is passed will default to the existing output buffer present + * on runner_context. + */ +void runner_context::get_ggml_node_data(struct ggml_tensor * output_node, float * output, size_t output_size, ggml_backend_buffer_t buffer) { + if (buffer == nullptr) { + buffer = buf_output; + } + if (ggml_backend_buffer_get_size(buffer) < output_size) { + TTS_ABORT("Output buffer overflow of %d / %d for output node '%s'\n", output_size, ggml_backend_buffer_get_size(buffer), ggml_get_name(output_node)); + } else if (ggml_nbytes(output_node) < output_size) { + TTS_ABORT("Output node, '%s', with %d bytes is too small for #ggml_backend_tensor_get_async with size of %d.\n", ggml_get_name(output_node), ggml_nbytes(output_node), output_size); + } + ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched, output_node); + ggml_backend_tensor_get_async(backend_res, output_node, output, 0, output_size); +} + +void runner_context::set_threads() { + if (backend != nullptr) { +#ifdef GGML_USE_METAL + // this is form copied from llama.cpp, but has since been removed. I don't know if this should be tuned. + ggml_backend_metal_set_n_cb(backend, 1); +#endif + } + if (backend_cpu != nullptr) { + ggml_backend_cpu_set_n_threads(backend_cpu, n_threads); + struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads); + threadpool = ggml_threadpool_new(&ttp); + ggml_backend_cpu_set_threadpool(backend_cpu, threadpool); + } +} + +void runner_context::build_schedule(size_t max_nodes) { + backend_cpu_buffer = ggml_backend_cpu_buffer_type(); + if (backend != nullptr) { +#ifdef GGML_USE_METAL + backend_buffer = ggml_backend_metal_buffer_type(); +#endif + std::vector bufs = {backend_buffer, backend_cpu_buffer}; + std::vector backs = {backend, backend_cpu}; + sched = ggml_backend_sched_new(backs.data(), bufs.data(), 2, max_nodes, false, false); + } else { + std::vector bufs = {backend_cpu_buffer}; + std::vector backs = {backend_cpu}; + sched = ggml_backend_sched_new(backs.data(), bufs.data(), 1, max_nodes, false, false); + } +} + +bool runner_context::prep_schedule(struct ggml_cgraph * gf) { + return ggml_backend_sched_reserve(sched, gf); +} + +void runner_context::prep_output_buffer(size_t new_size) { + const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output) : 0; + if (!buf_output || prev_size < new_size) { + if (buf_output) { + ggml_backend_buffer_free(buf_output); + buf_output = nullptr; + logits = nullptr; + } + buf_output = ggml_backend_buft_alloc_buffer(backend_cpu_buffer, new_size); + } + logits = (float *) ggml_backend_buffer_get_base(buf_output); +} + +void tts_runner::init_build(std::vector* buf_compute_meta) { + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute_meta->size(), + /*.mem_buffer =*/ buf_compute_meta->data(), + /*.no_alloc =*/ true, + }; + + ctx = ggml_init(params); +} + +void tts_runner::free_build() { + if (ctx) { + ggml_free(ctx); + ctx = nullptr; + } +} + +void tts_model::prep_buffers_and_context(bool cpu_only, float size_offset, uint32_t dedicated_add_on_size) { + // currently DAC is only supported on cpu because the ops are not implemented on other devices; + if (cpu_only) { + backend = ggml_backend_cpu_init(); + buffer = ggml_backend_cpu_buffer_type(); + } else { +#ifdef GGML_USE_METAL + backend = ggml_backend_metal_init(); + buffer = ggml_backend_metal_buffer_type(); +#endif + // if use metal is not installed then we need to warn here + if (!backend || !buffer) { + TTS_ABORT("'GGML_USE_METAL' is not defined either set the model to use CPU only or install ggml with metal support."); + } + } + size_t ctx_size = ggml_tensor_overhead() * (tensor_meta.n_tensors * size_offset); + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ctx = ggml_init(params); + buf = ggml_backend_buft_alloc_buffer(buffer, tensor_meta.n_bytes + dedicated_add_on_size); +} + +void tts_model::assign_weight(std::string name, ggml_tensor * tensor) { + TTS_ABORT("%s received name, %s, tensor without being defined. %s must be defined for all implementations of tts_model. \n", __func__, name.c_str(), __func__); +} + +void tts_model::set_tensor(struct ggml_tensor * tensor, struct ggml_tensor * target) { + tensor->buffer = buf; + tensor->data = (void *)((uint8_t *) ggml_backend_buffer_get_base(buf) + offset); + size_t size = ggml_nbytes(target); + ggml_backend_tensor_set(tensor, target->data, 0, size); + ggml_set_name(tensor, target->name); + offset += size; +} + +void tts_model::setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only, std::string model_prefix, float size_offset, uint32_t dedicated_add_on_size) { + tensor_meta = compute_tensor_meta(model_prefix, load_context, compute_tensor_meta_cb); + prep_buffers_and_context(cpu_only, size_offset, dedicated_add_on_size); +} + +size_t tts_model::max_nodes() { + return std::max(8192, tensor_meta.n_tensors*5); +} + +void tts_model::free() { + if (ctx) { + ggml_free(ctx); + } + if (buf) { + ggml_backend_buffer_free(buf); + } + if (backend) { + ggml_backend_free(backend); + } +} diff --git a/otherarch/ttscpp/src/tts_model.h b/otherarch/ttscpp/src/tts_model.h new file mode 100644 index 000000000..31f83fdfd --- /dev/null +++ b/otherarch/ttscpp/src/tts_model.h @@ -0,0 +1,69 @@ +#ifndef tts_model_h +#define tts_model_h + +#include +#include +#include "ttsutil.h" +#include "ttscommon.h" + +void append_to_response(struct tts_response * response, struct tts_response * to_append); + +using tensor_meta_callback = std::function*; + +struct runner_context { + runner_context(int n_threads): n_threads(n_threads) {}; + virtual ~runner_context() { + ggml_backend_sched_free(sched); + ggml_threadpool_free(threadpool); + ggml_backend_free(backend_cpu); + ggml_backend_free(backend); + ggml_backend_buffer_free(buf_output); + } + // TODO: extend the backend and buffer support out to all devices + ggml_backend_t backend = nullptr; + ggml_backend_buffer_type_t backend_buffer = nullptr; + + ggml_backend_t backend_cpu = nullptr; + ggml_backend_buffer_type_t backend_cpu_buffer = nullptr; + + std::vector buf_compute_meta; + ggml_backend_buffer_t buf_output = nullptr; + ggml_backend_sched_t sched = nullptr; + ggml_threadpool_t threadpool = nullptr; + float * logits = nullptr; + int n_threads; + + void get_ggml_node_data(struct ggml_tensor * output_tensor, float * output, size_t output_size, ggml_backend_buffer_t buffer = nullptr); + void set_threads(); + void build_schedule(size_t max_nodes); + bool prep_schedule(ggml_cgraph * gf); + void prep_output_buffer(size_t new_size); +}; + +struct tts_model { + struct model_tensor_meta tensor_meta; + + // this is the current byte offset into the model's buffer. + size_t offset = 0; + + bool use_cross_attn = true; + + ggml_backend_buffer_type_t buffer = nullptr; + ggml_backend_t backend = nullptr; + ggml_backend_buffer_t buf = nullptr; + + // it is quite common for implementations of tts_model to need to update attributes or perform distinct operations + // when computing the tensor meta of the loaded model. This callback allows this as it will receive each processed tensor. + tensor_meta_callback compute_tensor_meta_cb = nullptr; + + struct ggml_context * ctx; + + void prep_buffers_and_context(bool cpu_only, float size_offset, uint32_t dedicated_add_on_size); + void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only, std::string model_prefix, float size_offset = 1.4, uint32_t dedicated_add_on_size = 0); + void set_tensor(struct ggml_tensor * tensor, struct ggml_tensor * target); + size_t max_nodes(); + void assign_weight(std::string name, ggml_tensor * tensor); + void free(); +}; + +#endif diff --git a/otherarch/ttscpp/src/ttsutil.cpp b/otherarch/ttscpp/src/ttsutil.cpp new file mode 100644 index 000000000..de1227257 --- /dev/null +++ b/otherarch/ttscpp/src/ttsutil.cpp @@ -0,0 +1,308 @@ +#include "ttsutil.h" + +#include +#include +#include +#ifdef __APPLE__ +#include +#elif __linux__ +#include +#else +// windows stuff +#endif + +void tts_abort(const char * file, int line, const char * fmt, ...) { + fflush(stdout); + fprintf(stderr, "%s:%d: ", file, line); + va_list args; + va_start(args, fmt); + vfprintf(stderr, fmt, args); + va_end(args); + abort(); +} + +// Simple helper function for getting layer count from tensor name +std::pair parse_layer_count(std::string name, int skip) { + bool found = false; + bool after_layer = false; + std::string digit_chars = ""; + std::string after_layer_name = ""; + int count = 0; + for (char& c : name) { + if (count < skip) { + count += 1; + continue; + } + count += 1; + if (after_layer) { + after_layer_name += c; + } else if (std::isdigit(c)) { + found = true; + digit_chars += c; + } else if (!found) { + + } else { + after_layer = true; + after_layer_name += c; + } + } + if (digit_chars.size() == 0) { + return std::make_pair(-1, name); + } + return std::make_pair(std::stoi(digit_chars), after_layer_name); +} + +int search_for_gguf_keys(gguf_context * meta, std::vector possible_keys) { + int gguf_key = -1; + for (auto key : possible_keys) { + gguf_key = gguf_find_key(meta, key.c_str()); + if (gguf_key != -1) { + return gguf_key; + } + } + return gguf_key; +} + +void random_uniform_gen(int count, float * tgt, float min, float max) { + static std::default_random_engine e; + static std::uniform_real_distribution dis(min, max); + for (int i = 0; i < count; i++) { + tgt[i] = dis(e); + } +} + +void random_normal_gen(int count, float * tgt, float mean, float std) { + static std::default_random_engine e; + static std::normal_distribution dis(mean, std); + for (int i = 0; i < count; i++) { + tgt[i] = dis(e); + } +} + +float round_to_float(double v) { + return roundf(v * powl(10, 6)) / powl(10, 6); +} + +struct ggml_tensor * reciprocal(ggml_context * ctx, struct ggml_tensor * x) { + TTS_ASSERT(x->ne[0] == 1); + static constexpr float one = 1.0f; + ggml_tensor * numerator = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, x->ne[1]); + // stride trick so that the scalar numerator can be divided by x. + numerator->nb[1] = 0; + numerator->data = const_cast(&one); + return ggml_div(ctx, numerator, x); +} + +// Described in https://arxiv.org/abs/2006.08195 +// Snake1d is a common tunable activation function used in the DAC model. +struct ggml_tensor * snake_1d(ggml_context * ctx, struct ggml_tensor * alpha, struct ggml_tensor * a) { + assert(a->ne[2] == 1 && a->ne[3] == 1); + return ggml_add(ctx, a, ggml_mul(ctx, ggml_sqr(ctx, ggml_sin(ctx, ggml_mul(ctx, a, alpha))), reciprocal(ctx, alpha))); +} + +bool has_suffix(std::string value, std::string suffix) { + return value.size() >= suffix.size() && value.compare(value.size()-suffix.size(), suffix.size(), suffix) == 0; +} + +bool has_prefix(std::string value, std::string prefix) { + return value.size() >= prefix.size() && value.compare(0, prefix.size(), prefix) == 0; +} + +struct ggml_tensor * stft(ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * window, size_t n_fft, size_t hop, bool abs_and_angle, bool one_sided) { + if (window->ne[0] != n_fft) { + TTS_ABORT("For #stft the window_size, %d, must be either equal to n_fft, %d, or, when one sided, n_fft / 2 + 1, %d.\n", a->ne[0], n_fft, n_fft/2+1); + } + struct ggml_tensor * cur = ggml_stft(ctx, a, window, n_fft, hop, abs_and_angle); + if (one_sided) { + cur = ggml_cont(ctx, ggml_view_4d(ctx, cur, ((int64_t) n_fft / 2) + 1, cur->ne[1], cur->ne[2], cur->ne[3], cur->nb[1], cur->nb[2], cur->nb[3], 0)); + } + + return cur; +} + +struct ggml_tensor * istft(ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * window_squared_sum, struct ggml_tensor * window, size_t n_fft, size_t hop, bool abs_and_angle, bool one_sided) { + if ((!one_sided && a->ne[0] != n_fft) || (one_sided && a->ne[0] != n_fft / 2 + 1)) { + TTS_ABORT("For #istft the window_size, %d, must be either equal to n_fft, %d, or, when one sided, n_fft / 2 + 1, %d.\n", a->ne[0], n_fft, n_fft/2+1); + } + struct ggml_tensor * cur = ggml_istft(ctx, a, window, n_fft, hop, abs_and_angle); + cur = ggml_div(ctx, cur, window_squared_sum); + return cur; +} + +void hann_window(size_t n_fft, std::vector & tgt) { + for (int i = 0; i < n_fft; i++) { + float v = pow(sin(M_PI * (double)i / (double) n_fft), 2.0); + tgt.push_back(v); + } +} + +// This is a custom map op for computing noise and relevant voiced sections. +void uv_noise_compute(struct ggml_tensor * dst, const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata) { + float voice_threshold = ((float *) c->data)[0]; + float noise_std = ((float *) c->data)[1]; + float sin_amp = ((float *) c->data)[2]; + float sin_amp_div = ((float *) c->data)[3]; + float * rand_init = ((float *) c->data) + 4; + + const int rpt = (b->ne[0] + nth - 1)/nth; + const int start = ith * rpt; + const int end = MIN((ith + 1) * rpt, b->ne[0]); + + float * uv_dst = (float *) dst->data; + float * noise_dst = (float *)((char*)dst->data + dst->nb[2]); + float * tgt = (float *) b->data; + + for(int bt = 0; bt < b->ne[2]; bt++) { + for(int r = start; r < end; r++) { + if (tgt[r] > voice_threshold) { + for (int h = 0; h < a->ne[1]; h++) { + int index = h*dst->ne[0]+r; + uv_dst[index] = sin_amp; + noise_dst[index] = noise_std * rand_init[index]; + } + } else { + for (int h = 0; h < a->ne[1]; h++) { + int index = h*dst->ne[0]+r; + uv_dst[index] = 0.0f; + noise_dst[index] = sin_amp_div * rand_init[index]; + } + } + } + } +} + +// This is a custom map op for applying cfg scale. It is used at the terminus of logit generation in Dia. +void cfg_scale(struct ggml_tensor * dst, const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata) { + const float scale = ((float *) userdata)[0]; + const float max_output = ((float*) userdata)[1]; + const int rpt = (b->ne[0] + nth - 1)/nth; + const int start = ith * rpt; + const int end = MIN((ith + 1) * rpt, b->ne[0]); + + float * output = (float *) dst->data; + float * cond = (float *) a->data; + float * uncond = (float *) b->data; + + for(int bt = 0; bt < b->ne[2]; bt++) { + for (int h = 0; h < b->ne[1]; h++) { + int i = (h * b->ne[0]) + (bt * b->ne[0] * b->ne[1]); + for(int r = start; r < end; r++) { + // only let the output heads yield tokens up to EOS + if (r > max_output) { + output[i+r] = -INFINITY; + } + const float cr = cond[i+r]; + const float ur = uncond[i+r]; + output[i+r] = cr + scale * (cr - ur); + } + } + } +} + +// currently this assumes a center view in which the output vector is reflectively padded by n_fft / 2 on each side. +void compute_window_squared_sum(size_t n_fft, size_t hop, size_t n_frames, float * tgt, float * window) { + size_t cutoff = n_frames * hop; + size_t half = n_fft / 2; + std::memset(tgt, 0, cutoff*sizeof(float)); + // istft applies half / hop steps before the beginning of the sequence. We need to account for these accumulated windows. + for (int i = 0; i < n_frames + (half / hop); i++) { + for (int ii = 0; ii < n_fft; ii++) { + int index = ii + i*hop - half; + if (index < 0 || index >= cutoff) { + continue; + } + tgt[index] += powf(window[ii], 2); + } + } +} + +std::vector split(std::string target, std::string split_on, bool include_split_characters) { + std::vector output; + size_t last = 0; + + for (int i = 0; i < target.size(); i++) { + if (i > last && split_on.find(target[i]) != std::string::npos) { + std::string part(target.substr(last, i - last)); + output.push_back(part); + if (include_split_characters) { + output.push_back(target.substr(i, 1)); + } + last = i+1; + } else if (i == last && split_on.find(target[i]) != std::string::npos) { + if (include_split_characters) { + output.push_back(target.substr(i, 1)); + } + last = i+1; + } + } + if (last < target.size()) { + std::string part(target.substr(last)); + output.push_back(part); + } + + return output; +} + +std::vector split(std::string target, const char split_on, bool include_split_characters) { + std::vector output; + size_t last = 0; + + for (int i = 0; i < target.size(); i++) { + if (i > last && split_on == target[i]) { + std::string part(target.substr(last, i - last)); + output.push_back(part); + if (include_split_characters) { + output.push_back(target.substr(i, 1)); + } + last = i+1; + } else if (i == last && split_on == target[i]) { + if (include_split_characters) { + output.push_back(target.substr(i, 1)); + } + last = i+1; + } + } + if (last < target.size()) { + std::string part(target.substr(last)); + output.push_back(part); + } + + return output; +} + +std::string strip(std::string target, std::string vals) { + target.erase(target.begin(), std::find_if(target.begin(), target.end(), [&vals](unsigned char ch) { + return vals.find(ch) == std::string::npos; + })); + target.erase(std::find_if(target.rbegin(), target.rend(), [&vals](unsigned char ch) { + return vals.find(ch) == std::string::npos; + }).base(), target.end()); + return target; +} + +std::string replace_any(std::string target, std::string to_replace, std::string replacement) { + for (int i = 0; i < to_replace.size(); i++) { + size_t position = target.find(to_replace[i]); + while (position != std::string::npos) { + target.replace(position, 1, replacement); + position = target.find(to_replace[i]); + } + } + return target; +} + +struct model_tensor_meta compute_tensor_meta(std::string name_prefix, ggml_context * weight_ctx, std::function* callback) { + model_tensor_meta meta; + for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) { + if (callback) { + (*callback)(cur); + } + std::string::size_type pos = std::string(cur->name).find(".", 0); + std::string top_level(std::string(cur->name).substr(0, pos)); + if (top_level == name_prefix) { + meta.n_tensors += 1; + meta.n_bytes += ggml_nbytes_pad(cur); + } + } + return meta; +} diff --git a/otherarch/ttscpp/src/ttsutil.h b/otherarch/ttscpp/src/ttsutil.h new file mode 100644 index 000000000..42b2164eb --- /dev/null +++ b/otherarch/ttscpp/src/ttsutil.h @@ -0,0 +1,71 @@ +#ifndef util_h +#define util_h + +#define _USE_MATH_DEFINES +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ggml-metal.h" +#include "ggml-backend.h" +#include "ggml-alloc.h" +#include "ggml-cpu.h" +#include "ggml.h" +#include "ggml-impl.h" +#include "ggml-cpp.h" + +#define TTS_ABORT(...) tts_abort(__FILE__, __LINE__, __VA_ARGS__) +#define TTS_ASSERT(x) if (!(x)) TTS_ABORT("TTS_ASSERT(%s) failed", #x) + +struct model_tensor_meta { + uint32_t n_tensors = 0; + size_t n_bytes = 0; +}; + +/** + * Both of these random fill the tgt array with count random floating point values. + * the default parameter values are consistent with pytorch random function defaults. + */ +void random_uniform_gen(int count, float * tgt, float min = 0.0f, float max = 1.0f); +void random_normal_gen(int count, float * tgt, float mean = 0.0f, float std = 1.0f); + +std::pair parse_layer_count(std::string name, int skip = 0); + +struct model_tensor_meta compute_tensor_meta(std::string name_prefix, ggml_context * weight_ctx, std::function* callback = nullptr); +struct ggml_tensor * snake_1d(ggml_context * ctx, struct ggml_tensor * alpha, struct ggml_tensor * a); +int search_for_gguf_keys(gguf_context * meta, std::vector possible_keys); + +// a simple window function for stft +void hann_window(size_t n_fft, std::vector& tgt); + +// currently this assumes a center view in which the output vector is reflectively padded by n_fft / 2 on each side. +void compute_window_squared_sum(size_t n_fft, size_t hop, size_t n_frames, float * tgt, float * window); + +// these functions wrap the stft and istft ggml ops and compute the necessary view and division ops for their indepentent settings. +struct ggml_tensor * stft(ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * window, size_t n_fft, size_t hop, bool abs_and_angle, bool one_sided); +struct ggml_tensor * istft(ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * window_squared_sum, struct ggml_tensor * window, size_t n_fft, size_t hop, bool abs_and_angle, bool one_sided); + +// This is a custom op for sine_generation in the Kokoro model. +void uv_noise_compute(struct ggml_tensor * dst, const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata); + +// This is a custom op for logit correction in the Dia model. +void cfg_scale(struct ggml_tensor * dst, const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata); + +struct ggml_tensor * reciprocal(ggml_context * ctx, struct ggml_tensor * x); + +bool has_suffix(std::string value, std::string suffix); +bool has_prefix(std::string value, std::string prefix); + +std::vector split(std::string target, std::string split_on, bool include_split_characters = false); +std::vector split(std::string target, const char split_on, bool include_split_characters = false); +std::string strip(std::string target, std::string vals = " "); +std::string replace_any(std::string target, std::string to_replace, std::string replacement); + +[[noreturn]] void tts_abort(const char * file, int line, const char * fmt, ...); + +#endif