diff --git a/CMakeLists.txt b/CMakeLists.txt index 50ff864f0..9aec2b4e8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -474,7 +474,7 @@ set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) add_library(tts_adapter otherarch/tts_adapter.cpp) -target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./tools ./common) +target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./otherarch/ttscpp/include ./otherarch/ttscpp/src ./tools ./common) target_compile_features(tts_adapter PUBLIC cxx_std_17) # don't bump target_link_libraries(tts_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS}) set_target_properties(tts_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/Makefile b/Makefile index 2400dffff..0edece97d 100644 --- a/Makefile +++ b/Makefile @@ -729,7 +729,7 @@ mainvk: tools/main/main.cpp common/arg.cpp build-info.h ggml_v4_vulkan.o ggml-cp $(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS) embedding: examples/embedding/embedding.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -ttscppmain: otherarch/ttscpp/cli/cli.cpp otherarch/ttscpp/cli/playback.cpp otherarch/ttscpp/cli/playback.h otherarch/ttscpp/cli/write_file.cpp otherarch/ttscpp/cli/write_file.h otherarch/ttscpp/cli/vad.cpp otherarch/ttscpp/cli/vad.h otherarch/ttscpp/src/tts.cpp otherarch/ttscpp/src/tokenizer.cpp otherarch/ttscpp/src/sampler.cpp otherarch/ttscpp/src/parler_model.cpp otherarch/ttscpp/src/dac_model.cpp otherarch/ttscpp/src/ttsutil.cpp otherarch/ttscpp/src/args.cpp otherarch/ttscpp/src/t5_encoder_model.cpp otherarch/ttscpp/src/phonemizer.cpp otherarch/ttscpp/src/tts_model.cpp otherarch/ttscpp/src/kokoro_model.cpp otherarch/ttscpp/src/dia_model.cpp otherarch/ttscpp/src/orpheus_model.cpp otherarch/ttscpp/src/snac_model.cpp otherarch/ttscpp/src/general_neural_audio_codec.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS) +ttscppmain: otherarch/ttscpp/cli/cli.cpp otherarch/ttscpp/cli/playback.cpp otherarch/ttscpp/cli/playback.h otherarch/ttscpp/cli/write_file.cpp otherarch/ttscpp/cli/write_file.h otherarch/ttscpp/cli/vad.cpp otherarch/ttscpp/cli/vad.h otherarch/ttscpp/src/tts.cpp otherarch/ttscpp/src/ttstokenizer.cpp otherarch/ttscpp/src/ttssampler.cpp otherarch/ttscpp/src/parler_model.cpp otherarch/ttscpp/src/dac_model.cpp otherarch/ttscpp/src/ttsutil.cpp otherarch/ttscpp/src/ttsargs.cpp otherarch/ttscpp/src/ttst5_encoder_model.cpp otherarch/ttscpp/src/phonemizer.cpp otherarch/ttscpp/src/tts_model.cpp otherarch/ttscpp/src/kokoro_model.cpp otherarch/ttscpp/src/dia_model.cpp otherarch/ttscpp/src/orpheus_model.cpp otherarch/ttscpp/src/snac_model.cpp otherarch/ttscpp/src/general_neural_audio_codec.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) ggml/src/ggml-vulkan-shaders.cpp: diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp index 86caf67e5..81fb47b70 100644 --- a/otherarch/tts_adapter.cpp +++ b/otherarch/tts_adapter.cpp @@ -25,6 +25,22 @@ #define M_PI 3.14159265358979323846 #endif +//imports required for tts.cpp to work +#include "tts.cpp" +#include "ttstokenizer.cpp" +#include "ttssampler.cpp" +#include "parler_model.cpp" +#include "dac_model.cpp" +#include "ttsutil.cpp" +#include "ttst5_encoder_model.cpp" +#include "phonemizer.cpp" +#include "tts_model.cpp" +#include "kokoro_model.cpp" +#include "dia_model.cpp" +#include "orpheus_model.cpp" +#include "snac_model.cpp" +#include "general_neural_audio_codec.cpp" + enum TTS_VER { TTS_VER_2, diff --git a/otherarch/ttscpp/cli/vad.cpp b/otherarch/ttscpp/cli/vad.cpp index 9468ef2fa..ab966c3be 100644 --- a/otherarch/ttscpp/cli/vad.cpp +++ b/otherarch/ttscpp/cli/vad.cpp @@ -9,8 +9,8 @@ float energy(float * chunk, int count) { } void apply_energy_voice_inactivity_detection( - tts_response & data, - float sample_rate, + tts_response & data, + float sample_rate, int ms_per_frame, int frame_threshold, float normalized_energy_threshold, diff --git a/otherarch/ttscpp/include/audio_file.h b/otherarch/ttscpp/include/audio_file.h index dd1f50fb3..cf79de446 100644 --- a/otherarch/ttscpp/include/audio_file.h +++ b/otherarch/ttscpp/include/audio_file.h @@ -81,23 +81,23 @@ template class AudioFile { public: - + //============================================================= typedef std::vector > AudioBuffer; - + //============================================================= /** Constructor */ AudioFile(); - + /** Constructor, using a given file path to load a file */ AudioFile (std::string filePath); - + //============================================================= /** Loads an audio file from a given file path. * @Returns true if the file was successfully loaded */ bool load (std::string filePath); - + /** Saves an audio file to a given file path. * @Returns true if the file was successfully saved */ @@ -107,66 +107,66 @@ public: * @Returns true if the write was successful */ bool writeData (std::vector& fileData, AudioFileFormat format = AudioFileFormat::Wave); - + //============================================================= /** Loads an audio file from data in memory */ bool loadFromMemory (std::vector& fileData); - + //============================================================= /** @Returns the sample rate */ uint32_t getSampleRate() const; - + /** @Returns the number of audio channels in the buffer */ int getNumChannels() const; /** @Returns true if the audio file is mono */ bool isMono() const; - + /** @Returns true if the audio file is stereo */ bool isStereo() const; - + /** @Returns the bit depth of each sample */ int getBitDepth() const; - + /** @Returns the number of samples per channel */ int getNumSamplesPerChannel() const; - + /** @Returns the length in seconds of the audio file based on the number of samples and sample rate */ double getLengthInSeconds() const; - + /** Prints a summary of the audio file to the console */ void printSummary() const; - + //============================================================= - + /** Set the audio buffer for this AudioFile by copying samples from another buffer. * @Returns true if the buffer was copied successfully. */ bool setAudioBuffer (AudioBuffer& newBuffer); - + /** Sets the audio buffer to a given number of channels and number of samples per channel. This will try to preserve * the existing audio, adding zeros to any new channels or new samples in a given channel. */ void setAudioBufferSize (int numChannels, int numSamples); - + /** Sets the number of samples per channel in the audio buffer. This will try to preserve * the existing audio, adding zeros to new samples in a given channel if the number of samples is increased. */ void setNumSamplesPerChannel (int numSamples); - + /** Sets the number of channels. New channels will have the correct number of samples and be initialised to zero */ void setNumChannels (int numChannels); - + /** Sets the bit depth for the audio file. If you use the save() function, this bit depth rate will be used */ void setBitDepth (int numBitsPerSample); - + /** Sets the sample rate for the audio file. If you use the save() function, this sample rate will be used */ void setSampleRate (uint32_t newSampleRate); - + //============================================================= /** Sets whether the library should log error messages to the console. By default this is true */ void shouldLogErrorsToConsole (bool logErrors); - + //============================================================= /** A vector of vectors holding the audio samples for the AudioFile. You can * access the samples by channel and then by sample index, i.e: @@ -174,21 +174,21 @@ public: * samples[channel][sampleIndex] */ AudioBuffer samples; - + //============================================================= /** An optional iXML chunk that can be added to the AudioFile. */ std::string iXMLChunk; - + private: - + //============================================================= enum class Endianness { LittleEndian, BigEndian }; - + //============================================================= AudioFileFormat determineAudioFileFormat (std::vector& fileData); bool decodeWaveFile (std::vector& fileData); @@ -202,10 +202,10 @@ private: //============================================================= bool saveToWaveFile (std::string filePath); bool saveToAiffFile (std::string filePath); - + //============================================================= void clearAudioBuffer(); - + //============================================================= int32_t fourBytesToInt (std::vector& source, int startIndex, Endianness endianness = Endianness::LittleEndian); int16_t twoBytesToInt (std::vector& source, int startIndex, Endianness endianness = Endianness::LittleEndian); @@ -216,18 +216,18 @@ private: uint32_t getAiffSampleRate (std::vector& fileData, int sampleRateStartIndex); bool tenByteMatch (std::vector& v1, int startIndex1, std::vector& v2, int startIndex2); void addSampleRateToAiffData (std::vector& fileData, uint32_t sampleRate); - + //============================================================= void addStringToFileData (std::vector& fileData, std::string s); void addInt32ToFileData (std::vector& fileData, int32_t i, Endianness endianness = Endianness::LittleEndian); void addInt16ToFileData (std::vector& fileData, int16_t i, Endianness endianness = Endianness::LittleEndian); - + //============================================================= bool writeDataToFile (std::vector& fileData, std::string filePath); - + //============================================================= void reportError (std::string errorMessage); - + //============================================================= AudioFileFormat audioFileFormat; uint32_t sampleRate; @@ -242,38 +242,38 @@ struct AudioSampleConverter //============================================================= /** Convert a signed 8-bit integer to an audio sample */ static T signedByteToSample (int8_t sample); - + /** Convert an audio sample to an signed 8-bit representation */ static int8_t sampleToSignedByte (T sample); - + //============================================================= /** Convert an unsigned 8-bit integer to an audio sample */ static T unsignedByteToSample (uint8_t sample); - + /** Convert an audio sample to an unsigned 8-bit representation */ static uint8_t sampleToUnsignedByte (T sample); - + //============================================================= /** Convert a 16-bit integer to an audio sample */ static T sixteenBitIntToSample (int16_t sample); - + /** Convert a an audio sample to a 16-bit integer */ static int16_t sampleToSixteenBitInt (T sample); - + //============================================================= /** Convert a 24-bit value (int a 32-bit int) to an audio sample */ static T twentyFourBitIntToSample (int32_t sample); - + /** Convert a an audio sample to a 24-bit value (in a 32-bit integer) */ static int32_t sampleToTwentyFourBitInt (T sample); - + //============================================================= /** Convert a 32-bit signed integer to an audio sample */ static T thirtyTwoBitIntToSample (int32_t sample); - + /** Convert a an audio sample to a 32-bit signed integer */ static int32_t sampleToThirtyTwoBitInt (T sample); - + //============================================================= /** Helper clamp function to enforce ranges */ static T clamp (T v1, T minValue, T maxValue); @@ -414,30 +414,30 @@ template bool AudioFile::setAudioBuffer (AudioBuffer& newBuffer) { int numChannels = (int)newBuffer.size(); - + if (numChannels <= 0) { assert (false && "The buffer you are trying to use has no channels"); return false; } - + size_t numSamples = newBuffer[0].size(); - + // set the number of channels samples.resize (newBuffer.size()); - + for (int k = 0; k < getNumChannels(); k++) { assert (newBuffer[k].size() == numSamples); - + samples[k].resize (numSamples); - + for (size_t i = 0; i < numSamples; i++) { samples[k][i] = newBuffer[k][i]; } } - + return true; } @@ -454,11 +454,11 @@ template void AudioFile::setNumSamplesPerChannel (int numSamples) { int originalSize = getNumSamplesPerChannel(); - + for (int i = 0; i < getNumChannels();i++) { samples[i].resize (numSamples); - + // set any new samples to zero if (numSamples > originalSize) std::fill (samples[i].begin() + originalSize, samples[i].end(), (T)0.); @@ -471,9 +471,9 @@ void AudioFile::setNumChannels (int numChannels) { int originalNumChannels = getNumChannels(); int originalNumSamplesPerChannel = getNumSamplesPerChannel(); - + samples.resize (numChannels); - + // make sure any new channels are set to the right size // and filled with zeros if (numChannels > originalNumChannels) @@ -512,14 +512,14 @@ template bool AudioFile::load (std::string filePath) { std::ifstream file (filePath, std::ios::binary); - + // check the file exists if (! file.good()) { reportError ("ERROR: File doesn't exist or otherwise can't load file\n" + filePath); return false; } - + std::vector fileData; file.unsetf (std::ios::skipws); @@ -539,7 +539,7 @@ bool AudioFile::load (std::string filePath) reportError ("ERROR: Couldn't read entire file\n" + filePath); return false; } - + // Handle very small files that will break our attempt to read the // first header info from them if (fileData.size() < 12) @@ -559,7 +559,7 @@ bool AudioFile::loadFromMemory (std::vector& fileData) { // get audio file format audioFileFormat = determineAudioFileFormat (fileData); - + if (audioFileFormat == AudioFileFormat::Wave) { return decodeWaveFile (fileData); @@ -584,13 +584,13 @@ bool AudioFile::decodeWaveFile (std::vector& fileData) std::string headerChunkID (fileData.begin(), fileData.begin() + 4); //int32_t fileSizeInBytes = fourBytesToInt (fileData, 4) + 8; std::string format (fileData.begin() + 8, fileData.begin() + 12); - + // ----------------------------------------------------------- // try and find the start points of key chunks int indexOfDataChunk = getIndexOfChunk (fileData, "data", 12); int indexOfFormatChunk = getIndexOfChunk (fileData, "fmt ", 12); int indexOfXMLChunk = getIndexOfChunk (fileData, "iXML", 12); - + // if we can't find the data or format chunks, or the IDs/formats don't seem to be as expected // then it is unlikely we'll able to read this file, so abort if (indexOfDataChunk == -1 || indexOfFormatChunk == -1 || headerChunkID != "RIFF" || format != "WAVE") @@ -598,7 +598,7 @@ bool AudioFile::decodeWaveFile (std::vector& fileData) reportError ("ERROR: this doesn't seem to be a valid .WAV file"); return false; } - + // ----------------------------------------------------------- // FORMAT CHUNK int f = indexOfFormatChunk; @@ -610,7 +610,7 @@ bool AudioFile::decodeWaveFile (std::vector& fileData) uint32_t numBytesPerSecond = fourBytesToInt (fileData, f + 16); uint16_t numBytesPerBlock = twoBytesToInt (fileData, f + 20); bitDepth = (int) twoBytesToInt (fileData, f + 22); - + if (bitDepth > sizeof (T) * 8) { std::string message = "ERROR: you are trying to read a "; @@ -621,61 +621,61 @@ bool AudioFile::decodeWaveFile (std::vector& fileData) reportError (message); return false; } - + uint16_t numBytesPerSample = static_cast (bitDepth) / 8; - + // check that the audio format is PCM or Float or extensible if (audioFormat != WavAudioFormat::PCM && audioFormat != WavAudioFormat::IEEEFloat && audioFormat != WavAudioFormat::Extensible) { reportError ("ERROR: this .WAV file is encoded in a format that this library does not support at present"); return false; } - + // check the number of channels is mono or stereo if (numChannels < 1 || numChannels > 128) { reportError ("ERROR: this WAV file seems to be an invalid number of channels (or corrupted?)"); return false; } - + // check header data is consistent if (numBytesPerSecond != static_cast ((numChannels * sampleRate * bitDepth) / 8) || numBytesPerBlock != (numChannels * numBytesPerSample)) { reportError ("ERROR: the header data in this WAV file seems to be inconsistent"); return false; } - + // check bit depth is either 8, 16, 24 or 32 bit if (bitDepth != 8 && bitDepth != 16 && bitDepth != 24 && bitDepth != 32) { reportError ("ERROR: this file has a bit depth that is not 8, 16, 24 or 32 bits"); return false; } - + // ----------------------------------------------------------- // DATA CHUNK int d = indexOfDataChunk; std::string dataChunkID (fileData.begin() + d, fileData.begin() + d + 4); int32_t dataChunkSize = fourBytesToInt (fileData, d + 4); - + int numSamples = dataChunkSize / (numChannels * bitDepth / 8); int samplesStartIndex = indexOfDataChunk + 8; - + clearAudioBuffer(); samples.resize (numChannels); - + for (int i = 0; i < numSamples; i++) { for (int channel = 0; channel < numChannels; channel++) { int sampleIndex = samplesStartIndex + (numBytesPerBlock * i) + channel * numBytesPerSample; - + if ((sampleIndex + (bitDepth / 8) - 1) >= fileData.size()) { reportError ("ERROR: read file error as the metadata indicates more samples than there are in the file data"); return false; } - + if (bitDepth == 8) { T sample = AudioSampleConverter::unsignedByteToSample (fileData[sampleIndex]); @@ -691,7 +691,7 @@ bool AudioFile::decodeWaveFile (std::vector& fileData) { int32_t sampleAsInt = 0; sampleAsInt = (fileData[sampleIndex + 2] << 16) | (fileData[sampleIndex + 1] << 8) | fileData[sampleIndex]; - + if (sampleAsInt & 0x800000) // if the 24th bit is set, this is a negative number in 24-bit world sampleAsInt = sampleAsInt | ~0xFFFFFF; // so make sure sign is extended to the 32 bit float @@ -702,7 +702,7 @@ bool AudioFile::decodeWaveFile (std::vector& fileData) { int32_t sampleAsInt = fourBytesToInt (fileData, sampleIndex); T sample; - + if (audioFormat == WavAudioFormat::IEEEFloat && std::is_floating_point_v) { float f; @@ -713,7 +713,7 @@ bool AudioFile::decodeWaveFile (std::vector& fileData) { sample = AudioSampleConverter::thirtyTwoBitIntToSample (sampleAsInt); } - + samples[channel].push_back (sample); } else @@ -743,15 +743,15 @@ bool AudioFile::decodeAiffFile (std::vector& fileData) std::string headerChunkID (fileData.begin(), fileData.begin() + 4); //int32_t fileSizeInBytes = fourBytesToInt (fileData, 4, Endianness::BigEndian) + 8; std::string format (fileData.begin() + 8, fileData.begin() + 12); - + int audioFormat = format == "AIFF" ? AIFFAudioFormat::Uncompressed : format == "AIFC" ? AIFFAudioFormat::Compressed : AIFFAudioFormat::Error; - + // ----------------------------------------------------------- // try and find the start points of key chunks int indexOfCommChunk = getIndexOfChunk (fileData, "COMM", 12, Endianness::BigEndian); int indexOfSoundDataChunk = getIndexOfChunk (fileData, "SSND", 12, Endianness::BigEndian); int indexOfXMLChunk = getIndexOfChunk (fileData, "iXML", 12, Endianness::BigEndian); - + // if we can't find the data or format chunks, or the IDs/formats don't seem to be as expected // then it is unlikely we'll able to read this file, so abort if (indexOfSoundDataChunk == -1 || indexOfCommChunk == -1 || headerChunkID != "FORM" || audioFormat == AIFFAudioFormat::Error) @@ -769,7 +769,7 @@ bool AudioFile::decodeAiffFile (std::vector& fileData) int32_t numSamplesPerChannel = fourBytesToInt (fileData, p + 10, Endianness::BigEndian); bitDepth = (int) twoBytesToInt (fileData, p + 14, Endianness::BigEndian); sampleRate = getAiffSampleRate (fileData, p + 16); - + if (bitDepth > sizeof (T) * 8) { std::string message = "ERROR: you are trying to read a "; @@ -780,28 +780,28 @@ bool AudioFile::decodeAiffFile (std::vector& fileData) reportError (message); return false; } - + // check the sample rate was properly decoded if (sampleRate == 0) { reportError ("ERROR: this AIFF file has an unsupported sample rate"); return false; } - + // check the number of channels is mono or stereo if (numChannels < 1 ||numChannels > 2) { reportError ("ERROR: this AIFF file seems to be neither mono nor stereo (perhaps multi-track, or corrupted?)"); return false; } - + // check bit depth is either 8, 16, 24 or 32-bit if (bitDepth != 8 && bitDepth != 16 && bitDepth != 24 && bitDepth != 32) { reportError ("ERROR: this file has a bit depth that is not 8, 16, 24 or 32 bits"); return false; } - + // ----------------------------------------------------------- // SSND CHUNK int s = indexOfSoundDataChunk; @@ -809,34 +809,34 @@ bool AudioFile::decodeAiffFile (std::vector& fileData) int32_t soundDataChunkSize = fourBytesToInt (fileData, s + 4, Endianness::BigEndian); int32_t offset = fourBytesToInt (fileData, s + 8, Endianness::BigEndian); //int32_t blockSize = fourBytesToInt (fileData, s + 12, Endianness::BigEndian); - + int numBytesPerSample = bitDepth / 8; int numBytesPerFrame = numBytesPerSample * numChannels; int totalNumAudioSampleBytes = numSamplesPerChannel * numBytesPerFrame; int samplesStartIndex = s + 16 + (int)offset; - + // sanity check the data if ((soundDataChunkSize - 8) != totalNumAudioSampleBytes || totalNumAudioSampleBytes > static_cast(fileData.size() - samplesStartIndex)) { reportError ("ERROR: the metadatafor this file doesn't seem right"); return false; } - + clearAudioBuffer(); samples.resize (numChannels); - + for (int i = 0; i < numSamplesPerChannel; i++) { for (int channel = 0; channel < numChannels; channel++) { int sampleIndex = samplesStartIndex + (numBytesPerFrame * i) + channel * numBytesPerSample; - + if ((sampleIndex + (bitDepth / 8) - 1) >= fileData.size()) { reportError ("ERROR: read file error as the metadata indicates more samples than there are in the file data"); return false; } - + if (bitDepth == 8) { T sample = AudioSampleConverter::signedByteToSample (static_cast (fileData[sampleIndex])); @@ -852,7 +852,7 @@ bool AudioFile::decodeAiffFile (std::vector& fileData) { int32_t sampleAsInt = 0; sampleAsInt = (fileData[sampleIndex] << 16) | (fileData[sampleIndex + 1] << 8) | fileData[sampleIndex + 2]; - + if (sampleAsInt & 0x800000) // if the 24th bit is set, this is a negative number in 24-bit world sampleAsInt = sampleAsInt | ~0xFFFFFF; // so make sure sign is extended to the 32 bit float @@ -863,12 +863,12 @@ bool AudioFile::decodeAiffFile (std::vector& fileData) { int32_t sampleAsInt = fourBytesToInt (fileData, sampleIndex, Endianness::BigEndian); T sample; - + if (audioFormat == AIFFAudioFormat::Compressed) sample = (T)reinterpret_cast (sampleAsInt); else // assume PCM sample = AudioSampleConverter::thirtyTwoBitIntToSample (sampleAsInt); - + samples[channel].push_back (sample); } else @@ -885,7 +885,7 @@ bool AudioFile::decodeAiffFile (std::vector& fileData) int32_t chunkSize = fourBytesToInt (fileData, indexOfXMLChunk + 4); iXMLChunk = std::string ((const char*) &fileData[indexOfXMLChunk + 8], chunkSize); } - + return true; } @@ -898,7 +898,7 @@ uint32_t AudioFile::getAiffSampleRate (std::vector& fileData, int sa if (tenByteMatch (fileData, sampleRateStartIndex, it.second, 0)) return it.first; } - + return 0; } @@ -911,7 +911,7 @@ bool AudioFile::tenByteMatch (std::vector& v1, int startIndex1, std: if (v1[startIndex1 + i] != v2[startIndex2 + i]) return false; } - + return true; } @@ -938,7 +938,7 @@ bool AudioFile::save (std::string filePath, AudioFileFormat format) { return saveToAiffFile (filePath); } - + return false; } @@ -954,7 +954,7 @@ bool AudioFile::writeData (std::vector & fileData, AudioFileFormat f { return writeToAiffData (fileData); } - + return false; } @@ -962,16 +962,16 @@ bool AudioFile::writeData (std::vector & fileData, AudioFileFormat f //============================================================= template bool AudioFile::writeToWaveData (std::vector & fileData) -{ +{ int32_t dataChunkSize = getNumSamplesPerChannel() * (getNumChannels() * bitDepth / 8); int16_t audioFormat = bitDepth == 32 && std::is_floating_point_v ? WavAudioFormat::IEEEFloat : WavAudioFormat::PCM; int32_t formatChunkSize = audioFormat == WavAudioFormat::PCM ? 16 : 18; int32_t iXMLChunkSize = static_cast (iXMLChunk.size()); - + // ----------------------------------------------------------- // HEADER CHUNK addStringToFileData (fileData, "RIFF"); - + // The file size in bytes is the header chunk size (4, not counting RIFF and WAVE) + the format // chunk size (24) + the metadata part of the data chunk plus the actual data chunk size int32_t fileSizeInBytes = 4 + formatChunkSize + 8 + 8 + dataChunkSize; @@ -981,9 +981,9 @@ bool AudioFile::writeToWaveData (std::vector & fileData) } addInt32ToFileData (fileData, fileSizeInBytes); - + addStringToFileData (fileData, "WAVE"); - + // ----------------------------------------------------------- // FORMAT CHUNK addStringToFileData (fileData, "fmt "); @@ -991,23 +991,23 @@ bool AudioFile::writeToWaveData (std::vector & fileData) addInt16ToFileData (fileData, audioFormat); // audio format addInt16ToFileData (fileData, (int16_t)getNumChannels()); // num channels addInt32ToFileData (fileData, (int32_t)sampleRate); // sample rate - + int32_t numBytesPerSecond = (int32_t) ((getNumChannels() * sampleRate * bitDepth) / 8); addInt32ToFileData (fileData, numBytesPerSecond); - + int16_t numBytesPerBlock = getNumChannels() * (bitDepth / 8); addInt16ToFileData (fileData, numBytesPerBlock); - + addInt16ToFileData (fileData, (int16_t)bitDepth); - + if (audioFormat == WavAudioFormat::IEEEFloat) addInt16ToFileData (fileData, 0); // extension size - + // ----------------------------------------------------------- // DATA CHUNK addStringToFileData (fileData, "data"); addInt32ToFileData (fileData, dataChunkSize); - + for (int i = 0; i < getNumSamplesPerChannel(); i++) { for (int channel = 0; channel < getNumChannels(); channel++) @@ -1025,12 +1025,12 @@ bool AudioFile::writeToWaveData (std::vector & fileData) else if (bitDepth == 24) { int32_t sampleAsIntAgain = AudioSampleConverter::sampleToTwentyFourBitInt (samples[channel][i]); - + uint8_t bytes[3]; bytes[2] = (uint8_t) (sampleAsIntAgain >> 16) & 0xFF; bytes[1] = (uint8_t) (sampleAsIntAgain >> 8) & 0xFF; bytes[0] = (uint8_t) sampleAsIntAgain & 0xFF; - + fileData.push_back (bytes[0]); fileData.push_back (bytes[1]); fileData.push_back (bytes[2]); @@ -1038,12 +1038,12 @@ bool AudioFile::writeToWaveData (std::vector & fileData) else if (bitDepth == 32) { int32_t sampleAsInt; - + if (audioFormat == WavAudioFormat::IEEEFloat) sampleAsInt = (int32_t) reinterpret_cast (samples[channel][i]); else // assume PCM sampleAsInt = AudioSampleConverter::sampleToThirtyTwoBitInt (samples[channel][i]); - + addInt32ToFileData (fileData, sampleAsInt, Endianness::LittleEndian); } else @@ -1053,7 +1053,7 @@ bool AudioFile::writeToWaveData (std::vector & fileData) } } } - + // ----------------------------------------------------------- // iXML CHUNK if (iXMLChunkSize > 0) @@ -1062,24 +1062,24 @@ bool AudioFile::writeToWaveData (std::vector & fileData) addInt32ToFileData (fileData, iXMLChunkSize); addStringToFileData (fileData, iXMLChunk); } - + return true; } //============================================================= template bool AudioFile::writeToAiffData (std::vector & fileData) -{ +{ int32_t numBytesPerSample = bitDepth / 8; int32_t numBytesPerFrame = numBytesPerSample * getNumChannels(); int32_t totalNumAudioSampleBytes = getNumSamplesPerChannel() * numBytesPerFrame; int32_t soundDataChunkSize = totalNumAudioSampleBytes + 8; int32_t iXMLChunkSize = static_cast (iXMLChunk.size()); - + // ----------------------------------------------------------- // HEADER CHUNK addStringToFileData (fileData, "FORM"); - + // The file size in bytes is the header chunk size (4, not counting FORM and AIFF) + the COMM // chunk size (26) + the metadata part of the SSND chunk plus the actual data chunk size int32_t fileSizeInBytes = 4 + 26 + 16 + totalNumAudioSampleBytes; @@ -1089,9 +1089,9 @@ bool AudioFile::writeToAiffData (std::vector & fileData) } addInt32ToFileData (fileData, fileSizeInBytes, Endianness::BigEndian); - + addStringToFileData (fileData, "AIFF"); - + // ----------------------------------------------------------- // COMM CHUNK addStringToFileData (fileData, "COMM"); @@ -1100,14 +1100,14 @@ bool AudioFile::writeToAiffData (std::vector & fileData) addInt32ToFileData (fileData, getNumSamplesPerChannel(), Endianness::BigEndian); // num samples per channel addInt16ToFileData (fileData, bitDepth, Endianness::BigEndian); // bit depth addSampleRateToAiffData (fileData, sampleRate); - + // ----------------------------------------------------------- // SSND CHUNK addStringToFileData (fileData, "SSND"); addInt32ToFileData (fileData, soundDataChunkSize, Endianness::BigEndian); addInt32ToFileData (fileData, 0, Endianness::BigEndian); // offset addInt32ToFileData (fileData, 0, Endianness::BigEndian); // block size - + for (int i = 0; i < getNumSamplesPerChannel(); i++) { for (int channel = 0; channel < getNumChannels(); channel++) @@ -1125,12 +1125,12 @@ bool AudioFile::writeToAiffData (std::vector & fileData) else if (bitDepth == 24) { int32_t sampleAsIntAgain = AudioSampleConverter::sampleToTwentyFourBitInt (samples[channel][i]); - + uint8_t bytes[3]; bytes[0] = (uint8_t) (sampleAsIntAgain >> 16) & 0xFF; bytes[1] = (uint8_t) (sampleAsIntAgain >> 8) & 0xFF; bytes[2] = (uint8_t) sampleAsIntAgain & 0xFF; - + fileData.push_back (bytes[0]); fileData.push_back (bytes[1]); fileData.push_back (bytes[2]); @@ -1165,16 +1165,16 @@ template bool AudioFile::saveToWaveFile (std::string filePath) { std::vector fileData; - + int32_t dataChunkSize = getNumSamplesPerChannel() * (getNumChannels() * bitDepth / 8); int16_t audioFormat = bitDepth == 32 && std::is_floating_point_v ? WavAudioFormat::IEEEFloat : WavAudioFormat::PCM; int32_t formatChunkSize = audioFormat == WavAudioFormat::PCM ? 16 : 18; int32_t iXMLChunkSize = static_cast (iXMLChunk.size()); - + // ----------------------------------------------------------- // HEADER CHUNK addStringToFileData (fileData, "RIFF"); - + // The file size in bytes is the header chunk size (4, not counting RIFF and WAVE) + the format // chunk size (24) + the metadata part of the data chunk plus the actual data chunk size int32_t fileSizeInBytes = 4 + formatChunkSize + 8 + 8 + dataChunkSize; @@ -1184,9 +1184,9 @@ bool AudioFile::saveToWaveFile (std::string filePath) } addInt32ToFileData (fileData, fileSizeInBytes); - + addStringToFileData (fileData, "WAVE"); - + // ----------------------------------------------------------- // FORMAT CHUNK addStringToFileData (fileData, "fmt "); @@ -1194,23 +1194,23 @@ bool AudioFile::saveToWaveFile (std::string filePath) addInt16ToFileData (fileData, audioFormat); // audio format addInt16ToFileData (fileData, (int16_t)getNumChannels()); // num channels addInt32ToFileData (fileData, (int32_t)sampleRate); // sample rate - + int32_t numBytesPerSecond = (int32_t) ((getNumChannels() * sampleRate * bitDepth) / 8); addInt32ToFileData (fileData, numBytesPerSecond); - + int16_t numBytesPerBlock = getNumChannels() * (bitDepth / 8); addInt16ToFileData (fileData, numBytesPerBlock); - + addInt16ToFileData (fileData, (int16_t)bitDepth); - + if (audioFormat == WavAudioFormat::IEEEFloat) addInt16ToFileData (fileData, 0); // extension size - + // ----------------------------------------------------------- // DATA CHUNK addStringToFileData (fileData, "data"); addInt32ToFileData (fileData, dataChunkSize); - + for (int i = 0; i < getNumSamplesPerChannel(); i++) { for (int channel = 0; channel < getNumChannels(); channel++) @@ -1228,12 +1228,12 @@ bool AudioFile::saveToWaveFile (std::string filePath) else if (bitDepth == 24) { int32_t sampleAsIntAgain = AudioSampleConverter::sampleToTwentyFourBitInt (samples[channel][i]); - + uint8_t bytes[3]; bytes[2] = (uint8_t) (sampleAsIntAgain >> 16) & 0xFF; bytes[1] = (uint8_t) (sampleAsIntAgain >> 8) & 0xFF; bytes[0] = (uint8_t) sampleAsIntAgain & 0xFF; - + fileData.push_back (bytes[0]); fileData.push_back (bytes[1]); fileData.push_back (bytes[2]); @@ -1241,12 +1241,12 @@ bool AudioFile::saveToWaveFile (std::string filePath) else if (bitDepth == 32) { int32_t sampleAsInt; - + if (audioFormat == WavAudioFormat::IEEEFloat) sampleAsInt = (int32_t) reinterpret_cast (samples[channel][i]); else // assume PCM sampleAsInt = AudioSampleConverter::sampleToThirtyTwoBitInt (samples[channel][i]); - + addInt32ToFileData (fileData, sampleAsInt, Endianness::LittleEndian); } else @@ -1256,7 +1256,7 @@ bool AudioFile::saveToWaveFile (std::string filePath) } } } - + // ----------------------------------------------------------- // iXML CHUNK if (iXMLChunkSize > 0) @@ -1265,14 +1265,14 @@ bool AudioFile::saveToWaveFile (std::string filePath) addInt32ToFileData (fileData, iXMLChunkSize); addStringToFileData (fileData, iXMLChunk); } - + // check that the various sizes we put in the metadata are correct if (fileSizeInBytes != static_cast (fileData.size() - 8) || dataChunkSize != (getNumSamplesPerChannel() * getNumChannels() * (bitDepth / 8))) { reportError ("ERROR: couldn't save file to " + filePath); return false; } - + // try to write the file return writeDataToFile (fileData, filePath); } @@ -1282,17 +1282,17 @@ template bool AudioFile::saveToAiffFile (std::string filePath) { std::vector fileData; - + int32_t numBytesPerSample = bitDepth / 8; int32_t numBytesPerFrame = numBytesPerSample * getNumChannels(); int32_t totalNumAudioSampleBytes = getNumSamplesPerChannel() * numBytesPerFrame; int32_t soundDataChunkSize = totalNumAudioSampleBytes + 8; int32_t iXMLChunkSize = static_cast (iXMLChunk.size()); - + // ----------------------------------------------------------- // HEADER CHUNK addStringToFileData (fileData, "FORM"); - + // The file size in bytes is the header chunk size (4, not counting FORM and AIFF) + the COMM // chunk size (26) + the metadata part of the SSND chunk plus the actual data chunk size int32_t fileSizeInBytes = 4 + 26 + 16 + totalNumAudioSampleBytes; @@ -1302,9 +1302,9 @@ bool AudioFile::saveToAiffFile (std::string filePath) } addInt32ToFileData (fileData, fileSizeInBytes, Endianness::BigEndian); - + addStringToFileData (fileData, "AIFF"); - + // ----------------------------------------------------------- // COMM CHUNK addStringToFileData (fileData, "COMM"); @@ -1313,14 +1313,14 @@ bool AudioFile::saveToAiffFile (std::string filePath) addInt32ToFileData (fileData, getNumSamplesPerChannel(), Endianness::BigEndian); // num samples per channel addInt16ToFileData (fileData, bitDepth, Endianness::BigEndian); // bit depth addSampleRateToAiffData (fileData, sampleRate); - + // ----------------------------------------------------------- // SSND CHUNK addStringToFileData (fileData, "SSND"); addInt32ToFileData (fileData, soundDataChunkSize, Endianness::BigEndian); addInt32ToFileData (fileData, 0, Endianness::BigEndian); // offset addInt32ToFileData (fileData, 0, Endianness::BigEndian); // block size - + for (int i = 0; i < getNumSamplesPerChannel(); i++) { for (int channel = 0; channel < getNumChannels(); channel++) @@ -1338,12 +1338,12 @@ bool AudioFile::saveToAiffFile (std::string filePath) else if (bitDepth == 24) { int32_t sampleAsIntAgain = AudioSampleConverter::sampleToTwentyFourBitInt (samples[channel][i]); - + uint8_t bytes[3]; bytes[0] = (uint8_t) (sampleAsIntAgain >> 16) & 0xFF; bytes[1] = (uint8_t) (sampleAsIntAgain >> 8) & 0xFF; bytes[2] = (uint8_t) sampleAsIntAgain & 0xFF; - + fileData.push_back (bytes[0]); fileData.push_back (bytes[1]); fileData.push_back (bytes[2]); @@ -1370,14 +1370,14 @@ bool AudioFile::saveToAiffFile (std::string filePath) addInt32ToFileData (fileData, iXMLChunkSize, Endianness::BigEndian); addStringToFileData (fileData, iXMLChunk); } - + // check that the various sizes we put in the metadata are correct if (fileSizeInBytes != static_cast (fileData.size() - 8) || soundDataChunkSize != getNumSamplesPerChannel() * numBytesPerFrame + 8) { reportError ("ERROR: couldn't save file to " + filePath); return false; } - + // try to write the file return writeDataToFile (fileData, filePath); } @@ -1387,7 +1387,7 @@ template bool AudioFile::writeDataToFile (std::vector& fileData, std::string filePath) { std::ofstream outputFile (filePath, std::ios::binary); - + if (outputFile.is_open()) { for (size_t i = 0; i < fileData.size(); i++) @@ -1395,12 +1395,12 @@ bool AudioFile::writeDataToFile (std::vector& fileData, std::string char value = (char) fileData[i]; outputFile.write (&value, sizeof (char)); } - + outputFile.close(); - + return true; } - + return false; } @@ -1417,7 +1417,7 @@ template void AudioFile::addInt32ToFileData (std::vector& fileData, int32_t i, Endianness endianness) { uint8_t bytes[4]; - + if (endianness == Endianness::LittleEndian) { bytes[3] = (i >> 24) & 0xFF; @@ -1432,7 +1432,7 @@ void AudioFile::addInt32ToFileData (std::vector& fileData, int32_t i bytes[2] = (i >> 8) & 0xFF; bytes[3] = i & 0xFF; } - + for (int i = 0; i < 4; i++) fileData.push_back (bytes[i]); } @@ -1442,7 +1442,7 @@ template void AudioFile::addInt16ToFileData (std::vector& fileData, int16_t i, Endianness endianness) { uint8_t bytes[2]; - + if (endianness == Endianness::LittleEndian) { bytes[1] = (i >> 8) & 0xFF; @@ -1453,7 +1453,7 @@ void AudioFile::addInt16ToFileData (std::vector& fileData, int16_t i bytes[0] = (i >> 8) & 0xFF; bytes[1] = i & 0xFF; } - + fileData.push_back (bytes[0]); fileData.push_back (bytes[1]); } @@ -1466,7 +1466,7 @@ void AudioFile::clearAudioBuffer() { samples[i].clear(); } - + samples.clear(); } @@ -1475,7 +1475,7 @@ template AudioFileFormat AudioFile::determineAudioFileFormat (std::vector& fileData) { std::string header (fileData.begin(), fileData.begin() + 4); - + if (header == "RIFF") return AudioFileFormat::Wave; else if (header == "FORM") @@ -1491,12 +1491,12 @@ int32_t AudioFile::fourBytesToInt (std::vector& source, int startInd if (source.size() >= (startIndex + 4)) { int32_t result; - + if (endianness == Endianness::LittleEndian) result = (source[startIndex + 3] << 24) | (source[startIndex + 2] << 16) | (source[startIndex + 1] << 8) | source[startIndex]; else result = (source[startIndex] << 24) | (source[startIndex + 1] << 16) | (source[startIndex + 2] << 8) | source[startIndex + 3]; - + return result; } else @@ -1511,12 +1511,12 @@ template int16_t AudioFile::twoBytesToInt (std::vector& source, int startIndex, Endianness endianness) { int16_t result; - + if (endianness == Endianness::LittleEndian) result = (source[startIndex + 1] << 8) | source[startIndex]; else result = (source[startIndex] << 8) | source[startIndex + 1]; - + return result; } @@ -1526,18 +1526,18 @@ int AudioFile::getIndexOfString (std::vector& source, std::string st { int index = -1; int stringLength = (int)stringToSearchFor.length(); - + for (size_t i = 0; i < source.size() - stringLength;i++) { std::string section (source.begin() + i, source.begin() + i + stringLength); - + if (section == stringToSearchFor) { index = static_cast (i); break; } } - + return index; } @@ -1546,7 +1546,7 @@ template int AudioFile::getIndexOfChunk (std::vector& source, const std::string& chunkHeaderID, int startIndex, Endianness endianness) { constexpr int dataLen = 4; - + if (chunkHeaderID.size() != dataLen) { assert (false && "Invalid chunk header ID string"); @@ -1562,11 +1562,11 @@ int AudioFile::getIndexOfChunk (std::vector& source, const std::stri } i += dataLen; - + // If somehow we don't have 4 bytes left to read, then exit with -1 if ((i + 4) >= source.size()) return -1; - + auto chunkSize = fourBytesToInt (source, i, endianness); i += (dataLen + chunkSize); } @@ -1587,9 +1587,9 @@ template typename std::make_unsigned::type convertSignedToUnsigned (SignedType signedValue) { static_assert (std::is_signed::value, "The input value must be signed"); - + typename std::make_unsigned::type unsignedValue = static_cast::type> (1) + std::numeric_limits::max(); - + unsignedValue += signedValue; return unsignedValue; } diff --git a/otherarch/ttscpp/include/phonemizer.h b/otherarch/ttscpp/include/phonemizer.h index 6167a6818..0e401de74 100644 --- a/otherarch/ttscpp/include/phonemizer.h +++ b/otherarch/ttscpp/include/phonemizer.h @@ -12,7 +12,7 @@ #include #include #include -#include "tokenizer.h" +#include "ttstokenizer.h" #include #include @@ -33,16 +33,16 @@ static const std::unordered_set ONE_LETTER_WORDS = { "i", }; /* - * The two letter and three letter words listed below have been filtered down from the complete list of english two and three letter words + * The two letter and three letter words listed below have been filtered down from the complete list of english two and three letter words * via several criteria: * 1. All non-EN-US words have been removed * 2. All three letter acronyms have been removed (as these lists are used to identify acronyms) - * 3. All archaic, deprecated, or poetic words have been removed. - * 4. All literary, abbreviative, and slang words have been removed if they see no more than a mean of 30 daily searches via google (over the - * last 10 years). - * - * After the lists were filtered by the criteria described above, removed items were reviewed. Any item which had entered the common EN-US - * vernacular but was not identified as of American origin was reintroduced into the sets below. + * 3. All archaic, deprecated, or poetic words have been removed. + * 4. All literary, abbreviative, and slang words have been removed if they see no more than a mean of 30 daily searches via google (over the + * last 10 years). + * + * After the lists were filtered by the criteria described above, removed items were reviewed. Any item which had entered the common EN-US + * vernacular but was not identified as of American origin was reintroduced into the sets below. */ static const std::unordered_set TWO_LETTER_WORDS = { "ab", "ah", "am", "an", "as", "at", "aw", "ax", "ay", "be", "bo", "br", @@ -50,7 +50,7 @@ static const std::unordered_set TWO_LETTER_WORDS = { "id", "if", "in", "is", "it", "la", "lo", "ma", "me", "mm", "my", "na", "no", "of", "oh", "oi", "on", "oo", "or", "ow", "ox", "oy", "pa", "qi", "re", "sh", "so", "to", "uh", "um", "un", "up", "us", "we", "wo", "ya", - "ye", "yo", + "ye", "yo", }; static const std::unordered_set THREE_LETTER_WORDS = { "aah", "abs", "aby", "ace", "ach", "ack", "act", "add", "ado", "ads", "aft", "age", @@ -292,7 +292,7 @@ static std::string STOPPING_TOKENS = ".,:;!?"; #ifdef ESPEAK_INSTALL /** - * espeak-ng uses globals to persist and manage its state so it is not compatible with + * espeak-ng uses globals to persist and manage its state so it is not compatible with * threaded parallelism (https://github.com/espeak-ng/espeak-ng/issues/1527). * This singleton acts as a mutex wrapped provider for all espeak phonemization methods such * that multiple instances of the kokoro_runner can be initialized and called in parallel. @@ -323,7 +323,7 @@ public: #endif enum lookup_code { - SUCCESS = 100, + SUCCESS_TOTAL = 100, SUCCESS_PARTIAL = 101, FAILURE_UNFOUND = 200, FAILURE_PHONETIC = 201, @@ -368,7 +368,7 @@ struct conditions { void update_for_word(std::string word,bool allow_for_upper_check = true); }; -/* +/* * The corpus struct is simply a small wrapper class that is used to perform simple look forward and backwards in the text * which is being phonemized. This can be used to discern how to convert chunks of text in a consistent and protective fashion * in order to accurately phonemize complicated text. @@ -376,7 +376,7 @@ struct conditions { struct corpus { corpus(const char * text, size_t size): size(size), text(text) {}; size_t location = 0; - size_t size; + size_t size; const char * text; /* @@ -397,9 +397,9 @@ struct corpus { std::string after_until(int after, std::string val); }; -/* +/* * The TTS phonemizer works by splitting each word into distinct graphemes, and for each grapheme the phonemizer will look at the grapheme that came - * before, after, and for any word specific exceptions in order to compile a + * before, after, and for any word specific exceptions in order to compile a */ struct phonemizer_rule { ~phonemizer_rule() { @@ -436,10 +436,10 @@ private: struct word_phonemizer * word_phonemizer_from_gguf(gguf_context * meta); -/* +/* * The general translation approach that espeak uses is to lookup words in the dictionary and return a list of possible matches per lookup. * Each match contains flags which describe the match's conditions and limitations and optionally a pronunciation. When a pronunciation is not returned, - * it usually means that the word needs to be pronounced phonetically, the word belongs to another language, or that the original content is a + * it usually means that the word needs to be pronounced phonetically, the word belongs to another language, or that the original content is a * token representation of a different word (e.g. with numbers). * * Since it does not make sense to have the core lexer reperform this lookup operation with represented words or via distinct languages, those behaviors @@ -470,7 +470,7 @@ struct phoneme_dictionary { struct phoneme_dictionary * phoneme_dictionary_from_gguf(gguf_context * meta); -/* +/* * In general, I would like to avoid requiring the installation of otherwise broad and technically complicated libraries, * like espeak, especially when they are only being used for a small portion of their overall functionality. While avoiding these * requirements will keep the default installation cost of TTS.cpp down, it is also unlikely that TTS.cpp will support @@ -478,8 +478,8 @@ struct phoneme_dictionary * phoneme_dictionary_from_gguf(gguf_context * meta); * espeak. As such, the phonemizer struct described below will support simple text to IPA phoneme functionality out of the box, * while also optionally acting as an interface for espeak phonemization. * - * Phonemization seems to use a pattern close to the common lexer, such that at each index or chunk of text forward and backward context - * views are used to support single pass translation. As such, the TTS.cpp phonemization pattern I've decided to implement behaves + * Phonemization seems to use a pattern close to the common lexer, such that at each index or chunk of text forward and backward context + * views are used to support single pass translation. As such, the TTS.cpp phonemization pattern I've decided to implement behaves * effecively like a simple router lexer. It will only support utf-8 encoded text and english IPA conversion. */ struct phonemizer { diff --git a/otherarch/ttscpp/include/ttscommon.h b/otherarch/ttscpp/include/ttscommon.h index c3a1a1c80..df380704e 100644 --- a/otherarch/ttscpp/include/ttscommon.h +++ b/otherarch/ttscpp/include/ttscommon.h @@ -28,7 +28,7 @@ const std::map SUPPORTED_ARCHITECTURES = { { "orpheus", ORPHEUS_ARCH } }; -/// Given a map from keys to values, creates a new map from values to keys +/// Given a map from keys to values, creates a new map from values to keys template static std::map reverse_map(const std::map& m) { std::map r; @@ -43,10 +43,10 @@ const std::map ARCHITECTURE_NAMES = reverse_map(SUPPORTED struct generation_configuration { generation_configuration( std::string voice = "", - int top_k = 50, - float temperature = 1.0, - float repetition_penalty = 1.0, - bool use_cross_attn = true, + int top_k = 50, + float temperature = 1.0, + float repetition_penalty = 1.0, + bool use_cross_attn = true, std::string espeak_voice_id = "", int max_tokens = 0, float top_p = 1.0, diff --git a/otherarch/ttscpp/src/dac_model.h b/otherarch/ttscpp/src/dac_model.h index be43ad02d..f0ae96d03 100644 --- a/otherarch/ttscpp/src/dac_model.h +++ b/otherarch/ttscpp/src/dac_model.h @@ -22,13 +22,13 @@ struct dac_quantize_layer { // this struct maintains the static tensors for the dac audio decoder graph. // As such, this is designed to contain basic configuration and ggml tensor support for DAC. // The dac_runner describes how the graph is built and run. -struct dac_model : tts_model { +struct dac_model : tts_model { // These configs are essentially built for the 44khZ 8kbps standard DAC model audio encoder and decoder uint32_t n_layers = 4; uint32_t n_heads = 9; uint32_t up_sampling_factor = 512; uint32_t max_generation_size = 2580; - + struct ggml_tensor * in_conv_kernel; struct ggml_tensor * in_conv_bias; struct ggml_tensor * out_conv_kernel; @@ -53,11 +53,11 @@ void assign_to_audio_encoder(dac_model * model, std::string name, ggml_tensor * // the context used for running the dac model struct dac_context : runner_context { dac_context(dac_model * model, int n_threads): runner_context(n_threads), model(model) {}; - + struct dac_model * model; - + struct ggml_tensor * inp_tokens; - + void build_schedule() { runner_context::build_schedule(model->max_nodes()); } @@ -85,11 +85,11 @@ struct dac_runner : tts_runner { } dac_model * model; dac_context * dctx; - + void init_build() { tts_runner::init_build(&dctx->buf_compute_meta); } - + void prepare_post_load(); struct ggml_cgraph * build_dac_graph(dac_ubatch & batch); void run(uint32_t * input_tokens, uint32_t sequence_length, struct tts_response * outputs); diff --git a/otherarch/ttscpp/src/dia_model.cpp b/otherarch/ttscpp/src/dia_model.cpp index bd6dfd43a..d7ec2685f 100644 --- a/otherarch/ttscpp/src/dia_model.cpp +++ b/otherarch/ttscpp/src/dia_model.cpp @@ -119,7 +119,7 @@ void dia_model::assign_to_decoder_layer(std::string part, dia_decoder_layer * la set_tensor(layer->self_attn_norm, tensor); } else if (part == "pre_mlp_norm") { layer->mlp_norm = ggml_dup_tensor(ctx, tensor); - set_tensor(layer->mlp_norm, tensor); + set_tensor(layer->mlp_norm, tensor); } else if (part == "pre_ca_norm") { layer->cross_attn_norm = ggml_dup_tensor(ctx, tensor); set_tensor(layer->cross_attn_norm, tensor); @@ -151,7 +151,7 @@ void dia_model::prep_layers() { dia_decoder_layer * l = new dia_decoder_layer; decoder->layers.push_back(l); } - + decoder->embds.reserve((size_t) n_output_heads); decoder->heads.reserve((size_t) n_output_heads); for (int i = 0; i < n_output_heads; i++) { @@ -196,7 +196,7 @@ void dia_model::prep_constants(gguf_context * meta) { int encoder_attn_heads_key = gguf_find_key(meta, "dia.encoder.attn_heads"); if (encoder_attn_heads_key != -1) { encoder_attn_heads = gguf_get_val_u32(meta, encoder_attn_heads_key); - } + } int head_size_key = gguf_find_key(meta, "dia.attn_head_size"); if (head_size_key != -1) { @@ -271,7 +271,7 @@ struct dia_context * build_new_dia_context(struct dia_model * model, int n_threa return dctx; } -static bool dia_kv_cache_init(struct dia_kv_cache * cache, dia_model * model, dia_context * dctx) { +static bool dia_kv_cache_init(struct dia_kv_cache * cache, dia_model * model, dia_context * dctx) { ggml_backend_buffer_type_t buft = nullptr; // this will only really support cpu or metal for the time being; if (dctx->backend != nullptr) { @@ -382,7 +382,7 @@ static struct ggml_tensor * build_dia_encoder(ggml_context * ctx, dia_model * mo struct ggml_tensor * cur = ggml_reshape_3d(ctx, ggml_get_rows(ctx, model->encoder->embedding, dctx->inp_tokens), model->encoder_hidden_size, model->max_encoder_context_length, 2); for (auto layer : model->encoder->layers) { struct ggml_tensor * residual = cur; - + cur = dia_layer_norm(ctx, cur, layer->self_attn_norm); // self-attention { @@ -402,7 +402,7 @@ static struct ggml_tensor * build_dia_encoder(ggml_context * ctx, dia_model * mo struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3); // It is unclear why the attention ops in Dia's encoder don't project to the embedding dimension size as is standard. Instead they up project to the decoder's embedding dimension - // then down project back the the encoder embedding dimension. + // then down project back the the encoder embedding dimension. cur = ggml_cont_3d(ctx, kqv_merged, model->decoder_hidden_size, model->max_encoder_context_length, 2); cur = ggml_mul_mat(ctx, layer->o, cur); } @@ -443,10 +443,10 @@ static struct ggml_tensor * repeat_interleave_dim1(ggml_context * ctx, struct gg static void build_dia_self_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * k, struct ggml_tensor * v, dia_ubatch & batch, int layer_index) { int64_t attn_size = model->head_size * model->decoder_attn_heads; - struct ggml_tensor * k_cache_view = + struct ggml_tensor * k_cache_view = ggml_view_2d( - ctx, kv->k_l[layer_index], attn_size, 2, - attn_size * model->max_generation_size * ggml_element_size(kv->k_l[layer_index]), + ctx, kv->k_l[layer_index], attn_size, 2, + attn_size * model->max_generation_size * ggml_element_size(kv->k_l[layer_index]), attn_size*dctx->current_position*ggml_element_size(kv->k_l[layer_index])); k = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, k, model->head_size, model->decoder_attn_heads / model->decoder_query_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2); @@ -461,8 +461,8 @@ static void build_dia_self_kv_store(ggml_context * ctx, dia_context * dctx, dia_ struct ggml_tensor * v_cache_view = nullptr; v_cache_view = ggml_view_2d( - ctx, kv->v_l[layer_index], attn_size, 2, - attn_size * model->max_generation_size * ggml_element_size(kv->v_l[layer_index]), + ctx, kv->v_l[layer_index], attn_size, 2, + attn_size * model->max_generation_size * ggml_element_size(kv->v_l[layer_index]), attn_size*dctx->current_position*ggml_element_size(kv->v_l[layer_index])); // Since the sequence length should always be 1 here this is the most pertinent time to repeat the heads for grouped query attention. @@ -476,11 +476,11 @@ static void build_dia_self_kv_store(ggml_context * ctx, dia_context * dctx, dia_ static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia_model * model, dia_kv_cache * kv, ggml_cgraph * gf, struct ggml_tensor * encoder_hidden_states, int layer_index) { dia_decoder_layer * layer = model->decoder->layers[layer_index]; struct ggml_tensor * encoder_states_key_view = ggml_cont(ctx, ggml_view_3d( - ctx, - encoder_hidden_states, - model->encoder_hidden_size, - dctx->prompt_size, - 2, + ctx, + encoder_hidden_states, + model->encoder_hidden_size, + dctx->prompt_size, + 2, model->encoder_hidden_size * ggml_element_size(encoder_hidden_states), model->encoder_hidden_size * model->max_encoder_context_length * ggml_element_size(encoder_hidden_states), 0)); struct ggml_tensor * k = ggml_mul_mat(ctx, layer->cross_attn_k, encoder_states_key_view); @@ -491,8 +491,8 @@ static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia struct ggml_tensor * k_cache_view = ggml_view_4d( - ctx, kv->cross_k_l[layer_index], model->head_size, model->decoder_attn_heads, 2, dctx->prompt_size, - model->head_size*ggml_element_size(kv->cross_k_l[layer_index]), + ctx, kv->cross_k_l[layer_index], model->head_size, model->decoder_attn_heads, 2, dctx->prompt_size, + model->head_size*ggml_element_size(kv->cross_k_l[layer_index]), model->head_size*model->decoder_attn_heads*ggml_element_size(kv->cross_k_l[layer_index]), model->head_size*model->decoder_attn_heads*2*ggml_element_size(kv->cross_k_l[layer_index]), 0); @@ -504,10 +504,10 @@ static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia struct ggml_tensor * v_cache_view = ggml_view_4d( - ctx, kv->cross_v_l[layer_index], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2, - model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]), - model->head_size*model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]), - model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(kv->cross_v_l[layer_index]), + ctx, kv->cross_v_l[layer_index], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2, + model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]), + model->head_size*model->max_encoder_context_length*ggml_element_size(kv->cross_v_l[layer_index]), + model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(kv->cross_v_l[layer_index]), 0); ggml_build_forward_expand(gf, ggml_cpy(ctx, v, v_cache_view)); @@ -515,11 +515,11 @@ static void build_dia_cross_kv_store(ggml_context * ctx, dia_context * dctx, dia static struct ggml_tensor * build_dia_decoder( ggml_cgraph * gf, - ggml_context * ctx, - dia_model * model, - dia_context * dctx, - dia_kv_cache * cache, - dia_ubatch & batch, + ggml_context * ctx, + dia_model * model, + dia_context * dctx, + dia_kv_cache * cache, + dia_ubatch & batch, struct ggml_tensor * encoder_hidden_states) { dctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.sequence_length); ggml_set_input(dctx->positions); @@ -528,7 +528,7 @@ static struct ggml_tensor * build_dia_decoder( for (int l = 0; l < model->decoder->layers.size(); l++){ dia_decoder_layer * layer = model->decoder->layers[l]; struct ggml_tensor * residual = cur; - + cur = dia_layer_norm(ctx, cur, layer->self_attn_norm); // self-attention { @@ -546,13 +546,13 @@ static struct ggml_tensor * build_dia_decoder( 0); k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3)); - struct ggml_tensor * v = + struct ggml_tensor * v = ggml_view_3d(ctx, cache->v_l[l], model->head_size * model->decoder_attn_heads, dctx->current_position + 1, 2, ggml_element_size(cache->v_l[l]) * model->decoder_attn_heads * model->head_size, ggml_element_size(cache->v_l[l]) * model->decoder_attn_heads * model->head_size * model->max_generation_size, 0); - v = ggml_cont_4d(ctx, ggml_transpose(ctx, v), dctx->current_position + 1, model->head_size, model->decoder_attn_heads, 2); + v = ggml_cont_4d(ctx, ggml_transpose(ctx, v), dctx->current_position + 1, model->head_size, model->decoder_attn_heads, 2); // As noted in the encoder Dia uses the Neo-X protocol for RoPE. Qcur = ggml_rope(ctx, ggml_cont(ctx, ggml_reshape_4d(ctx, Qcur, model->head_size, model->decoder_attn_heads, batch.sequence_length, 2)), dctx->positions, model->head_size, 2); @@ -583,22 +583,22 @@ static struct ggml_tensor * build_dia_decoder( build_dia_cross_kv_store(ctx, dctx, model, cache, gf, encoder_hidden_states, l); } - struct ggml_tensor * cross_k = + struct ggml_tensor * cross_k = ggml_view_4d( ctx, cache->cross_k_l[l], model->head_size, model->decoder_attn_heads, 2, - model->max_encoder_context_length, model->head_size*ggml_element_size(cache->cross_k_l[l]), - model->head_size*model->decoder_attn_heads*ggml_element_size(cache->cross_k_l[l]), - model->head_size*model->decoder_attn_heads*2*ggml_element_size(cache->cross_k_l[l]), + model->max_encoder_context_length, model->head_size*ggml_element_size(cache->cross_k_l[l]), + model->head_size*model->decoder_attn_heads*ggml_element_size(cache->cross_k_l[l]), + model->head_size*model->decoder_attn_heads*2*ggml_element_size(cache->cross_k_l[l]), 0); // the double permute operation shouldn't be necessary here, but it seems that currently ggml permute only currently alows for a single // axis pair to be transposed. cross_k = ggml_cont(ctx, ggml_permute(ctx, ggml_permute(ctx, cross_k, 0, 1, 3, 2), 0, 2, 1, 3)); - struct ggml_tensor * cross_v = + struct ggml_tensor * cross_v = ggml_cont(ctx, ggml_view_4d( ctx, cache->cross_v_l[l], model->max_encoder_context_length, model->head_size, model->decoder_attn_heads, 2, - model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]), - model->head_size*model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]), + model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]), + model->head_size*model->max_encoder_context_length*ggml_element_size(cache->cross_v_l[l]), model->head_size*model->max_encoder_context_length*model->decoder_attn_heads*ggml_element_size(cache->cross_v_l[l]), 0)); @@ -637,10 +637,10 @@ static struct ggml_tensor * build_dia_decoder( } void dia_runner::tokenize_sentence(std::string sentence, dia_ubatch & batch) { - // Dia's tokenization process is unusual. Essentially Dia takes the byte value for each character and uses that as - // a token array. Additionally, because Dia performs a cfg-scale adjustment before sampling tokens, it is necessary to + // Dia's tokenization process is unusual. Essentially Dia takes the byte value for each character and uses that as + // a token array. Additionally, because Dia performs a cfg-scale adjustment before sampling tokens, it is necessary to // generate with a conditioned context (i.e. with the text) and an unconditioned context (i.e. without any text) so that - // proper adjustments can be perfored at each generation step. This means that we need to pad the end of our tokens to the + // proper adjustments can be perfored at each generation step. This means that we need to pad the end of our tokens to the // max context size for both the conditional and unconditional sequence. // if the sentence isn't prepended by dialogue start tokens, [S1] or [S2], then append one. @@ -699,7 +699,7 @@ dia_ubatch dia_runner::batch_from_sentence(std::string sentence) { * 1. Dia cleans its output generation by adding the difference between its text based output (its conditional output) and its unconditional output * to the conditional ouput before sampling. This is why the batch is set to two throughout the graph. * - * 2. Dia's decoder attends across the entire encoded space including the pad buffer which receives a unique attention mask. This is why the + * 2. Dia's decoder attends across the entire encoded space including the pad buffer which receives a unique attention mask. This is why the * encoder sequence is always max length. */ struct ggml_cgraph * dia_runner::build_dia_graph(dia_ubatch & batch) { @@ -716,7 +716,7 @@ struct ggml_cgraph * dia_runner::build_dia_graph(dia_ubatch & batch) { ggml_set_name(cur, "decoder_output"); ggml_build_forward_expand(gf, cur); free_build(); - + return gf; } @@ -758,11 +758,11 @@ int dia_runner::decode(dia_ubatch & batch) { dctx->output_tokens.reserve(dctx->max_generation_size * model->n_output_heads); } ggml_backend_sched_reset(dctx->sched); - + const size_t logits_size = model->output_vocab_size * dctx->max_generation_size * model->n_output_heads; const size_t prev_size = dctx->buf_output ? ggml_backend_buffer_get_size(dctx->buf_output) : 0; const size_t new_size = logits_size * sizeof(float); - + if (!dctx->buf_output || prev_size < new_size) { if (dctx->buf_output) { ggml_backend_buffer_free(dctx->buf_output); @@ -772,7 +772,7 @@ int dia_runner::decode(dia_ubatch & batch) { dctx->buf_output = ggml_backend_buft_alloc_buffer(dctx->backend_cpu_buffer, new_size); } - + dctx->logits = (float *) ggml_backend_buffer_get_base(dctx->buf_output); ggml_cgraph * gf = build_dia_graph(batch); @@ -817,7 +817,7 @@ bool dia_runner::check_stopping(dia_ubatch & batch) { if (dctx->delay_steps == -1 && (batch.audio_tokens[0] == model->eos_token_id || dctx->current_position >= dctx->max_generation_size - model->max_delay)) { dctx->delay_steps = model->max_delay; } - + if (dctx->delay_steps > 0) { int step_after_eos = model->max_delay - dctx->delay_steps; for (int i = 0; i < model->delay_pattern.size(); i++) { @@ -907,5 +907,5 @@ void dia_runner::assign_weight(std::string name, ggml_tensor * tensor) { dac_runner->model->assign_weight(name.substr(14), tensor); } else { model->assign_weight(name, tensor); - } + } } diff --git a/otherarch/ttscpp/src/dia_model.h b/otherarch/ttscpp/src/dia_model.h index bdca91d8c..6936b3945 100644 --- a/otherarch/ttscpp/src/dia_model.h +++ b/otherarch/ttscpp/src/dia_model.h @@ -1,7 +1,7 @@ #pragma once #include "dac_model.h" -#include "sampler.h" +#include "ttssampler.h" struct dia_encoder_layer { struct ggml_tensor * k; @@ -22,7 +22,7 @@ struct dia_decoder_layer { struct ggml_tensor * self_attn_v; struct ggml_tensor * self_attn_o; struct ggml_tensor * self_attn_norm; - + struct ggml_tensor * cross_attn_k; struct ggml_tensor * cross_attn_q; struct ggml_tensor * cross_attn_v; @@ -76,7 +76,7 @@ struct dia_model : tts_model { dia_encoder * encoder; dia_decoder * decoder; - + void assign_weight(std::string name, ggml_tensor * tensor); void assign_to_encoder(std::vector parts, struct ggml_tensor * tensor, std::string name); void assign_to_decoder(std::vector parts, struct ggml_tensor * tensor, std::string name); @@ -103,15 +103,15 @@ struct dia_context : runner_context { uint32_t max_generation_size; // this is set by the generation context or defaults to the config set on dia model. std::vector output_tokens; - struct dia_model * model; - + struct dia_model * model; + struct ggml_tensor * inp_tokens; struct ggml_tensor * audio_inp_tokens; struct ggml_tensor * positions; struct ggml_tensor * encode_positions; struct ggml_tensor * encode_attn_mask; struct ggml_tensor * cross_attn_mask; - + void build_schedule() { runner_context::build_schedule(model->max_nodes()); } @@ -126,11 +126,11 @@ struct dia_kv_cache { std::vector k_l; std::vector v_l; - + struct ggml_context * ctx; ggml_backend_buffer_type_t buft; ggml_backend_buffer_t buf; - + void free() { ggml_free(ctx); ggml_backend_buffer_free(buf); diff --git a/otherarch/ttscpp/src/general_neural_audio_codec.h b/otherarch/ttscpp/src/general_neural_audio_codec.h index 1ec0a42b7..97180e49a 100644 --- a/otherarch/ttscpp/src/general_neural_audio_codec.h +++ b/otherarch/ttscpp/src/general_neural_audio_codec.h @@ -53,7 +53,7 @@ namespace general_neural_audio_codec { uint32_t padding; uint32_t stride; - + std::vector residual_blocks; }; diff --git a/otherarch/ttscpp/src/kokoro_model.h b/otherarch/ttscpp/src/kokoro_model.h index b4f4f9671..7ffa9eba6 100644 --- a/otherarch/ttscpp/src/kokoro_model.h +++ b/otherarch/ttscpp/src/kokoro_model.h @@ -3,11 +3,11 @@ #include #include "tts_model.h" -#include "tokenizer.h" +#include "ttstokenizer.h" #include "phonemizer.h" // Rather than using ISO 639-2 language codes, Kokoro voice pack specify their corresponding language via their first letter. -// Below is a map that describes the relationship between those designations and espeak-ng's voice identifiers so that the +// Below is a map that describes the relationship between those designations and espeak-ng's voice identifiers so that the // appropriate phonemization protocol can inferred from the Kokoro voice. static std::map KOKORO_LANG_TO_ESPEAK_ID = { {'a', "gmw/en-US"}, @@ -22,7 +22,7 @@ static std::map KOKORO_LANG_TO_ESPEAK_ID = { }; struct lstm_cell { - std::vector weights; + std::vector weights; std::vector biases; std::vector reverse_weights; std::vector reverse_biases; @@ -197,8 +197,8 @@ struct kokoro_model : tts_model { // standard configuration for duration prediction uint32_t f0_n_blocks = 3; uint32_t n_duration_prediction_layers = 3; - // while it is technically possible for the duration predictor to assign 50 values per token there is no practical need to - // allocate that many items to the sequence as it is impossible for all tokens to require such long durations and each + // while it is technically possible for the duration predictor to assign 50 values per token there is no practical need to + // allocate that many items to the sequence as it is impossible for all tokens to require such long durations and each // allocation increases node allocation size by O(N) uint32_t max_duration_per_token = 20; uint32_t style_half_size = 128; @@ -221,7 +221,7 @@ struct kokoro_model : tts_model { float noise_std = 0.003f; float voice_threshold = 10.0f; float sample_rate = 24000.0f; - std::string window = "hann"; + std::string window = "hann"; // It is really annoying that ggml doesn't allow using non ggml tensors as the operator for simple math ops. // This is just the constant defined above as a tensor. @@ -259,7 +259,7 @@ struct kokoro_model : tts_model { // Decoding and Generation portion of the model struct kokoro_decoder * decoder; - // the default hidden states need to be initialized + // the default hidden states need to be initialized std::vector lstms; size_t duration_node_counter = 0; @@ -317,15 +317,15 @@ struct kokoro_duration_context : runner_context { ~kokoro_duration_context() { ggml_backend_buffer_free(buf_len_output); } - + std::string voice = "af_alloy"; struct kokoro_model * model; ggml_backend_buffer_t buf_len_output = nullptr; - + size_t logits_size = 0; // capacity (of floats) for logits float * lens = nullptr; - + struct ggml_tensor * inp_tokens; struct ggml_tensor * positions; struct ggml_tensor * attn_mask; @@ -356,7 +356,7 @@ struct kokoro_duration_response { }; // This struct is intended to manage graph and compute for the duration prediction portion of the kokoro model. -// Duration computation and speech generation are separated into distinct graphs because the precomputed graph structure of ggml doesn't +// Duration computation and speech generation are separated into distinct graphs because the precomputed graph structure of ggml doesn't // support the tensor dependent views that would otherwise be necessary. struct kokoro_duration_runner : tts_runner { kokoro_duration_runner(kokoro_model * model, kokoro_duration_context * context, single_pass_tokenizer * tokenizer): model(model), kctx(context), tokenizer(tokenizer) {}; @@ -375,7 +375,7 @@ struct kokoro_duration_runner : tts_runner { void init_build() { tts_runner::init_build(&kctx->buf_compute_meta); } - + void prepare_post_load(); struct kokoro_ubatch build_worst_case_batch(); void set_inputs(kokoro_ubatch & batch); @@ -397,7 +397,7 @@ struct kokoro_context : runner_context { } std::string voice = "af_alloy"; - + struct kokoro_model * model; uint32_t total_duration; @@ -408,7 +408,7 @@ struct kokoro_context : runner_context { struct ggml_tensor * duration_mask; struct ggml_tensor * window_sq_sum; // needs to be calculatd from the generator window. struct ggml_tensor * uv_noise_data; - + void build_schedule() { runner_context::build_schedule(model->max_gen_nodes()*30); } diff --git a/otherarch/ttscpp/src/orpheus_model.cpp b/otherarch/ttscpp/src/orpheus_model.cpp index 4866af208..1fba00ee1 100644 --- a/otherarch/ttscpp/src/orpheus_model.cpp +++ b/otherarch/ttscpp/src/orpheus_model.cpp @@ -150,7 +150,7 @@ orpheus_context * build_new_orpheus_context(orpheus_model * model, int n_threads return octx; } -void orpheus_runner::orpheus_kv_cache_init() { +void orpheus_runner::orpheus_kv_cache_init() { ggml_backend_buffer_type_t buft = nullptr; if (octx->backend != nullptr) { #ifdef GGML_USE_METAL @@ -192,21 +192,21 @@ void orpheus_runner::orpheus_kv_cache_init() { } void orpheus_runner::orpheus_build_kv_store(struct ggml_context * ctx, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, int index, uint32_t n_tokens, int repeat) { - k_cur = ggml_rope_ext(ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, k_cur, model->head_size, model->n_kv_attn_heads, n_tokens)), octx->positions, model->rope_frequencies, + k_cur = ggml_rope_ext(ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, k_cur, model->head_size, model->n_kv_attn_heads, n_tokens)), octx->positions, model->rope_frequencies, model->head_size, 2,0, 500000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); // A performance comparison between this method, i.e. performing 3 incremental copy operations in order to achieve repeat_interleave, // and performing the repeat operation upfront before performign a single copy needs to be performed in order to better optimize this function. - // Additionally, it might be more performant for the values transposition to be performed prior to appending it to the cache, as it would save us + // Additionally, it might be more performant for the values transposition to be performed prior to appending it to the cache, as it would save us // from incrementally larger transpositions with generation. for (int i = 0; i < repeat; i++) { struct ggml_tensor * k_cache_view = ggml_view_3d( - ctx, - kv_self->k_l[index], + ctx, + kv_self->k_l[index], model->head_size, model->n_kv_attn_heads, - n_tokens, + n_tokens, ggml_element_size(kv_self->k_l[index]) * model->head_size * repeat, ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size, ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size * octx->current_position + i * ggml_element_size(kv_self->k_l[index]) * model->head_size @@ -230,19 +230,19 @@ void orpheus_runner::orpheus_kv_cache_init() { struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch) { init_build(); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false); - + struct ggml_tensor * cur; struct ggml_tensor * inpL; - + const int32_t full_sequence_length = octx->current_position + (uint32_t) batch.n_tokens; octx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens); ggml_set_input(octx->positions); octx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens); ggml_set_input(octx->inp_tokens); inpL = ggml_get_rows(ctx, model->embd, octx->inp_tokens); - + struct ggml_tensor * KQ_mask_dec = build_attn_mask(ctx, octx, batch); - + for (int l = 0; l < model->n_layers; l++) { struct ggml_tensor * residual = inpL; cur = orpheus_build_layer_norm(ctx, inpL, model->layers[l].input_norm); @@ -261,8 +261,8 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch) model->head_size, full_sequence_length, model->n_attn_heads, ggml_element_size(kv_self->k_l[l]) * model->n_attn_heads * model->head_size, ggml_element_size(kv_self->k_l[l]) * model->head_size, - 0)); - + 0)); + struct ggml_tensor * v = ggml_view_2d(ctx, kv_self->v_l[l], model->hidden_size, full_sequence_length, @@ -272,7 +272,7 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch) v = ggml_cont_3d(ctx, ggml_transpose(ctx, v), full_sequence_length, model->head_size, model->n_attn_heads); Qcur = ggml_rope_ext( - ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, Qcur, model->head_size, model->n_attn_heads, batch.n_tokens)), + ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, Qcur, model->head_size, model->n_attn_heads, batch.n_tokens)), octx->positions, model->rope_frequencies, model->head_size, 2, 0, 500000.0f, // rope theta 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); @@ -286,7 +286,7 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch) } cur = ggml_add(ctx, attn_out, residual); - + struct ggml_tensor * residualffn = cur; // mlp @@ -298,7 +298,7 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch) cur = ggml_add(ctx, cur, residualffn); inpL = cur; } - + cur = orpheus_build_layer_norm(ctx, cur, model->output_norm); // only about 40k of the output head is actually uses for generation purposes. Ideally the head tensor should be shrunk and sampled tokens should be incremented. cur = ggml_mul_mat(ctx, model->head, cur); @@ -307,15 +307,15 @@ struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch) } ggml_build_forward_expand(gf, cur); free_build(); - + return gf; } void orpheus_runner::decode(orpheus_ubatch & batch) { ggml_backend_sched_reset(octx->sched); - + octx->output_tokens.reserve(model->max_generation_size); - + const size_t new_size = model->vocab_size * model->max_generation_size * sizeof(float); octx->prep_output_buffer(new_size); @@ -324,10 +324,10 @@ void orpheus_runner::decode(orpheus_ubatch & batch) { // the output is always the last tensor in the graph struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; ggml_backend_sched_alloc_graph(octx->sched, gf); - + set_inputs(batch); ggml_backend_sched_graph_compute_async(octx->sched, gf); - + float * logits_out = octx->logits + octx->n_outputs * model->vocab_size; octx->get_ggml_node_data(res, logits_out, model->vocab_size * sizeof(float)); diff --git a/otherarch/ttscpp/src/orpheus_model.h b/otherarch/ttscpp/src/orpheus_model.h index 9f02d7697..7be342d29 100644 --- a/otherarch/ttscpp/src/orpheus_model.h +++ b/otherarch/ttscpp/src/orpheus_model.h @@ -1,7 +1,7 @@ #pragma once -#include "sampler.h" -#include "tokenizer.h" +#include "ttssampler.h" +#include "ttstokenizer.h" #include "snac_model.h" // Orpheus uses vLLM with a llama-3 architecture. The only critical difference from the normal llama architecture is the use of kv heads. @@ -73,7 +73,7 @@ struct orpheus_context : runner_context { struct ggml_tensor * positions; }; -struct orpheus_kv_cache { +struct orpheus_kv_cache { ggml_type cache_type = GGML_TYPE_F32; std::vector k_l; @@ -104,11 +104,11 @@ struct orpheus_ubatch { struct orpheus_runner : tts_runner { orpheus_runner( - orpheus_model * model, - snac_runner * audio_decoder, - orpheus_context * octx, - bpe_tokenizer * bt, - sampler * samp, + orpheus_model * model, + snac_runner * audio_decoder, + orpheus_context * octx, + bpe_tokenizer * bt, + sampler * samp, orpheus_kv_cache * cache): model(model), srunner(audio_decoder), octx(octx), tokenizer(bt), generation_sampler(samp), kv_self(cache) { tts_runner::sampling_rate = 24000.0f; generation_sampler->n_output_heads = 1; diff --git a/otherarch/ttscpp/src/parler_model.h b/otherarch/ttscpp/src/parler_model.h index 463910f49..e6c3ba41f 100644 --- a/otherarch/ttscpp/src/parler_model.h +++ b/otherarch/ttscpp/src/parler_model.h @@ -2,8 +2,8 @@ #define parler_model_h #include "dac_model.h" -#include "t5_encoder_model.h" -#include "sampler.h" +#include "ttst5_encoder_model.h" +#include "ttssampler.h" enum parler_tensor { PARLER_EMBD, @@ -38,17 +38,17 @@ struct parler_layer { struct ggml_tensor * self_attn_o_proj; struct ggml_tensor * self_attn_norm; struct ggml_tensor * self_attn_norm_bias; - + struct ggml_tensor * attn_k_proj; struct ggml_tensor * attn_q_proj; struct ggml_tensor * attn_v_proj; struct ggml_tensor * attn_o_proj; struct ggml_tensor * attn_norm; struct ggml_tensor * attn_norm_bias; - + struct ggml_tensor * cross_k; struct ggml_tensor * cross_v; - + struct ggml_tensor * fc1; struct ggml_tensor * fc2; struct ggml_tensor * final_norm; @@ -74,18 +74,18 @@ struct parler_tts_model : tts_model { uint32_t prompt_vocab_size; bool use_cross_attn = true; - + std::vector embds; std::vector layers; std::vector heads; - + struct ggml_tensor * precomputed_input_emb; struct ggml_tensor * precomputed_positional_embds; - + struct ggml_tensor * layer_norm; struct ggml_tensor * layer_norm_bias; struct ggml_tensor * prompt_embd; - + void assign_weight(std::string name, ggml_tensor * tensor); void prep_constants(gguf_context * meta); void prep_layers(gguf_context * meta); @@ -107,21 +107,21 @@ struct parler_context : runner_context { std::vector eos_seen; bool use_cache = true; - + size_t output_size = 0; // capacity (of tokens positions) for the output buffers int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch uint32_t current_position = 0; // current position in the active sequence uint32_t prompt_end_position = 0; // the position of the text prompt termination (used for adjusting the cache when incrementally generating) int32_t seq_id; // a unique identifier associated with the active sequence. - + std::vector output_tokens; - + struct ggml_tensor * inp_tokens; struct ggml_tensor * audio_inp_tokens; struct ggml_tensor * positions; struct ggml_tensor * attn_mask; struct ggml_tensor * attn_mask_cross; - + void build_schedule() { runner_context::build_schedule(model->max_nodes()); } @@ -130,17 +130,17 @@ struct parler_context : runner_context { struct parler_kv_cache { int32_t seq_id; - + ggml_type type_k = GGML_TYPE_F32; ggml_type type_v = GGML_TYPE_F32; std::vector k_l; std::vector v_l; - + struct ggml_context * ctx; ggml_backend_buffer_type_t buft; ggml_backend_buffer_t buf; - + void free() { ggml_free(ctx); ggml_backend_buffer_free(buf); @@ -152,8 +152,8 @@ struct parler_kv_cache { }; struct parler_ubatch { - parler_ubatch(bool audio_generation, size_t n_tokens, size_t n_audio_tokens, size_t sequence_length, - uint32_t * tokens, uint32_t * audio_tokens, uint32_t * positions, uint32_t * true_order, + parler_ubatch(bool audio_generation, size_t n_tokens, size_t n_audio_tokens, size_t sequence_length, + uint32_t * tokens, uint32_t * audio_tokens, uint32_t * positions, uint32_t * true_order, int current_step): audio_generation(audio_generation), n_tokens(n_tokens), n_audio_tokens(n_audio_tokens), sequence_length(sequence_length), tokens(tokens), audio_tokens(audio_tokens), positions(positions), true_order(true_order), current_step(current_step) {}; parler_ubatch() {}; bool audio_generation; // whether we are receiving codebook decoded tokens or text tokens diff --git a/otherarch/ttscpp/src/phonemizer.cpp b/otherarch/ttscpp/src/phonemizer.cpp index 36da56723..9fc58133c 100644 --- a/otherarch/ttscpp/src/phonemizer.cpp +++ b/otherarch/ttscpp/src/phonemizer.cpp @@ -543,7 +543,7 @@ dictionary_response * phoneme_dictionary::lookup(corpus * text, std::string valu } std::vector possibilities = lookup_map.at(value); for (auto possible : possibilities) { - if (possible->code == SUCCESS || (possible->code == SUCCESS_PARTIAL && possible->is_match(text, flags))) { + if (possible->code == SUCCESS_TOTAL || (possible->code == SUCCESS_PARTIAL && possible->is_match(text, flags))) { return possible; } } @@ -818,7 +818,7 @@ bool phonemizer::process_word(corpus* text, std::string* output, std::string wor output->append(" "); } flags->update_for_word(word); - if (response->code != SUCCESS) { + if (response->code != SUCCESS_TOTAL) { word += response->after_match; output->append(response->value); text->size_pop(word.size()+unaccented_size_difference); @@ -1072,7 +1072,7 @@ dictionary_response * response_from_string(std::string value, std::string key) { bool not_at_start = key[0] == '#'; bool not_at_end = key.back() == '#'; if (!has_spacing) { - dictionary_response * resp = new dictionary_response(SUCCESS, value); + dictionary_response * resp = new dictionary_response(SUCCESS_TOTAL, value); resp->expects_to_be_proceeded_by_number = expects_to_be_proceeded_by_number; resp->not_at_clause_end = not_at_end; resp->not_at_clause_start = not_at_start; diff --git a/otherarch/ttscpp/src/snac_model.h b/otherarch/ttscpp/src/snac_model.h index 9450c1b75..8b546dc12 100644 --- a/otherarch/ttscpp/src/snac_model.h +++ b/otherarch/ttscpp/src/snac_model.h @@ -4,7 +4,7 @@ // SNAC, Scale Neural Audio Codec, is another neural audio codec much like DAC. // The key differences are that it uses grouping in the residual units of its layers, -// performs a repeat_interleave over the second and third input channels, applies +// performs a repeat_interleave over the second and third input channels, applies // a noise convolutional layer after input encoding for each layer, and applies // an extra convolutional layer before residual layers are applied. struct snac_model : tts_model { @@ -19,7 +19,7 @@ struct snac_model : tts_model { uint32_t noise_steps[4] = {8, 64, 256, 512}; uint32_t noise_steps_sum = 840; bool use_noise = true; - + struct ggml_tensor * repeat_interleave_buffer; struct ggml_tensor * in_conv_kernel; @@ -46,12 +46,12 @@ struct snac_model : tts_model { // the context used for running the snac model struct snac_context : runner_context { snac_context(snac_model * model, int n_threads): runner_context(n_threads), model(model) {}; - + struct snac_model * model; - + struct ggml_tensor * inp_tokens; struct ggml_tensor * noise; - + void build_schedule() { runner_context::build_schedule(model->max_nodes()); } @@ -74,11 +74,11 @@ struct snac_runner : tts_runner { } snac_model * model; snac_context * sctx; - + void init_build() { tts_runner::init_build(&sctx->buf_compute_meta); } - + void set_inputs(std::vector> & tokens); void prepare_post_load(); struct ggml_cgraph * build_snac_graph(size_t sequence_length); diff --git a/otherarch/ttscpp/src/args.cpp b/otherarch/ttscpp/src/ttsargs.cpp similarity index 100% rename from otherarch/ttscpp/src/args.cpp rename to otherarch/ttscpp/src/ttsargs.cpp diff --git a/otherarch/ttscpp/src/sampler.cpp b/otherarch/ttscpp/src/ttssampler.cpp similarity index 98% rename from otherarch/ttscpp/src/sampler.cpp rename to otherarch/ttscpp/src/ttssampler.cpp index b2f2cc1b7..26340db82 100644 --- a/otherarch/ttscpp/src/sampler.cpp +++ b/otherarch/ttscpp/src/ttssampler.cpp @@ -1,4 +1,4 @@ -#include "sampler.h" +#include "ttssampler.h" void sampler::sample(float * logits, std::vector & output_tokens) { // assume that we are pointing to the start of the first token output; @@ -6,7 +6,7 @@ void sampler::sample(float * logits, std::vector & output_tokens) { return max(logits, output_tokens); } std::vector max_vals; - // the max_head_probs variable is used when top-p is applied but exists to address the case in which top-k and top-p cause the cumulative probability of the nucleus to beless than or + // the max_head_probs variable is used when top-p is applied but exists to address the case in which top-k and top-p cause the cumulative probability of the nucleus to beless than or // equal to top_p; std::vector max_head_probs; @@ -189,7 +189,7 @@ void sampler::max(float * logits, std::vector & output_tokens) { uint32_t token_id = 0; for (uint32_t ii = 0; ii < vocab_size; ii++) { float v = *(logits+i*vocab_size+ii); - // while repetition penalty will never be used for maximum token selection, it is used for the logarithmic stabilization of + // while repetition penalty will never be used for maximum token selection, it is used for the logarithmic stabilization of // the softmax function in which case it is possible for repetition counts to be set. if (has_repetition_penalty && last_token_ids[i] == ii) { v /= (pow(repetition_penalty, repetition_counts[i])); diff --git a/otherarch/ttscpp/src/sampler.h b/otherarch/ttscpp/src/ttssampler.h similarity index 99% rename from otherarch/ttscpp/src/sampler.h rename to otherarch/ttscpp/src/ttssampler.h index 0b8941e4c..58dae542f 100644 --- a/otherarch/ttscpp/src/sampler.h +++ b/otherarch/ttscpp/src/ttssampler.h @@ -21,7 +21,7 @@ struct sampler { std::vector repetition_counts; bool do_sample = true; bool apply_softmax = true; - + void sample(float * logits, std::vector & output_tokens); void softmax(float * logits, std::vector> picks, std::vector max_indices); void max(float * logits, std::vector & output_tokens); diff --git a/otherarch/ttscpp/src/t5_encoder_model.cpp b/otherarch/ttscpp/src/ttst5_encoder_model.cpp similarity index 99% rename from otherarch/ttscpp/src/t5_encoder_model.cpp rename to otherarch/ttscpp/src/ttst5_encoder_model.cpp index 2dbc7614d..cce9afdf7 100644 --- a/otherarch/ttscpp/src/t5_encoder_model.cpp +++ b/otherarch/ttscpp/src/ttst5_encoder_model.cpp @@ -1,4 +1,4 @@ -#include "t5_encoder_model.h" +#include "ttst5_encoder_model.h" static const std::map T5_TENSOR_GGUF_LOOKUP = { {"t5encoder.token_embd", T5_EMBD}, @@ -139,7 +139,7 @@ void t5_encoder::prep_constants(gguf_context * meta) { int bos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.bos_token_id"); if (bos_token_id_key != -1) { bos_token_id = gguf_get_val_u32(meta, bos_token_id_key); - } + } int eos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.eos_token_id"); if (eos_token_id_key != -1) { @@ -219,7 +219,7 @@ struct ggml_cgraph * t5_runner::build_t5_graph(t5_ubatch & batch) { struct ggml_tensor * cur; struct ggml_tensor * inpL; - + //t5ctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens); //ggml_set_input(t5ctx->positions); @@ -233,7 +233,7 @@ struct ggml_cgraph * t5_runner::build_t5_graph(t5_ubatch & batch) { struct ggml_tensor * KQ_mask_dec = build_t5_attn_mask(ctx, t5ctx, batch); struct ggml_tensor * pos_bias = build_t5_pos_bias(ctx, t5ctx->inp_pos_bucket, model->relative_attn_bias); - + for (int l = 0; l < model->n_layers; l++) { struct ggml_tensor * residual = inpL; @@ -293,7 +293,7 @@ struct ggml_cgraph * t5_runner::build_t5_graph(t5_ubatch & batch) { ggml_build_forward_expand(gf, cur); free_build(); - + return gf; } @@ -312,7 +312,7 @@ void t5_runner::set_inputs(t5_ubatch & batch) { for (int ii = 0; ii < batch.n_tokens; ii++) { int ab_rpos = abs(i - ii); int rpos = i - ii; - attn_mask[i*batch.n_tokens + ii] = 0.0f; //ii > i ? -INFINITY : 0.0f; + attn_mask[i*batch.n_tokens + ii] = 0.0f; //ii > i ? -INFINITY : 0.0f; pos_bucket[i*batch.n_tokens + ii] = (uint32_t) (rpos > 0 ? n_buckets : 0) + (ab_rpos < max_exact ? ab_rpos : std::min((n_buckets - 1), (max_exact + (int)((log((ab_rpos / max_exact)) / logarithmic_denominator) * max_exact)))); } } @@ -324,10 +324,10 @@ void t5_runner::run(uint32_t * input_tokens, uint32_t sequence_length, struct tt batch.input_tokens = input_tokens; batch.n_tokens = sequence_length; ggml_backend_sched_reset(t5ctx->sched); - + const size_t prev_size = t5ctx->buf_output ? ggml_backend_buffer_get_size(t5ctx->buf_output) : 0; const size_t new_size = model->max_context_length * model->output_size * sizeof(float); - + if (!t5ctx->buf_output || prev_size < new_size) { if (t5ctx->buf_output) { ggml_backend_buffer_free(t5ctx->buf_output); @@ -337,7 +337,7 @@ void t5_runner::run(uint32_t * input_tokens, uint32_t sequence_length, struct tt t5ctx->buf_output = ggml_backend_buft_alloc_buffer(t5ctx->backend_cpu_buffer, new_size); } - + outputs->data = (float *) ggml_backend_buffer_get_base(t5ctx->buf_output); ggml_backend_buffer_clear(t5ctx->buf_output, 0); struct ggml_cgraph * gf = NULL; diff --git a/otherarch/ttscpp/src/t5_encoder_model.h b/otherarch/ttscpp/src/ttst5_encoder_model.h similarity index 99% rename from otherarch/ttscpp/src/t5_encoder_model.h rename to otherarch/ttscpp/src/ttst5_encoder_model.h index 9a801873d..eadbf4d55 100644 --- a/otherarch/ttscpp/src/t5_encoder_model.h +++ b/otherarch/ttscpp/src/ttst5_encoder_model.h @@ -2,7 +2,7 @@ #define t5_encoder_model_h #include "tts_model.h" -#include "tokenizer.h" +#include "ttstokenizer.h" enum t5_tensor { @@ -75,14 +75,14 @@ void assign_to_t5_layer(t5_encoder * model, t5_layer & layer, std::string name, struct t5_context : runner_context { t5_context(t5_encoder * model, int n_threads): runner_context(n_threads), model(model) {}; - + struct t5_encoder * model; - + struct ggml_tensor * inp_tokens; struct ggml_tensor * positions; struct ggml_tensor * attn_mask; struct ggml_tensor * inp_pos_bucket; - + void build_schedule() { runner_context::build_schedule(model->max_nodes()); } @@ -116,7 +116,7 @@ struct t5_runner : tts_runner { void init_build() { tts_runner::init_build(&t5ctx->buf_compute_meta); } - + void prepare_post_load(); struct t5_ubatch build_worst_case_batch(); void set_inputs(t5_ubatch & batch); diff --git a/otherarch/ttscpp/src/tokenizer.cpp b/otherarch/ttscpp/src/ttstokenizer.cpp similarity index 99% rename from otherarch/ttscpp/src/tokenizer.cpp rename to otherarch/ttscpp/src/ttstokenizer.cpp index 9b870d44a..cabac7089 100644 --- a/otherarch/ttscpp/src/tokenizer.cpp +++ b/otherarch/ttscpp/src/ttstokenizer.cpp @@ -1,4 +1,4 @@ -#include "tokenizer.h" +#include "ttstokenizer.h" void token_trie::add(const std::string & gram, uint32_t token) { _add(gram, token, 0); diff --git a/otherarch/ttscpp/src/tokenizer.h b/otherarch/ttscpp/src/ttstokenizer.h similarity index 100% rename from otherarch/ttscpp/src/tokenizer.h rename to otherarch/ttscpp/src/ttstokenizer.h