#include "phonemizer.h" #ifdef ESPEAK_INSTALL /** * espeak_wrapper functions and assignments * * The espeak_wrapper is a singleton which wraps threaded calls to espeak-ng with a shared mutex */ // non-const static members must be initialized out of line espeak_wrapper* espeak_wrapper::instance{nullptr}; std::mutex espeak_wrapper::mutex; espeak_wrapper * espeak_wrapper::get_instance() { if (!instance) { instance = new espeak_wrapper; } return instance; } const espeak_VOICE ** espeak_wrapper::list_voices() { std::lock_guard lock(mutex); return espeak_ListVoices(nullptr); } espeak_ERROR espeak_wrapper::set_voice(const char * voice_code) { std::lock_guard lock(mutex); return espeak_SetVoiceByName(voice_code); } const char * espeak_wrapper::text_to_phonemes(const void ** textptr, int textmode, int phonememode) { std::lock_guard lock(mutex); return espeak_TextToPhonemes(textptr, textmode, phonememode); } void espeak_wrapper::initialize(espeak_AUDIO_OUTPUT output, int buflength, const char * path, int options) { std::lock_guard lock(mutex); if (!espeak_initialized) { espeak_initialized = true; espeak_Initialize(output, buflength, path, options); } } #endif /** * Helper functions for string parsing */ const std::unordered_set inline_combine_sets(const std::vector> sets) { std::unordered_set combined; for (auto set : sets) { combined.insert(set.begin(), set.end()); } return combined; } std::string replace(std::string target, char to_replace, char replacement) { for (int i = 0; i < target.size(); i++) { if (target[i] == to_replace) { target[i] = replacement; } } return target; } std::string to_lower(std::string word) { std::transform(word.begin(), word.end(), word.begin(), [](unsigned char c){ return std::tolower(c); }); return word; } std::string to_upper(std::string word) { std::transform(word.begin(), word.end(), word.begin(), [](unsigned char c){ return std::toupper(c); }); return word; } std::string replace_accents(std::string word) { std::string new_word; for (int i = 0; i < word.size();) { int grab = 0; while(i+grab+1 < word.size() && (word[i+grab + 1] & 0b11000000) == 0b10000000) { ++grab; } ++grab; if (grab > 1) { std::string accent = word.substr(i, grab); if (ACCENTED_A.find(accent) != std::string::npos) { new_word.push_back('a'); } else if (ACCENTED_C.find(accent) != std::string::npos) { new_word.push_back('c'); } else if (ACCENTED_E.find(accent) != std::string::npos) { new_word.push_back('e'); } else if (ACCENTED_I.find(accent) != std::string::npos) { new_word.push_back('i'); } else if (ACCENTED_N.find(accent) != std::string::npos) { new_word.push_back('n'); } else if (ACCENTED_O.find(accent) != std::string::npos) { new_word.push_back('o'); } else if (ACCENTED_U.find(accent) != std::string::npos) { new_word.push_back('u'); } else { // non accented charactes in a word string should really be possible but for the sake of keeping this function pure // just put the multibyte character back; new_word.append(accent); } } else { new_word.push_back(word[i]); } i += grab; } return new_word; } int upper_count(std::string word) { int count = 0; for (char letter : word) { if (isupper(letter)) { count += 1; } } return count; } bool is_all_upper(std::string word) { for (char letter : word) { if (!isupper(letter)) { return false; } } return true; } /* * Text condition checks */ bool is_roman_numeral(char letter) { return ROMAN_NUMERAL_CHARACTERS.find(letter) != std::string::npos; } bool can_be_roman_numeral(std::string word) { for (int i = 0; i < word.size(); i++) { if (!is_roman_numeral(word[i])) { return false; } } return true; } bool is_alphabetic(char letter) { return ALPHABET.find(letter) != std::string::npos; } bool is_numeric(char letter) { int val = (int) letter; return val >= 48 && val <= 57; } std::string parse_voice_code(std::string voice_code) { #ifdef ESPEAK_INSTALL voice_code = to_lower(voice_code); const espeak_VOICE * primary_match = nullptr; const espeak_VOICE * secondary_match = nullptr; bool search_by_lc = voice_code.size() == 2; bool search_by_lfc = !search_by_lc && voice_code.size() == 3; bool search_by_id = !search_by_lfc && voice_code.find("/") != std::string::npos; // It is common for locale's to be '_' separated rather than '-' separated. Check for both. bool search_by_lcc = !search_by_id && (voice_code.find("-") != std::string::npos || voice_code.find("_") != std::string::npos); if (search_by_id || search_by_lcc) { voice_code = replace(voice_code, '_', '-'); } const espeak_VOICE** espeak_voices = espeak_wrapper::get_instance()->list_voices(); // ideally we'd use the espeak voice scores which order voices by preference, but they are only returned when a voice_spec is passed to the list api and // the voice spec isn't compatible with partials (e.g. country codes, language family code, etc) int i = 0; while (espeak_voices[i] != nullptr) { auto identifier_parts = split(espeak_voices[i]->identifier, "/"); // it is possible to add languages to espeak-ng without following their identifier pattern, if we run into such a language just try to match against // the identifier and otherwise continue; if (identifier_parts.size() == 1) { if (voice_code == identifier_parts[0] || voice_code == espeak_voices[i]->name) { primary_match = espeak_voices[i]; } else { continue; } } if (search_by_lc) { std::string language_part = identifier_parts[1]; if (language_part == voice_code) { primary_match = espeak_voices[i]; break; // if we have an exact match then we can exit } else if (has_prefix(language_part, voice_code) && (!primary_match || strlen(primary_match->identifier) > strlen(espeak_voices[i]->identifier))) { // prefer the smaller codes as longer codes typically refer to more specific locales primary_match = espeak_voices[i] ; } else { auto subparts = split(language_part, "-"); if (subparts.size() > 1 && to_lower(subparts[1]) == voice_code && (!secondary_match || strlen(secondary_match->identifier) > strlen(espeak_voices[i]->identifier))) { // country codes are typically capitalized in espeak-ng secondary_match = espeak_voices[i]; } } } else if (search_by_lfc) { // espeak-ng uses language family codes in their identifiers, but also uses ISO 639-3 language codes for some languages. // Since language codes are more specific attempt to match against the language code as the primary and match against the language family // code as the secondary. if (has_prefix(identifier_parts[1], voice_code) && (!primary_match || strlen(primary_match->identifier) > strlen(espeak_voices[i]->identifier))) { primary_match = espeak_voices[i]; } else if (identifier_parts[0] == voice_code && (!secondary_match || strlen(secondary_match->identifier) > strlen(espeak_voices[i]->identifier))) { secondary_match = espeak_voices[i]; } } else if (search_by_id && has_prefix(to_lower(espeak_voices[i]->identifier), voice_code) && (!primary_match || strlen(primary_match->identifier) > strlen(espeak_voices[i]->identifier))) { primary_match = espeak_voices[i]; } else if (search_by_lcc && has_prefix(to_lower(identifier_parts[1]), voice_code) && (!primary_match || strlen(primary_match->identifier) > strlen(espeak_voices[i]->identifier))) { primary_match = espeak_voices[i]; } else if (to_lower(espeak_voices[i]->name).find(voice_code) != std::string::npos && (!primary_match || strlen(primary_match->identifier) > strlen(espeak_voices[i]->identifier))) { primary_match = espeak_voices[i]; } i++; } if (!primary_match && !secondary_match) { TTS_ABORT("Failed to match espeak voice code '%s' to known espeak voices.\n", voice_code.c_str()); } if (!primary_match) { primary_match = secondary_match; } fprintf(stdout, "Passed Espeak Voice Code '%s' doesn't directly match any known Espeak Voice IDs. Nearest match with name '%s' and id '%s' will be used instead.\n", voice_code.c_str(), primary_match->name, primary_match->identifier); return std::string(primary_match->identifier); #else TTS_ABORT("Attempted to list voices without espeak-ng installed."); #endif } void update_voice(std::string voice_code) { #ifdef ESPEAK_INSTALL espeak_ERROR e = espeak_wrapper::get_instance()->set_voice(voice_code.c_str()); if (e != EE_OK) { voice_code = parse_voice_code(voice_code); espeak_wrapper::get_instance()->set_voice(voice_code.c_str()); } #else TTS_ABORT("Attempted to set voice without espeak-ng installed."); #endif } void conditions::reset_for_clause_end() { hyphenated = false; was_punctuated_acronym = false; beginning_of_clause = true; was_number = false; } void conditions::reset_for_space() { hyphenated = false; was_punctuated_acronym = false; was_word = false; } void conditions::update_for_word(std::string word, bool allow_for_upper_check) { if (allow_for_upper_check && !is_all_upper(word)) { was_all_capitalized = false; } was_word = true; beginning_of_clause = false; hyphenated = false; was_number = false; } std::string corpus::next(int count) { if (location == size || count == 0) { return ""; } int final_loc = location; int grabbed = 0; while(grabbed < count && final_loc < size) { while(final_loc + 1 < size && (text[final_loc+1] & 0b11000000) == 0b10000000) { ++final_loc; } ++final_loc; ++grabbed; } return std::string(text+location, text+final_loc); } std::string corpus::last(int count) { if (location == 0 || count == 0) { return ""; } int final_loc = location - 1; int grabbed = 0; while(grabbed < count && final_loc > 0) { while((text[final_loc] & 0b11000000) == 0b10000000) { --final_loc; } ++grabbed; } return std::string(text+final_loc, text+location-1); } std::string corpus::pop(int count) { std::string ret = next(count); location += ret.size(); return ret; } std::string corpus::after(int aftr, int count) { size_t new_loc = location + aftr; if (new_loc >= size || count == 0) { return ""; } int final_loc = new_loc; int grabbed = 0; while(grabbed < count && final_loc < size) { while(final_loc+1 < size && (text[final_loc+1] & 0b11000000) == 0b10000000) { ++final_loc; } ++final_loc; ++grabbed; } return std::string(text+new_loc, text+final_loc); } std::string corpus::size_pop(size_t pop_size) { size_t tsize = std::min(pop_size, size - location); std::string ret = std::string(text+location, text+location+tsize); location += tsize; return ret; } std::string corpus::next_in(std::string val, bool* has_accent) { int n = 0; int running = 0; std::string nafter = next(); while (nafter != "" && val.find(nafter) != std::string::npos) { if (has_accent && !(*has_accent) && COMMON_ACCENTED_CHARACTERS.find(nafter) != std::string::npos) { *has_accent = true; } ++n; running += nafter.size(); nafter = after(running); } return next(n); } std::string corpus::pop_in(std::string val) { int n = 0; size_t running = 0; std::string nafter = next(); running += nafter.size(); while (nafter != "" && val.find(nafter) != std::string::npos) { ++n; nafter = after(running); running += nafter.size(); } return pop(n); } std::string corpus::after_until(int aftr, std::string val) { int n = 0; std::string nafter = after(aftr); while (nafter != "" && val.find(nafter) != std::string::npos) { ++n; nafter = after(n); } return after(aftr, n); } std::string phonemizer_rule::lookup_rule(std::vector & keys, int index) { if (index >= keys.size()) { return value; } std::string found_key = keys[index]; bool found_match = false; for (const auto& pair : rules) { if (pair.first == found_key) { found_match = true; break; } else if (pair.first[0] == '*' && has_suffix(found_key, pair.first.substr(1))) { found_match = true; found_key = pair.first; break; } else if (pair.first.back() == '*' && has_prefix(found_key, pair.first.substr(0, pair.first.size()-1))) { found_match = true; found_key = pair.first; break; } } if (found_match) { return rules.at(found_key)->lookup_rule(keys, index + 1); } else { return value; } } std::string word_phonemizer::lookup_rule(std::string word, std::string current, std::string before, std::string after) { if (rules.find(current) == rules.end()) { return ""; } std::vector lookup_keys = {before, after, word}; return rules[current]->lookup_rule(lookup_keys, 0); } void word_phonemizer::add_rule(std::vector keys, std::string phoneme) { phonemizer_rule * current_rule = nullptr; for (int i = 0; i < keys.size(); i++) { if (current_rule) { if (current_rule->rules.find(keys[i]) == current_rule->rules.end()) { phonemizer_rule * nrule = new phonemizer_rule; current_rule->rules[keys[i]] = nrule; current_rule = nrule; } else { current_rule = current_rule->rules.at(keys[i]); } } else { if (rules.find(keys[i]) == rules.end()) { current_rule = new phonemizer_rule; rules[keys[i]] = current_rule; } else { current_rule = rules.at(keys[i]); } } } if (current_rule) { current_rule->value = phoneme; } } std::string word_phonemizer::phonemize(std::string word) { std::vector graphemes; word = to_lower(word); tokenizer->token_split(word, graphemes); std::string phoneme = ""; for (int i = 0; i < graphemes.size(); i++) { std::string before = i > 0 ? graphemes[i-1] : "^"; std::string after = i + 1 < graphemes.size() ? graphemes[i+1] : "$"; std::string current = graphemes[i]; phoneme += lookup_rule(word, current, before, after); } return phoneme; } std::string build_subthousand_phoneme(int value) { int hundreds = value / 100; std::string phoneme = hundreds > 0 ? NUMBER_PHONEMES[hundreds] + " " + HUNDRED_PHONEME : ""; value = value % 100; if (value > 0 && value < 20) { phoneme += NUMBER_PHONEMES[value]; } else if (value > 0) { phoneme += SUB_HUNDRED_NUMBERS[(value / 10) - 2]; value = value % 10; if (value > 0) { phoneme += " " + NUMBER_PHONEMES[value]; } } return phoneme; } std::string build_number_phoneme(long long int remainder) { std::string phoneme = ""; bool started = false; if (remainder > TRILLION) { long long int trillions = (long long int) remainder / TRILLION; phoneme += build_subthousand_phoneme(trillions) + " " + TRILLION_PHONEME; remainder = (long long int) remainder % TRILLION; if (remainder > 0) { phoneme += ","; } started = true; } if (remainder > BILLION) { long long int billions = (long long int) remainder / BILLION; remainder = (long long int) remainder % BILLION; std::string billion_part = build_subthousand_phoneme(billions) + " " + BILLION_PHONEME; if (!started) { phoneme += remainder > 0 ? billion_part + "," : billion_part; } else if (remainder == 0) { phoneme += " " + billion_part; } else { phoneme += " " + billion_part + ","; } started = true; } if (remainder > MILLION) { long long int millions = (long long int) remainder / MILLION; remainder = (long long int) remainder % MILLION; std::string million_part = build_subthousand_phoneme(millions) + " " + MILLION_PHONEME; if (!started) { phoneme += remainder > 0 ? million_part + "," : million_part; } else if (remainder == 0) { phoneme += " " + million_part; } else { phoneme += " " + million_part + ","; } started = true; } if (remainder > 1000) { long long int thousands = (long long int) remainder / 1000; remainder = (long long int) remainder % 1000; std::string thousand_part = build_subthousand_phoneme(thousands) + " " + THOUSAND_PHONEME; if (!started) { phoneme += remainder > 0 ? thousand_part + "," : thousand_part; } else if (remainder == 0) { phoneme += " " + thousand_part; } else { phoneme += " " + thousand_part + ","; } started = true; } if (remainder > 0) { if (started) { phoneme += " " + build_subthousand_phoneme(remainder); } else { phoneme += build_subthousand_phoneme(remainder); } } return phoneme; } bool dictionary_response::is_successful() { return code < 200; } bool dictionary_response::is_match(corpus* text, conditions* flags) { if (not_at_clause_end) { std::string chunk = text->next_in(NON_CLAUSE_WORD_CHARACTERS); std::string after = text->after(chunk.size()); if (after == "!" || after == "." || after == "?") { return false; } } return text->next(after_match.size()) == after_match && (!expects_to_be_proceeded_by_number || flags->was_number) && (!not_at_clause_start || !flags->beginning_of_clause); } dictionary_response * phoneme_dictionary::lookup(corpus * text, std::string value, conditions* flags) { if (lookup_map.find(value) == lookup_map.end()) { return not_found_response; } std::vector possibilities = lookup_map.at(value); for (auto possible : possibilities) { if (possible->code == SUCCESS_TOTAL || (possible->code == SUCCESS_PARTIAL && possible->is_match(text, flags))) { return possible; } } return phonetic_fallback_response; } bool phonemizer::handle_space(corpus* text, std::string* output, conditions* flags) { flags->reset_for_space(); text->pop_in(" \n\f\t"); if (output->back() != ' ') { output->append(" "); } return true; } void phonemizer::append_numeric_series(std::string series, std::string* output, conditions * flags) { if (flags->was_word && output->back() != ' ' && !flags->hyphenated) { output->append(" "); } for (int i = 0; i < series.size(); i++) { int numeral = series[i] - '0'; output->append(NUMBER_PHONEMES[numeral]); if (i + 1 < series.size()) { output->append(" "); } } if (series.size() > 0) { flags->update_for_word(series); flags->was_number = true; } } bool phonemizer::handle_numeric_series(corpus* text, std::string* output, conditions* flags) { std::string series = text->pop_in(NUMBER_CHARACTERS); append_numeric_series(series, output, flags); return true; } bool phonemizer::handle_numeric(corpus* text, std::string* output, conditions* flags) { /* * There are four recognized ways of separating large arabic numerals: * 1. No breaks or seperations exception for the decimal (e.g. '32000.012' or '32000,012') * 2. Space separated breaks between every three digits and comma separated decimals (e.g. '32 000,012') * 3. Period separated breaks between every three digits and comma separated decimals (e.g. '32.000,012') * 4. Comma separated breaks between every three digits and period separated decimals (e.g. '32,000.012') * * This implementation will support all three approaches up to the trillions, after which numbers will be represented as a series * of distinct digits. Non conforming patterns, e.g. multiple commas, multiple periods, or multiple spaces that are not three * digits apart, will not be treated as continuous numbers but rather separate numerical strings. */ std::string number = text->next_in(COMPATIBLE_NUMERICS); number = strip(number, ",. "); // For numerics, we don't necessarily want to stop reading from the corpus at periods, commas, or spaces. char large_number_separator = '\0'; char decimal_separator = '\0'; char last_break_char = '\0'; bool invalid_format = false; int count_since_break = 0; std::string built = ""; for (char & c : number) { if (is_numeric(c)) { built += c; count_since_break += 1; } else if (last_break_char =='\0') { if (count_since_break > 3) { decimal_separator = c; } last_break_char = c; built += c; count_since_break = 0; } else if (c != last_break_char) { if (c == ' ') { break; } else if (count_since_break == 3 && decimal_separator == '\0') { if (large_number_separator == '\0') { large_number_separator = last_break_char; } decimal_separator = c; built += c; count_since_break = 0; last_break_char = c; } else if (count_since_break != 3) { if (large_number_separator != '\0') { invalid_format = true; } break; } else { break; } } else if (c == last_break_char) { if (decimal_separator != '\0') { break; } else if (count_since_break != 3) { invalid_format = true; break; } else { large_number_separator = c; built += c; count_since_break = 0; } } } if (!invalid_format) { if (large_number_separator != '\0' && decimal_separator == '\0' && count_since_break != 3) { invalid_format = true; } else if (count_since_break == 3 && last_break_char != '\0' && decimal_separator == '\0' && large_number_separator == '\0') { large_number_separator = last_break_char; } else if (count_since_break != 3 && last_break_char != '\0' && decimal_separator == '\0' && large_number_separator == '\0') { decimal_separator = last_break_char; } } if (invalid_format) { return handle_numeric_series(text, output, flags); } if (large_number_separator != '\0') { built.erase(std::remove(built.begin(), built.end(), large_number_separator), built.end()); } if (decimal_separator == ',') { replace(built, decimal_separator, '.'); } long long int value = std::stoll(built); if (value >= LARGEST_PRONOUNCABLE_NUMBER) { return handle_numeric_series(text, output, flags); } text->size_pop(built.size()); std::string noutput = build_number_phoneme(value); if (noutput.size() > 0) { if (flags->was_word && output->back() != ' ' && !flags->hyphenated) { output->append(" "); } output->append(noutput); flags->update_for_word(built); flags->was_number = true; } if (decimal_separator != '\0') { std::vector parts = split(built, decimal_separator); if (parts[1].size() > 0) { output->append(" " + POINT_PHONEME + " "); append_numeric_series(parts[1], output, flags); } } return true; } bool phonemizer::is_acronym_like(corpus* text, std::string word, conditions* flags) { if (word.find(".") != std::string::npos) { for (std::string part : split(word, ".")) { if (part.size() == 0) { return false; } if (part.size() > 1) { if (part.size() > 2 || !(isupper(part[0]) && islower(part[1]))) { return false; } } } return true; } else if (word.size() < 4) { return small_english_words.find(to_lower(word)) == small_english_words.end(); } else if (is_all_upper(word)) { if (flags->was_all_capitalized || is_all_upper(text->after_until(word.size()+1, " "))) { flags->was_all_capitalized = true; return false; } return true; } else if (!is_all_upper(word) && upper_count(word) > (int) word.length() / 2) { return true; } return false; } bool phonemizer::handle_roman_numeral(corpus* text, std::string* output, conditions * flags) { auto next = text->next(); next = to_lower(next); int total = 0; int last_value = 0; std::string running = ""; while (is_roman_numeral(next[0])) { bool found = false; for (int size = 4; size > 0; size--) { std::string chunk = text->after(running.size(), size); chunk = to_lower(chunk); if (ROMAN_NUMERALS.find(chunk) != ROMAN_NUMERALS.end()) { found = true; int found_value = ROMAN_NUMERALS.at(chunk); if (total == 0 || last_value > found_value) { total += found_value; last_value = found_value; running += chunk; } else { return false; } } } if (found) { next = text->after(running.size()); to_lower(next); continue; } return false; } std::string noutput = build_number_phoneme(total); if (flags->was_word && output->back() != ' ' && !flags->hyphenated) { output->append(" "); } output->append(noutput); text->size_pop(running.size()); flags->update_for_word(running, false); flags->was_number = true; return true; } bool phonemizer::handle_acronym(corpus* text, std::string word, std::string* output, conditions * flags) { std::string out = ""; for (int i = 0; i < word.size(); i++) { try { if (word[i] == '.') { flags->was_punctuated_acronym = true; continue; } char letter = std::tolower(word[i]); out += LETTER_PHONEMES.at(letter); } catch (const std::out_of_range& e) { continue; } } text->size_pop(word.size()); if (flags->was_word && output->back() != ' ' && !flags->hyphenated) { output->append(" "); } output->append(out); flags->update_for_word(word, false); return true; } bool phonemizer::handle_phonetic(corpus* text, std::string word, std::string* output, conditions* flags, size_t unaccented_size_difference) { if (flags->was_word && output->back() != ' ' && !flags->hyphenated) { output->append(" "); } output->append(phonetic_phonemizer->phonemize(word)); text->size_pop(word.size()+unaccented_size_difference); flags->update_for_word(word); return true; } static std::unordered_map kokoro_ipa_map; void populate_kokoro_ipa_map(std::string executable_path) { std::string line; auto filepath = executable_path + "kokoro_ipa.embd"; printf("\nReading Kokoro IPA from %s",filepath.c_str()); std::ifstream myfile(filepath); if (myfile.is_open()) { while (myfile.good()) { getline(myfile, line); auto parts = split(line, ","); if(parts.size()==2) { kokoro_ipa_map[parts[0]] = parts[1]; } else { printf("\nError reading line in Kokoro IPA!"); } } myfile.close(); printf("\nPopulated Kokoro IPA: %d entries", kokoro_ipa_map.size()); } else { printf("\nUnable to open Kokoro IPA file"); } } std::string found_word_to_ipa(std::string input) { bool is_acronym = !input.empty() && std::all_of(input.begin(), input.end(), [](unsigned char c) { return std::isupper(c); }); if (is_acronym) { return ""; // Return empty for acronyms } // Convert input to lowercase std::transform(input.begin(), input.end(), input.begin(), [](unsigned char c) { return std::tolower(c); }); auto it = kokoro_ipa_map.find(input); if (it != kokoro_ipa_map.end()) { return it->second; // found } return ""; } bool phonemizer::process_word(corpus* text, std::string* output, std::string word, conditions* flags, bool has_accent) { dictionary_response* response; size_t unaccented_size_difference = 0; std::string foundstr = found_word_to_ipa(word); if(foundstr!="") { output->append(foundstr); text->size_pop(word.size()); return true; } if (has_accent) { response = dict->lookup(text, word, flags); if (!response->is_successful()) { unaccented_size_difference = word.size(); word = replace_accents(word); unaccented_size_difference -= word.size(); response = dict->lookup(text, word, flags); } } else { response = dict->lookup(text, word, flags); } //printf("\nSUCCESS: %d, word:%s, result:%s\n",response->is_successful(),word.c_str(),response->value.c_str()); if (response->is_successful()) { if (flags->was_word && output->back() != ' ' && !flags->hyphenated) { output->append(" "); } flags->update_for_word(word); if (response->code != SUCCESS_TOTAL) { word += response->after_match; output->append(response->value); text->size_pop(word.size()+unaccented_size_difference); return true; } else { output->append(response->value); text->size_pop(word.size()+unaccented_size_difference); return true; } } else if (can_be_roman_numeral(word) && is_all_upper(word) && small_english_words.find(to_lower(word)) == small_english_words.end() && handle_roman_numeral(text, output, flags)) { return true; } else if (is_acronym_like(text, word, flags)) { return handle_acronym(text, word, output, flags); } else if (word.find(".") < word.length()) { bool part_has_accent = false; std::string word_part = text->next_in(ALPHABET+COMMON_ACCENTED_CHARACTERS, &part_has_accent); process_word(text, output, word_part, flags, part_has_accent); handle_punctuation(text, ".", output, flags); output->append(" "); flags->reset_for_space(); return true; } else { return handle_phonetic(text, word, output, flags, unaccented_size_difference); } return true; } bool phonemizer::handle_word(corpus * text, std::string* output, conditions * flags) { bool has_accent = false; std::string word = text->next_in(WORD_CHARACTERS, &has_accent); while (word.size() > 0 && word.back() == '.') { word = word.substr(0,word.size()-1); } return process_word(text, output, word, flags, has_accent); } bool phonemizer::handle_replacement(corpus* text, std::string next, std::string* output, conditions * flags) { if (flags->was_word && output->back() != ' ' && !flags->hyphenated) { output->append(" "); } output->append(REPLACEABLE.at(next)); flags->update_for_word(next); text->pop(); return true; } bool phonemizer::handle_possession_plural(corpus* text, std::string* output, conditions * flags) { if (text->next(2) == "'s") { std::string last = text->last(); if (VOWELS.find(to_lower(last)[0]) != std::string::npos) { output->append("z"); } else if (last == "s" || last == "z") { output->append("ᵻz"); } else if (is_alphabetic(last[0])) { output->append("s"); } else { output->append("ˈɛs"); } text->pop(2); } else { text->pop(); } return true; } bool phonemizer::handle_contraction(corpus* text, std::string* output, conditions * flags) { text->pop(); std::string next = text->next_in(ALPHABET); next = to_lower(next); try { output->append(CONTRACTION_PHONEMES.at(next)); } catch (const std::out_of_range& e) { // in the situation that we cannt find a contraction then we just want to pop the ' character and continue // it could be the end of a single quote which is ignored by the espeak phonemizer. return true; } // make sure to pop the contraction. text->pop_in(ALPHABET); return true; } bool phonemizer::handle_punctuation(corpus* text, std::string next, std::string* output, conditions * flags) { std::string last = text->last(); std::string after = text->after(); if (next[0] == '.') { if (flags->was_punctuated_acronym) { // we finished an acronym flags->was_punctuated_acronym = false; output->append(next); text->pop(); if (text->after(1, 2) == "'s") { return handle_possession_plural(text, output, flags); } return true; } std::string chunk = text->next_in("."); /*if (chunk.size() > 1) { flags->pre_pause += 4; }*/ output->append(chunk); text->size_pop(chunk.size()); return true; } else if (next == "'") { if (flags->was_word && (after == "s" || !is_alphabetic(after[0]))) { return handle_possession_plural(text, output, flags); } else if (flags->was_word && (CONTRACTION_PHONEMES.find(after) != CONTRACTION_PHONEMES.end() || CONTRACTION_PHONEMES.find(text->after(next.size(), 2)) != CONTRACTION_PHONEMES.end())) { return handle_contraction(text, output, flags); } else { // could be the end or start of a quote text->pop(); return true; } } else if (next[0] == '-') { if (last == " " && after == " ") { //flags->pre_pause += 4; text->pop(2); flags->reset_for_space(); return true; } else if (after[0] == '-') { //flags->pre_pause += 4; text->pop(2); output->append(" "); flags->reset_for_space(); return true; } else if (!flags->beginning_of_clause && flags->was_word && is_alphabetic(after[0])) { flags->hyphenated = true; text->pop(); return true; } else { // ignore it text->pop(); return true; } } else if (CLAUSE_BREAKS.find(next) != std::string::npos) { output->append(next); flags->reset_for_clause_end(); text->pop(); return true; } else if (NOOP_BREAKS.find(next) != std::string::npos) { output->append(next); text->pop(); return true; } else if (REPLACEABLE.find(next) != REPLACEABLE.end()) { return handle_replacement(text, next, output, flags); } else { // ignore it text->pop(); return true; } } bool phonemizer::route(corpus * text, std::string* output, conditions * flags) { std::string next = text->next(); if (next == "") { // we finished lexing the corpus return false; } if (SPACE_CHARACTERS.find(next) != std::string::npos) { return handle_space(text, output, flags); } else if (is_numeric(next[0])) { return handle_numeric(text, output, flags); } else if (is_alphabetic(next[0])) { return handle_word(text, output, flags); } else { return handle_punctuation(text, next, output, flags); } } #ifdef ESPEAK_INSTALL std::string phonemizer::espeak_text_to_phonemes(const char * text) { int mode = phoneme_mode == IPA ? (0 << 8 | 0x02) : (0 << 8 | 0x01); const void ** txt_ptr = (const void**)&text; const char * resp = espeak_wrapper::get_instance()->text_to_phonemes(txt_ptr, espeakCHARS_UTF8, mode); return strip(std::string(resp)); } #endif std::string phonemizer::text_to_phonemes(const char * text, size_t size) { std::string output = ""; if (mode == ESPEAK) { #ifdef ESPEAK_INSTALL auto parts = split(text, STOPPING_TOKENS, true); std::string phonemes = ""; for (int i = 0; i < parts.size(); i+=2) { phonemes += espeak_text_to_phonemes(parts[i].c_str()); if (preserve_punctuation && i + 1 < parts.size()) { phonemes += parts[i+1]; } } return phonemes; #else TTS_ABORT("%s attempted to run in espeak mode without espeak installed. \n", __func__); #endif } else { text_to_phonemes(text, size, &output); } return output; } std::string phonemizer::text_to_phonemes(std::string text) { return text_to_phonemes(text.c_str(), text.size()); } void phonemizer::text_to_phonemes(const char * text, size_t size, std::string* output) { if (mode == ESPEAK) { #ifdef ESPEAK_INSTALL TTS_ABORT("%s attempted to run in espeak mode with output already defined. \n", __func__); #else TTS_ABORT("%s attempted to run in espeak mode without espeak installed. \n", __func__); #endif return; } corpus * corpus_text = new corpus(text, size); conditions * flags = new conditions; bool running = true; while (running) { running = route(corpus_text, output, flags); } delete corpus_text; delete flags; } void phonemizer::text_to_phonemes(std::string text, std::string* output) { text_to_phonemes(text.c_str(), text.size(), output); } struct word_phonemizer * word_phonemizer_from_gguf(gguf_context * meta) { struct single_pass_tokenizer * tokenizer = single_pass_tokenizer_from_gguf(meta); word_phonemizer * wph = new word_phonemizer(tokenizer); int rule_keys_key = gguf_find_key(meta, "phonemizer.rules.keys"); int phoneme_key = gguf_find_key(meta, "phonemizer.rules.phonemes"); if (rule_keys_key == -1 || phoneme_key == -1) { TTS_ABORT("Both 'phonemizer.rules.keys' and 'phonemizer.rules.phonemes' keys must be set in order to support phonemization."); } int key_count = gguf_get_arr_n(meta, rule_keys_key); assert(key_count == gguf_get_arr_n(meta, phoneme_key)); for (int i = 0; i < key_count; i++) { std::string rule_key = gguf_get_arr_str(meta, rule_keys_key, i); std::string phoneme = gguf_get_arr_str(meta, phoneme_key, i); wph->add_rule(split(rule_key, "."), phoneme); } return wph; } dictionary_response * response_from_string(std::string value, std::string key) { std::vector parts = split(value, ":"); bool has_spacing = parts.size() > 1; bool expects_to_be_proceeded_by_number = key[0] == '$'; bool not_at_start = key[0] == '#'; bool not_at_end = key.back() == '#'; if (!has_spacing) { dictionary_response * resp = new dictionary_response(SUCCESS_TOTAL, value); resp->expects_to_be_proceeded_by_number = expects_to_be_proceeded_by_number; resp->not_at_clause_end = not_at_end; resp->not_at_clause_start = not_at_start; return resp; } else { dictionary_response * resp = new dictionary_response(SUCCESS_PARTIAL, parts[0]); resp->after_match = parts[1]; resp->expects_to_be_proceeded_by_number = expects_to_be_proceeded_by_number; resp->not_at_clause_end = not_at_end; resp->not_at_clause_start = not_at_start; return resp; } } struct phoneme_dictionary * phoneme_dictionary_from_gguf(gguf_context * meta) { struct phoneme_dictionary * dict = new phoneme_dictionary; int keys_key = gguf_find_key(meta, "phonemizer.dictionary.keys"); int values_key = gguf_find_key(meta, "phonemizer.dictionary.values"); if (keys_key == -1 || values_key == -1) { TTS_ABORT("Both 'phonemizer.dictionary.keys' and 'phonemizer.dictionary.values' keys must be set in order to support phonemization."); } int key_count = gguf_get_arr_n(meta, keys_key); assert(key_count == gguf_get_arr_n(meta, values_key)); for (int i = 0; i < key_count; i++) { std::string key = gguf_get_arr_str(meta, keys_key, i); std::string values = gguf_get_arr_str(meta, values_key, i); std::vector out; for (std::string val : split(values, ",")) { out.push_back(response_from_string(val, key)); } if (key[0] == '$' || key[0] == '#') { key = key.substr(1); } if (key.back() == '#') { key = key.substr(0, key.size() - 1); } dict->lookup_map[key] = out; } return dict; } struct phonemizer * phonemizer_from_gguf(gguf_context * meta, const std::string espeak_voice_code) { int mode_key = gguf_find_key(meta, "phonemizer.type"); phonemizer * ph; if (mode_key == -1) { TTS_ABORT("Key 'phonemizer.type' must be specified in gguf file for all models using a phonemizer."); } uint32_t ph_type = gguf_get_val_u32(meta, mode_key); if ((phonemizer_type) ph_type == ESPEAK) { #ifdef ESPEAK_INSTALL espeak_wrapper::get_instance()->initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, ESPEAK_DATA_PATH, 0); update_voice(espeak_voice_code); ph = new phonemizer(nullptr, nullptr); ph->mode = ESPEAK; #else TTS_ABORT("%s attempted to load an espeak phonemizer without espeak installed. \n", __func__); #endif int phoneme_type_key = gguf_find_key(meta, "phonemizer.phoneme_type"); if (phoneme_type_key != -1) { uint32_t phoneme_typing = gguf_get_val_u32(meta, mode_key); if ((phoneme_type)phoneme_typing == ESPEAK_PHONEMES) { ph->phoneme_mode = ESPEAK_PHONEMES; } } return ph; } struct word_phonemizer * phonetic_ph = word_phonemizer_from_gguf(meta); struct phoneme_dictionary * dict = phoneme_dictionary_from_gguf(meta); ph = new phonemizer(dict, phonetic_ph); return ph; } struct phonemizer * espeak_phonemizer(bool use_espeak_phonemes, std::string espeak_voice_code) { #ifdef ESPEAK_INSTALL espeak_wrapper::get_instance()->initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, ESPEAK_DATA_PATH, 0); update_voice(espeak_voice_code); phonemizer * ph = new phonemizer(nullptr, nullptr); ph->mode = ESPEAK; if (use_espeak_phonemes) { ph->phoneme_mode = ESPEAK_PHONEMES; } return ph; #else TTS_ABORT("%s attempted to load an espeak phonemizer without espeak installed. \n", __func__); #endif } struct phonemizer * phonemizer_from_file(const std::string fname, const std::string espeak_voice_code) { ggml_context * weight_ctx = NULL; struct gguf_init_params params = { /*.no_alloc =*/ false, /*.ctx =*/ &weight_ctx, }; gguf_context * meta_ctx = gguf_init_from_file(fname.c_str(), params); if (!meta_ctx) { TTS_ABORT("%s failed for file %s\n", __func__, fname.c_str()); } return phonemizer_from_gguf(meta_ctx, espeak_voice_code); }