koboldcpp/otherarch/ttscpp/src/phonemizer.cpp

1239 lines
39 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "phonemizer.h"
#ifdef ESPEAK_INSTALL
/**
* espeak_wrapper functions and assignments
*
* The espeak_wrapper is a singleton which wraps threaded calls to espeak-ng with a shared mutex
*/
// non-const static members must be initialized out of line
espeak_wrapper* espeak_wrapper::instance{nullptr};
std::mutex espeak_wrapper::mutex;
espeak_wrapper * espeak_wrapper::get_instance() {
if (!instance) {
instance = new espeak_wrapper;
}
return instance;
}
const espeak_VOICE ** espeak_wrapper::list_voices() {
std::lock_guard<std::mutex> lock(mutex);
return espeak_ListVoices(nullptr);
}
espeak_ERROR espeak_wrapper::set_voice(const char * voice_code) {
std::lock_guard<std::mutex> lock(mutex);
return espeak_SetVoiceByName(voice_code);
}
const char * espeak_wrapper::text_to_phonemes(const void ** textptr, int textmode, int phonememode) {
std::lock_guard<std::mutex> lock(mutex);
return espeak_TextToPhonemes(textptr, textmode, phonememode);
}
void espeak_wrapper::initialize(espeak_AUDIO_OUTPUT output, int buflength, const char * path, int options) {
std::lock_guard<std::mutex> lock(mutex);
if (!espeak_initialized) {
espeak_initialized = true;
espeak_Initialize(output, buflength, path, options);
}
}
#endif
/**
* Helper functions for string parsing
*/
const std::unordered_set<std::string> inline_combine_sets(const std::vector<std::unordered_set<std::string>> sets) {
std::unordered_set<std::string> combined;
for (auto set : sets) {
combined.insert(set.begin(), set.end());
}
return combined;
}
std::string replace(std::string target, char to_replace, char replacement) {
for (int i = 0; i < target.size(); i++) {
if (target[i] == to_replace) {
target[i] = replacement;
}
}
return target;
}
std::string to_lower(std::string word) {
std::transform(word.begin(), word.end(), word.begin(),
[](unsigned char c){ return std::tolower(c);
});
return word;
}
std::string to_upper(std::string word) {
std::transform(word.begin(), word.end(), word.begin(),
[](unsigned char c){ return std::toupper(c);
});
return word;
}
std::string replace_accents(std::string word) {
std::string new_word;
for (int i = 0; i < word.size();) {
int grab = 0;
while(i+grab+1 < word.size() && (word[i+grab + 1] & 0b11000000) == 0b10000000) {
++grab;
}
++grab;
if (grab > 1) {
std::string accent = word.substr(i, grab);
if (ACCENTED_A.find(accent) != std::string::npos) {
new_word.push_back('a');
} else if (ACCENTED_C.find(accent) != std::string::npos) {
new_word.push_back('c');
} else if (ACCENTED_E.find(accent) != std::string::npos) {
new_word.push_back('e');
} else if (ACCENTED_I.find(accent) != std::string::npos) {
new_word.push_back('i');
} else if (ACCENTED_N.find(accent) != std::string::npos) {
new_word.push_back('n');
} else if (ACCENTED_O.find(accent) != std::string::npos) {
new_word.push_back('o');
} else if (ACCENTED_U.find(accent) != std::string::npos) {
new_word.push_back('u');
} else {
// non accented charactes in a word string should really be possible but for the sake of keeping this function pure
// just put the multibyte character back;
new_word.append(accent);
}
} else {
new_word.push_back(word[i]);
}
i += grab;
}
return new_word;
}
int upper_count(std::string word) {
int count = 0;
for (char letter : word) {
if (isupper(letter)) {
count += 1;
}
}
return count;
}
bool is_all_upper(std::string word) {
for (char letter : word) {
if (!isupper(letter)) {
return false;
}
}
return true;
}
/*
* Text condition checks
*/
bool is_roman_numeral(char letter) {
return ROMAN_NUMERAL_CHARACTERS.find(letter) != std::string::npos;
}
bool can_be_roman_numeral(std::string word) {
for (int i = 0; i < word.size(); i++) {
if (!is_roman_numeral(word[i])) {
return false;
}
}
return true;
}
bool is_alphabetic(char letter) {
return ALPHABET.find(letter) != std::string::npos;
}
bool is_numeric(char letter) {
int val = (int) letter;
return val >= 48 && val <= 57;
}
std::string parse_voice_code(std::string voice_code) {
#ifdef ESPEAK_INSTALL
voice_code = to_lower(voice_code);
const espeak_VOICE * primary_match = nullptr;
const espeak_VOICE * secondary_match = nullptr;
bool search_by_lc = voice_code.size() == 2;
bool search_by_lfc = !search_by_lc && voice_code.size() == 3;
bool search_by_id = !search_by_lfc && voice_code.find("/") != std::string::npos;
// It is common for locale's to be '_' separated rather than '-' separated. Check for both.
bool search_by_lcc = !search_by_id && (voice_code.find("-") != std::string::npos || voice_code.find("_") != std::string::npos);
if (search_by_id || search_by_lcc) {
voice_code = replace(voice_code, '_', '-');
}
const espeak_VOICE** espeak_voices = espeak_wrapper::get_instance()->list_voices();
// ideally we'd use the espeak voice scores which order voices by preference, but they are only returned when a voice_spec is passed to the list api and
// the voice spec isn't compatible with partials (e.g. country codes, language family code, etc)
int i = 0;
while (espeak_voices[i] != nullptr) {
auto identifier_parts = split(espeak_voices[i]->identifier, "/");
// it is possible to add languages to espeak-ng without following their identifier pattern, if we run into such a language just try to match against
// the identifier and otherwise continue;
if (identifier_parts.size() == 1) {
if (voice_code == identifier_parts[0] || voice_code == espeak_voices[i]->name) {
primary_match = espeak_voices[i];
} else {
continue;
}
}
if (search_by_lc) {
std::string language_part = identifier_parts[1];
if (language_part == voice_code) {
primary_match = espeak_voices[i];
break; // if we have an exact match then we can exit
} else if (has_prefix(language_part, voice_code) && (!primary_match || strlen(primary_match->identifier) > strlen(espeak_voices[i]->identifier))) {
// prefer the smaller codes as longer codes typically refer to more specific locales
primary_match = espeak_voices[i] ;
} else {
auto subparts = split(language_part, "-");
if (subparts.size() > 1 && to_lower(subparts[1]) == voice_code && (!secondary_match || strlen(secondary_match->identifier) > strlen(espeak_voices[i]->identifier))) {
// country codes are typically capitalized in espeak-ng
secondary_match = espeak_voices[i];
}
}
} else if (search_by_lfc) {
// espeak-ng uses language family codes in their identifiers, but also uses ISO 639-3 language codes for some languages.
// Since language codes are more specific attempt to match against the language code as the primary and match against the language family
// code as the secondary.
if (has_prefix(identifier_parts[1], voice_code) && (!primary_match || strlen(primary_match->identifier) > strlen(espeak_voices[i]->identifier))) {
primary_match = espeak_voices[i];
} else if (identifier_parts[0] == voice_code && (!secondary_match || strlen(secondary_match->identifier) > strlen(espeak_voices[i]->identifier))) {
secondary_match = espeak_voices[i];
}
} else if (search_by_id && has_prefix(to_lower(espeak_voices[i]->identifier), voice_code) && (!primary_match || strlen(primary_match->identifier) > strlen(espeak_voices[i]->identifier))) {
primary_match = espeak_voices[i];
} else if (search_by_lcc && has_prefix(to_lower(identifier_parts[1]), voice_code) && (!primary_match || strlen(primary_match->identifier) > strlen(espeak_voices[i]->identifier))) {
primary_match = espeak_voices[i];
} else if (to_lower(espeak_voices[i]->name).find(voice_code) != std::string::npos && (!primary_match || strlen(primary_match->identifier) > strlen(espeak_voices[i]->identifier))) {
primary_match = espeak_voices[i];
}
i++;
}
if (!primary_match && !secondary_match) {
TTS_ABORT("Failed to match espeak voice code '%s' to known espeak voices.\n", voice_code.c_str());
}
if (!primary_match) {
primary_match = secondary_match;
}
fprintf(stdout, "Passed Espeak Voice Code '%s' doesn't directly match any known Espeak Voice IDs. Nearest match with name '%s' and id '%s' will be used instead.\n", voice_code.c_str(), primary_match->name, primary_match->identifier);
return std::string(primary_match->identifier);
#else
TTS_ABORT("Attempted to list voices without espeak-ng installed.");
#endif
}
void update_voice(std::string voice_code) {
#ifdef ESPEAK_INSTALL
espeak_ERROR e = espeak_wrapper::get_instance()->set_voice(voice_code.c_str());
if (e != EE_OK) {
voice_code = parse_voice_code(voice_code);
espeak_wrapper::get_instance()->set_voice(voice_code.c_str());
}
#else
TTS_ABORT("Attempted to set voice without espeak-ng installed.");
#endif
}
void conditions::reset_for_clause_end() {
hyphenated = false;
was_punctuated_acronym = false;
beginning_of_clause = true;
was_number = false;
}
void conditions::reset_for_space() {
hyphenated = false;
was_punctuated_acronym = false;
was_word = false;
}
void conditions::update_for_word(std::string word, bool allow_for_upper_check) {
if (allow_for_upper_check && !is_all_upper(word)) {
was_all_capitalized = false;
}
was_word = true;
beginning_of_clause = false;
hyphenated = false;
was_number = false;
}
std::string corpus::next(int count) {
if (location == size || count == 0) {
return "";
}
int final_loc = location;
int grabbed = 0;
while(grabbed < count && final_loc < size) {
while(final_loc + 1 < size && (text[final_loc+1] & 0b11000000) == 0b10000000) {
++final_loc;
}
++final_loc;
++grabbed;
}
return std::string(text+location, text+final_loc);
}
std::string corpus::last(int count) {
if (location == 0 || count == 0) {
return "";
}
int final_loc = location - 1;
int grabbed = 0;
while(grabbed < count && final_loc > 0) {
while((text[final_loc] & 0b11000000) == 0b10000000) {
--final_loc;
}
++grabbed;
}
return std::string(text+final_loc, text+location-1);
}
std::string corpus::pop(int count) {
std::string ret = next(count);
location += ret.size();
return ret;
}
std::string corpus::after(int aftr, int count) {
size_t new_loc = location + aftr;
if (new_loc >= size || count == 0) {
return "";
}
int final_loc = new_loc;
int grabbed = 0;
while(grabbed < count && final_loc < size) {
while(final_loc+1 < size && (text[final_loc+1] & 0b11000000) == 0b10000000) {
++final_loc;
}
++final_loc;
++grabbed;
}
return std::string(text+new_loc, text+final_loc);
}
std::string corpus::size_pop(size_t pop_size) {
size_t tsize = std::min(pop_size, size - location);
std::string ret = std::string(text+location, text+location+tsize);
location += tsize;
return ret;
}
std::string corpus::next_in(std::string val, bool* has_accent) {
int n = 0;
int running = 0;
std::string nafter = next();
while (nafter != "" && val.find(nafter) != std::string::npos) {
if (has_accent && !(*has_accent) && COMMON_ACCENTED_CHARACTERS.find(nafter) != std::string::npos) {
*has_accent = true;
}
++n;
running += nafter.size();
nafter = after(running);
}
return next(n);
}
std::string corpus::pop_in(std::string val) {
int n = 0;
size_t running = 0;
std::string nafter = next();
running += nafter.size();
while (nafter != "" && val.find(nafter) != std::string::npos) {
++n;
nafter = after(running);
running += nafter.size();
}
return pop(n);
}
std::string corpus::after_until(int aftr, std::string val) {
int n = 0;
std::string nafter = after(aftr);
while (nafter != "" && val.find(nafter) != std::string::npos) {
++n;
nafter = after(n);
}
return after(aftr, n);
}
std::string phonemizer_rule::lookup_rule(std::vector<std::string> & keys, int index) {
if (index >= keys.size()) {
return value;
}
std::string found_key = keys[index];
bool found_match = false;
for (const auto& pair : rules) {
if (pair.first == found_key) {
found_match = true;
break;
} else if (pair.first[0] == '*' && has_suffix(found_key, pair.first.substr(1))) {
found_match = true;
found_key = pair.first;
break;
} else if (pair.first.back() == '*' && has_prefix(found_key, pair.first.substr(0, pair.first.size()-1))) {
found_match = true;
found_key = pair.first;
break;
}
}
if (found_match) {
return rules.at(found_key)->lookup_rule(keys, index + 1);
} else {
return value;
}
}
std::string word_phonemizer::lookup_rule(std::string word, std::string current, std::string before, std::string after) {
if (rules.find(current) == rules.end()) {
return "";
}
std::vector<std::string> lookup_keys = {before, after, word};
return rules[current]->lookup_rule(lookup_keys, 0);
}
void word_phonemizer::add_rule(std::vector<std::string> keys, std::string phoneme) {
phonemizer_rule * current_rule = nullptr;
for (int i = 0; i < keys.size(); i++) {
if (current_rule) {
if (current_rule->rules.find(keys[i]) == current_rule->rules.end()) {
phonemizer_rule * nrule = new phonemizer_rule;
current_rule->rules[keys[i]] = nrule;
current_rule = nrule;
} else {
current_rule = current_rule->rules.at(keys[i]);
}
} else {
if (rules.find(keys[i]) == rules.end()) {
current_rule = new phonemizer_rule;
rules[keys[i]] = current_rule;
} else {
current_rule = rules.at(keys[i]);
}
}
}
if (current_rule) {
current_rule->value = phoneme;
}
}
std::string word_phonemizer::phonemize(std::string word) {
std::vector<std::string> graphemes;
word = to_lower(word);
tokenizer->token_split(word, graphemes);
std::string phoneme = "";
for (int i = 0; i < graphemes.size(); i++) {
std::string before = i > 0 ? graphemes[i-1] : "^";
std::string after = i + 1 < graphemes.size() ? graphemes[i+1] : "$";
std::string current = graphemes[i];
phoneme += lookup_rule(word, current, before, after);
}
return phoneme;
}
std::string build_subthousand_phoneme(int value) {
int hundreds = value / 100;
std::string phoneme = hundreds > 0 ? NUMBER_PHONEMES[hundreds] + " " + HUNDRED_PHONEME : "";
value = value % 100;
if (value > 0 && value < 20) {
phoneme += NUMBER_PHONEMES[value];
} else if (value > 0) {
phoneme += SUB_HUNDRED_NUMBERS[(value / 10) - 2];
value = value % 10;
if (value > 0) {
phoneme += " " + NUMBER_PHONEMES[value];
}
}
return phoneme;
}
std::string build_number_phoneme(long long int remainder) {
std::string phoneme = "";
bool started = false;
if (remainder > TRILLION) {
long long int trillions = (long long int) remainder / TRILLION;
phoneme += build_subthousand_phoneme(trillions) + " " + TRILLION_PHONEME;
remainder = (long long int) remainder % TRILLION;
if (remainder > 0) {
phoneme += ",";
}
started = true;
}
if (remainder > BILLION) {
long long int billions = (long long int) remainder / BILLION;
remainder = (long long int) remainder % BILLION;
std::string billion_part = build_subthousand_phoneme(billions) + " " + BILLION_PHONEME;
if (!started) {
phoneme += remainder > 0 ? billion_part + "," : billion_part;
} else if (remainder == 0) {
phoneme += " " + billion_part;
} else {
phoneme += " " + billion_part + ",";
}
started = true;
}
if (remainder > MILLION) {
long long int millions = (long long int) remainder / MILLION;
remainder = (long long int) remainder % MILLION;
std::string million_part = build_subthousand_phoneme(millions) + " " + MILLION_PHONEME;
if (!started) {
phoneme += remainder > 0 ? million_part + "," : million_part;
} else if (remainder == 0) {
phoneme += " " + million_part;
} else {
phoneme += " " + million_part + ",";
}
started = true;
}
if (remainder > 1000) {
long long int thousands = (long long int) remainder / 1000;
remainder = (long long int) remainder % 1000;
std::string thousand_part = build_subthousand_phoneme(thousands) + " " + THOUSAND_PHONEME;
if (!started) {
phoneme += remainder > 0 ? thousand_part + "," : thousand_part;
} else if (remainder == 0) {
phoneme += " " + thousand_part;
} else {
phoneme += " " + thousand_part + ",";
}
started = true;
}
if (remainder > 0) {
if (started) {
phoneme += " " + build_subthousand_phoneme(remainder);
} else {
phoneme += build_subthousand_phoneme(remainder);
}
}
return phoneme;
}
bool dictionary_response::is_successful() {
return code < 200;
}
bool dictionary_response::is_match(corpus* text, conditions* flags) {
if (not_at_clause_end) {
std::string chunk = text->next_in(NON_CLAUSE_WORD_CHARACTERS);
std::string after = text->after(chunk.size());
if (after == "!" || after == "." || after == "?") {
return false;
}
}
return text->next(after_match.size()) == after_match && (!expects_to_be_proceeded_by_number || flags->was_number) && (!not_at_clause_start || !flags->beginning_of_clause);
}
dictionary_response * phoneme_dictionary::lookup(corpus * text, std::string value, conditions* flags) {
if (lookup_map.find(value) == lookup_map.end()) {
return not_found_response;
}
std::vector<dictionary_response*> possibilities = lookup_map.at(value);
for (auto possible : possibilities) {
if (possible->code == SUCCESS_TOTAL || (possible->code == SUCCESS_PARTIAL && possible->is_match(text, flags))) {
return possible;
}
}
return phonetic_fallback_response;
}
bool phonemizer::handle_space(corpus* text, std::string* output, conditions* flags) {
flags->reset_for_space();
text->pop_in(" \n\f\t");
if (output->back() != ' ') {
output->append(" ");
}
return true;
}
void phonemizer::append_numeric_series(std::string series, std::string* output, conditions * flags) {
if (flags->was_word && output->back() != ' ' && !flags->hyphenated) {
output->append(" ");
}
for (int i = 0; i < series.size(); i++) {
int numeral = series[i] - '0';
output->append(NUMBER_PHONEMES[numeral]);
if (i + 1 < series.size()) {
output->append(" ");
}
}
if (series.size() > 0) {
flags->update_for_word(series);
flags->was_number = true;
}
}
bool phonemizer::handle_numeric_series(corpus* text, std::string* output, conditions* flags) {
std::string series = text->pop_in(NUMBER_CHARACTERS);
append_numeric_series(series, output, flags);
return true;
}
bool phonemizer::handle_numeric(corpus* text, std::string* output, conditions* flags) {
/*
* There are four recognized ways of separating large arabic numerals:
* 1. No breaks or seperations exception for the decimal (e.g. '32000.012' or '32000,012')
* 2. Space separated breaks between every three digits and comma separated decimals (e.g. '32 000,012')
* 3. Period separated breaks between every three digits and comma separated decimals (e.g. '32.000,012')
* 4. Comma separated breaks between every three digits and period separated decimals (e.g. '32,000.012')
*
* This implementation will support all three approaches up to the trillions, after which numbers will be represented as a series
* of distinct digits. Non conforming patterns, e.g. multiple commas, multiple periods, or multiple spaces that are not three
* digits apart, will not be treated as continuous numbers but rather separate numerical strings.
*/
std::string number = text->next_in(COMPATIBLE_NUMERICS);
number = strip(number, ",. ");
// For numerics, we don't necessarily want to stop reading from the corpus at periods, commas, or spaces.
char large_number_separator = '\0';
char decimal_separator = '\0';
char last_break_char = '\0';
bool invalid_format = false;
int count_since_break = 0;
std::string built = "";
for (char & c : number) {
if (is_numeric(c)) {
built += c;
count_since_break += 1;
} else if (last_break_char =='\0') {
if (count_since_break > 3) {
decimal_separator = c;
}
last_break_char = c;
built += c;
count_since_break = 0;
} else if (c != last_break_char) {
if (c == ' ') {
break;
} else if (count_since_break == 3 && decimal_separator == '\0') {
if (large_number_separator == '\0') {
large_number_separator = last_break_char;
}
decimal_separator = c;
built += c;
count_since_break = 0;
last_break_char = c;
} else if (count_since_break != 3) {
if (large_number_separator != '\0') {
invalid_format = true;
}
break;
} else {
break;
}
} else if (c == last_break_char) {
if (decimal_separator != '\0') {
break;
} else if (count_since_break != 3) {
invalid_format = true;
break;
} else {
large_number_separator = c;
built += c;
count_since_break = 0;
}
}
}
if (!invalid_format) {
if (large_number_separator != '\0' && decimal_separator == '\0' && count_since_break != 3) {
invalid_format = true;
} else if (count_since_break == 3 && last_break_char != '\0' && decimal_separator == '\0' && large_number_separator == '\0') {
large_number_separator = last_break_char;
} else if (count_since_break != 3 && last_break_char != '\0' && decimal_separator == '\0' && large_number_separator == '\0') {
decimal_separator = last_break_char;
}
}
if (invalid_format) {
return handle_numeric_series(text, output, flags);
}
if (large_number_separator != '\0') {
built.erase(std::remove(built.begin(), built.end(), large_number_separator), built.end());
}
if (decimal_separator == ',') {
replace(built, decimal_separator, '.');
}
long long int value = std::stoll(built);
if (value >= LARGEST_PRONOUNCABLE_NUMBER) {
return handle_numeric_series(text, output, flags);
}
text->size_pop(built.size());
std::string noutput = build_number_phoneme(value);
if (noutput.size() > 0) {
if (flags->was_word && output->back() != ' ' && !flags->hyphenated) {
output->append(" ");
}
output->append(noutput);
flags->update_for_word(built);
flags->was_number = true;
}
if (decimal_separator != '\0') {
std::vector<std::string> parts = split(built, decimal_separator);
if (parts[1].size() > 0) {
output->append(" " + POINT_PHONEME + " ");
append_numeric_series(parts[1], output, flags);
}
}
return true;
}
bool phonemizer::is_acronym_like(corpus* text, std::string word, conditions* flags) {
if (word.find(".") != std::string::npos) {
for (std::string part : split(word, ".")) {
if (part.size() == 0) {
return false;
}
if (part.size() > 1) {
if (part.size() > 2 || !(isupper(part[0]) && islower(part[1]))) {
return false;
}
}
}
return true;
} else if (word.size() < 4) {
return small_english_words.find(to_lower(word)) == small_english_words.end();
} else if (is_all_upper(word)) {
if (flags->was_all_capitalized || is_all_upper(text->after_until(word.size()+1, " "))) {
flags->was_all_capitalized = true;
return false;
}
return true;
} else if (!is_all_upper(word) && upper_count(word) > (int) word.length() / 2) {
return true;
}
return false;
}
bool phonemizer::handle_roman_numeral(corpus* text, std::string* output, conditions * flags) {
auto next = text->next();
next = to_lower(next);
int total = 0;
int last_value = 0;
std::string running = "";
while (is_roman_numeral(next[0])) {
bool found = false;
for (int size = 4; size > 0; size--) {
std::string chunk = text->after(running.size(), size);
chunk = to_lower(chunk);
if (ROMAN_NUMERALS.find(chunk) != ROMAN_NUMERALS.end()) {
found = true;
int found_value = ROMAN_NUMERALS.at(chunk);
if (total == 0 || last_value > found_value) {
total += found_value;
last_value = found_value;
running += chunk;
} else {
return false;
}
}
}
if (found) {
next = text->after(running.size());
to_lower(next);
continue;
}
return false;
}
std::string noutput = build_number_phoneme(total);
if (flags->was_word && output->back() != ' ' && !flags->hyphenated) {
output->append(" ");
}
output->append(noutput);
text->size_pop(running.size());
flags->update_for_word(running, false);
flags->was_number = true;
return true;
}
bool phonemizer::handle_acronym(corpus* text, std::string word, std::string* output, conditions * flags) {
std::string out = "";
for (int i = 0; i < word.size(); i++) {
try {
if (word[i] == '.') {
flags->was_punctuated_acronym = true;
continue;
}
char letter = std::tolower(word[i]);
out += LETTER_PHONEMES.at(letter);
} catch (const std::out_of_range& e) {
continue;
}
}
text->size_pop(word.size());
if (flags->was_word && output->back() != ' ' && !flags->hyphenated) {
output->append(" ");
}
output->append(out);
flags->update_for_word(word, false);
return true;
}
bool phonemizer::handle_phonetic(corpus* text, std::string word, std::string* output, conditions* flags, size_t unaccented_size_difference) {
if (flags->was_word && output->back() != ' ' && !flags->hyphenated) {
output->append(" ");
}
output->append(phonetic_phonemizer->phonemize(word));
text->size_pop(word.size()+unaccented_size_difference);
flags->update_for_word(word);
return true;
}
static std::unordered_map<std::string, std::string> kokoro_ipa_map;
void populate_kokoro_ipa_map(std::string executable_path)
{
std::string line;
auto filepath = executable_path + "kokoro_ipa.embd";
printf("\nReading Kokoro IPA from %s",filepath.c_str());
std::ifstream myfile(filepath);
if (myfile.is_open())
{
while (myfile.good())
{
getline(myfile, line);
auto parts = split(line, ",");
if(parts.size()==2)
{
kokoro_ipa_map[parts[0]] = parts[1];
} else {
printf("\nError reading line in Kokoro IPA!");
}
}
myfile.close();
printf("\nPopulated Kokoro IPA: %d entries", kokoro_ipa_map.size());
}
else
{
printf("\nUnable to open Kokoro IPA file");
}
}
std::string found_word_to_ipa(std::string input)
{
bool is_acronym = !input.empty() &&
std::all_of(input.begin(), input.end(), [](unsigned char c) {
return std::isupper(c);
});
if (is_acronym) {
return ""; // Return empty for acronyms
}
// Convert input to lowercase
std::transform(input.begin(), input.end(), input.begin(),
[](unsigned char c) { return std::tolower(c); });
auto it = kokoro_ipa_map.find(input);
if (it != kokoro_ipa_map.end()) {
return it->second; // found
}
return "";
}
bool phonemizer::process_word(corpus* text, std::string* output, std::string word, conditions* flags, bool has_accent) {
dictionary_response* response;
size_t unaccented_size_difference = 0;
std::string foundstr = found_word_to_ipa(word);
if(foundstr!="")
{
output->append(foundstr);
text->size_pop(word.size());
return true;
}
if (has_accent) {
response = dict->lookup(text, word, flags);
if (!response->is_successful()) {
unaccented_size_difference = word.size();
word = replace_accents(word);
unaccented_size_difference -= word.size();
response = dict->lookup(text, word, flags);
}
} else {
response = dict->lookup(text, word, flags);
}
//printf("\nSUCCESS: %d, word:%s, result:%s\n",response->is_successful(),word.c_str(),response->value.c_str());
if (response->is_successful()) {
if (flags->was_word && output->back() != ' ' && !flags->hyphenated) {
output->append(" ");
}
flags->update_for_word(word);
if (response->code != SUCCESS_TOTAL) {
word += response->after_match;
output->append(response->value);
text->size_pop(word.size()+unaccented_size_difference);
return true;
} else {
output->append(response->value);
text->size_pop(word.size()+unaccented_size_difference);
return true;
}
} else if (can_be_roman_numeral(word) && is_all_upper(word) && small_english_words.find(to_lower(word)) == small_english_words.end() && handle_roman_numeral(text, output, flags)) {
return true;
} else if (is_acronym_like(text, word, flags)) {
return handle_acronym(text, word, output, flags);
} else if (word.find(".") < word.length()) {
bool part_has_accent = false;
std::string word_part = text->next_in(ALPHABET+COMMON_ACCENTED_CHARACTERS, &part_has_accent);
process_word(text, output, word_part, flags, part_has_accent);
handle_punctuation(text, ".", output, flags);
output->append(" ");
flags->reset_for_space();
return true;
} else {
return handle_phonetic(text, word, output, flags, unaccented_size_difference);
}
return true;
}
bool phonemizer::handle_word(corpus * text, std::string* output, conditions * flags) {
bool has_accent = false;
std::string word = text->next_in(WORD_CHARACTERS, &has_accent);
while (word.size() > 0 && word.back() == '.') {
word = word.substr(0,word.size()-1);
}
return process_word(text, output, word, flags, has_accent);
}
bool phonemizer::handle_replacement(corpus* text, std::string next, std::string* output, conditions * flags) {
if (flags->was_word && output->back() != ' ' && !flags->hyphenated) {
output->append(" ");
}
output->append(REPLACEABLE.at(next));
flags->update_for_word(next);
text->pop();
return true;
}
bool phonemizer::handle_possession_plural(corpus* text, std::string* output, conditions * flags) {
if (text->next(2) == "'s") {
std::string last = text->last();
if (VOWELS.find(to_lower(last)[0]) != std::string::npos) {
output->append("z");
} else if (last == "s" || last == "z") {
output->append("ᵻz");
} else if (is_alphabetic(last[0])) {
output->append("s");
} else {
output->append("ˈɛs");
}
text->pop(2);
} else {
text->pop();
}
return true;
}
bool phonemizer::handle_contraction(corpus* text, std::string* output, conditions * flags) {
text->pop();
std::string next = text->next_in(ALPHABET);
next = to_lower(next);
try {
output->append(CONTRACTION_PHONEMES.at(next));
} catch (const std::out_of_range& e) {
// in the situation that we cannt find a contraction then we just want to pop the ' character and continue
// it could be the end of a single quote which is ignored by the espeak phonemizer.
return true;
}
// make sure to pop the contraction.
text->pop_in(ALPHABET);
return true;
}
bool phonemizer::handle_punctuation(corpus* text, std::string next, std::string* output, conditions * flags) {
std::string last = text->last();
std::string after = text->after();
if (next[0] == '.') {
if (flags->was_punctuated_acronym) {
// we finished an acronym
flags->was_punctuated_acronym = false;
output->append(next);
text->pop();
if (text->after(1, 2) == "'s") {
return handle_possession_plural(text, output, flags);
}
return true;
}
std::string chunk = text->next_in(".");
/*if (chunk.size() > 1) {
flags->pre_pause += 4;
}*/
output->append(chunk);
text->size_pop(chunk.size());
return true;
} else if (next == "'") {
if (flags->was_word && (after == "s" || !is_alphabetic(after[0]))) {
return handle_possession_plural(text, output, flags);
} else if (flags->was_word && (CONTRACTION_PHONEMES.find(after) != CONTRACTION_PHONEMES.end() || CONTRACTION_PHONEMES.find(text->after(next.size(), 2)) != CONTRACTION_PHONEMES.end())) {
return handle_contraction(text, output, flags);
} else {
// could be the end or start of a quote
text->pop();
return true;
}
} else if (next[0] == '-') {
if (last == " " && after == " ") {
//flags->pre_pause += 4;
text->pop(2);
flags->reset_for_space();
return true;
} else if (after[0] == '-') {
//flags->pre_pause += 4;
text->pop(2);
output->append(" ");
flags->reset_for_space();
return true;
} else if (!flags->beginning_of_clause && flags->was_word && is_alphabetic(after[0])) {
flags->hyphenated = true;
text->pop();
return true;
} else {
// ignore it
text->pop();
return true;
}
}
else if (CLAUSE_BREAKS.find(next) != std::string::npos) {
output->append(next);
flags->reset_for_clause_end();
text->pop();
return true;
} else if (NOOP_BREAKS.find(next) != std::string::npos) {
output->append(next);
text->pop();
return true;
} else if (REPLACEABLE.find(next) != REPLACEABLE.end()) {
return handle_replacement(text, next, output, flags);
} else {
// ignore it
text->pop();
return true;
}
}
bool phonemizer::route(corpus * text, std::string* output, conditions * flags) {
std::string next = text->next();
if (next == "") {
// we finished lexing the corpus
return false;
}
if (SPACE_CHARACTERS.find(next) != std::string::npos) {
return handle_space(text, output, flags);
} else if (is_numeric(next[0])) {
return handle_numeric(text, output, flags);
} else if (is_alphabetic(next[0])) {
return handle_word(text, output, flags);
} else {
return handle_punctuation(text, next, output, flags);
}
}
#ifdef ESPEAK_INSTALL
std::string phonemizer::espeak_text_to_phonemes(const char * text) {
int mode = phoneme_mode == IPA ? (0 << 8 | 0x02) : (0 << 8 | 0x01);
const void ** txt_ptr = (const void**)&text;
const char * resp = espeak_wrapper::get_instance()->text_to_phonemes(txt_ptr, espeakCHARS_UTF8, mode);
return strip(std::string(resp));
}
#endif
std::string phonemizer::text_to_phonemes(const char * text, size_t size) {
std::string output = "";
if (mode == ESPEAK) {
#ifdef ESPEAK_INSTALL
auto parts = split(text, STOPPING_TOKENS, true);
std::string phonemes = "";
for (int i = 0; i < parts.size(); i+=2) {
phonemes += espeak_text_to_phonemes(parts[i].c_str());
if (preserve_punctuation && i + 1 < parts.size()) {
phonemes += parts[i+1];
}
}
return phonemes;
#else
TTS_ABORT("%s attempted to run in espeak mode without espeak installed. \n", __func__);
#endif
} else {
text_to_phonemes(text, size, &output);
}
return output;
}
std::string phonemizer::text_to_phonemes(std::string text) {
return text_to_phonemes(text.c_str(), text.size());
}
void phonemizer::text_to_phonemes(const char * text, size_t size, std::string* output) {
if (mode == ESPEAK) {
#ifdef ESPEAK_INSTALL
TTS_ABORT("%s attempted to run in espeak mode with output already defined. \n", __func__);
#else
TTS_ABORT("%s attempted to run in espeak mode without espeak installed. \n", __func__);
#endif
return;
}
corpus * corpus_text = new corpus(text, size);
conditions * flags = new conditions;
bool running = true;
while (running) {
running = route(corpus_text, output, flags);
}
delete corpus_text;
delete flags;
}
void phonemizer::text_to_phonemes(std::string text, std::string* output) {
text_to_phonemes(text.c_str(), text.size(), output);
}
struct word_phonemizer * word_phonemizer_from_gguf(gguf_context * meta) {
struct single_pass_tokenizer * tokenizer = single_pass_tokenizer_from_gguf(meta);
word_phonemizer * wph = new word_phonemizer(tokenizer);
int rule_keys_key = gguf_find_key(meta, "phonemizer.rules.keys");
int phoneme_key = gguf_find_key(meta, "phonemizer.rules.phonemes");
if (rule_keys_key == -1 || phoneme_key == -1) {
TTS_ABORT("Both 'phonemizer.rules.keys' and 'phonemizer.rules.phonemes' keys must be set in order to support phonemization.");
}
int key_count = gguf_get_arr_n(meta, rule_keys_key);
assert(key_count == gguf_get_arr_n(meta, phoneme_key));
for (int i = 0; i < key_count; i++) {
std::string rule_key = gguf_get_arr_str(meta, rule_keys_key, i);
std::string phoneme = gguf_get_arr_str(meta, phoneme_key, i);
wph->add_rule(split(rule_key, "."), phoneme);
}
return wph;
}
dictionary_response * response_from_string(std::string value, std::string key) {
std::vector<std::string> parts = split(value, ":");
bool has_spacing = parts.size() > 1;
bool expects_to_be_proceeded_by_number = key[0] == '$';
bool not_at_start = key[0] == '#';
bool not_at_end = key.back() == '#';
if (!has_spacing) {
dictionary_response * resp = new dictionary_response(SUCCESS_TOTAL, value);
resp->expects_to_be_proceeded_by_number = expects_to_be_proceeded_by_number;
resp->not_at_clause_end = not_at_end;
resp->not_at_clause_start = not_at_start;
return resp;
} else {
dictionary_response * resp = new dictionary_response(SUCCESS_PARTIAL, parts[0]);
resp->after_match = parts[1];
resp->expects_to_be_proceeded_by_number = expects_to_be_proceeded_by_number;
resp->not_at_clause_end = not_at_end;
resp->not_at_clause_start = not_at_start;
return resp;
}
}
struct phoneme_dictionary * phoneme_dictionary_from_gguf(gguf_context * meta) {
struct phoneme_dictionary * dict = new phoneme_dictionary;
int keys_key = gguf_find_key(meta, "phonemizer.dictionary.keys");
int values_key = gguf_find_key(meta, "phonemizer.dictionary.values");
if (keys_key == -1 || values_key == -1) {
TTS_ABORT("Both 'phonemizer.dictionary.keys' and 'phonemizer.dictionary.values' keys must be set in order to support phonemization.");
}
int key_count = gguf_get_arr_n(meta, keys_key);
assert(key_count == gguf_get_arr_n(meta, values_key));
for (int i = 0; i < key_count; i++) {
std::string key = gguf_get_arr_str(meta, keys_key, i);
std::string values = gguf_get_arr_str(meta, values_key, i);
std::vector<dictionary_response*> out;
for (std::string val : split(values, ",")) {
out.push_back(response_from_string(val, key));
}
if (key[0] == '$' || key[0] == '#') {
key = key.substr(1);
}
if (key.back() == '#') {
key = key.substr(0, key.size() - 1);
}
dict->lookup_map[key] = out;
}
return dict;
}
struct phonemizer * phonemizer_from_gguf(gguf_context * meta, const std::string espeak_voice_code) {
int mode_key = gguf_find_key(meta, "phonemizer.type");
phonemizer * ph;
if (mode_key == -1) {
TTS_ABORT("Key 'phonemizer.type' must be specified in gguf file for all models using a phonemizer.");
}
uint32_t ph_type = gguf_get_val_u32(meta, mode_key);
if ((phonemizer_type) ph_type == ESPEAK) {
#ifdef ESPEAK_INSTALL
espeak_wrapper::get_instance()->initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, ESPEAK_DATA_PATH, 0);
update_voice(espeak_voice_code);
ph = new phonemizer(nullptr, nullptr);
ph->mode = ESPEAK;
#else
TTS_ABORT("%s attempted to load an espeak phonemizer without espeak installed. \n", __func__);
#endif
int phoneme_type_key = gguf_find_key(meta, "phonemizer.phoneme_type");
if (phoneme_type_key != -1) {
uint32_t phoneme_typing = gguf_get_val_u32(meta, mode_key);
if ((phoneme_type)phoneme_typing == ESPEAK_PHONEMES) {
ph->phoneme_mode = ESPEAK_PHONEMES;
}
}
return ph;
}
struct word_phonemizer * phonetic_ph = word_phonemizer_from_gguf(meta);
struct phoneme_dictionary * dict = phoneme_dictionary_from_gguf(meta);
ph = new phonemizer(dict, phonetic_ph);
return ph;
}
struct phonemizer * espeak_phonemizer(bool use_espeak_phonemes, std::string espeak_voice_code) {
#ifdef ESPEAK_INSTALL
espeak_wrapper::get_instance()->initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, ESPEAK_DATA_PATH, 0);
update_voice(espeak_voice_code);
phonemizer * ph = new phonemizer(nullptr, nullptr);
ph->mode = ESPEAK;
if (use_espeak_phonemes) {
ph->phoneme_mode = ESPEAK_PHONEMES;
}
return ph;
#else
TTS_ABORT("%s attempted to load an espeak phonemizer without espeak installed. \n", __func__);
#endif
}
struct phonemizer * phonemizer_from_file(const std::string fname, const std::string espeak_voice_code) {
ggml_context * weight_ctx = NULL;
struct gguf_init_params params = {
/*.no_alloc =*/ false,
/*.ctx =*/ &weight_ctx,
};
gguf_context * meta_ctx = gguf_init_from_file(fname.c_str(), params);
if (!meta_ctx) {
TTS_ABORT("%s failed for file %s\n", __func__, fname.c_str());
}
return phonemizer_from_gguf(meta_ctx, espeak_voice_code);
}