koboldcpp/otherarch/ttscpp/src/phonemizer.cpp

#include "phonemizer.h"

#ifdef ESPEAK_INSTALL
/**
 * espeak_wrapper functions and assignments
 *
 * The espeak_wrapper is a singleton which wraps threaded calls to espeak-ng with a shared mutex
 */

// non-const static members must be initialized out of line
espeak_wrapper* espeak_wrapper::instance{nullptr};
std::mutex espeak_wrapper::mutex;

espeak_wrapper * espeak_wrapper::get_instance() {
	if (!instance) {
		instance = new espeak_wrapper;
	}
	return instance;
}

const espeak_VOICE ** espeak_wrapper::list_voices() {
	std::lock_guard<std::mutex> lock(mutex);
	return espeak_ListVoices(nullptr);
}

espeak_ERROR espeak_wrapper::set_voice(const char * voice_code) {
	std::lock_guard<std::mutex> lock(mutex);
	return espeak_SetVoiceByName(voice_code);
}

const char * espeak_wrapper::text_to_phonemes(const void ** textptr, int textmode, int phonememode) {
	std::lock_guard<std::mutex> lock(mutex);
	return espeak_TextToPhonemes(textptr, textmode, phonememode);
}

void espeak_wrapper::initialize(espeak_AUDIO_OUTPUT output, int buflength, const char * path, int options) {
	std::lock_guard<std::mutex> lock(mutex);
	if (!espeak_initialized) {
		espeak_initialized = true;
		espeak_Initialize(output, buflength, path, options);
	}
}
#endif

/**
 * Helper functions for string parsing
 */
const std::unordered_set<std::string> inline_combine_sets(const std::vector<std::unordered_set<std::string>> sets) {
	std::unordered_set<std::string> combined;
	for (auto set : sets) {
		combined.insert(set.begin(), set.end());
	}
	return combined;
}

std::string replace(std::string target, char to_replace, char replacement) {
	for (int i = 0; i < target.size(); i++) {
		if (target[i] == to_replace) {
			target[i] = replacement;
		}
	}
	return target;
}

std::string to_lower(std::string word) {
	std::transform(word.begin(), word.end(), word.begin(),
    	[](unsigned char c){ return std::tolower(c);
    });
    return word;
}

std::string to_upper(std::string word) {
	std::transform(word.begin(), word.end(), word.begin(),
    	[](unsigned char c){ return std::toupper(c);
    });
    return word;
}

std::string replace_accents(std::string word) {
	std::string new_word;
	for (int i = 0; i < word.size();) {
		int grab = 0;
		while(i+grab+1 < word.size() && (word[i+grab + 1] & 0b11000000) == 0b10000000) {
			++grab;
		}
		++grab;

		if (grab > 1) {
			std::string accent = word.substr(i, grab);
			if (ACCENTED_A.find(accent) != std::string::npos) {
				new_word.push_back('a');
			} else if (ACCENTED_C.find(accent) != std::string::npos) {
				new_word.push_back('c');
			} else if (ACCENTED_E.find(accent) != std::string::npos) {
				new_word.push_back('e');
			} else if (ACCENTED_I.find(accent) != std::string::npos) {
				new_word.push_back('i');
			} else if (ACCENTED_N.find(accent) != std::string::npos) {
				new_word.push_back('n');
			} else if (ACCENTED_O.find(accent) != std::string::npos) {
				new_word.push_back('o');
			} else if (ACCENTED_U.find(accent) != std::string::npos) {
				new_word.push_back('u');
			} else {
				// non accented charactes in a word string should really be possible but for the sake of keeping this function pure
				// just put the multibyte character back;
				new_word.append(accent);

			}
		} else {
			new_word.push_back(word[i]);
		}
		i += grab;
	}
	return new_word;
}

int upper_count(std::string word) {
	int count = 0;
	for (char letter : word) {
		if (isupper(letter)) {
			count += 1;
		}
	}
	return count;
}

bool is_all_upper(std::string word) {
	for (char letter : word) {
		if (!isupper(letter)) {
			return false;
		}
	}
	return true;
}

/*
 * Text condition checks
 */
bool is_roman_numeral(char letter) {
	return ROMAN_NUMERAL_CHARACTERS.find(letter) != std::string::npos;
}

bool can_be_roman_numeral(std::string word) {
	for (int i = 0; i < word.size(); i++) {
		if (!is_roman_numeral(word[i])) {
			return false;
		}
	}
	return true;
}

bool is_alphabetic(char letter) {
	return ALPHABET.find(letter) != std::string::npos;
}

bool is_numeric(char letter) {
	int val = (int) letter;
	return val >= 48 && val <= 57;
}


std::string parse_voice_code(std::string voice_code) {
#ifdef ESPEAK_INSTALL
	voice_code = to_lower(voice_code);
	const espeak_VOICE * primary_match = nullptr;
	const espeak_VOICE * secondary_match = nullptr;
	bool search_by_lc = voice_code.size() == 2;
	bool search_by_lfc = !search_by_lc && voice_code.size() == 3;
	bool search_by_id = !search_by_lfc && voice_code.find("/") != std::string::npos;
	// It is common for locale's to be '_' separated rather than '-' separated. Check for both.
	bool search_by_lcc = !search_by_id && (voice_code.find("-") != std::string::npos || voice_code.find("_") != std::string::npos);
	if (search_by_id || search_by_lcc) {
		voice_code = replace(voice_code, '_', '-');
	}
	const espeak_VOICE** espeak_voices = espeak_wrapper::get_instance()->list_voices();
	// ideally we'd use the espeak voice scores which order voices by preference, but they are only returned when a voice_spec is passed to the list api and
	// the voice spec isn't compatible with partials (e.g. country codes, language family code, etc)
	int i = 0;
	while (espeak_voices[i] != nullptr) {
		auto identifier_parts = split(espeak_voices[i]->identifier, "/");
		// it is possible to add languages to espeak-ng without following their identifier pattern, if we run into such a language just try to match against
		// the identifier and otherwise continue;
		if (identifier_parts.size() == 1) {
			if (voice_code == identifier_parts[0] || voice_code == espeak_voices[i]->name) {
				primary_match = espeak_voices[i];
			} else {
				continue;
			}
		}
		if (search_by_lc) {
			std::string language_part = identifier_parts[1];
			if (language_part == voice_code) {
				primary_match = espeak_voices[i];
				break; // if we have an exact match then we can exit
			} else if (has_prefix(language_part, voice_code) && (!primary_match || strlen(primary_match->identifier) > strlen(espeak_voices[i]->identifier))) {
				// prefer the smaller codes as longer codes typically refer to more specific locales
				primary_match = espeak_voices[i] ;
			} else {
				auto subparts = split(language_part, "-");
				if (subparts.size() > 1 && to_lower(subparts[1]) == voice_code && (!secondary_match || strlen(secondary_match->identifier) > strlen(espeak_voices[i]->identifier))) {
					// country codes are typically capitalized in espeak-ng
					secondary_match = espeak_voices[i];
				}
			}
		} else if (search_by_lfc) {
			// espeak-ng uses language family codes in their identifiers, but also uses ISO 639-3 language codes for some languages.
			// Since language codes are more specific attempt to match against the language code as the primary and match against the language family
			// code as the secondary.
			if (has_prefix(identifier_parts[1], voice_code) && (!primary_match || strlen(primary_match->identifier) > strlen(espeak_voices[i]->identifier))) {
				primary_match = espeak_voices[i];
			} else if (identifier_parts[0] == voice_code && (!secondary_match || strlen(secondary_match->identifier) > strlen(espeak_voices[i]->identifier))) {
				secondary_match = espeak_voices[i];
			}
		} else if (search_by_id && has_prefix(to_lower(espeak_voices[i]->identifier), voice_code) && (!primary_match || strlen(primary_match->identifier) > strlen(espeak_voices[i]->identifier))) {
			primary_match = espeak_voices[i];
		} else if (search_by_lcc && has_prefix(to_lower(identifier_parts[1]), voice_code) && (!primary_match || strlen(primary_match->identifier) > strlen(espeak_voices[i]->identifier))) {
			primary_match = espeak_voices[i];
		} else if (to_lower(espeak_voices[i]->name).find(voice_code) != std::string::npos && (!primary_match || strlen(primary_match->identifier) > strlen(espeak_voices[i]->identifier))) {
			primary_match = espeak_voices[i];
		}
		i++;
	}
	if (!primary_match && !secondary_match) {
		TTS_ABORT("Failed to match espeak voice code '%s' to known espeak voices.\n", voice_code.c_str());
	}
	if (!primary_match) {
		primary_match = secondary_match;
	}
	fprintf(stdout, "Passed Espeak Voice Code '%s' doesn't directly match any known Espeak Voice IDs. Nearest match with name '%s' and id '%s' will be used instead.\n", voice_code.c_str(), primary_match->name, primary_match->identifier);
	return std::string(primary_match->identifier);
#else
	TTS_ABORT("Attempted to list voices without espeak-ng installed.");
#endif
}

void update_voice(std::string voice_code) {
#ifdef ESPEAK_INSTALL
	espeak_ERROR e = espeak_wrapper::get_instance()->set_voice(voice_code.c_str());
   	if (e != EE_OK) {
   		voice_code = parse_voice_code(voice_code);
   		espeak_wrapper::get_instance()->set_voice(voice_code.c_str());
    }
#else
    TTS_ABORT("Attempted to set voice without espeak-ng installed.");
#endif
}


void conditions::reset_for_clause_end() {
	hyphenated = false;
	was_punctuated_acronym = false;
	beginning_of_clause = true;
	was_number = false;
}

void conditions::reset_for_space() {
	hyphenated = false;
	was_punctuated_acronym = false;
	was_word = false;
}

void conditions::update_for_word(std::string word, bool allow_for_upper_check) {
	if (allow_for_upper_check && !is_all_upper(word)) {
		was_all_capitalized = false;
	}
	was_word = true;
	beginning_of_clause = false;
	hyphenated = false;
	was_number = false;
}

std::string corpus::next(int count) {
	if (location == size || count == 0) {
		return "";
	}
	int final_loc = location;
	int grabbed = 0;
	while(grabbed < count && final_loc < size) {
		while(final_loc + 1 < size && (text[final_loc+1] & 0b11000000) == 0b10000000) {
			++final_loc;
		}
		++final_loc;
		++grabbed;
	}
	return std::string(text+location, text+final_loc);
}

std::string corpus::last(int count) {
	if (location == 0 || count == 0) {
		return "";
	}
	int final_loc = location - 1;
	int grabbed = 0;
	while(grabbed < count && final_loc > 0) {
		while((text[final_loc] & 0b11000000) == 0b10000000) {
			--final_loc;
		}
		++grabbed;
	}

	return std::string(text+final_loc, text+location-1);
}

std::string corpus::pop(int count) {
	std::string ret = next(count);
	location += ret.size();
	return ret;
}

std::string corpus::after(int aftr, int count) {
	size_t new_loc = location + aftr;
	if (new_loc >= size || count == 0) {
		return "";
	}
	int final_loc = new_loc;
	int grabbed = 0;
	while(grabbed < count && final_loc < size) {
		while(final_loc+1 < size && (text[final_loc+1] & 0b11000000) == 0b10000000) {
			++final_loc;
		}
		++final_loc;
		++grabbed;
	}
	return std::string(text+new_loc, text+final_loc);
}

std::string corpus::size_pop(size_t pop_size) {
	size_t tsize = std::min(pop_size, size - location);
	std::string ret = std::string(text+location, text+location+tsize);
	location += tsize;
	return ret;
}

std::string corpus::next_in(std::string val, bool* has_accent) {
	int n = 0;
	int running = 0;
	std::string nafter = next();
	while (nafter != "" && val.find(nafter) != std::string::npos) {
		if (has_accent && !(*has_accent) && COMMON_ACCENTED_CHARACTERS.find(nafter) != std::string::npos) {
			*has_accent = true;
		}
		++n;
		running += nafter.size();
		nafter = after(running);
	}
	return next(n);
}

std::string corpus::pop_in(std::string val) {
	int n = 0;
	size_t running = 0;
	std::string nafter = next();
	running += nafter.size();
	while (nafter != "" && val.find(nafter) != std::string::npos) {
		++n;
		nafter = after(running);
		running += nafter.size();
	}
	return pop(n);
}

std::string corpus::after_until(int aftr, std::string val) {
	int n = 0;
	std::string nafter = after(aftr);
	while (nafter != "" && val.find(nafter) != std::string::npos) {
		++n;
		nafter = after(n);
	}
	return after(aftr, n);
}

std::string phonemizer_rule::lookup_rule(std::vector<std::string> & keys, int index) {
	if (index >= keys.size()) {
		return value;
	}
	std::string found_key = keys[index];
	bool found_match = false;
	for (const auto& pair : rules) {
		if (pair.first == found_key) {
			found_match = true;
			break;
		} else if (pair.first[0] == '*' && has_suffix(found_key, pair.first.substr(1))) {
			found_match = true;
			found_key = pair.first;
			break;
		} else if (pair.first.back() == '*' && has_prefix(found_key, pair.first.substr(0, pair.first.size()-1))) {
			found_match = true;
			found_key = pair.first;
			break;
		}
	}
	if (found_match) {
		return rules.at(found_key)->lookup_rule(keys, index + 1);
	} else {
		return value;
	}
}

std::string word_phonemizer::lookup_rule(std::string word, std::string current, std::string before, std::string after) {
	if (rules.find(current) == rules.end()) {
		return "";
	}
	std::vector<std::string> lookup_keys = {before, after, word};
	return rules[current]->lookup_rule(lookup_keys, 0);
}

void word_phonemizer::add_rule(std::vector<std::string> keys, std::string phoneme) {
	phonemizer_rule * current_rule = nullptr;
	for (int i = 0; i < keys.size(); i++) {
		if (current_rule) {
			if (current_rule->rules.find(keys[i]) == current_rule->rules.end()) {
				phonemizer_rule * nrule = new phonemizer_rule;
				current_rule->rules[keys[i]] = nrule;
				current_rule = nrule;
			} else {
				current_rule = current_rule->rules.at(keys[i]);
			}
		} else {
			if (rules.find(keys[i]) == rules.end()) {
				current_rule = new phonemizer_rule;
				rules[keys[i]] = current_rule;
			} else {
				current_rule = rules.at(keys[i]);
			}
		}
	}
	if (current_rule) {
		current_rule->value = phoneme;
	}
}

std::string word_phonemizer::phonemize(std::string word) {
	std::vector<std::string> graphemes;
	word = to_lower(word);
	tokenizer->token_split(word, graphemes);
	std::string phoneme = "";
	for (int i = 0; i < graphemes.size(); i++) {
		std::string before = i > 0 ? graphemes[i-1] : "^";
		std::string after = i + 1 < graphemes.size() ? graphemes[i+1] : "$";
		std::string current = graphemes[i];
		phoneme += lookup_rule(word, current, before, after);
	}
	return phoneme;
}

std::string build_subthousand_phoneme(int value) {
	int hundreds = value / 100;
	std::string phoneme = hundreds > 0 ? NUMBER_PHONEMES[hundreds] + " " + HUNDRED_PHONEME : "";
	value = value % 100;
	if (value > 0 && value < 20) {
		phoneme += NUMBER_PHONEMES[value];
	} else if (value > 0) {
		phoneme += SUB_HUNDRED_NUMBERS[(value / 10) - 2];
		value = value % 10;
		if (value > 0) {
			phoneme += " " + NUMBER_PHONEMES[value];
		}
	}
	return phoneme;
}

std::string build_number_phoneme(long long int remainder) {
	std::string phoneme = "";
	bool started = false;
	if (remainder > TRILLION) {
		long long int trillions = (long long int) remainder / TRILLION;
		phoneme += build_subthousand_phoneme(trillions) + " " + TRILLION_PHONEME;
		remainder = (long long int) remainder % TRILLION;
		if (remainder > 0) {
			phoneme += ",";
		}
		started = true;
	}
	if (remainder > BILLION) {
		long long int billions = (long long int) remainder / BILLION;
		remainder = (long long int) remainder % BILLION;
		std::string billion_part =  build_subthousand_phoneme(billions) + " " + BILLION_PHONEME;
		if (!started) {
			phoneme += remainder > 0 ? billion_part + "," : billion_part;

		} else if (remainder == 0) {
			phoneme += " " + billion_part;
		} else {
			phoneme += " " + billion_part + ",";
		}
		started = true;
	}
	if (remainder > MILLION) {
		long long int millions = (long long int) remainder / MILLION;
		remainder = (long long int) remainder % MILLION;
		std::string million_part =  build_subthousand_phoneme(millions) + " " + MILLION_PHONEME;
		if (!started) {
			phoneme += remainder > 0 ? million_part + "," : million_part;
		} else if (remainder == 0) {
			phoneme += " " + million_part;
		} else {
			phoneme += " " + million_part + ",";
		}
		started = true;
	}
	if (remainder > 1000) {
		long long int thousands = (long long int) remainder / 1000;
		remainder = (long long int) remainder % 1000;
		std::string thousand_part =  build_subthousand_phoneme(thousands) + " " + THOUSAND_PHONEME;
		if (!started) {
			phoneme += remainder > 0 ? thousand_part + "," : thousand_part;
		} else if (remainder == 0) {
			phoneme += " " + thousand_part;
		} else {
			phoneme += " " + thousand_part + ",";
		}
		started = true;
	}
	if (remainder > 0) {
		if (started) {
			phoneme += " " + build_subthousand_phoneme(remainder);
		} else {
			phoneme += build_subthousand_phoneme(remainder);
		}
	}
	return phoneme;
}

bool dictionary_response::is_successful() {
	return code < 200;
}

bool dictionary_response::is_match(corpus* text, conditions* flags) {
	if (not_at_clause_end) {
		std::string chunk = text->next_in(NON_CLAUSE_WORD_CHARACTERS);
		std::string after = text->after(chunk.size());
		if (after == "!" || after == "." || after == "?") {
			return false;
		}
	}
	return text->next(after_match.size()) == after_match && (!expects_to_be_proceeded_by_number || flags->was_number) && (!not_at_clause_start || !flags->beginning_of_clause);
}

dictionary_response * phoneme_dictionary::lookup(corpus * text, std::string value, conditions* flags) {
	if (lookup_map.find(value) == lookup_map.end()) {
		return not_found_response;
	}
	std::vector<dictionary_response*> possibilities = lookup_map.at(value);
	for (auto possible : possibilities) {
		if (possible->code == SUCCESS_TOTAL || (possible->code == SUCCESS_PARTIAL && possible->is_match(text, flags))) {
			return possible;
		}
	}
	return phonetic_fallback_response;
}

bool phonemizer::handle_space(corpus* text, std::string* output, conditions* flags) {
	flags->reset_for_space();
	text->pop_in(" \n\f\t");
	if (output->back() !=  ' ') {
		output->append(" ");
	}
	return true;
}

void phonemizer::append_numeric_series(std::string series, std::string* output, conditions * flags) {
	if (flags->was_word && output->back() != ' ' && !flags->hyphenated) {
		output->append(" ");
	}
	for (int i = 0; i < series.size(); i++) {
		int numeral = series[i] - '0';
		output->append(NUMBER_PHONEMES[numeral]);
		if (i + 1 < series.size()) {
			output->append(" ");
		}
	}
	if (series.size() > 0) {
		flags->update_for_word(series);
		flags->was_number = true;
	}
}

bool phonemizer::handle_numeric_series(corpus* text, std::string* output, conditions* flags) {
	std::string series = text->pop_in(NUMBER_CHARACTERS);
	append_numeric_series(series, output, flags);
	return true;
}

bool phonemizer::handle_numeric(corpus* text, std::string* output, conditions* flags) {
	/*
	 * There are four recognized ways of separating large arabic numerals:
	 *   1. No breaks or seperations exception for the decimal (e.g. '32000.012' or '32000,012')
	 *   2. Space separated breaks between every three digits and comma separated decimals (e.g. '32 000,012')
	 *   3. Period separated breaks between every three digits and comma separated decimals (e.g. '32.000,012')
	 *   4. Comma separated breaks between every three digits and period separated decimals (e.g. '32,000.012')
	 *
	 * This implementation will support all three approaches up to the trillions, after which numbers will be represented as a series
	 * of distinct digits. Non conforming patterns, e.g. multiple commas, multiple periods, or multiple spaces that are not three
	 * digits apart, will not be treated as continuous numbers but rather separate numerical strings.
	 */
 	std::string number = text->next_in(COMPATIBLE_NUMERICS);
 	number = strip(number, ",. ");

	// For numerics, we don't necessarily want to stop reading from the corpus at periods, commas, or spaces.
	char large_number_separator = '\0';
	char decimal_separator = '\0';
	char last_break_char = '\0';
	bool invalid_format = false;
	int count_since_break = 0;
	std::string built = "";
	for (char & c : number) {
		if (is_numeric(c)) {
			built += c;
			count_since_break += 1;
		} else if (last_break_char =='\0') {
			if (count_since_break > 3) {
				decimal_separator = c;
			}
			last_break_char = c;
			built += c;
			count_since_break = 0;
		} else if (c != last_break_char) {
			if (c == ' ') {
				break;
			} else if (count_since_break == 3 && decimal_separator == '\0') {
				if (large_number_separator == '\0') {
					large_number_separator = last_break_char;
				}
				decimal_separator = c;
				built += c;
				count_since_break = 0;
				last_break_char = c;
			} else if (count_since_break != 3) {
				if (large_number_separator != '\0') {
					invalid_format = true;
				}
				break;
			} else {
				break;
			}
		} else if (c == last_break_char) {
			if (decimal_separator != '\0') {
				break;
			} else if (count_since_break != 3) {
				invalid_format = true;
				break;
			} else {
				large_number_separator = c;
				built += c;
				count_since_break = 0;
			}
		}
	}

	if (!invalid_format) {
		if (large_number_separator != '\0' && decimal_separator == '\0' && count_since_break != 3) {
			invalid_format = true;
		} else if (count_since_break == 3 && last_break_char != '\0' && decimal_separator == '\0' && large_number_separator == '\0') {
			large_number_separator = last_break_char;
		} else if (count_since_break != 3 && last_break_char != '\0' && decimal_separator == '\0' && large_number_separator == '\0') {
			decimal_separator = last_break_char;
		}
	}

	if (invalid_format) {
		return handle_numeric_series(text, output, flags);
	}

	if (large_number_separator != '\0') {
		built.erase(std::remove(built.begin(), built.end(), large_number_separator), built.end());
	}
	if (decimal_separator == ',') {
		replace(built, decimal_separator, '.');
	}
	long long int value = std::stoll(built);

	if (value >= LARGEST_PRONOUNCABLE_NUMBER) {
		return handle_numeric_series(text, output, flags);
	}

	text->size_pop(built.size());

	std::string noutput = build_number_phoneme(value);
	if (noutput.size() > 0) {
		if (flags->was_word && output->back() != ' ' && !flags->hyphenated) {
			output->append(" ");
		}
		output->append(noutput);
		flags->update_for_word(built);
		flags->was_number = true;
	}
	if (decimal_separator != '\0') {
		std::vector<std::string> parts = split(built, decimal_separator);
		if (parts[1].size() > 0) {
			output->append(" " + POINT_PHONEME + " ");
			append_numeric_series(parts[1], output, flags);
		}
	}
	return true;
}

bool phonemizer::is_acronym_like(corpus* text, std::string word, conditions* flags) {
	if (word.find(".") != std::string::npos) {
		for (std::string part : split(word, ".")) {
			if (part.size() == 0) {
				return false;
			}
			if (part.size() > 1) {
				if (part.size() > 2 || !(isupper(part[0]) && islower(part[1]))) {
					return false;
				}
			}
		}
		return true;
	} else if (word.size() < 4) {
		return small_english_words.find(to_lower(word)) == small_english_words.end();
	} else if (is_all_upper(word)) {
		if (flags->was_all_capitalized || is_all_upper(text->after_until(word.size()+1, " "))) {
			flags->was_all_capitalized = true;
			return false;
		}
		return true;
	} else if (!is_all_upper(word) && upper_count(word) > (int) word.length() / 2) {
		return true;
	}
	return false;
}

bool phonemizer::handle_roman_numeral(corpus* text, std::string* output, conditions * flags) {
	auto next = text->next();
	next = to_lower(next);
	int total = 0;
	int last_value = 0;
	std::string running = "";
	while (is_roman_numeral(next[0])) {
		bool found = false;
		for (int size = 4; size > 0; size--) {
			std::string chunk = text->after(running.size(), size);
			chunk = to_lower(chunk);
			if (ROMAN_NUMERALS.find(chunk) != ROMAN_NUMERALS.end()) {
				found = true;
				int found_value = ROMAN_NUMERALS.at(chunk);
				if (total == 0 || last_value > found_value) {
					total += found_value;
					last_value = found_value;
					running += chunk;
				} else {
					return false;
				}
			}
		}
		if (found) {
			next = text->after(running.size());
			to_lower(next);
			continue;
		}
		return false;
	}

	std::string noutput = build_number_phoneme(total);
	if (flags->was_word && output->back() != ' ' && !flags->hyphenated) {
		output->append(" ");
	}
	output->append(noutput);
	text->size_pop(running.size());
	flags->update_for_word(running, false);
	flags->was_number = true;

	return true;
}

bool phonemizer::handle_acronym(corpus* text, std::string word, std::string* output, conditions * flags) {
	std::string out = "";
	for (int i = 0; i < word.size(); i++) {
		try {
			if (word[i] == '.') {
				flags->was_punctuated_acronym = true;
				continue;
			}
			char letter = std::tolower(word[i]);
			out += LETTER_PHONEMES.at(letter);
		} catch (const std::out_of_range& e) {
			continue;
		}
	}
	text->size_pop(word.size());
	if (flags->was_word && output->back() != ' ' && !flags->hyphenated) {
		output->append(" ");
	}
	output->append(out);
	flags->update_for_word(word, false);
	return true;
}

bool phonemizer::handle_phonetic(corpus* text, std::string word, std::string* output, conditions* flags, size_t unaccented_size_difference) {
	if (flags->was_word && output->back() != ' ' && !flags->hyphenated) {
		output->append(" ");
	}
	output->append(phonetic_phonemizer->phonemize(word));
	text->size_pop(word.size()+unaccented_size_difference);
	flags->update_for_word(word);
	return true;
}

static std::unordered_map<std::string, std::string> kokoro_ipa_map;
void populate_kokoro_ipa_map(std::string executable_path)
{
	std::string line;
    auto filepath = executable_path + "kokoro_ipa.embd";
    printf("\nReading Kokoro IPA from %s",filepath.c_str());
    std::ifstream myfile(filepath);
    if (myfile.is_open())
    {
        while (myfile.good())
        {
            getline(myfile, line);
			auto parts = split(line, ",");
			if(parts.size()==2)
			{
                kokoro_ipa_map[parts[0]] = parts[1];
            } else {
                printf("\nError reading line in Kokoro IPA!");
            }
        }
        myfile.close();
		printf("\nPopulated Kokoro IPA: %d entries", kokoro_ipa_map.size());
    }
    else
    {
		printf("\nUnable to open Kokoro IPA file");
    }
}
std::string found_word_to_ipa(std::string input)
{
	bool is_acronym = !input.empty() &&
        std::all_of(input.begin(), input.end(), [](unsigned char c) {
            return std::isupper(c);
        });

    if (is_acronym) {
        return ""; // Return empty for acronyms
    }

	// Convert input to lowercase
    std::transform(input.begin(), input.end(), input.begin(),
                   [](unsigned char c) { return std::tolower(c); });
    auto it = kokoro_ipa_map.find(input);
    if (it != kokoro_ipa_map.end()) {
        return it->second; // found
    }
    return "";
}
bool phonemizer::process_word(corpus* text, std::string* output, std::string word, conditions* flags, bool has_accent) {
	dictionary_response* response;
	size_t unaccented_size_difference = 0;

	std::string foundstr = found_word_to_ipa(word);
	if(foundstr!="")
	{
		output->append(foundstr);
		text->size_pop(word.size());
		return true;
	}

	if (has_accent) {
		response = dict->lookup(text, word, flags);
		if (!response->is_successful()) {
			unaccented_size_difference = word.size();
			word = replace_accents(word);
			unaccented_size_difference -= word.size();
			response = dict->lookup(text, word, flags);
		}
	} else {
		response = dict->lookup(text, word, flags);
	}

	//printf("\nSUCCESS: %d, word:%s, result:%s\n",response->is_successful(),word.c_str(),response->value.c_str());

	if (response->is_successful()) {
		if (flags->was_word && output->back() != ' ' && !flags->hyphenated) {
			output->append(" ");
		}
		flags->update_for_word(word);
		if (response->code != SUCCESS_TOTAL) {
			word += response->after_match;
			output->append(response->value);
			text->size_pop(word.size()+unaccented_size_difference);
			return true;
		} else {
			output->append(response->value);
			text->size_pop(word.size()+unaccented_size_difference);
			return true;
		}
	} else if (can_be_roman_numeral(word) && is_all_upper(word) && small_english_words.find(to_lower(word)) == small_english_words.end() && handle_roman_numeral(text, output, flags)) {
		return true;
	} else if (is_acronym_like(text, word, flags)) {
		return handle_acronym(text, word, output, flags);
	} else if (word.find(".") < word.length()) {
		bool part_has_accent = false;
		std::string word_part = text->next_in(ALPHABET+COMMON_ACCENTED_CHARACTERS, &part_has_accent);
		process_word(text, output, word_part, flags, part_has_accent);
		handle_punctuation(text, ".", output, flags);
		output->append(" ");
		flags->reset_for_space();
		return true;
	} else {
		return handle_phonetic(text, word, output, flags, unaccented_size_difference);
	}
	return true;
}

bool phonemizer::handle_word(corpus * text, std::string* output, conditions * flags) {
	bool has_accent = false;
	std::string word = text->next_in(WORD_CHARACTERS, &has_accent);
	while (word.size() > 0 && word.back() == '.') {
		word = word.substr(0,word.size()-1);
	}

	return process_word(text, output, word, flags, has_accent);
}

bool phonemizer::handle_replacement(corpus* text, std::string next, std::string* output, conditions * flags) {
	if (flags->was_word && output->back() != ' ' && !flags->hyphenated) {
		output->append(" ");
	}
	output->append(REPLACEABLE.at(next));
	flags->update_for_word(next);
	text->pop();
	return true;
}

bool phonemizer::handle_possession_plural(corpus* text, std::string* output, conditions * flags) {
	if (text->next(2) == "'s") {
		std::string last = text->last();
		if (VOWELS.find(to_lower(last)[0]) != std::string::npos) {
			output->append("z");
		} else if (last == "s" || last == "z") {
			output->append("ᵻz");
		} else if (is_alphabetic(last[0])) {
			output->append("s");
		} else {
			output->append("ˈɛs");
		}
		text->pop(2);
	} else {
		text->pop();
	}
	return true;
}

bool phonemizer::handle_contraction(corpus* text, std::string* output, conditions * flags) {
	text->pop();
	std::string next = text->next_in(ALPHABET);
	next = to_lower(next);
	try {
		output->append(CONTRACTION_PHONEMES.at(next));
	} catch (const std::out_of_range& e) {
		// in the situation that we cannt find a contraction then we just want to pop the ' character and continue
		// it could be the end of a single quote which is ignored by the espeak phonemizer.
		return true;
	}
	// make sure to pop the contraction.
	text->pop_in(ALPHABET);
	return true;
}

bool phonemizer::handle_punctuation(corpus* text, std::string next, std::string* output, conditions * flags) {
	std::string last = text->last();
	std::string after = text->after();
	if (next[0] == '.') {
		if (flags->was_punctuated_acronym) {
			// we finished an acronym
			flags->was_punctuated_acronym = false;
			output->append(next);
			text->pop();
			if (text->after(1, 2) == "'s") {
				return handle_possession_plural(text, output, flags);
			}
			return true;
		}
		std::string chunk = text->next_in(".");
		/*if (chunk.size() > 1) {
			flags->pre_pause += 4;
		}*/
		output->append(chunk);
		text->size_pop(chunk.size());
		return true;
	} else if (next == "'") {
		if (flags->was_word  && (after == "s" || !is_alphabetic(after[0]))) {
			return handle_possession_plural(text, output, flags);
		} else if (flags->was_word && (CONTRACTION_PHONEMES.find(after) != CONTRACTION_PHONEMES.end() || CONTRACTION_PHONEMES.find(text->after(next.size(), 2)) != CONTRACTION_PHONEMES.end())) {
			return handle_contraction(text, output, flags);
		} else {
			// could be the end or start of a quote
			text->pop();
			return true;
		}
	}  else if (next[0] == '-') {
		if (last == " " && after == " ") {
			//flags->pre_pause += 4;
			text->pop(2);
			flags->reset_for_space();
			return true;
		} else if (after[0] == '-') {
			//flags->pre_pause += 4;
			text->pop(2);
			output->append(" ");
			flags->reset_for_space();
			return true;
		} else if (!flags->beginning_of_clause && flags->was_word && is_alphabetic(after[0])) {
			flags->hyphenated = true;
			text->pop();
			return true;
		} else {
			// ignore it
			text->pop();
			return true;
		}
	}
	else if (CLAUSE_BREAKS.find(next) != std::string::npos) {
		output->append(next);
		flags->reset_for_clause_end();
		text->pop();
		return true;
	} else if (NOOP_BREAKS.find(next) != std::string::npos) {
		output->append(next);
		text->pop();
		return true;
	} else if (REPLACEABLE.find(next) != REPLACEABLE.end()) {
		return handle_replacement(text, next, output, flags);
	} else {
		// ignore it
		text->pop();
		return true;
	}
}

bool phonemizer::route(corpus * text, std::string* output, conditions * flags) {
	std::string next = text->next();
	if (next == "") {
		// we finished lexing the corpus
		return false;
	}
	if (SPACE_CHARACTERS.find(next) != std::string::npos) {
		return handle_space(text, output, flags);
	} else if (is_numeric(next[0])) {
		return handle_numeric(text, output, flags);
	} else if (is_alphabetic(next[0])) {
		return handle_word(text, output, flags);
	} else {
		return handle_punctuation(text, next, output, flags);
	}
}

#ifdef ESPEAK_INSTALL
std::string phonemizer::espeak_text_to_phonemes(const char * text) {
	int mode = phoneme_mode == IPA ? (0 << 8 | 0x02) : (0 << 8 | 0x01);
	const void ** txt_ptr = (const void**)&text;
	const char * resp = espeak_wrapper::get_instance()->text_to_phonemes(txt_ptr, espeakCHARS_UTF8, mode);
	return strip(std::string(resp));
}
#endif

std::string phonemizer::text_to_phonemes(const char * text, size_t size) {
	std::string output = "";
	if (mode == ESPEAK) {
#ifdef ESPEAK_INSTALL
		auto parts = split(text, STOPPING_TOKENS, true);
		std::string phonemes = "";
		for (int i = 0; i < parts.size(); i+=2) {
			phonemes += espeak_text_to_phonemes(parts[i].c_str());
			if (preserve_punctuation && i + 1 < parts.size()) {
				phonemes += parts[i+1];
			}
		}
		return phonemes;
#else
		TTS_ABORT("%s attempted to run in espeak mode without espeak installed. \n", __func__);
#endif
	} else {
		text_to_phonemes(text, size, &output);
	}
	return output;
}

std::string phonemizer::text_to_phonemes(std::string text) {
	return text_to_phonemes(text.c_str(), text.size());
}

void phonemizer::text_to_phonemes(const char * text, size_t size, std::string* output) {
	if (mode == ESPEAK) {
#ifdef ESPEAK_INSTALL
		TTS_ABORT("%s attempted to run in espeak mode with output already defined. \n", __func__);
#else
		TTS_ABORT("%s attempted to run in espeak mode without espeak installed. \n", __func__);
#endif
		return;
	}
	corpus * corpus_text = new corpus(text, size);
	conditions * flags = new conditions;
	bool running = true;
	while (running) {
		running = route(corpus_text, output, flags);
	}
	delete corpus_text;
	delete flags;
}

void phonemizer::text_to_phonemes(std::string text, std::string* output) {
	text_to_phonemes(text.c_str(), text.size(), output);
}

struct word_phonemizer * word_phonemizer_from_gguf(gguf_context * meta) {
	struct single_pass_tokenizer * tokenizer = single_pass_tokenizer_from_gguf(meta);
	word_phonemizer * wph = new word_phonemizer(tokenizer);
    int rule_keys_key = gguf_find_key(meta, "phonemizer.rules.keys");
    int phoneme_key = gguf_find_key(meta, "phonemizer.rules.phonemes");
    if (rule_keys_key == -1 || phoneme_key == -1) {
    	TTS_ABORT("Both 'phonemizer.rules.keys' and 'phonemizer.rules.phonemes' keys must be set in order to support phonemization.");
    }
    int key_count = gguf_get_arr_n(meta, rule_keys_key);
    assert(key_count == gguf_get_arr_n(meta, phoneme_key));
    for (int i = 0; i < key_count; i++) {
    	std::string rule_key = gguf_get_arr_str(meta, rule_keys_key, i);
    	std::string phoneme = gguf_get_arr_str(meta, phoneme_key, i);
    	wph->add_rule(split(rule_key, "."), phoneme);
    }
    return wph;
}

dictionary_response * response_from_string(std::string value, std::string key) {
	std::vector<std::string> parts = split(value, ":");
	bool has_spacing = parts.size() > 1;
	bool expects_to_be_proceeded_by_number = key[0] == '$';
	bool not_at_start = key[0] == '#';
	bool not_at_end = key.back() == '#';
    if (!has_spacing) {
    	dictionary_response * resp = new dictionary_response(SUCCESS_TOTAL, value);
    	resp->expects_to_be_proceeded_by_number = expects_to_be_proceeded_by_number;
    	resp->not_at_clause_end = not_at_end;
    	resp->not_at_clause_start = not_at_start;
    	return resp;
    } else {
    	dictionary_response * resp = new dictionary_response(SUCCESS_PARTIAL, parts[0]);
    	resp->after_match = parts[1];
    	resp->expects_to_be_proceeded_by_number = expects_to_be_proceeded_by_number;
    	resp->not_at_clause_end = not_at_end;
    	resp->not_at_clause_start = not_at_start;
    	return resp;
    }
}

struct phoneme_dictionary * phoneme_dictionary_from_gguf(gguf_context * meta) {
	struct phoneme_dictionary * dict = new phoneme_dictionary;

    int keys_key = gguf_find_key(meta, "phonemizer.dictionary.keys");
    int values_key = gguf_find_key(meta, "phonemizer.dictionary.values");
    if (keys_key == -1 || values_key == -1) {
    	TTS_ABORT("Both 'phonemizer.dictionary.keys' and 'phonemizer.dictionary.values' keys must be set in order to support phonemization.");
    }
    int key_count = gguf_get_arr_n(meta, keys_key);
    assert(key_count == gguf_get_arr_n(meta, values_key));
    for (int i = 0; i < key_count; i++) {
    	std::string key = gguf_get_arr_str(meta, keys_key, i);
    	std::string values = gguf_get_arr_str(meta, values_key, i);
    	std::vector<dictionary_response*> out;
    	for (std::string val : split(values, ",")) {
    		out.push_back(response_from_string(val, key));
    	}
    	if (key[0] == '$' || key[0] == '#') {
    		key = key.substr(1);
    	}
    	if (key.back() == '#') {
    		key = key.substr(0, key.size() - 1);
    	}
    	dict->lookup_map[key] = out;
    }
    return dict;
}

struct phonemizer * phonemizer_from_gguf(gguf_context * meta, const std::string espeak_voice_code) {
	int mode_key = gguf_find_key(meta, "phonemizer.type");
	phonemizer * ph;
    if (mode_key == -1) {
        TTS_ABORT("Key 'phonemizer.type' must be specified in gguf file for all models using a phonemizer.");
    }
    uint32_t ph_type = gguf_get_val_u32(meta, mode_key);

    if ((phonemizer_type) ph_type == ESPEAK) {
#ifdef ESPEAK_INSTALL
    	espeak_wrapper::get_instance()->initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, ESPEAK_DATA_PATH, 0);

    	update_voice(espeak_voice_code);

		ph = new phonemizer(nullptr, nullptr);
		ph->mode = ESPEAK;
#else
		TTS_ABORT("%s attempted to load an espeak phonemizer without espeak installed. \n", __func__);
#endif
		int phoneme_type_key = gguf_find_key(meta, "phonemizer.phoneme_type");
		if (phoneme_type_key != -1) {
			uint32_t phoneme_typing = gguf_get_val_u32(meta, mode_key);
			if ((phoneme_type)phoneme_typing == ESPEAK_PHONEMES) {
				ph->phoneme_mode = ESPEAK_PHONEMES;
			}
		}
		return ph;
    }
    struct word_phonemizer * phonetic_ph = word_phonemizer_from_gguf(meta);
    struct phoneme_dictionary * dict =  phoneme_dictionary_from_gguf(meta);
    ph = new phonemizer(dict, phonetic_ph);
    return ph;
}

struct phonemizer * espeak_phonemizer(bool use_espeak_phonemes, std::string espeak_voice_code) {
#ifdef ESPEAK_INSTALL
	espeak_wrapper::get_instance()->initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, ESPEAK_DATA_PATH, 0);

	update_voice(espeak_voice_code);

	phonemizer * ph = new phonemizer(nullptr, nullptr);
	ph->mode = ESPEAK;
	if (use_espeak_phonemes) {
		ph->phoneme_mode = ESPEAK_PHONEMES;
	}
	return ph;
#else
	TTS_ABORT("%s attempted to load an espeak phonemizer without espeak installed. \n", __func__);
#endif
}

struct phonemizer * phonemizer_from_file(const std::string fname, const std::string espeak_voice_code) {
	ggml_context * weight_ctx = NULL;
    struct gguf_init_params params = {
        /*.no_alloc   =*/ false,
        /*.ctx        =*/ &weight_ctx,
    };
    gguf_context * meta_ctx = gguf_init_from_file(fname.c_str(), params);
    if (!meta_ctx) {
        TTS_ABORT("%s failed for file %s\n", __func__, fname.c_str());
    }
    return phonemizer_from_gguf(meta_ctx, espeak_voice_code);
}