wip, adding IPA for kokoro

2025-09-15 11:29:43 +00:00 · 2025-08-18 00:51:12 +08:00 · 2025-08-18 00:51:12 +08:00 · 3f621be7dd
commit 3f621be7dd
parent 3138a151c2
3 changed files with 65185 additions and 0 deletions
--- a/kokoro_ipa.embd
+++ b/kokoro_ipa.embd
--- a/otherarch/ttscpp/src/kokoro_model.cpp
+++ b/otherarch/ttscpp/src/kokoro_model.cpp
@ -1387,6 +1387,24 @@ std::vector<std::vector<uint32_t>> kokoro_runner::tokenize_chunks(std::vector<st
 	return chunks;
 }

+//kcpp hacked a quick replace fn
+static void kokoro_str_replace_all(std::string & s, const std::string & search, const std::string & replace) {
+    if (search.empty()) {
+        return;
+    }
+    std::string builder;
+    builder.reserve(s.length());
+    size_t pos = 0;
+    size_t last_pos = 0;
+    while ((pos = s.find(search, last_pos)) != std::string::npos) {
+        builder.append(s, last_pos, pos - last_pos);
+        builder.append(replace);
+        last_pos = pos + search.length();
+    }
+    builder.append(s, last_pos, std::string::npos);
+    s = std::move(builder);
+}
+
 int kokoro_runner::generate(std::string prompt, struct tts_response * response, std::string voice, std::string voice_code) {
 	if (model->voices.find(voice) == model->voices.end()) {
 		fprintf(stdout,"\nFailed to find Kokoro voice '%s' aborting.\n", voice.c_str());
@ -1406,7 +1424,10 @@ int kokoro_runner::generate(std::string prompt, struct tts_response * response,
    // We preserve the other punctuation for cleaner chunking pre-tokenization
    prompt = replace_any(prompt, ",;:", "--");
    prompt = replace_any(prompt, "\n", " ");
+	kokoro_str_replace_all(prompt," - "," -- ");
+	kokoro_str_replace_all(prompt,"'s ","s ");
  	std::string phonemized_prompt = phmzr->text_to_phonemes(prompt);
+	// printf("\nRESULT: %s\n",phonemized_prompt.c_str());

  	// Kokoro users a utf-8 single character tokenizer so if the size of the prompt is smaller than the max context length without the
  	// beginning of sentence and end of sentence tokens then we can compute it all at once.
--- a/otherarch/ttscpp/src/phonemizer.cpp
+++ b/otherarch/ttscpp/src/phonemizer.cpp
@ -798,9 +798,53 @@ bool phonemizer::handle_phonetic(corpus* text, std::string word, std::string* ou
 	return true;
 }

+static std::unordered_map<std::string, std::string> kokoro_ipa_map;
+void populate_kokoro_ipa_map(std::string executable_path)
+{
+	std::string line;
+    auto filepath = executable_path + "kokoro_ipa.embd";
+    printf("\nReading Kokoro IPA from %s",filepath.c_str());
+    std::ifstream myfile(filepath);
+    if (myfile.is_open())
+    {
+        while (myfile.good())
+        {
+            getline(myfile, line);
+			auto parts = split(line, ",");
+			if(parts.size()==2)
+			{
+                kokoro_ipa_map[parts[0]] = parts[1];
+            } else {
+                printf("\nError reading line in Kokoro IPA!");
+            }
+        }
+        myfile.close();
+    }
+    else
+    {
+		printf("\nUnable to open Kokoro IPA file");
+    }
+}
+std::string found_word_to_ipa(std::string input)
+{
+	auto it = kokoro_ipa_map.find(input);
+    if (it != kokoro_ipa_map.end()) {
+        return it->second; //found
+    }
+    return "";
+}
 bool phonemizer::process_word(corpus* text, std::string* output, std::string word, conditions* flags, bool has_accent) {
 	dictionary_response* response;
 	size_t unaccented_size_difference = 0;
+
+	std::string foundstr = found_word_to_ipa(word);
+	if(foundstr!="")
+	{
+		output->append(foundstr);
+		text->size_pop(word.size());
+		return true;
+	}
+
 	if (has_accent) {
 		response = dict->lookup(text, word, flags);
 		if (!response->is_successful()) {
@ -813,6 +857,8 @@ bool phonemizer::process_word(corpus* text, std::string* output, std::string wor
 		response = dict->lookup(text, word, flags);
 	}

+	//printf("\nSUCCESS: %d, word:%s, result:%s\n",response->is_successful(),word.c_str(),response->value.c_str());
+
 	if (response->is_successful()) {
 		if (flags->was_word && output->back() != ' ' && !flags->hyphenated) {
 			output->append(" ");