From 75c919cfd4dfb3b0af0f87a226f3ddb806d787a3 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Tue, 26 Aug 2025 17:55:03 +0800 Subject: [PATCH] can't resolve the clicking --- kokoro_ipa.embd | 21 +++---- otherarch/tts_adapter.cpp | 80 +++++++++++++++++++++++++++ otherarch/ttscpp/include/phonemizer.h | 2 +- otherarch/ttscpp/src/kokoro_model.cpp | 2 + otherarch/ttscpp/src/phonemizer.cpp | 8 ++- 5 files changed, 99 insertions(+), 14 deletions(-) diff --git a/kokoro_ipa.embd b/kokoro_ipa.embd index 134ded9f7..4e3166637 100644 --- a/kokoro_ipa.embd +++ b/kokoro_ipa.embd @@ -19951,6 +19951,7 @@ excursions,ɪkskˈɜɹʒənz excusable,ɪkskjˈuzəbᵊl excused,ɪkskjˈuzd excusing,ɪkskjˈuzɪŋ +excuses,ɪkskjˈuzᵻz exec,ɛɡzˈɛk execrable,ˈɛksəkɹəbᵊl execration,ˌɛksəkɹˈAʃən @@ -39067,16 +39068,16 @@ organelles,ˌɔɹɡənˈɛlz organically,ɔɹɡˈænəkᵊli organic,ɔɹɡˈænɪk organics,ɔɹɡˈænɪks -organisationally,ˌɔɹɡənəzˈAʃənᵊli -organisational,ˌɔɹɡənəzˈAʃənᵊl -organisation,ˌɔɹɡənəzˈAʃən -organisations,ˌɔɹɡənəzˈAʃənz -organised,ˈɔɹɡənˌIzd -organise,ˈɔɹɡənˌIz -organiser,ˈɔɹɡənˌIzəɹ -organisers,ˈɔɹɡənˌIzəɹz -organises,ˈɔɹɡənˌIzᵻz -organising,ˈɔɹɡənˌIzɪŋ +organizationally,ˌɔɹɡənəzˈAʃənᵊli +organizational,ˌɔɹɡənəzˈAʃənᵊl +organization,ˌɔɹɡənəzˈAʃən +organizations,ˌɔɹɡənəzˈAʃənz +organized,ˈɔɹɡənˌIzd +organize,ˈɔɹɡənˌIz +organizer,ˈɔɹɡənˌIzəɹ +organizers,ˈɔɹɡənˌIzəɹz +organizes,ˈɔɹɡənˌIzᵻz +organizing,ˈɔɹɡənˌIzɪŋ organism,ˈɔɹɡənˌɪzəm organisms,ˈɔɹɡənˌɪzəmz organist,ˈɔɹɡənɪst diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp index acf52ca3e..374ccde72 100644 --- a/otherarch/tts_adapter.cpp +++ b/otherarch/tts_adapter.cpp @@ -64,6 +64,85 @@ struct wav_header { uint32_t data_size; }; +// #include +// #include +// #include + +// static void audio_post_clean(std::vector& data) { // detect clicks +// const float silenceThreshold = 1e-5f; +// const float noiseThreshold = 1e-3f; +// const size_t minSilence = 100; // samples +// const size_t noiseSpan = 150; // samples +// const size_t minSilence2 = 100; // samples + +// size_t len = data.size(); + +// int silencecounterA = 0; +// int noisecounterA = 0; +// int silencecounterB = 0; +// int state = 0; // 0 = finding first silence, 1 = measuring noise, 2 = finding second silence + +// size_t noiseStart = 0; + +// for (size_t i = 0; i < len; ++i) { +// float sample = std::fabs(data[i]); + +// if (state == 0) { // finding first silence +// if (sample < silenceThreshold) { +// silencecounterA++; +// } else { +// if (silencecounterA >= minSilence) { +// state = 1; +// noisecounterA = 1; +// noiseStart = i; +// } else { +// silencecounterA = 0; +// noisecounterA = 0; +// silencecounterB = 0; +// } +// } +// } +// if (state == 1) { // measuring noise span +// noisecounterA++; +// if(sample>noiseThreshold) +// { +// state = 0; +// silencecounterA = 0; +// noisecounterA = 0; +// silencecounterB = 0; +// } +// else if(noisecounterA>noiseSpan) +// { +// state = 2; +// } +// } +// if (state == 2) { // finding second silence +// if (sample < silenceThreshold) { +// silencecounterB++; +// if (silencecounterB >= minSilence2) { +// // full click detected +// size_t noiseend = noiseStart + noisecounterA - 1; +// //printf("Click detected from %zu to %zu\n", noiseStart, noiseend); +// for(size_t j=noiseStart;j &data, int sample_rate) { std::ostringstream oss; wav_header header; @@ -740,6 +819,7 @@ static tts_generation_outputs ttstype_generate_ttscpp(const tts_generation_input ttstime = timer_check(); printf("\nTTS Generated audio in %.2fs.\n",ttstime); std::vector wavdat = std::vector(response_data.data, response_data.data + response_data.n_outputs); + //audio_post_clean(wavdat); last_generated_audio = save_wav16_base64(wavdat, ttscpp_runner->sampling_rate); output.data = last_generated_audio.c_str(); output.status = 1; diff --git a/otherarch/ttscpp/include/phonemizer.h b/otherarch/ttscpp/include/phonemizer.h index 0e401de74..7461ce165 100644 --- a/otherarch/ttscpp/include/phonemizer.h +++ b/otherarch/ttscpp/include/phonemizer.h @@ -128,7 +128,7 @@ static const std::map LETTER_PHONEMES = { {'d', "dˈiː"}, {'e', "ˈiː"}, {'f', "ˈɛf"}, - {'j', "dʒˈeɪ"}, + {'g', "dʒˈi"}, {'h', "ˈeɪtʃ"}, {'i', "ˈaɪ"}, {'j', "dʒˈeɪ"}, diff --git a/otherarch/ttscpp/src/kokoro_model.cpp b/otherarch/ttscpp/src/kokoro_model.cpp index 12f77b106..a6543c7cc 100644 --- a/otherarch/ttscpp/src/kokoro_model.cpp +++ b/otherarch/ttscpp/src/kokoro_model.cpp @@ -1426,11 +1426,13 @@ int kokoro_runner::generate(std::string prompt, struct tts_response * response, prompt = replace_any(prompt, ";:", "--"); prompt = replace_any(prompt, "\n", "--"); kokoro_str_replace_all(prompt,"’","'"); + kokoro_str_replace_all(prompt,"Mr. ","Mister "); prompt = std::regex_replace(prompt, std::regex("(\\w)([.!?]) "), "$1$2, "); kokoro_str_replace_all(prompt," - "," -- "); kokoro_str_replace_all(prompt,"he's ","he is "); kokoro_str_replace_all(prompt,"'s ","s "); kokoro_str_replace_all(prompt,"n't ","nt "); + kokoro_str_replace_all(prompt,"*"," "); std::string phonemized_prompt = phmzr->text_to_phonemes(prompt); // printf("\nRESULT: %s\n",phonemized_prompt.c_str()); diff --git a/otherarch/ttscpp/src/phonemizer.cpp b/otherarch/ttscpp/src/phonemizer.cpp index aafa1416b..078dcb707 100644 --- a/otherarch/ttscpp/src/phonemizer.cpp +++ b/otherarch/ttscpp/src/phonemizer.cpp @@ -893,9 +893,11 @@ bool phonemizer::process_word(corpus* text, std::string* output, std::string wor text->size_pop(word.size()+unaccented_size_difference); return true; } - } else if (can_be_roman_numeral(word) && is_all_upper(word) && small_english_words.find(to_lower(word)) == small_english_words.end() && handle_roman_numeral(text, output, flags)) { - return true; - } else if (is_acronym_like(text, word, flags)) { + } + // else if (can_be_roman_numeral(word) && is_all_upper(word) && small_english_words.find(to_lower(word)) == small_english_words.end() && handle_roman_numeral(text, output, flags)) { + // return true; + // } + else if (is_acronym_like(text, word, flags)) { return handle_acronym(text, word, output, flags); } else if (word.find(".") < word.length()) { bool part_has_accent = false;