can't resolve the clicking

This commit is contained in:
Concedo 2025-08-26 17:55:03 +08:00
parent ff1d179e21
commit 75c919cfd4
5 changed files with 99 additions and 14 deletions

View file

@ -19951,6 +19951,7 @@ excursions,ɪkskˈɜɹʒənz
excusable,ɪkskjˈuzəbᵊl
excused,ɪkskjˈuzd
excusing,ɪkskjˈuzɪŋ
excuses,ɪkskjˈuzᵻz
exec,ɛɡzˈɛk
execrable,ˈɛksəkɹəbᵊl
execration,ˌɛksəkɹˈAʃən
@ -39067,16 +39068,16 @@ organelles,ˌɔɹɡənˈɛlz
organically,ɔɹɡˈænəkᵊli
organic,ɔɹɡˈænɪk
organics,ɔɹɡˈænɪks
organisationally,ˌɔɹɡənəzˈAʃənᵊli
organisational,ˌɔɹɡənəzˈAʃənᵊl
organisation,ˌɔɹɡənəzˈAʃən
organisations,ˌɔɹɡənəzˈAʃənz
organised,ˈɔɹɡənˌIzd
organise,ˈɔɹɡənˌIz
organiser,ˈɔɹɡənˌIzəɹ
organisers,ˈɔɹɡənˌIzəɹz
organises,ˈɔɹɡənˌIzᵻz
organising,ˈɔɹɡənˌIzɪŋ
organizationally,ˌɔɹɡənəzˈAʃənᵊli
organizational,ˌɔɹɡənəzˈAʃənᵊl
organization,ˌɔɹɡənəzˈAʃən
organizations,ˌɔɹɡənəzˈAʃənz
organized,ˈɔɹɡənˌIzd
organize,ˈɔɹɡənˌIz
organizer,ˈɔɹɡənˌIzəɹ
organizers,ˈɔɹɡənˌIzəɹz
organizes,ˈɔɹɡənˌIzᵻz
organizing,ˈɔɹɡənˌIzɪŋ
organism,ˈɔɹɡənˌɪzəm
organisms,ˈɔɹɡənˌɪzəmz
organist,ˈɔɹɡənɪst

View file

@ -64,6 +64,85 @@ struct wav_header {
uint32_t data_size;
};
// #include <vector>
// #include <cstdio>
// #include <cmath>
// static void audio_post_clean(std::vector<float>& data) { // detect clicks
// const float silenceThreshold = 1e-5f;
// const float noiseThreshold = 1e-3f;
// const size_t minSilence = 100; // samples
// const size_t noiseSpan = 150; // samples
// const size_t minSilence2 = 100; // samples
// size_t len = data.size();
// int silencecounterA = 0;
// int noisecounterA = 0;
// int silencecounterB = 0;
// int state = 0; // 0 = finding first silence, 1 = measuring noise, 2 = finding second silence
// size_t noiseStart = 0;
// for (size_t i = 0; i < len; ++i) {
// float sample = std::fabs(data[i]);
// if (state == 0) { // finding first silence
// if (sample < silenceThreshold) {
// silencecounterA++;
// } else {
// if (silencecounterA >= minSilence) {
// state = 1;
// noisecounterA = 1;
// noiseStart = i;
// } else {
// silencecounterA = 0;
// noisecounterA = 0;
// silencecounterB = 0;
// }
// }
// }
// if (state == 1) { // measuring noise span
// noisecounterA++;
// if(sample>noiseThreshold)
// {
// state = 0;
// silencecounterA = 0;
// noisecounterA = 0;
// silencecounterB = 0;
// }
// else if(noisecounterA>noiseSpan)
// {
// state = 2;
// }
// }
// if (state == 2) { // finding second silence
// if (sample < silenceThreshold) {
// silencecounterB++;
// if (silencecounterB >= minSilence2) {
// // full click detected
// size_t noiseend = noiseStart + noisecounterA - 1;
// //printf("Click detected from %zu to %zu\n", noiseStart, noiseend);
// for(size_t j=noiseStart;j<noiseend;++j)
// {
// data[j] *= 0.01f; //greatly suppress noise
// }
// // reset to search again
// state = 0;
// silencecounterA = 0;
// noisecounterA = 0;
// silencecounterB = 0;
// }
// } else {
// state = 0;
// silencecounterA = 0;
// noisecounterA = 0;
// silencecounterB = 0;
// }
// }
// }
// }
static std::string save_wav16_base64(const std::vector<float> &data, int sample_rate) {
std::ostringstream oss;
wav_header header;
@ -740,6 +819,7 @@ static tts_generation_outputs ttstype_generate_ttscpp(const tts_generation_input
ttstime = timer_check();
printf("\nTTS Generated audio in %.2fs.\n",ttstime);
std::vector<float> wavdat = std::vector(response_data.data, response_data.data + response_data.n_outputs);
//audio_post_clean(wavdat);
last_generated_audio = save_wav16_base64(wavdat, ttscpp_runner->sampling_rate);
output.data = last_generated_audio.c_str();
output.status = 1;

View file

@ -128,7 +128,7 @@ static const std::map<const char, std::string> LETTER_PHONEMES = {
{'d', "dˈiː"},
{'e', "ˈiː"},
{'f', "ˈɛf"},
{'j', "ˈeɪ"},
{'g', "ˈi"},
{'h', "ˈeɪ"},
{'i', "ˈaɪ"},
{'j', "ˈeɪ"},

View file

@ -1426,11 +1426,13 @@ int kokoro_runner::generate(std::string prompt, struct tts_response * response,
prompt = replace_any(prompt, ";:", "--");
prompt = replace_any(prompt, "\n", "--");
kokoro_str_replace_all(prompt,"","'");
kokoro_str_replace_all(prompt,"Mr. ","Mister ");
prompt = std::regex_replace(prompt, std::regex("(\\w)([.!?]) "), "$1$2, ");
kokoro_str_replace_all(prompt," - "," -- ");
kokoro_str_replace_all(prompt,"he's ","he is ");
kokoro_str_replace_all(prompt,"'s ","s ");
kokoro_str_replace_all(prompt,"n't ","nt ");
kokoro_str_replace_all(prompt,"*"," ");
std::string phonemized_prompt = phmzr->text_to_phonemes(prompt);
// printf("\nRESULT: %s\n",phonemized_prompt.c_str());

View file

@ -893,9 +893,11 @@ bool phonemizer::process_word(corpus* text, std::string* output, std::string wor
text->size_pop(word.size()+unaccented_size_difference);
return true;
}
} else if (can_be_roman_numeral(word) && is_all_upper(word) && small_english_words.find(to_lower(word)) == small_english_words.end() && handle_roman_numeral(text, output, flags)) {
return true;
} else if (is_acronym_like(text, word, flags)) {
}
// else if (can_be_roman_numeral(word) && is_all_upper(word) && small_english_words.find(to_lower(word)) == small_english_words.end() && handle_roman_numeral(text, output, flags)) {
// return true;
// }
else if (is_acronym_like(text, word, flags)) {
return handle_acronym(text, word, output, flags);
} else if (word.find(".") < word.length()) {
bool part_has_accent = false;