custom voice handling

This commit is contained in:
Concedo 2025-08-18 16:57:34 +08:00
parent 67ef5e6c02
commit 2abe11071b
3 changed files with 31 additions and 17 deletions

View file

@ -247,6 +247,7 @@ struct tts_generation_inputs
const char * prompt = nullptr; const char * prompt = nullptr;
const int speaker_seed = 0; const int speaker_seed = 0;
const int audio_seed = 0; const int audio_seed = 0;
const char * custom_speaker_voice = "";
const char * custom_speaker_text = ""; const char * custom_speaker_text = "";
const char * custom_speaker_data = ""; const char * custom_speaker_data = "";
}; };

View file

@ -352,6 +352,7 @@ class tts_generation_inputs(ctypes.Structure):
_fields_ = [("prompt", ctypes.c_char_p), _fields_ = [("prompt", ctypes.c_char_p),
("speaker_seed", ctypes.c_int), ("speaker_seed", ctypes.c_int),
("audio_seed", ctypes.c_int), ("audio_seed", ctypes.c_int),
("custom_speaker_voice", ctypes.c_char_p),
("custom_speaker_text", ctypes.c_char_p), ("custom_speaker_text", ctypes.c_char_p),
("custom_speaker_data", ctypes.c_char_p)] ("custom_speaker_data", ctypes.c_char_p)]
@ -1880,6 +1881,7 @@ def tts_generate(genparams):
else: else:
voice = simple_lcg_hash(voicestr.strip()) if voicestr else 1 voice = simple_lcg_hash(voicestr.strip()) if voicestr else 1
inputs = tts_generation_inputs() inputs = tts_generation_inputs()
inputs.custom_speaker_voice = normalized_voice.encode("UTF-8")
inputs.prompt = prompt.encode("UTF-8") inputs.prompt = prompt.encode("UTF-8")
inputs.speaker_seed = voice inputs.speaker_seed = voice
aseed = -1 aseed = -1

View file

@ -502,6 +502,7 @@ static int tts_max_len = 4096;
static bool is_ttscpp_file = false; static bool is_ttscpp_file = false;
static generation_configuration * ttscpp_config = nullptr; static generation_configuration * ttscpp_config = nullptr;
static struct tts_runner * ttscpp_runner = nullptr; static struct tts_runner * ttscpp_runner = nullptr;
static std::string detectedarch = "";
int total_tts_gens = 0; int total_tts_gens = 0;
static std::string tts_executable_path = ""; static std::string tts_executable_path = "";
@ -540,7 +541,7 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
std::string modelfile_ttc = inputs.ttc_model_filename; std::string modelfile_ttc = inputs.ttc_model_filename;
std::string modelfile_cts = inputs.cts_model_filename; std::string modelfile_cts = inputs.cts_model_filename;
std::string detectedarch = gguf_get_model_arch(modelfile_ttc); detectedarch = gguf_get_model_arch(modelfile_ttc);
is_ttscpp_file = false; is_ttscpp_file = false;
if (detectedarch!="" && SUPPORTED_ARCHITECTURES.find(detectedarch) != SUPPORTED_ARCHITECTURES.end()) { if (detectedarch!="" && SUPPORTED_ARCHITECTURES.find(detectedarch) != SUPPORTED_ARCHITECTURES.end()) {
@ -663,24 +664,34 @@ static tts_generation_outputs ttstype_generate_ttscpp(const tts_generation_input
std::string prompt = inputs.prompt; std::string prompt = inputs.prompt;
double ttstime = 0; double ttstime = 0;
timer_start(); timer_start();
switch(speaker_seed)
std::vector<std::string> vmapper = {};
std::vector<std::string> vpermitted = {};
if(detectedarch=="kokoro")
{ {
case 1: vmapper = {"am_echo","af_heart","af_alloy","bm_daniel","bf_isabella"};
voiceused = "am_echo"; vpermitted = {"af_alloy", "af_aoede", "af_bella", "af_heart", "af_jessica", "af_kore", "af_nicole", "af_nova", "af_river", "af_sarah", "af_sky", "am_adam", "am_echo", "am_eric", "am_fenrir", "am_liam", "am_michael", "am_onyx", "am_puck", "am_santa", "bf_alice", "bf_emma", "bf_isabella", "bf_lily", "bm_daniel", "bm_fable", "bm_george", "bm_lewis"};
break;
case 2:
voiceused = "af_alloy";
break;
case 3:
voiceused = "af_jessica";
break;
case 4:
voiceused = "bm_daniel";
break;
case 5:
voiceused = "bf_isabella";
break;
} }
else if(detectedarch=="dia")
{
vmapper = {"zoe", "zac", "jess", "leo", "mia"};
vpermitted = {"zoe", "zac","jess", "leo", "mia", "julia", "leah"};
}
if(speaker_seed>=1 && speaker_seed<=5 && vmapper.size()>=5)
{
voiceused = vmapper[speaker_seed-1];
}
else if(vpermitted.size()>0)
{
//if we can match the voice, use it
const std::string cspeaker = inputs.custom_speaker_voice;
if (std::find(vpermitted.begin(), vpermitted.end(), cspeaker) != vpermitted.end()) {
voiceused = cspeaker;
}
}
if(ttsdebugmode==1 && !tts_is_quiet) if(ttsdebugmode==1 && !tts_is_quiet)
{ {
printf("\nUsing Speaker ID: %d, Voice: %s", speaker_seed, voiceused.c_str()); printf("\nUsing Speaker ID: %d, Voice: %s", speaker_seed, voiceused.c_str());