cache used voices

This commit is contained in:
Concedo 2026-02-22 00:43:57 +08:00
parent 5536fb29f2
commit 313d37a602
3 changed files with 22 additions and 18 deletions

View file

@ -202,10 +202,10 @@ tts_result Qwen3TTS::synthesize_with_voice(const std::string & text,
return synthesize_with_voice(text, ref_samples.data(), (int32_t)ref_samples.size(), params);
}
static std::vector<float> speaker_embedding;
static std::map<std::size_t, std::vector<float>> speaker_embeddings;
tts_result Qwen3TTS::synthesize_with_voice(const std::string & text,
const float * ref_samples, int32_t n_ref_samples,
const tts_params & params, bool regenerate) {
const tts_params & params, std::size_t reuse_hash_val) {
tts_result result;
if (!models_loaded_) {
@ -232,14 +232,27 @@ tts_result Qwen3TTS::synthesize_with_voice(const std::string & text,
}
int64_t t_encode_start = get_time_ms();
std::vector<float> speaker_embedding;
if(speaker_embedding.size()==0 || regenerate)
if(speaker_embeddings.size()>0 || reuse_hash_val>0)
{
speaker_embedding.clear();
auto it = speaker_embeddings.find(reuse_hash_val);
if (it != speaker_embeddings.end()) {
speaker_embedding = it->second;
}
}
if(speaker_embedding.size()==0)
{
printf("Creating Voice Embedding ID=%u... (Warning, lengthy sample audio will be very slow. Use short clips!)\n",reuse_hash_val);
if (!audio_encoder_.encode(ref_samples, n_ref_samples, speaker_embedding)) {
result.error_msg = "Failed to extract speaker embedding: " + audio_encoder_.get_error();
return result;
}
if(reuse_hash_val!=0)
{
speaker_embeddings[reuse_hash_val] = speaker_embedding;
}
}
result.t_encode_ms = get_time_ms() - t_encode_start;

View file

@ -109,7 +109,7 @@ public:
// params: generation parameters
tts_result synthesize_with_voice(const std::string & text,
const float * ref_samples, int32_t n_ref_samples,
const tts_params & params = tts_params(), bool regenerate=true);
const tts_params & params = tts_params(), std::size_t reuse_hash_val=0);
// Set progress callback
void set_progress_callback(tts_progress_callback_t callback);

View file

@ -549,7 +549,6 @@ static std::string detectedarch = "";
//qwen3tts specific
static bool is_qwen3tts_file = false;
static qwen3_tts::Qwen3TTS qwen3tts_runner;
static std::string last_qwen3tts_reference_audio_str = "";
int total_tts_gens = 0;
static std::string tts_executable_path = "";
@ -1282,19 +1281,11 @@ static tts_generation_outputs ttstype_generate_qwen3tts(const tts_generation_inp
if (custom_reference_audio_pcmf32.empty()) {
result = qwen3tts_runner.synthesize(prompt, qwen3tts_params);
} else {
bool regenerate = (last_qwen3tts_reference_audio_str=="" || last_qwen3tts_reference_audio_str!=custom_reference_audio_str);
std::string msg = "\nUsing reference voice...";
if(regenerate)
{
msg += "Regenerating... (Warning, lengthy sample audio will be very slow. Use short clips!)";
}
msg += "\n";
std::size_t reuse_hash_value = std::hash<std::string>{}(custom_reference_audio_str);
std::string msg = "\nUsing reference voice...\n";
printf("%s",msg.c_str());
result = qwen3tts_runner.synthesize_with_voice(prompt, custom_reference_audio_pcmf32.data(),custom_reference_audio_pcmf32.size(), qwen3tts_params, regenerate);
if(regenerate)
{
last_qwen3tts_reference_audio_str = custom_reference_audio_str;
}
result = qwen3tts_runner.synthesize_with_voice(prompt, custom_reference_audio_pcmf32.data(),custom_reference_audio_pcmf32.size(), qwen3tts_params, reuse_hash_value);
}
if (!result.success) {