diff --git a/otherarch/qwen3tts/q3ttsmain.cpp b/otherarch/qwen3tts/q3ttsmain.cpp index dd6f72838..db46ec529 100644 --- a/otherarch/qwen3tts/q3ttsmain.cpp +++ b/otherarch/qwen3tts/q3ttsmain.cpp @@ -139,7 +139,7 @@ int main(int argc, char ** argv) { if (reference_audio.empty()) { fprintf(stderr, "Synthesizing: \"%s\"\n", text.c_str()); - result = tts.synthesize(text,"", params); + result = tts.synthesize(text,"", -1, params); } else { fprintf(stderr, "Synthesizing with voice cloning: \"%s\"\n", text.c_str()); fprintf(stderr, "Reference audio: %s\n", reference_audio.c_str()); diff --git a/otherarch/qwen3tts/qwen3_tts.cpp b/otherarch/qwen3tts/qwen3_tts.cpp index fcec1052d..c3c92a155 100644 --- a/otherarch/qwen3tts/qwen3_tts.cpp +++ b/otherarch/qwen3tts/qwen3_tts.cpp @@ -163,7 +163,26 @@ bool Qwen3TTS::load_models(const std::string & tts_model_path, const std::string return true; } -tts_result Qwen3TTS::synthesize(const std::string & text, const std::string & instruction, +bool Qwen3TTS::load_speaker_enc() +{ + if (!encoder_loaded_) { + if (tts_model_path_.empty()) { + fprintf(stderr, "Failed to load speaker encoder, file not found"); + return false; + } + int64_t t_encoder_load_start = get_time_ms(); + if (!audio_encoder_.load_model(tts_model_path_)) { + return false; //no encoder + } + encoder_loaded_ = true; + fprintf(stderr, " Speaker encoder lazy-loaded in %lld ms\n", + (long long)(get_time_ms() - t_encoder_load_start)); + return true; + } + return true; +} + +tts_result Qwen3TTS::synthesize(const std::string & text, const std::string & instruction, const int speaker_id, const tts_params & params) { tts_result result; @@ -176,7 +195,7 @@ tts_result Qwen3TTS::synthesize(const std::string & text, const std::string & in // This will use the model's default voice characteristics std::vector zero_embedding(transformer_.get_config().hidden_size, 0.0f); - return synthesize_internal(text, instruction, zero_embedding.data(), params, result); + return synthesize_internal(text, instruction, speaker_id, zero_embedding.data(), params, result); } tts_result Qwen3TTS::synthesize_with_voice(const std::string & text, @@ -260,10 +279,10 @@ tts_result Qwen3TTS::synthesize_with_voice(const std::string & text, fprintf(stderr, "Speaker embedding extracted: %zu floats\n", speaker_embedding.size()); } - return synthesize_internal(text, "", speaker_embedding.data(), params, result); + return synthesize_internal(text, "", -1, speaker_embedding.data(), params, result); } -tts_result Qwen3TTS::synthesize_internal(const std::string & text, const std::string & instruction, +tts_result Qwen3TTS::synthesize_internal(const std::string & text, const std::string & instruction, const int speakerid, const float * speaker_embedding, const tts_params & params, tts_result & result) { @@ -325,7 +344,7 @@ tts_result Qwen3TTS::synthesize_internal(const std::string & text, const std::st if (!transformer_.generate(text_tokens.data(), (int32_t)text_tokens.size(), speaker_embedding, params.max_audio_tokens, speech_codes, 2050, params.repetition_penalty, - params.temperature, params.top_k, -1, instruct_tok_data, instruct_tok_count)) { + params.temperature, params.top_k, speakerid, instruct_tok_data, instruct_tok_count)) { result.error_msg = "Failed to generate speech codes: " + transformer_.get_error(); return result; } diff --git a/otherarch/qwen3tts/qwen3_tts.h b/otherarch/qwen3tts/qwen3_tts.h index 54879f234..f8b535ed0 100644 --- a/otherarch/qwen3tts/qwen3_tts.h +++ b/otherarch/qwen3tts/qwen3_tts.h @@ -91,7 +91,7 @@ public: // Generate speech from text // text: input text to synthesize // params: generation parameters - tts_result synthesize(const std::string & text, const std::string & instruction, + tts_result synthesize(const std::string & text, const std::string & instruction, const int speaker_id, const tts_params & params = tts_params()); // Generate speech with voice cloning @@ -114,6 +114,8 @@ public: // Set progress callback void set_progress_callback(tts_progress_callback_t callback); + bool load_speaker_enc(); //return false if failed to load encoder + // Get error message const std::string & get_error() const { return error_msg_; } @@ -121,7 +123,7 @@ public: bool is_loaded() const { return models_loaded_; } private: - tts_result synthesize_internal(const std::string & text, const std::string & instruction, + tts_result synthesize_internal(const std::string & text, const std::string & instruction, const int speakerid, const float * speaker_embedding, const tts_params & params, tts_result & result); diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp index e61634db4..7e761adbd 100644 --- a/otherarch/tts_adapter.cpp +++ b/otherarch/tts_adapter.cpp @@ -1186,6 +1186,15 @@ static tts_generation_outputs ttstype_generate_qwen3tts(const tts_generation_inp std::vector custom_reference_audio_pcmf32; std::string speaker_instruction = inputs.speaker_instruction; + int speakerID = inputs.speaker_seed; + int speakermap[] = {2861,3066,2873,3061,2864,2875,2878,3065,3010}; + + if (speakerID > 0 && speakerID <= 5) { + speakerID = speakermap[speakerID-1]; + } else { + speakerID = -1; + } + int audio_seed = inputs.audio_seed; if (audio_seed <= 0 || audio_seed==0xFFFFFFFF) { @@ -1194,7 +1203,7 @@ static tts_generation_outputs ttstype_generate_qwen3tts(const tts_generation_inp if(ttsdebugmode==1 && !tts_is_quiet) { - printf("\nUsing Audio Seed: %d", audio_seed); + printf("\nUsing Audio Seed: %d, SpeakerID: %d", audio_seed, speakerID); } qwen3tts_runner.set_seed(audio_seed); @@ -1221,11 +1230,13 @@ static tts_generation_outputs ttstype_generate_qwen3tts(const tts_generation_inp qwen3tts_params.print_progress = true; } - if (speaker_instruction!="" || custom_reference_audio_pcmf32.empty()) { + bool has_speaker_enc = qwen3tts_runner.load_speaker_enc(); + + if (speaker_instruction!="" || custom_reference_audio_pcmf32.empty() || !has_speaker_enc) { if (speaker_instruction != "" && !tts_is_quiet) { printf("\nApply VoiceDesign Instruction: %s", speaker_instruction.c_str()); } - result = qwen3tts_runner.synthesize(prompt, speaker_instruction, qwen3tts_params); + result = qwen3tts_runner.synthesize(prompt, speaker_instruction, speakerID, qwen3tts_params); } else { std::size_t reuse_hash_value = std::hash{}(custom_reference_audio_str);