q3tts custom voice support

2026-05-17 04:09:19 +00:00 · 2026-03-24 23:38:18 +08:00 · 2026-03-24 23:38:18 +08:00 · efdc52fe8b
commit efdc52fe8b
parent 8437c346a7
4 changed files with 43 additions and 11 deletions
--- a/otherarch/qwen3tts/q3ttsmain.cpp
+++ b/otherarch/qwen3tts/q3ttsmain.cpp
@ -139,7 +139,7 @@ int main(int argc, char ** argv) {

    if (reference_audio.empty()) {
        fprintf(stderr, "Synthesizing: \"%s\"\n", text.c_str());
-        result = tts.synthesize(text,"", params);
+        result = tts.synthesize(text,"", -1, params);
    } else {
        fprintf(stderr, "Synthesizing with voice cloning: \"%s\"\n", text.c_str());
        fprintf(stderr, "Reference audio: %s\n", reference_audio.c_str());
--- a/otherarch/qwen3tts/qwen3_tts.cpp
+++ b/otherarch/qwen3tts/qwen3_tts.cpp
@ -163,7 +163,26 @@ bool Qwen3TTS::load_models(const std::string & tts_model_path, const std::string
    return true;
 }

-tts_result Qwen3TTS::synthesize(const std::string & text, const std::string & instruction,
+bool Qwen3TTS::load_speaker_enc()
+{
+    if (!encoder_loaded_) {
+        if (tts_model_path_.empty()) {
+            fprintf(stderr, "Failed to load speaker encoder, file not found");
+            return false;
+        }
+        int64_t t_encoder_load_start = get_time_ms();
+        if (!audio_encoder_.load_model(tts_model_path_)) {
+            return false; //no encoder
+        }
+        encoder_loaded_ = true;
+        fprintf(stderr, "  Speaker encoder lazy-loaded in %lld ms\n",
+                (long long)(get_time_ms() - t_encoder_load_start));
+        return true;
+    }
+    return true;
+}
+
+tts_result Qwen3TTS::synthesize(const std::string & text, const std::string & instruction, const int speaker_id,
                                 const tts_params & params) {
    tts_result result;

@ -176,7 +195,7 @@ tts_result Qwen3TTS::synthesize(const std::string & text, const std::string & in
    // This will use the model's default voice characteristics
    std::vector<float> zero_embedding(transformer_.get_config().hidden_size, 0.0f);

-    return synthesize_internal(text, instruction, zero_embedding.data(), params, result);
+    return synthesize_internal(text, instruction, speaker_id, zero_embedding.data(), params, result);
 }

 tts_result Qwen3TTS::synthesize_with_voice(const std::string & text,
@ -260,10 +279,10 @@ tts_result Qwen3TTS::synthesize_with_voice(const std::string & text,
        fprintf(stderr, "Speaker embedding extracted: %zu floats\n", speaker_embedding.size());
    }

-    return synthesize_internal(text, "", speaker_embedding.data(), params, result);
+    return synthesize_internal(text, "", -1, speaker_embedding.data(), params, result);
 }

-tts_result Qwen3TTS::synthesize_internal(const std::string & text, const std::string & instruction,
+tts_result Qwen3TTS::synthesize_internal(const std::string & text, const std::string & instruction, const int speakerid,
                                          const float * speaker_embedding,
                                          const tts_params & params,
                                          tts_result & result) {
@ -325,7 +344,7 @@ tts_result Qwen3TTS::synthesize_internal(const std::string & text, const std::st
    if (!transformer_.generate(text_tokens.data(), (int32_t)text_tokens.size(),
                               speaker_embedding, params.max_audio_tokens, speech_codes,
                               2050, params.repetition_penalty,
-                               params.temperature, params.top_k, -1, instruct_tok_data, instruct_tok_count)) {
+                               params.temperature, params.top_k, speakerid, instruct_tok_data, instruct_tok_count)) {
        result.error_msg = "Failed to generate speech codes: " + transformer_.get_error();
        return result;
    }
--- a/otherarch/qwen3tts/qwen3_tts.h
+++ b/otherarch/qwen3tts/qwen3_tts.h
@ -91,7 +91,7 @@ public:
    // Generate speech from text
    // text: input text to synthesize
    // params: generation parameters
-    tts_result synthesize(const std::string & text, const std::string & instruction,
+    tts_result synthesize(const std::string & text, const std::string & instruction, const int speaker_id,
                          const tts_params & params = tts_params());

    // Generate speech with voice cloning
@ -114,6 +114,8 @@ public:
    // Set progress callback
    void set_progress_callback(tts_progress_callback_t callback);

+    bool load_speaker_enc(); //return false if failed to load encoder
+
    // Get error message
    const std::string & get_error() const { return error_msg_; }

@ -121,7 +123,7 @@ public:
    bool is_loaded() const { return models_loaded_; }

 private:
-    tts_result synthesize_internal(const std::string & text, const std::string & instruction,
+    tts_result synthesize_internal(const std::string & text, const std::string & instruction, const int speakerid,
                                   const float * speaker_embedding,
                                   const tts_params & params,
                                   tts_result & result);
--- a/otherarch/tts_adapter.cpp
+++ b/otherarch/tts_adapter.cpp
@ -1186,6 +1186,15 @@ static tts_generation_outputs ttstype_generate_qwen3tts(const tts_generation_inp
        std::vector<float> custom_reference_audio_pcmf32;
        std::string speaker_instruction = inputs.speaker_instruction;

+        int speakerID = inputs.speaker_seed;
+        int speakermap[] = {2861,3066,2873,3061,2864,2875,2878,3065,3010};
+
+        if (speakerID > 0 && speakerID <= 5) {
+            speakerID = speakermap[speakerID-1];
+        } else {
+            speakerID = -1;
+        }
+
        int audio_seed = inputs.audio_seed;
        if (audio_seed <= 0 || audio_seed==0xFFFFFFFF)
        {
@ -1194,7 +1203,7 @@ static tts_generation_outputs ttstype_generate_qwen3tts(const tts_generation_inp

        if(ttsdebugmode==1 && !tts_is_quiet)
        {
-            printf("\nUsing Audio Seed: %d", audio_seed);
+            printf("\nUsing Audio Seed: %d, SpeakerID: %d", audio_seed, speakerID);
        }
        qwen3tts_runner.set_seed(audio_seed);

@ -1221,11 +1230,13 @@ static tts_generation_outputs ttstype_generate_qwen3tts(const tts_generation_inp
            qwen3tts_params.print_progress = true;
        }

-        if (speaker_instruction!="" || custom_reference_audio_pcmf32.empty()) {
+        bool has_speaker_enc = qwen3tts_runner.load_speaker_enc();
+
+        if (speaker_instruction!="" || custom_reference_audio_pcmf32.empty() || !has_speaker_enc) {
            if (speaker_instruction != "" && !tts_is_quiet) {
                printf("\nApply VoiceDesign Instruction: %s", speaker_instruction.c_str());
            }
-            result = qwen3tts_runner.synthesize(prompt, speaker_instruction, qwen3tts_params);
+            result = qwen3tts_runner.synthesize(prompt, speaker_instruction, speakerID, qwen3tts_params);
        } else {
            std::size_t reuse_hash_value = std::hash<std::string>{}(custom_reference_audio_str);