mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-17 04:09:19 +00:00
q3tts custom voice support
This commit is contained in:
parent
8437c346a7
commit
efdc52fe8b
4 changed files with 43 additions and 11 deletions
|
|
@ -139,7 +139,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
if (reference_audio.empty()) {
|
||||
fprintf(stderr, "Synthesizing: \"%s\"\n", text.c_str());
|
||||
result = tts.synthesize(text,"", params);
|
||||
result = tts.synthesize(text,"", -1, params);
|
||||
} else {
|
||||
fprintf(stderr, "Synthesizing with voice cloning: \"%s\"\n", text.c_str());
|
||||
fprintf(stderr, "Reference audio: %s\n", reference_audio.c_str());
|
||||
|
|
|
|||
|
|
@ -163,7 +163,26 @@ bool Qwen3TTS::load_models(const std::string & tts_model_path, const std::string
|
|||
return true;
|
||||
}
|
||||
|
||||
tts_result Qwen3TTS::synthesize(const std::string & text, const std::string & instruction,
|
||||
bool Qwen3TTS::load_speaker_enc()
|
||||
{
|
||||
if (!encoder_loaded_) {
|
||||
if (tts_model_path_.empty()) {
|
||||
fprintf(stderr, "Failed to load speaker encoder, file not found");
|
||||
return false;
|
||||
}
|
||||
int64_t t_encoder_load_start = get_time_ms();
|
||||
if (!audio_encoder_.load_model(tts_model_path_)) {
|
||||
return false; //no encoder
|
||||
}
|
||||
encoder_loaded_ = true;
|
||||
fprintf(stderr, " Speaker encoder lazy-loaded in %lld ms\n",
|
||||
(long long)(get_time_ms() - t_encoder_load_start));
|
||||
return true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
tts_result Qwen3TTS::synthesize(const std::string & text, const std::string & instruction, const int speaker_id,
|
||||
const tts_params & params) {
|
||||
tts_result result;
|
||||
|
||||
|
|
@ -176,7 +195,7 @@ tts_result Qwen3TTS::synthesize(const std::string & text, const std::string & in
|
|||
// This will use the model's default voice characteristics
|
||||
std::vector<float> zero_embedding(transformer_.get_config().hidden_size, 0.0f);
|
||||
|
||||
return synthesize_internal(text, instruction, zero_embedding.data(), params, result);
|
||||
return synthesize_internal(text, instruction, speaker_id, zero_embedding.data(), params, result);
|
||||
}
|
||||
|
||||
tts_result Qwen3TTS::synthesize_with_voice(const std::string & text,
|
||||
|
|
@ -260,10 +279,10 @@ tts_result Qwen3TTS::synthesize_with_voice(const std::string & text,
|
|||
fprintf(stderr, "Speaker embedding extracted: %zu floats\n", speaker_embedding.size());
|
||||
}
|
||||
|
||||
return synthesize_internal(text, "", speaker_embedding.data(), params, result);
|
||||
return synthesize_internal(text, "", -1, speaker_embedding.data(), params, result);
|
||||
}
|
||||
|
||||
tts_result Qwen3TTS::synthesize_internal(const std::string & text, const std::string & instruction,
|
||||
tts_result Qwen3TTS::synthesize_internal(const std::string & text, const std::string & instruction, const int speakerid,
|
||||
const float * speaker_embedding,
|
||||
const tts_params & params,
|
||||
tts_result & result) {
|
||||
|
|
@ -325,7 +344,7 @@ tts_result Qwen3TTS::synthesize_internal(const std::string & text, const std::st
|
|||
if (!transformer_.generate(text_tokens.data(), (int32_t)text_tokens.size(),
|
||||
speaker_embedding, params.max_audio_tokens, speech_codes,
|
||||
2050, params.repetition_penalty,
|
||||
params.temperature, params.top_k, -1, instruct_tok_data, instruct_tok_count)) {
|
||||
params.temperature, params.top_k, speakerid, instruct_tok_data, instruct_tok_count)) {
|
||||
result.error_msg = "Failed to generate speech codes: " + transformer_.get_error();
|
||||
return result;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -91,7 +91,7 @@ public:
|
|||
// Generate speech from text
|
||||
// text: input text to synthesize
|
||||
// params: generation parameters
|
||||
tts_result synthesize(const std::string & text, const std::string & instruction,
|
||||
tts_result synthesize(const std::string & text, const std::string & instruction, const int speaker_id,
|
||||
const tts_params & params = tts_params());
|
||||
|
||||
// Generate speech with voice cloning
|
||||
|
|
@ -114,6 +114,8 @@ public:
|
|||
// Set progress callback
|
||||
void set_progress_callback(tts_progress_callback_t callback);
|
||||
|
||||
bool load_speaker_enc(); //return false if failed to load encoder
|
||||
|
||||
// Get error message
|
||||
const std::string & get_error() const { return error_msg_; }
|
||||
|
||||
|
|
@ -121,7 +123,7 @@ public:
|
|||
bool is_loaded() const { return models_loaded_; }
|
||||
|
||||
private:
|
||||
tts_result synthesize_internal(const std::string & text, const std::string & instruction,
|
||||
tts_result synthesize_internal(const std::string & text, const std::string & instruction, const int speakerid,
|
||||
const float * speaker_embedding,
|
||||
const tts_params & params,
|
||||
tts_result & result);
|
||||
|
|
|
|||
|
|
@ -1186,6 +1186,15 @@ static tts_generation_outputs ttstype_generate_qwen3tts(const tts_generation_inp
|
|||
std::vector<float> custom_reference_audio_pcmf32;
|
||||
std::string speaker_instruction = inputs.speaker_instruction;
|
||||
|
||||
int speakerID = inputs.speaker_seed;
|
||||
int speakermap[] = {2861,3066,2873,3061,2864,2875,2878,3065,3010};
|
||||
|
||||
if (speakerID > 0 && speakerID <= 5) {
|
||||
speakerID = speakermap[speakerID-1];
|
||||
} else {
|
||||
speakerID = -1;
|
||||
}
|
||||
|
||||
int audio_seed = inputs.audio_seed;
|
||||
if (audio_seed <= 0 || audio_seed==0xFFFFFFFF)
|
||||
{
|
||||
|
|
@ -1194,7 +1203,7 @@ static tts_generation_outputs ttstype_generate_qwen3tts(const tts_generation_inp
|
|||
|
||||
if(ttsdebugmode==1 && !tts_is_quiet)
|
||||
{
|
||||
printf("\nUsing Audio Seed: %d", audio_seed);
|
||||
printf("\nUsing Audio Seed: %d, SpeakerID: %d", audio_seed, speakerID);
|
||||
}
|
||||
qwen3tts_runner.set_seed(audio_seed);
|
||||
|
||||
|
|
@ -1221,11 +1230,13 @@ static tts_generation_outputs ttstype_generate_qwen3tts(const tts_generation_inp
|
|||
qwen3tts_params.print_progress = true;
|
||||
}
|
||||
|
||||
if (speaker_instruction!="" || custom_reference_audio_pcmf32.empty()) {
|
||||
bool has_speaker_enc = qwen3tts_runner.load_speaker_enc();
|
||||
|
||||
if (speaker_instruction!="" || custom_reference_audio_pcmf32.empty() || !has_speaker_enc) {
|
||||
if (speaker_instruction != "" && !tts_is_quiet) {
|
||||
printf("\nApply VoiceDesign Instruction: %s", speaker_instruction.c_str());
|
||||
}
|
||||
result = qwen3tts_runner.synthesize(prompt, speaker_instruction, qwen3tts_params);
|
||||
result = qwen3tts_runner.synthesize(prompt, speaker_instruction, speakerID, qwen3tts_params);
|
||||
} else {
|
||||
std::size_t reuse_hash_value = std::hash<std::string>{}(custom_reference_audio_str);
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue