adjust kokoro default voices

2026-07-10 01:18:32 +00:00 · 2025-08-22 23:48:29 +08:00 · 2025-08-22 23:48:29 +08:00 · 3867db34bc
commit 3867db34bc
parent 80dabbb689
3 changed files with 5 additions and 5 deletions
--- a/otherarch/tts_adapter.cpp
+++ b/otherarch/tts_adapter.cpp
@ -670,7 +670,7 @@ static tts_generation_outputs ttstype_generate_ttscpp(const tts_generation_input

    if(detectedarch=="kokoro")
    {
-        vmapper = {"am_echo","af_heart","af_alloy","bm_daniel","bf_isabella"};
+        vmapper = {"am_echo","af_heart","af_nicole","bm_fable","bf_isabella"};
        vpermitted = {"af_alloy", "af_aoede", "af_bella", "af_heart", "af_jessica", "af_kore", "af_nicole", "af_nova", "af_river", "af_sarah", "af_sky", "am_adam", "am_echo", "am_eric", "am_fenrir", "am_liam", "am_michael", "am_onyx", "am_puck", "am_santa", "bf_alice", "bf_emma", "bf_isabella", "bf_lily", "bm_daniel", "bm_fable", "bm_george", "bm_lewis"};
    }
    else if(detectedarch=="dia")
--- a/otherarch/ttscpp/cli/cli.cpp
+++ b/otherarch/ttscpp/cli/cli.cpp
@ -39,7 +39,7 @@ int main(int argc, const char ** argv) {
    args.add_argument(bool_arg("--no-cross-attn", "(OPTIONAL) Whether to not include cross attention", "-ca"));
    args.add_argument(string_arg("--conditional-prompt", "(OPTIONAL) A distinct conditional prompt to use for generating. If none is provided the preencoded prompt is used. '--text-encoder-path' must be set to use conditional generation.", "-cp", false));
    args.add_argument(string_arg("--text-encoder-path", "(OPTIONAL) The local path of the text encoder gguf model for conditional generaiton.", "-tep", false));
-    args.add_argument(string_arg("--voice", "(OPTIONAL) The voice to use to generate the audio. This is only used for models with voice packs.", "-v", false, "af_alloy"));
+    args.add_argument(string_arg("--voice", "(OPTIONAL) The voice to use to generate the audio. This is only used for models with voice packs.", "-v", false, "af_heart"));
    args.add_argument(bool_arg("--vad", "(OPTIONAL) whether to apply voice inactivity detection (VAD) and strip silence form the end of the output (particularly useful for Parler TTS). By default, no VAD is applied.", "-va"));
    args.add_argument(string_arg("--espeak-voice-id", "(OPTIONAL) The espeak voice id to use for phonemization. This should only be specified when the correct espeak voice cannot be inferred from the kokoro voice ( see MultiLanguage Configuration in the README for more info).", "-eid", false));
    args.add_argument(int_arg("--max-tokens", "(OPTIONAL) The max audio tokens or token batches to generate where each represents approximates 11 ms of audio. Only applied to Dia generation. If set to zero as is its default then the default max generation size. Warning values under 15 are not supported.", "-mt", false, &default_max_tokens));
--- a/otherarch/ttscpp/src/kokoro_model.h
+++ b/otherarch/ttscpp/src/kokoro_model.h
@ -318,7 +318,7 @@ struct kokoro_duration_context : runner_context {
        ggml_backend_buffer_free(buf_len_output);
    }

-    std::string voice = "af_alloy";
+    std::string voice = "af_heart";
    struct kokoro_model * model;
    ggml_backend_buffer_t buf_len_output = nullptr;

@ -396,7 +396,7 @@ struct kokoro_context : runner_context {
        }
    }

-    std::string voice = "af_alloy";
+    std::string voice = "af_heart";

    struct kokoro_model * model;

@ -442,7 +442,7 @@ struct kokoro_runner : tts_runner {
    kokoro_duration_runner * drunner;
    phonemizer * phmzr;

-    std::string default_voice = "af_alloy";
+    std::string default_voice = "af_heart";

    void init_build() {
        tts_runner::init_build(&kctx->buf_compute_meta);