added multilingual support for whisper

2025-09-11 09:34:37 +00:00 · 2025-01-09 23:28:52 +08:00 · 2025-01-09 23:28:52 +08:00 · 91b6e29af3
commit 91b6e29af3
parent 0cb599546e
6 changed files with 49 additions and 8 deletions
--- a/expose.h
+++ b/expose.h
@ -192,6 +192,7 @@ struct whisper_generation_inputs
    const char * prompt = nullptr;
    const char * audio_data = nullptr;
    const bool suppress_non_speech = false;
+    const char * langcode = nullptr;
    const bool quiet = false;
 };
 struct whisper_generation_outputs
--- a/kcpp_docs.embd
+++ b/kcpp_docs.embd
@ -1344,6 +1344,8 @@
                            "application/json": {
                               "example": {
                                  "prompt": "",
+                                  "suppress_non_speech" : false,
+                                  "langcode": "en",
                                  "audio_data": "base64_wav_data",
                               },
                               "schema": {
@ -1351,6 +1353,18 @@
                                     "audio_data": {
                                        "type": "string",
                                        "description": "Base64 respresentation of a 16-bit 16kHz wave file to be transcribed to text."
+                                     },
+                                     "prompt": {
+                                        "type": "string",
+                                        "description": "Prompt to steer the transcription."
+                                     },
+                                     "langcode": {
+                                        "type": "string",
+                                        "description": "Two letter language code, or use auto to autodetect."
+                                     },
+                                     "suppress_non_speech": {
+                                        "type": "boolean",
+                                        "description": "Prevent noise tokens, always generate words for speech."
                                     }
                                  },
                                  "type": "object"
--- a/klite.embd
+++ b/klite.embd
@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
 -->

 <script>
-	const LITEVER = 201;
+	const LITEVER = 202;
 	const urlParams = new URLSearchParams(window.location.search);
 	var localflag = true;
 	const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
@ -3044,6 +3044,7 @@ Current version indicated by LITEVER below.
 		narrate_only_dialog: false,
 		voice_end_delay: 300,
 		voice_suppress_nonspeech: false,
+		voice_langcode: "auto",
 		tts_speed: 1.0,
 		image_styles: "",
 		image_negprompt: "",
@ -10330,6 +10331,7 @@ initializeInstructUIFunctionality();
 		document.getElementById("tts_speed").value = localsettings.tts_speed;
 		document.getElementById("voice_end_delay").value = localsettings.voice_end_delay;
 		document.getElementById("voice_suppress_nonspeech").checked = localsettings.voice_suppress_nonspeech;
+		document.getElementById("voice_langcode").value = localsettings.voice_langcode;
 		toggle_opmode();

 		//sd models display
@ -10618,6 +10620,7 @@ initializeInstructUIFunctionality();
 		localsettings.tts_speed = document.getElementById("tts_speed").value;
 		localsettings.voice_end_delay = document.getElementById("voice_end_delay").value;
 		localsettings.voice_suppress_nonspeech = (document.getElementById("voice_suppress_nonspeech").checked?true:false);
+		localsettings.voice_langcode = document.getElementById("voice_langcode").value;
 		localsettings.auto_ctxlen = (document.getElementById("auto_ctxlen").checked ? true : false);
 		localsettings.auto_genamt = (document.getElementById("auto_genamt").checked ? true : false);

@ -15299,7 +15302,8 @@ initializeInstructUIFunctionality();
 		let payload = {
 			"audio_data": dataurl,
 			"prompt": "",
-			"suppress_non_speech": localsettings.voice_suppress_nonspeech
+			"suppress_non_speech": localsettings.voice_suppress_nonspeech,
+			"langcode": localsettings.voice_langcode,
 		};
 		fetch(apply_proxy_url(custom_kobold_endpoint + koboldcpp_transcribe_endpoint), {
 			method: 'POST',
@ -15349,7 +15353,8 @@ initializeInstructUIFunctionality();
 							let payload = {
 								"audio_data": dataurl,
 								"prompt": "",
-								"suppress_non_speech": (document.getElementById("voice_suppress_nonspeech").checked?true:false)
+								"suppress_non_speech": (document.getElementById("voice_suppress_nonspeech").checked?true:false),
+								"langcode": document.getElementById("voice_langcode").value
 							};
 							fetch(apply_proxy_url(custom_kobold_endpoint + koboldcpp_transcribe_endpoint), {
 								method: 'POST',
@ -20229,6 +20234,12 @@ initializeInstructUIFunctionality();
 							<div class="justifyleft" style="padding:2px"  title="Suppress non-speech (e.g. music and sounds) from transcription">Suppress Non-Speech </div>
 						  	<input title="Suppress Non-Speech" type="checkbox" id="voice_suppress_nonspeech" style="margin:0px 0px 0px auto;">
 						</div>
+						<div class="inlinelabel" style="font-size: 11px;">
+							<div class="justifyleft" style="padding:2px"  title="Language Code">Language </div>
+							<input class="settinglabel miniinput" type="text" placeholder="en" value="auto" id="voice_langcode" style="height:18px; width: 36px; padding: 2px;">
+
+						</div>
+
 						<div class="inlinelabel" style="font-size: 11px;">
 							<div class="justifyleft" style="padding:3px">Voice Delay: </div>
 							<input title="Voice Delay Milliseconds" type="text" inputmode="decimal" value="300" id="voice_end_delay" style="width:40px">
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -273,6 +273,7 @@ class whisper_generation_inputs(ctypes.Structure):
    _fields_ = [("prompt", ctypes.c_char_p),
                ("audio_data", ctypes.c_char_p),
                ("suppress_non_speech", ctypes.c_bool),
+                ("langcode", ctypes.c_char_p),
                ("quiet", ctypes.c_bool)]

 class whisper_generation_outputs(ctypes.Structure):
@ -1252,6 +1253,9 @@ def whisper_generate(genparams):
    inputs.prompt = prompt.encode("UTF-8")
    inputs.audio_data = audio_data.encode("UTF-8")
    inputs.quiet = is_quiet
+    lc = genparams.get("langcode", "auto")
+    lc = lc.strip().lower() if (lc and lc.strip().lower()!="") else "auto"
+    inputs.langcode = lc.encode("UTF-8")
    inputs.suppress_non_speech = genparams.get("suppress_non_speech", False)
    ret = handle.whisper_generate(inputs)
    outstr = ""
--- a/otherarch/whispercpp/whisper.cpp
+++ b/otherarch/whispercpp/whisper.cpp
@ -5355,13 +5355,19 @@ int whisper_full_with_state(

        const auto lang_id = whisper_lang_auto_detect_with_state(ctx, state, 0, params.n_threads, probs.data());
        if (lang_id < 0) {
-            WHISPER_LOG_ERROR("%s: failed to auto-detect language\n", __func__);
+            if(params.debug_mode)
+            {
+                printf("\n%s: failed to auto-detect language\n", __func__);
+            }
            return -3;
        }
        state->lang_id = lang_id;
        params.language = whisper_lang_str(lang_id);

-        WHISPER_LOG_INFO("%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
+        if(params.debug_mode)
+        {
+            printf("\n%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
+        }
        if (params.detect_language) {
            return 0;
        }
@ -5477,7 +5483,11 @@ int whisper_full_with_state(
    std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx), };

    if (whisper_is_multilingual(ctx)) {
-        const int lang_id = whisper_lang_id(params.language);
+        int lang_id = whisper_lang_id(params.language);
+        if(lang_id<0)
+        {
+            lang_id = 0; //default to english
+        }
        state->lang_id = lang_id;
        prompt_init.push_back(whisper_token_lang(ctx, lang_id));
        if (params.translate) {
--- a/otherarch/whispercpp/whisper_adapter.cpp
+++ b/otherarch/whispercpp/whisper_adapter.cpp
@ -217,6 +217,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs

    const std::string b64data = std::string(inputs.audio_data);
    const std::string initprompt = std::string(inputs.prompt);
+    const std::string langcode = std::string(inputs.langcode);

    std::vector<float> pcmf32;               // mono-channel F32 PCM
    std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
@ -236,7 +237,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
    wparams.print_timestamps = false;
    wparams.print_special    = false;
    wparams.translate        = false;
-    wparams.language         = "auto";
+    wparams.language         = langcode.c_str();
    wparams.detect_language  = false;
    wparams.n_threads        = 4;
    wparams.n_max_text_ctx   = wparams.n_max_text_ctx;
@ -248,7 +249,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
    wparams.split_on_word    = false;
    wparams.audio_ctx        = 0;
    wparams.speed_up         = false;
-    wparams.debug_mode       = false;
+    wparams.debug_mode       = (whisperdebugmode==1);
    wparams.tdrz_enable      = false;
    wparams.suppress_regex   = nullptr;
    wparams.suppress_non_speech_tokens = inputs.suppress_non_speech;