added multilingual support for whisper

This commit is contained in:
Concedo 2025-01-09 23:28:52 +08:00
parent 0cb599546e
commit 91b6e29af3
6 changed files with 49 additions and 8 deletions

View file

@ -192,6 +192,7 @@ struct whisper_generation_inputs
const char * prompt = nullptr; const char * prompt = nullptr;
const char * audio_data = nullptr; const char * audio_data = nullptr;
const bool suppress_non_speech = false; const bool suppress_non_speech = false;
const char * langcode = nullptr;
const bool quiet = false; const bool quiet = false;
}; };
struct whisper_generation_outputs struct whisper_generation_outputs

View file

@ -1344,6 +1344,8 @@
"application/json": { "application/json": {
"example": { "example": {
"prompt": "", "prompt": "",
"suppress_non_speech" : false,
"langcode": "en",
"audio_data": "base64_wav_data", "audio_data": "base64_wav_data",
}, },
"schema": { "schema": {
@ -1351,6 +1353,18 @@
"audio_data": { "audio_data": {
"type": "string", "type": "string",
"description": "Base64 respresentation of a 16-bit 16kHz wave file to be transcribed to text." "description": "Base64 respresentation of a 16-bit 16kHz wave file to be transcribed to text."
},
"prompt": {
"type": "string",
"description": "Prompt to steer the transcription."
},
"langcode": {
"type": "string",
"description": "Two letter language code, or use auto to autodetect."
},
"suppress_non_speech": {
"type": "boolean",
"description": "Prevent noise tokens, always generate words for speech."
} }
}, },
"type": "object" "type": "object"

View file

@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
--> -->
<script> <script>
const LITEVER = 201; const LITEVER = 202;
const urlParams = new URLSearchParams(window.location.search); const urlParams = new URLSearchParams(window.location.search);
var localflag = true; var localflag = true;
const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_"; const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
@ -3044,6 +3044,7 @@ Current version indicated by LITEVER below.
narrate_only_dialog: false, narrate_only_dialog: false,
voice_end_delay: 300, voice_end_delay: 300,
voice_suppress_nonspeech: false, voice_suppress_nonspeech: false,
voice_langcode: "auto",
tts_speed: 1.0, tts_speed: 1.0,
image_styles: "", image_styles: "",
image_negprompt: "", image_negprompt: "",
@ -10330,6 +10331,7 @@ initializeInstructUIFunctionality();
document.getElementById("tts_speed").value = localsettings.tts_speed; document.getElementById("tts_speed").value = localsettings.tts_speed;
document.getElementById("voice_end_delay").value = localsettings.voice_end_delay; document.getElementById("voice_end_delay").value = localsettings.voice_end_delay;
document.getElementById("voice_suppress_nonspeech").checked = localsettings.voice_suppress_nonspeech; document.getElementById("voice_suppress_nonspeech").checked = localsettings.voice_suppress_nonspeech;
document.getElementById("voice_langcode").value = localsettings.voice_langcode;
toggle_opmode(); toggle_opmode();
//sd models display //sd models display
@ -10618,6 +10620,7 @@ initializeInstructUIFunctionality();
localsettings.tts_speed = document.getElementById("tts_speed").value; localsettings.tts_speed = document.getElementById("tts_speed").value;
localsettings.voice_end_delay = document.getElementById("voice_end_delay").value; localsettings.voice_end_delay = document.getElementById("voice_end_delay").value;
localsettings.voice_suppress_nonspeech = (document.getElementById("voice_suppress_nonspeech").checked?true:false); localsettings.voice_suppress_nonspeech = (document.getElementById("voice_suppress_nonspeech").checked?true:false);
localsettings.voice_langcode = document.getElementById("voice_langcode").value;
localsettings.auto_ctxlen = (document.getElementById("auto_ctxlen").checked ? true : false); localsettings.auto_ctxlen = (document.getElementById("auto_ctxlen").checked ? true : false);
localsettings.auto_genamt = (document.getElementById("auto_genamt").checked ? true : false); localsettings.auto_genamt = (document.getElementById("auto_genamt").checked ? true : false);
@ -15299,7 +15302,8 @@ initializeInstructUIFunctionality();
let payload = { let payload = {
"audio_data": dataurl, "audio_data": dataurl,
"prompt": "", "prompt": "",
"suppress_non_speech": localsettings.voice_suppress_nonspeech "suppress_non_speech": localsettings.voice_suppress_nonspeech,
"langcode": localsettings.voice_langcode,
}; };
fetch(apply_proxy_url(custom_kobold_endpoint + koboldcpp_transcribe_endpoint), { fetch(apply_proxy_url(custom_kobold_endpoint + koboldcpp_transcribe_endpoint), {
method: 'POST', method: 'POST',
@ -15349,7 +15353,8 @@ initializeInstructUIFunctionality();
let payload = { let payload = {
"audio_data": dataurl, "audio_data": dataurl,
"prompt": "", "prompt": "",
"suppress_non_speech": (document.getElementById("voice_suppress_nonspeech").checked?true:false) "suppress_non_speech": (document.getElementById("voice_suppress_nonspeech").checked?true:false),
"langcode": document.getElementById("voice_langcode").value
}; };
fetch(apply_proxy_url(custom_kobold_endpoint + koboldcpp_transcribe_endpoint), { fetch(apply_proxy_url(custom_kobold_endpoint + koboldcpp_transcribe_endpoint), {
method: 'POST', method: 'POST',
@ -20229,6 +20234,12 @@ initializeInstructUIFunctionality();
<div class="justifyleft" style="padding:2px" title="Suppress non-speech (e.g. music and sounds) from transcription">Suppress Non-Speech </div> <div class="justifyleft" style="padding:2px" title="Suppress non-speech (e.g. music and sounds) from transcription">Suppress Non-Speech </div>
<input title="Suppress Non-Speech" type="checkbox" id="voice_suppress_nonspeech" style="margin:0px 0px 0px auto;"> <input title="Suppress Non-Speech" type="checkbox" id="voice_suppress_nonspeech" style="margin:0px 0px 0px auto;">
</div> </div>
<div class="inlinelabel" style="font-size: 11px;">
<div class="justifyleft" style="padding:2px" title="Language Code">Language </div>
<input class="settinglabel miniinput" type="text" placeholder="en" value="auto" id="voice_langcode" style="height:18px; width: 36px; padding: 2px;">
</div>
<div class="inlinelabel" style="font-size: 11px;"> <div class="inlinelabel" style="font-size: 11px;">
<div class="justifyleft" style="padding:3px">Voice Delay: </div> <div class="justifyleft" style="padding:3px">Voice Delay: </div>
<input title="Voice Delay Milliseconds" type="text" inputmode="decimal" value="300" id="voice_end_delay" style="width:40px"> <input title="Voice Delay Milliseconds" type="text" inputmode="decimal" value="300" id="voice_end_delay" style="width:40px">

View file

@ -273,6 +273,7 @@ class whisper_generation_inputs(ctypes.Structure):
_fields_ = [("prompt", ctypes.c_char_p), _fields_ = [("prompt", ctypes.c_char_p),
("audio_data", ctypes.c_char_p), ("audio_data", ctypes.c_char_p),
("suppress_non_speech", ctypes.c_bool), ("suppress_non_speech", ctypes.c_bool),
("langcode", ctypes.c_char_p),
("quiet", ctypes.c_bool)] ("quiet", ctypes.c_bool)]
class whisper_generation_outputs(ctypes.Structure): class whisper_generation_outputs(ctypes.Structure):
@ -1252,6 +1253,9 @@ def whisper_generate(genparams):
inputs.prompt = prompt.encode("UTF-8") inputs.prompt = prompt.encode("UTF-8")
inputs.audio_data = audio_data.encode("UTF-8") inputs.audio_data = audio_data.encode("UTF-8")
inputs.quiet = is_quiet inputs.quiet = is_quiet
lc = genparams.get("langcode", "auto")
lc = lc.strip().lower() if (lc and lc.strip().lower()!="") else "auto"
inputs.langcode = lc.encode("UTF-8")
inputs.suppress_non_speech = genparams.get("suppress_non_speech", False) inputs.suppress_non_speech = genparams.get("suppress_non_speech", False)
ret = handle.whisper_generate(inputs) ret = handle.whisper_generate(inputs)
outstr = "" outstr = ""

View file

@ -5355,13 +5355,19 @@ int whisper_full_with_state(
const auto lang_id = whisper_lang_auto_detect_with_state(ctx, state, 0, params.n_threads, probs.data()); const auto lang_id = whisper_lang_auto_detect_with_state(ctx, state, 0, params.n_threads, probs.data());
if (lang_id < 0) { if (lang_id < 0) {
WHISPER_LOG_ERROR("%s: failed to auto-detect language\n", __func__); if(params.debug_mode)
{
printf("\n%s: failed to auto-detect language\n", __func__);
}
return -3; return -3;
} }
state->lang_id = lang_id; state->lang_id = lang_id;
params.language = whisper_lang_str(lang_id); params.language = whisper_lang_str(lang_id);
WHISPER_LOG_INFO("%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]); if(params.debug_mode)
{
printf("\n%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
}
if (params.detect_language) { if (params.detect_language) {
return 0; return 0;
} }
@ -5477,7 +5483,11 @@ int whisper_full_with_state(
std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx), }; std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx), };
if (whisper_is_multilingual(ctx)) { if (whisper_is_multilingual(ctx)) {
const int lang_id = whisper_lang_id(params.language); int lang_id = whisper_lang_id(params.language);
if(lang_id<0)
{
lang_id = 0; //default to english
}
state->lang_id = lang_id; state->lang_id = lang_id;
prompt_init.push_back(whisper_token_lang(ctx, lang_id)); prompt_init.push_back(whisper_token_lang(ctx, lang_id));
if (params.translate) { if (params.translate) {

View file

@ -217,6 +217,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
const std::string b64data = std::string(inputs.audio_data); const std::string b64data = std::string(inputs.audio_data);
const std::string initprompt = std::string(inputs.prompt); const std::string initprompt = std::string(inputs.prompt);
const std::string langcode = std::string(inputs.langcode);
std::vector<float> pcmf32; // mono-channel F32 PCM std::vector<float> pcmf32; // mono-channel F32 PCM
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
@ -236,7 +237,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
wparams.print_timestamps = false; wparams.print_timestamps = false;
wparams.print_special = false; wparams.print_special = false;
wparams.translate = false; wparams.translate = false;
wparams.language = "auto"; wparams.language = langcode.c_str();
wparams.detect_language = false; wparams.detect_language = false;
wparams.n_threads = 4; wparams.n_threads = 4;
wparams.n_max_text_ctx = wparams.n_max_text_ctx; wparams.n_max_text_ctx = wparams.n_max_text_ctx;
@ -248,7 +249,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
wparams.split_on_word = false; wparams.split_on_word = false;
wparams.audio_ctx = 0; wparams.audio_ctx = 0;
wparams.speed_up = false; wparams.speed_up = false;
wparams.debug_mode = false; wparams.debug_mode = (whisperdebugmode==1);
wparams.tdrz_enable = false; wparams.tdrz_enable = false;
wparams.suppress_regex = nullptr; wparams.suppress_regex = nullptr;
wparams.suppress_non_speech_tokens = inputs.suppress_non_speech; wparams.suppress_non_speech_tokens = inputs.suppress_non_speech;