added multilingual support for whisper

This commit is contained in:
Concedo 2025-01-09 23:28:52 +08:00
parent 0cb599546e
commit 91b6e29af3
6 changed files with 49 additions and 8 deletions

View file

@ -192,6 +192,7 @@ struct whisper_generation_inputs
const char * prompt = nullptr;
const char * audio_data = nullptr;
const bool suppress_non_speech = false;
const char * langcode = nullptr;
const bool quiet = false;
};
struct whisper_generation_outputs

View file

@ -1344,6 +1344,8 @@
"application/json": {
"example": {
"prompt": "",
"suppress_non_speech" : false,
"langcode": "en",
"audio_data": "base64_wav_data",
},
"schema": {
@ -1351,6 +1353,18 @@
"audio_data": {
"type": "string",
"description": "Base64 respresentation of a 16-bit 16kHz wave file to be transcribed to text."
},
"prompt": {
"type": "string",
"description": "Prompt to steer the transcription."
},
"langcode": {
"type": "string",
"description": "Two letter language code, or use auto to autodetect."
},
"suppress_non_speech": {
"type": "boolean",
"description": "Prevent noise tokens, always generate words for speech."
}
},
"type": "object"

View file

@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
-->
<script>
const LITEVER = 201;
const LITEVER = 202;
const urlParams = new URLSearchParams(window.location.search);
var localflag = true;
const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
@ -3044,6 +3044,7 @@ Current version indicated by LITEVER below.
narrate_only_dialog: false,
voice_end_delay: 300,
voice_suppress_nonspeech: false,
voice_langcode: "auto",
tts_speed: 1.0,
image_styles: "",
image_negprompt: "",
@ -10330,6 +10331,7 @@ initializeInstructUIFunctionality();
document.getElementById("tts_speed").value = localsettings.tts_speed;
document.getElementById("voice_end_delay").value = localsettings.voice_end_delay;
document.getElementById("voice_suppress_nonspeech").checked = localsettings.voice_suppress_nonspeech;
document.getElementById("voice_langcode").value = localsettings.voice_langcode;
toggle_opmode();
//sd models display
@ -10618,6 +10620,7 @@ initializeInstructUIFunctionality();
localsettings.tts_speed = document.getElementById("tts_speed").value;
localsettings.voice_end_delay = document.getElementById("voice_end_delay").value;
localsettings.voice_suppress_nonspeech = (document.getElementById("voice_suppress_nonspeech").checked?true:false);
localsettings.voice_langcode = document.getElementById("voice_langcode").value;
localsettings.auto_ctxlen = (document.getElementById("auto_ctxlen").checked ? true : false);
localsettings.auto_genamt = (document.getElementById("auto_genamt").checked ? true : false);
@ -15299,7 +15302,8 @@ initializeInstructUIFunctionality();
let payload = {
"audio_data": dataurl,
"prompt": "",
"suppress_non_speech": localsettings.voice_suppress_nonspeech
"suppress_non_speech": localsettings.voice_suppress_nonspeech,
"langcode": localsettings.voice_langcode,
};
fetch(apply_proxy_url(custom_kobold_endpoint + koboldcpp_transcribe_endpoint), {
method: 'POST',
@ -15349,7 +15353,8 @@ initializeInstructUIFunctionality();
let payload = {
"audio_data": dataurl,
"prompt": "",
"suppress_non_speech": (document.getElementById("voice_suppress_nonspeech").checked?true:false)
"suppress_non_speech": (document.getElementById("voice_suppress_nonspeech").checked?true:false),
"langcode": document.getElementById("voice_langcode").value
};
fetch(apply_proxy_url(custom_kobold_endpoint + koboldcpp_transcribe_endpoint), {
method: 'POST',
@ -20229,6 +20234,12 @@ initializeInstructUIFunctionality();
<div class="justifyleft" style="padding:2px" title="Suppress non-speech (e.g. music and sounds) from transcription">Suppress Non-Speech </div>
<input title="Suppress Non-Speech" type="checkbox" id="voice_suppress_nonspeech" style="margin:0px 0px 0px auto;">
</div>
<div class="inlinelabel" style="font-size: 11px;">
<div class="justifyleft" style="padding:2px" title="Language Code">Language </div>
<input class="settinglabel miniinput" type="text" placeholder="en" value="auto" id="voice_langcode" style="height:18px; width: 36px; padding: 2px;">
</div>
<div class="inlinelabel" style="font-size: 11px;">
<div class="justifyleft" style="padding:3px">Voice Delay: </div>
<input title="Voice Delay Milliseconds" type="text" inputmode="decimal" value="300" id="voice_end_delay" style="width:40px">

View file

@ -273,6 +273,7 @@ class whisper_generation_inputs(ctypes.Structure):
_fields_ = [("prompt", ctypes.c_char_p),
("audio_data", ctypes.c_char_p),
("suppress_non_speech", ctypes.c_bool),
("langcode", ctypes.c_char_p),
("quiet", ctypes.c_bool)]
class whisper_generation_outputs(ctypes.Structure):
@ -1252,6 +1253,9 @@ def whisper_generate(genparams):
inputs.prompt = prompt.encode("UTF-8")
inputs.audio_data = audio_data.encode("UTF-8")
inputs.quiet = is_quiet
lc = genparams.get("langcode", "auto")
lc = lc.strip().lower() if (lc and lc.strip().lower()!="") else "auto"
inputs.langcode = lc.encode("UTF-8")
inputs.suppress_non_speech = genparams.get("suppress_non_speech", False)
ret = handle.whisper_generate(inputs)
outstr = ""

View file

@ -5355,13 +5355,19 @@ int whisper_full_with_state(
const auto lang_id = whisper_lang_auto_detect_with_state(ctx, state, 0, params.n_threads, probs.data());
if (lang_id < 0) {
WHISPER_LOG_ERROR("%s: failed to auto-detect language\n", __func__);
if(params.debug_mode)
{
printf("\n%s: failed to auto-detect language\n", __func__);
}
return -3;
}
state->lang_id = lang_id;
params.language = whisper_lang_str(lang_id);
WHISPER_LOG_INFO("%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
if(params.debug_mode)
{
printf("\n%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
}
if (params.detect_language) {
return 0;
}
@ -5477,7 +5483,11 @@ int whisper_full_with_state(
std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx), };
if (whisper_is_multilingual(ctx)) {
const int lang_id = whisper_lang_id(params.language);
int lang_id = whisper_lang_id(params.language);
if(lang_id<0)
{
lang_id = 0; //default to english
}
state->lang_id = lang_id;
prompt_init.push_back(whisper_token_lang(ctx, lang_id));
if (params.translate) {

View file

@ -217,6 +217,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
const std::string b64data = std::string(inputs.audio_data);
const std::string initprompt = std::string(inputs.prompt);
const std::string langcode = std::string(inputs.langcode);
std::vector<float> pcmf32; // mono-channel F32 PCM
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
@ -236,7 +237,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
wparams.print_timestamps = false;
wparams.print_special = false;
wparams.translate = false;
wparams.language = "auto";
wparams.language = langcode.c_str();
wparams.detect_language = false;
wparams.n_threads = 4;
wparams.n_max_text_ctx = wparams.n_max_text_ctx;
@ -248,7 +249,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
wparams.split_on_word = false;
wparams.audio_ctx = 0;
wparams.speed_up = false;
wparams.debug_mode = false;
wparams.debug_mode = (whisperdebugmode==1);
wparams.tdrz_enable = false;
wparams.suppress_regex = nullptr;
wparams.suppress_non_speech_tokens = inputs.suppress_non_speech;