mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
added multilingual support for whisper
This commit is contained in:
parent
0cb599546e
commit
91b6e29af3
6 changed files with 49 additions and 8 deletions
1
expose.h
1
expose.h
|
@ -192,6 +192,7 @@ struct whisper_generation_inputs
|
||||||
const char * prompt = nullptr;
|
const char * prompt = nullptr;
|
||||||
const char * audio_data = nullptr;
|
const char * audio_data = nullptr;
|
||||||
const bool suppress_non_speech = false;
|
const bool suppress_non_speech = false;
|
||||||
|
const char * langcode = nullptr;
|
||||||
const bool quiet = false;
|
const bool quiet = false;
|
||||||
};
|
};
|
||||||
struct whisper_generation_outputs
|
struct whisper_generation_outputs
|
||||||
|
|
|
@ -1344,6 +1344,8 @@
|
||||||
"application/json": {
|
"application/json": {
|
||||||
"example": {
|
"example": {
|
||||||
"prompt": "",
|
"prompt": "",
|
||||||
|
"suppress_non_speech" : false,
|
||||||
|
"langcode": "en",
|
||||||
"audio_data": "base64_wav_data",
|
"audio_data": "base64_wav_data",
|
||||||
},
|
},
|
||||||
"schema": {
|
"schema": {
|
||||||
|
@ -1351,6 +1353,18 @@
|
||||||
"audio_data": {
|
"audio_data": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "Base64 respresentation of a 16-bit 16kHz wave file to be transcribed to text."
|
"description": "Base64 respresentation of a 16-bit 16kHz wave file to be transcribed to text."
|
||||||
|
},
|
||||||
|
"prompt": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Prompt to steer the transcription."
|
||||||
|
},
|
||||||
|
"langcode": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Two letter language code, or use auto to autodetect."
|
||||||
|
},
|
||||||
|
"suppress_non_speech": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Prevent noise tokens, always generate words for speech."
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"type": "object"
|
"type": "object"
|
||||||
|
|
17
klite.embd
17
klite.embd
|
@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
|
||||||
-->
|
-->
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
const LITEVER = 201;
|
const LITEVER = 202;
|
||||||
const urlParams = new URLSearchParams(window.location.search);
|
const urlParams = new URLSearchParams(window.location.search);
|
||||||
var localflag = true;
|
var localflag = true;
|
||||||
const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
|
const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
|
||||||
|
@ -3044,6 +3044,7 @@ Current version indicated by LITEVER below.
|
||||||
narrate_only_dialog: false,
|
narrate_only_dialog: false,
|
||||||
voice_end_delay: 300,
|
voice_end_delay: 300,
|
||||||
voice_suppress_nonspeech: false,
|
voice_suppress_nonspeech: false,
|
||||||
|
voice_langcode: "auto",
|
||||||
tts_speed: 1.0,
|
tts_speed: 1.0,
|
||||||
image_styles: "",
|
image_styles: "",
|
||||||
image_negprompt: "",
|
image_negprompt: "",
|
||||||
|
@ -10330,6 +10331,7 @@ initializeInstructUIFunctionality();
|
||||||
document.getElementById("tts_speed").value = localsettings.tts_speed;
|
document.getElementById("tts_speed").value = localsettings.tts_speed;
|
||||||
document.getElementById("voice_end_delay").value = localsettings.voice_end_delay;
|
document.getElementById("voice_end_delay").value = localsettings.voice_end_delay;
|
||||||
document.getElementById("voice_suppress_nonspeech").checked = localsettings.voice_suppress_nonspeech;
|
document.getElementById("voice_suppress_nonspeech").checked = localsettings.voice_suppress_nonspeech;
|
||||||
|
document.getElementById("voice_langcode").value = localsettings.voice_langcode;
|
||||||
toggle_opmode();
|
toggle_opmode();
|
||||||
|
|
||||||
//sd models display
|
//sd models display
|
||||||
|
@ -10618,6 +10620,7 @@ initializeInstructUIFunctionality();
|
||||||
localsettings.tts_speed = document.getElementById("tts_speed").value;
|
localsettings.tts_speed = document.getElementById("tts_speed").value;
|
||||||
localsettings.voice_end_delay = document.getElementById("voice_end_delay").value;
|
localsettings.voice_end_delay = document.getElementById("voice_end_delay").value;
|
||||||
localsettings.voice_suppress_nonspeech = (document.getElementById("voice_suppress_nonspeech").checked?true:false);
|
localsettings.voice_suppress_nonspeech = (document.getElementById("voice_suppress_nonspeech").checked?true:false);
|
||||||
|
localsettings.voice_langcode = document.getElementById("voice_langcode").value;
|
||||||
localsettings.auto_ctxlen = (document.getElementById("auto_ctxlen").checked ? true : false);
|
localsettings.auto_ctxlen = (document.getElementById("auto_ctxlen").checked ? true : false);
|
||||||
localsettings.auto_genamt = (document.getElementById("auto_genamt").checked ? true : false);
|
localsettings.auto_genamt = (document.getElementById("auto_genamt").checked ? true : false);
|
||||||
|
|
||||||
|
@ -15299,7 +15302,8 @@ initializeInstructUIFunctionality();
|
||||||
let payload = {
|
let payload = {
|
||||||
"audio_data": dataurl,
|
"audio_data": dataurl,
|
||||||
"prompt": "",
|
"prompt": "",
|
||||||
"suppress_non_speech": localsettings.voice_suppress_nonspeech
|
"suppress_non_speech": localsettings.voice_suppress_nonspeech,
|
||||||
|
"langcode": localsettings.voice_langcode,
|
||||||
};
|
};
|
||||||
fetch(apply_proxy_url(custom_kobold_endpoint + koboldcpp_transcribe_endpoint), {
|
fetch(apply_proxy_url(custom_kobold_endpoint + koboldcpp_transcribe_endpoint), {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
|
@ -15349,7 +15353,8 @@ initializeInstructUIFunctionality();
|
||||||
let payload = {
|
let payload = {
|
||||||
"audio_data": dataurl,
|
"audio_data": dataurl,
|
||||||
"prompt": "",
|
"prompt": "",
|
||||||
"suppress_non_speech": (document.getElementById("voice_suppress_nonspeech").checked?true:false)
|
"suppress_non_speech": (document.getElementById("voice_suppress_nonspeech").checked?true:false),
|
||||||
|
"langcode": document.getElementById("voice_langcode").value
|
||||||
};
|
};
|
||||||
fetch(apply_proxy_url(custom_kobold_endpoint + koboldcpp_transcribe_endpoint), {
|
fetch(apply_proxy_url(custom_kobold_endpoint + koboldcpp_transcribe_endpoint), {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
|
@ -20229,6 +20234,12 @@ initializeInstructUIFunctionality();
|
||||||
<div class="justifyleft" style="padding:2px" title="Suppress non-speech (e.g. music and sounds) from transcription">Suppress Non-Speech </div>
|
<div class="justifyleft" style="padding:2px" title="Suppress non-speech (e.g. music and sounds) from transcription">Suppress Non-Speech </div>
|
||||||
<input title="Suppress Non-Speech" type="checkbox" id="voice_suppress_nonspeech" style="margin:0px 0px 0px auto;">
|
<input title="Suppress Non-Speech" type="checkbox" id="voice_suppress_nonspeech" style="margin:0px 0px 0px auto;">
|
||||||
</div>
|
</div>
|
||||||
|
<div class="inlinelabel" style="font-size: 11px;">
|
||||||
|
<div class="justifyleft" style="padding:2px" title="Language Code">Language </div>
|
||||||
|
<input class="settinglabel miniinput" type="text" placeholder="en" value="auto" id="voice_langcode" style="height:18px; width: 36px; padding: 2px;">
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
<div class="inlinelabel" style="font-size: 11px;">
|
<div class="inlinelabel" style="font-size: 11px;">
|
||||||
<div class="justifyleft" style="padding:3px">Voice Delay: </div>
|
<div class="justifyleft" style="padding:3px">Voice Delay: </div>
|
||||||
<input title="Voice Delay Milliseconds" type="text" inputmode="decimal" value="300" id="voice_end_delay" style="width:40px">
|
<input title="Voice Delay Milliseconds" type="text" inputmode="decimal" value="300" id="voice_end_delay" style="width:40px">
|
||||||
|
|
|
@ -273,6 +273,7 @@ class whisper_generation_inputs(ctypes.Structure):
|
||||||
_fields_ = [("prompt", ctypes.c_char_p),
|
_fields_ = [("prompt", ctypes.c_char_p),
|
||||||
("audio_data", ctypes.c_char_p),
|
("audio_data", ctypes.c_char_p),
|
||||||
("suppress_non_speech", ctypes.c_bool),
|
("suppress_non_speech", ctypes.c_bool),
|
||||||
|
("langcode", ctypes.c_char_p),
|
||||||
("quiet", ctypes.c_bool)]
|
("quiet", ctypes.c_bool)]
|
||||||
|
|
||||||
class whisper_generation_outputs(ctypes.Structure):
|
class whisper_generation_outputs(ctypes.Structure):
|
||||||
|
@ -1252,6 +1253,9 @@ def whisper_generate(genparams):
|
||||||
inputs.prompt = prompt.encode("UTF-8")
|
inputs.prompt = prompt.encode("UTF-8")
|
||||||
inputs.audio_data = audio_data.encode("UTF-8")
|
inputs.audio_data = audio_data.encode("UTF-8")
|
||||||
inputs.quiet = is_quiet
|
inputs.quiet = is_quiet
|
||||||
|
lc = genparams.get("langcode", "auto")
|
||||||
|
lc = lc.strip().lower() if (lc and lc.strip().lower()!="") else "auto"
|
||||||
|
inputs.langcode = lc.encode("UTF-8")
|
||||||
inputs.suppress_non_speech = genparams.get("suppress_non_speech", False)
|
inputs.suppress_non_speech = genparams.get("suppress_non_speech", False)
|
||||||
ret = handle.whisper_generate(inputs)
|
ret = handle.whisper_generate(inputs)
|
||||||
outstr = ""
|
outstr = ""
|
||||||
|
|
|
@ -5355,13 +5355,19 @@ int whisper_full_with_state(
|
||||||
|
|
||||||
const auto lang_id = whisper_lang_auto_detect_with_state(ctx, state, 0, params.n_threads, probs.data());
|
const auto lang_id = whisper_lang_auto_detect_with_state(ctx, state, 0, params.n_threads, probs.data());
|
||||||
if (lang_id < 0) {
|
if (lang_id < 0) {
|
||||||
WHISPER_LOG_ERROR("%s: failed to auto-detect language\n", __func__);
|
if(params.debug_mode)
|
||||||
|
{
|
||||||
|
printf("\n%s: failed to auto-detect language\n", __func__);
|
||||||
|
}
|
||||||
return -3;
|
return -3;
|
||||||
}
|
}
|
||||||
state->lang_id = lang_id;
|
state->lang_id = lang_id;
|
||||||
params.language = whisper_lang_str(lang_id);
|
params.language = whisper_lang_str(lang_id);
|
||||||
|
|
||||||
WHISPER_LOG_INFO("%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
|
if(params.debug_mode)
|
||||||
|
{
|
||||||
|
printf("\n%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
|
||||||
|
}
|
||||||
if (params.detect_language) {
|
if (params.detect_language) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -5477,7 +5483,11 @@ int whisper_full_with_state(
|
||||||
std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx), };
|
std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx), };
|
||||||
|
|
||||||
if (whisper_is_multilingual(ctx)) {
|
if (whisper_is_multilingual(ctx)) {
|
||||||
const int lang_id = whisper_lang_id(params.language);
|
int lang_id = whisper_lang_id(params.language);
|
||||||
|
if(lang_id<0)
|
||||||
|
{
|
||||||
|
lang_id = 0; //default to english
|
||||||
|
}
|
||||||
state->lang_id = lang_id;
|
state->lang_id = lang_id;
|
||||||
prompt_init.push_back(whisper_token_lang(ctx, lang_id));
|
prompt_init.push_back(whisper_token_lang(ctx, lang_id));
|
||||||
if (params.translate) {
|
if (params.translate) {
|
||||||
|
|
|
@ -217,6 +217,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
|
||||||
|
|
||||||
const std::string b64data = std::string(inputs.audio_data);
|
const std::string b64data = std::string(inputs.audio_data);
|
||||||
const std::string initprompt = std::string(inputs.prompt);
|
const std::string initprompt = std::string(inputs.prompt);
|
||||||
|
const std::string langcode = std::string(inputs.langcode);
|
||||||
|
|
||||||
std::vector<float> pcmf32; // mono-channel F32 PCM
|
std::vector<float> pcmf32; // mono-channel F32 PCM
|
||||||
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
|
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
|
||||||
|
@ -236,7 +237,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
|
||||||
wparams.print_timestamps = false;
|
wparams.print_timestamps = false;
|
||||||
wparams.print_special = false;
|
wparams.print_special = false;
|
||||||
wparams.translate = false;
|
wparams.translate = false;
|
||||||
wparams.language = "auto";
|
wparams.language = langcode.c_str();
|
||||||
wparams.detect_language = false;
|
wparams.detect_language = false;
|
||||||
wparams.n_threads = 4;
|
wparams.n_threads = 4;
|
||||||
wparams.n_max_text_ctx = wparams.n_max_text_ctx;
|
wparams.n_max_text_ctx = wparams.n_max_text_ctx;
|
||||||
|
@ -248,7 +249,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
|
||||||
wparams.split_on_word = false;
|
wparams.split_on_word = false;
|
||||||
wparams.audio_ctx = 0;
|
wparams.audio_ctx = 0;
|
||||||
wparams.speed_up = false;
|
wparams.speed_up = false;
|
||||||
wparams.debug_mode = false;
|
wparams.debug_mode = (whisperdebugmode==1);
|
||||||
wparams.tdrz_enable = false;
|
wparams.tdrz_enable = false;
|
||||||
wparams.suppress_regex = nullptr;
|
wparams.suppress_regex = nullptr;
|
||||||
wparams.suppress_non_speech_tokens = inputs.suppress_non_speech;
|
wparams.suppress_non_speech_tokens = inputs.suppress_non_speech;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue