mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-08 01:41:37 +00:00
switch to miniaudio, support mp3 for whisper
This commit is contained in:
parent
e7eb6d3200
commit
66755c8fe9
2 changed files with 31 additions and 78 deletions
23
klite.embd
23
klite.embd
|
|
@ -3408,6 +3408,7 @@ Current version indicated by LITEVER below.
|
|||
|
||||
//section migrated from story itself
|
||||
extrastopseq: "",
|
||||
includedefaultstops: true,
|
||||
tokenbans: "",
|
||||
logitbiasdict: {},
|
||||
regexreplace_data: [],
|
||||
|
|
@ -7489,6 +7490,7 @@ Current version indicated by LITEVER below.
|
|||
new_save_storyobj.wisearchdepth = wi_searchdepth;
|
||||
new_save_storyobj.wiinsertlocation = wi_insertlocation;
|
||||
new_save_storyobj.personal_notes = personal_notes;
|
||||
new_save_storyobj.newlineaftermemory = newlineaftermemory;
|
||||
new_save_storyobj.documentdb_provider = documentdb_provider;
|
||||
new_save_storyobj.documentdb_searchhistory = documentdb_searchhistory;
|
||||
new_save_storyobj.documentdb_numresults = documentdb_numresults;
|
||||
|
|
@ -7709,6 +7711,7 @@ Current version indicated by LITEVER below.
|
|||
let old_current_memory = current_memory;
|
||||
let old_current_wi = current_wi;
|
||||
let old_notes = personal_notes;
|
||||
let old_newlineaftermemory = newlineaftermemory;
|
||||
|
||||
let old_extrastopseq = localsettings.extrastopseq;
|
||||
let old_regexreplace_data = localsettings.regexreplace_data;
|
||||
|
|
@ -7784,6 +7787,9 @@ Current version indicated by LITEVER below.
|
|||
if (storyobj.personal_notes) {
|
||||
personal_notes = storyobj.personal_notes;
|
||||
}
|
||||
if (storyobj.newlineaftermemory===true || storyobj.newlineaftermemory===false) {
|
||||
newlineaftermemory = storyobj.newlineaftermemory;
|
||||
}
|
||||
if(storyobj.documentdb_provider)
|
||||
{
|
||||
documentdb_provider = storyobj.documentdb_provider;
|
||||
|
|
@ -7910,6 +7916,7 @@ Current version indicated by LITEVER below.
|
|||
current_anotetemplate = old_current_anotetemplate;
|
||||
current_memory = old_current_memory;
|
||||
personal_notes = old_notes;
|
||||
newlineaftermemory = old_newlineaftermemory;
|
||||
}
|
||||
if(!loadworldinfo)
|
||||
{
|
||||
|
|
@ -12801,6 +12808,7 @@ Current version indicated by LITEVER below.
|
|||
document.getElementById("websearch_enabled").checked = localsettings.websearch_enabled;
|
||||
document.getElementById("websearch_multipass").checked = localsettings.websearch_multipass;
|
||||
document.getElementById("websearch_retain").checked = localsettings.websearch_retain;
|
||||
document.getElementById("includedefaultstops").checked = localsettings.includedefaultstops;
|
||||
document.getElementById("websearch_template").value = (localsettings.websearch_template==""?default_websearch_template:localsettings.websearch_template);
|
||||
if(is_using_kcpp_with_websearch())
|
||||
{
|
||||
|
|
@ -13281,6 +13289,7 @@ Current version indicated by LITEVER below.
|
|||
localsettings.websearch_enabled = document.getElementById("websearch_enabled").checked?true:false;
|
||||
localsettings.websearch_multipass = document.getElementById("websearch_multipass").checked?true:false;
|
||||
localsettings.websearch_retain = document.getElementById("websearch_retain").checked?true:false;
|
||||
localsettings.includedefaultstops = document.getElementById("includedefaultstops").checked?true:false;
|
||||
localsettings.websearch_template = (document.getElementById("websearch_template").value==default_websearch_template?"":document.getElementById("websearch_template").value);
|
||||
if(document.getElementById("thinking_pattern").value !="" && validate_regex(document.getElementById("thinking_pattern").value))
|
||||
{
|
||||
|
|
@ -14258,6 +14267,7 @@ Current version indicated by LITEVER below.
|
|||
documentdb_searchrange = 300;
|
||||
documentdb_chunksize = 800;
|
||||
documentdb_data = "";
|
||||
newlineaftermemory = true;
|
||||
}
|
||||
if(localsettings.inject_randomness_seed>0)
|
||||
{
|
||||
|
|
@ -16606,6 +16616,10 @@ Current version indicated by LITEVER below.
|
|||
}
|
||||
}
|
||||
}
|
||||
if(!localsettings.includedefaultstops)
|
||||
{
|
||||
seqs = [];
|
||||
}
|
||||
if (localsettings.extrastopseq != "") {
|
||||
let rep = replaceAll(localsettings.extrastopseq, "\\n", "\n");
|
||||
let srep = rep.split("||$||");
|
||||
|
|
@ -25032,7 +25046,14 @@ Current version indicated by LITEVER below.
|
|||
<div id="settingsmenutokens" class="settingsmenu hidden" onchange="setting_tweaked()">
|
||||
<div class="settingitem wide" style="font-size:12px">
|
||||
<div class="justifyleft settinglabel">Extra Stopping Sequences <span class="helpicon">?<span
|
||||
class="helptext">Triggers the text generator to stop generating early if this sequence appears, in addition to default stop sequences. If you want multiple sequences, separate them with the following delimiter: ||$||</span></span></div>
|
||||
class="helptext">Triggers the text generator to stop generating early if this sequence appears, in addition to default stop sequences. If you want multiple sequences, separate them with the following delimiter: ||$||</span></span>
|
||||
<span class="justifyright flex-push-right" >
|
||||
<div class="settinglabel" style="padding-top: 4px;">
|
||||
<div class="justifyleft settingsmall" title="Include default stop sequences. Leave enabled if unsure.">Include Default Stops </div>
|
||||
<input type="checkbox" title="Include default stop sequences." id="includedefaultstops" style="margin:0px 0 0;" checked>
|
||||
</div>
|
||||
</span>
|
||||
</div>
|
||||
<div class="color_red hidden" id="noextrastopseq">Stop Sequences may be unavailable.</div>
|
||||
<div style="display: flex; column-gap: 4px; margin-bottom: 4px;">
|
||||
<input title="Extra Stopping Sequences" class="form-control menuinput_inline" type="text" placeholder="None" value="" id="extrastopseq">
|
||||
|
|
|
|||
|
|
@ -3,9 +3,6 @@
|
|||
|
||||
#include "whisper.cpp"
|
||||
|
||||
#define DR_WAV_IMPLEMENTATION
|
||||
#include "dr_wav.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <fstream>
|
||||
#include <cstdio>
|
||||
|
|
@ -43,89 +40,25 @@ static bool is_wav_buffer(const std::string buf) {
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool read_wav(const std::string & b64data, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo)
|
||||
static bool read_wav(const std::string & b64data, std::vector<float>& pcmf32)
|
||||
{
|
||||
drwav wav;
|
||||
std::vector<uint8_t> wav_data = kcpp_base64_decode(b64data);
|
||||
std::vector<uint8_t> media_data_buffer = kcpp_base64_decode(b64data);
|
||||
|
||||
if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
|
||||
printf("error: failed to open WAV file from stdin\n");
|
||||
bool ok = kcpp_decode_audio_from_buf(media_data_buffer.data(), media_data_buffer.size(), COMMON_SAMPLE_RATE, pcmf32);
|
||||
if (!ok) {
|
||||
printf("\nError: Cannot read input audio file.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (wav.channels != 1 && wav.channels != 2) {
|
||||
printf("WAV file must be mono or stereo\n");
|
||||
drwav_uninit(&wav);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (wav.bitsPerSample != 8 && wav.bitsPerSample != 16 && wav.bitsPerSample != 32) {
|
||||
printf("WAV file must be 8-bit, 16-bit or 32-bit. Detected: %d\n",wav.bitsPerSample);
|
||||
drwav_uninit(&wav);
|
||||
return false;
|
||||
}
|
||||
|
||||
const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
|
||||
|
||||
std::vector<int16_t> pcm16;
|
||||
pcm16.resize(n*wav.channels);
|
||||
|
||||
if (wav.bitsPerSample == 8) {
|
||||
// Handle 8-bit PCM and convert to 16-bit
|
||||
std::vector<uint8_t> pcm8(n * wav.channels);
|
||||
drwav_read_pcm_frames(&wav, n, pcm8.data());
|
||||
drwav_u8_to_s16(pcm16.data(), pcm8.data(), n * wav.channels);
|
||||
} else if (wav.bitsPerSample == 16) {
|
||||
// Handle 16-bit PCM directly
|
||||
drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
|
||||
} else if (wav.bitsPerSample == 32) {
|
||||
// Handle 32-bit PCM and convert to 16-bit
|
||||
std::vector<int32_t> pcm32(n * wav.channels);
|
||||
drwav_read_pcm_frames_s32(&wav, n, pcm32.data());
|
||||
for (uint64_t i = 0; i < n * wav.channels; ++i) {
|
||||
pcm16[i] = static_cast<int16_t>(pcm32[i] >> 16); // Scale down by shifting
|
||||
}
|
||||
}
|
||||
drwav_uninit(&wav);
|
||||
|
||||
std::vector<float> raw_pcm;
|
||||
raw_pcm.resize(n);
|
||||
|
||||
if(whisperdebugmode==1 && !whisper_is_quiet)
|
||||
{
|
||||
printf("\nwav_data_size: %d, n:%d",wav_data.size(),n);
|
||||
}
|
||||
|
||||
// convert to mono, float
|
||||
if (wav.channels == 1) {
|
||||
for (uint64_t i = 0; i < n; i++) {
|
||||
raw_pcm[i] = float(pcm16[i])/32768.0f;
|
||||
}
|
||||
} else {
|
||||
for (uint64_t i = 0; i < n; i++) {
|
||||
raw_pcm[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
|
||||
}
|
||||
}
|
||||
|
||||
if (wav.sampleRate != COMMON_SAMPLE_RATE) {
|
||||
if(whisperdebugmode==1 && !whisper_is_quiet)
|
||||
{
|
||||
printf("\nResample wav from %" PRIu32 " to %" PRIu32 " (in size: %zu)",
|
||||
wav.sampleRate, COMMON_SAMPLE_RATE, raw_pcm.size());
|
||||
}
|
||||
raw_pcm = resample_wav(raw_pcm, wav.sampleRate, COMMON_SAMPLE_RATE);
|
||||
}
|
||||
|
||||
uint64_t finalsize = raw_pcm.size();
|
||||
pcmf32.resize(finalsize);
|
||||
for (uint64_t i = 0; i < finalsize; i++) {
|
||||
pcmf32[i] = raw_pcm[i];
|
||||
printf("\nwav_data_size: %d",pcmf32.size());
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static std::string output_txt(struct whisper_context * ctx, std::vector<std::vector<float>> pcmf32s) {
|
||||
static std::string output_txt(struct whisper_context * ctx) {
|
||||
|
||||
std::string outtxt = "";
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
|
|
@ -216,9 +149,8 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
|
|||
const std::string langcode = std::string(inputs.langcode);
|
||||
|
||||
std::vector<float> pcmf32; // mono-channel F32 PCM
|
||||
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
|
||||
|
||||
if (!::read_wav(b64data, pcmf32, pcmf32s, false)) {
|
||||
if (!::read_wav(b64data, pcmf32)) {
|
||||
printf("\nWhisper: Failed to read input wav data!\n");
|
||||
output.text = "";
|
||||
output.status = 0;
|
||||
|
|
@ -270,7 +202,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
|
|||
}
|
||||
|
||||
// output text transcription
|
||||
whisper_output_text = output_txt(whisper_ctx, pcmf32s);
|
||||
whisper_output_text = output_txt(whisper_ctx);
|
||||
std::string ts = get_timestamp_str();
|
||||
if(!whisper_is_quiet)
|
||||
{
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue