switch to miniaudio, support mp3 for whisper

This commit is contained in:
Concedo 2025-07-13 23:24:07 +08:00
parent e7eb6d3200
commit 66755c8fe9
2 changed files with 31 additions and 78 deletions

View file

@ -3408,6 +3408,7 @@ Current version indicated by LITEVER below.
//section migrated from story itself
extrastopseq: "",
includedefaultstops: true,
tokenbans: "",
logitbiasdict: {},
regexreplace_data: [],
@ -7489,6 +7490,7 @@ Current version indicated by LITEVER below.
new_save_storyobj.wisearchdepth = wi_searchdepth;
new_save_storyobj.wiinsertlocation = wi_insertlocation;
new_save_storyobj.personal_notes = personal_notes;
new_save_storyobj.newlineaftermemory = newlineaftermemory;
new_save_storyobj.documentdb_provider = documentdb_provider;
new_save_storyobj.documentdb_searchhistory = documentdb_searchhistory;
new_save_storyobj.documentdb_numresults = documentdb_numresults;
@ -7709,6 +7711,7 @@ Current version indicated by LITEVER below.
let old_current_memory = current_memory;
let old_current_wi = current_wi;
let old_notes = personal_notes;
let old_newlineaftermemory = newlineaftermemory;
let old_extrastopseq = localsettings.extrastopseq;
let old_regexreplace_data = localsettings.regexreplace_data;
@ -7784,6 +7787,9 @@ Current version indicated by LITEVER below.
if (storyobj.personal_notes) {
personal_notes = storyobj.personal_notes;
}
if (storyobj.newlineaftermemory===true || storyobj.newlineaftermemory===false) {
newlineaftermemory = storyobj.newlineaftermemory;
}
if(storyobj.documentdb_provider)
{
documentdb_provider = storyobj.documentdb_provider;
@ -7910,6 +7916,7 @@ Current version indicated by LITEVER below.
current_anotetemplate = old_current_anotetemplate;
current_memory = old_current_memory;
personal_notes = old_notes;
newlineaftermemory = old_newlineaftermemory;
}
if(!loadworldinfo)
{
@ -12801,6 +12808,7 @@ Current version indicated by LITEVER below.
document.getElementById("websearch_enabled").checked = localsettings.websearch_enabled;
document.getElementById("websearch_multipass").checked = localsettings.websearch_multipass;
document.getElementById("websearch_retain").checked = localsettings.websearch_retain;
document.getElementById("includedefaultstops").checked = localsettings.includedefaultstops;
document.getElementById("websearch_template").value = (localsettings.websearch_template==""?default_websearch_template:localsettings.websearch_template);
if(is_using_kcpp_with_websearch())
{
@ -13281,6 +13289,7 @@ Current version indicated by LITEVER below.
localsettings.websearch_enabled = document.getElementById("websearch_enabled").checked?true:false;
localsettings.websearch_multipass = document.getElementById("websearch_multipass").checked?true:false;
localsettings.websearch_retain = document.getElementById("websearch_retain").checked?true:false;
localsettings.includedefaultstops = document.getElementById("includedefaultstops").checked?true:false;
localsettings.websearch_template = (document.getElementById("websearch_template").value==default_websearch_template?"":document.getElementById("websearch_template").value);
if(document.getElementById("thinking_pattern").value !="" && validate_regex(document.getElementById("thinking_pattern").value))
{
@ -14258,6 +14267,7 @@ Current version indicated by LITEVER below.
documentdb_searchrange = 300;
documentdb_chunksize = 800;
documentdb_data = "";
newlineaftermemory = true;
}
if(localsettings.inject_randomness_seed>0)
{
@ -16606,6 +16616,10 @@ Current version indicated by LITEVER below.
}
}
}
if(!localsettings.includedefaultstops)
{
seqs = [];
}
if (localsettings.extrastopseq != "") {
let rep = replaceAll(localsettings.extrastopseq, "\\n", "\n");
let srep = rep.split("||$||");
@ -25032,7 +25046,14 @@ Current version indicated by LITEVER below.
<div id="settingsmenutokens" class="settingsmenu hidden" onchange="setting_tweaked()">
<div class="settingitem wide" style="font-size:12px">
<div class="justifyleft settinglabel">Extra Stopping Sequences <span class="helpicon">?<span
class="helptext">Triggers the text generator to stop generating early if this sequence appears, in addition to default stop sequences. If you want multiple sequences, separate them with the following delimiter: ||$||</span></span></div>
class="helptext">Triggers the text generator to stop generating early if this sequence appears, in addition to default stop sequences. If you want multiple sequences, separate them with the following delimiter: ||$||</span></span>
<span class="justifyright flex-push-right" >
<div class="settinglabel" style="padding-top: 4px;">
<div class="justifyleft settingsmall" title="Include default stop sequences. Leave enabled if unsure.">Include Default Stops </div>
<input type="checkbox" title="Include default stop sequences." id="includedefaultstops" style="margin:0px 0 0;" checked>
</div>
</span>
</div>
<div class="color_red hidden" id="noextrastopseq">Stop Sequences may be unavailable.</div>
<div style="display: flex; column-gap: 4px; margin-bottom: 4px;">
<input title="Extra Stopping Sequences" class="form-control menuinput_inline" type="text" placeholder="None" value="" id="extrastopseq">

View file

@ -3,9 +3,6 @@
#include "whisper.cpp"
#define DR_WAV_IMPLEMENTATION
#include "dr_wav.h"
#include <cmath>
#include <fstream>
#include <cstdio>
@ -43,89 +40,25 @@ static bool is_wav_buffer(const std::string buf) {
return true;
}
static bool read_wav(const std::string & b64data, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo)
static bool read_wav(const std::string & b64data, std::vector<float>& pcmf32)
{
drwav wav;
std::vector<uint8_t> wav_data = kcpp_base64_decode(b64data);
std::vector<uint8_t> media_data_buffer = kcpp_base64_decode(b64data);
if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
printf("error: failed to open WAV file from stdin\n");
bool ok = kcpp_decode_audio_from_buf(media_data_buffer.data(), media_data_buffer.size(), COMMON_SAMPLE_RATE, pcmf32);
if (!ok) {
printf("\nError: Cannot read input audio file.");
return false;
}
if (wav.channels != 1 && wav.channels != 2) {
printf("WAV file must be mono or stereo\n");
drwav_uninit(&wav);
return false;
}
if (wav.bitsPerSample != 8 && wav.bitsPerSample != 16 && wav.bitsPerSample != 32) {
printf("WAV file must be 8-bit, 16-bit or 32-bit. Detected: %d\n",wav.bitsPerSample);
drwav_uninit(&wav);
return false;
}
const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
std::vector<int16_t> pcm16;
pcm16.resize(n*wav.channels);
if (wav.bitsPerSample == 8) {
// Handle 8-bit PCM and convert to 16-bit
std::vector<uint8_t> pcm8(n * wav.channels);
drwav_read_pcm_frames(&wav, n, pcm8.data());
drwav_u8_to_s16(pcm16.data(), pcm8.data(), n * wav.channels);
} else if (wav.bitsPerSample == 16) {
// Handle 16-bit PCM directly
drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
} else if (wav.bitsPerSample == 32) {
// Handle 32-bit PCM and convert to 16-bit
std::vector<int32_t> pcm32(n * wav.channels);
drwav_read_pcm_frames_s32(&wav, n, pcm32.data());
for (uint64_t i = 0; i < n * wav.channels; ++i) {
pcm16[i] = static_cast<int16_t>(pcm32[i] >> 16); // Scale down by shifting
}
}
drwav_uninit(&wav);
std::vector<float> raw_pcm;
raw_pcm.resize(n);
if(whisperdebugmode==1 && !whisper_is_quiet)
{
printf("\nwav_data_size: %d, n:%d",wav_data.size(),n);
}
// convert to mono, float
if (wav.channels == 1) {
for (uint64_t i = 0; i < n; i++) {
raw_pcm[i] = float(pcm16[i])/32768.0f;
}
} else {
for (uint64_t i = 0; i < n; i++) {
raw_pcm[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
}
}
if (wav.sampleRate != COMMON_SAMPLE_RATE) {
if(whisperdebugmode==1 && !whisper_is_quiet)
{
printf("\nResample wav from %" PRIu32 " to %" PRIu32 " (in size: %zu)",
wav.sampleRate, COMMON_SAMPLE_RATE, raw_pcm.size());
}
raw_pcm = resample_wav(raw_pcm, wav.sampleRate, COMMON_SAMPLE_RATE);
}
uint64_t finalsize = raw_pcm.size();
pcmf32.resize(finalsize);
for (uint64_t i = 0; i < finalsize; i++) {
pcmf32[i] = raw_pcm[i];
printf("\nwav_data_size: %d",pcmf32.size());
}
return true;
}
static std::string output_txt(struct whisper_context * ctx, std::vector<std::vector<float>> pcmf32s) {
static std::string output_txt(struct whisper_context * ctx) {
std::string outtxt = "";
const int n_segments = whisper_full_n_segments(ctx);
@ -216,9 +149,8 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
const std::string langcode = std::string(inputs.langcode);
std::vector<float> pcmf32; // mono-channel F32 PCM
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
if (!::read_wav(b64data, pcmf32, pcmf32s, false)) {
if (!::read_wav(b64data, pcmf32)) {
printf("\nWhisper: Failed to read input wav data!\n");
output.text = "";
output.status = 0;
@ -270,7 +202,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
}
// output text transcription
whisper_output_text = output_txt(whisper_ctx, pcmf32s);
whisper_output_text = output_txt(whisper_ctx);
std::string ts = get_timestamp_str();
if(!whisper_is_quiet)
{