switch to miniaudio, support mp3 for whisper

2026-05-30 12:03:38 +00:00 · 2025-07-13 23:24:07 +08:00 · 2025-07-13 23:24:07 +08:00 · 66755c8fe9
commit 66755c8fe9
parent e7eb6d3200
2 changed files with 31 additions and 78 deletions
--- a/klite.embd
+++ b/klite.embd
@ -3408,6 +3408,7 @@ Current version indicated by LITEVER below.

 		//section migrated from story itself
 		extrastopseq: "",
+		includedefaultstops: true,
 		tokenbans: "",
 		logitbiasdict: {},
 		regexreplace_data: [],
@ -7489,6 +7490,7 @@ Current version indicated by LITEVER below.
 		new_save_storyobj.wisearchdepth = wi_searchdepth;
 		new_save_storyobj.wiinsertlocation = wi_insertlocation;
 		new_save_storyobj.personal_notes = personal_notes;
+		new_save_storyobj.newlineaftermemory = newlineaftermemory;
 		new_save_storyobj.documentdb_provider = documentdb_provider;
 		new_save_storyobj.documentdb_searchhistory = documentdb_searchhistory;
 		new_save_storyobj.documentdb_numresults = documentdb_numresults;
@ -7709,6 +7711,7 @@ Current version indicated by LITEVER below.
 			let old_current_memory = current_memory;
 			let old_current_wi = current_wi;
 			let old_notes = personal_notes;
+			let old_newlineaftermemory = newlineaftermemory;

 			let old_extrastopseq = localsettings.extrastopseq;
 			let old_regexreplace_data = localsettings.regexreplace_data;
@ -7784,6 +7787,9 @@ Current version indicated by LITEVER below.
 				if (storyobj.personal_notes) {
 					personal_notes = storyobj.personal_notes;
 				}
+				if (storyobj.newlineaftermemory===true || storyobj.newlineaftermemory===false) {
+					newlineaftermemory = storyobj.newlineaftermemory;
+				}
 				if(storyobj.documentdb_provider)
 				{
 					documentdb_provider = storyobj.documentdb_provider;
@ -7910,6 +7916,7 @@ Current version indicated by LITEVER below.
 					current_anotetemplate = old_current_anotetemplate;
 					current_memory = old_current_memory;
 					personal_notes = old_notes;
+					newlineaftermemory = old_newlineaftermemory;
 				}
 				if(!loadworldinfo)
 				{
@ -12801,6 +12808,7 @@ Current version indicated by LITEVER below.
 		document.getElementById("websearch_enabled").checked = localsettings.websearch_enabled;
 		document.getElementById("websearch_multipass").checked = localsettings.websearch_multipass;
 		document.getElementById("websearch_retain").checked = localsettings.websearch_retain;
+		document.getElementById("includedefaultstops").checked = localsettings.includedefaultstops;
 		document.getElementById("websearch_template").value = (localsettings.websearch_template==""?default_websearch_template:localsettings.websearch_template);
 		if(is_using_kcpp_with_websearch())
 		{
@ -13281,6 +13289,7 @@ Current version indicated by LITEVER below.
 		localsettings.websearch_enabled = document.getElementById("websearch_enabled").checked?true:false;
 		localsettings.websearch_multipass = document.getElementById("websearch_multipass").checked?true:false;
 		localsettings.websearch_retain = document.getElementById("websearch_retain").checked?true:false;
+		localsettings.includedefaultstops = document.getElementById("includedefaultstops").checked?true:false;
 		localsettings.websearch_template = (document.getElementById("websearch_template").value==default_websearch_template?"":document.getElementById("websearch_template").value);
 		if(document.getElementById("thinking_pattern").value !="" && validate_regex(document.getElementById("thinking_pattern").value))
 		{
@ -14258,6 +14267,7 @@ Current version indicated by LITEVER below.
 			documentdb_searchrange = 300;
 			documentdb_chunksize = 800;
 			documentdb_data = "";
+			newlineaftermemory = true;
 		}
 		if(localsettings.inject_randomness_seed>0)
 		{
@ -16606,6 +16616,10 @@ Current version indicated by LITEVER below.
 				}
 			}
 		}
+		if(!localsettings.includedefaultstops)
+		{
+			seqs = [];
+		}
 		if (localsettings.extrastopseq != "") {
 			let rep = replaceAll(localsettings.extrastopseq, "\\n", "\n");
 			let srep = rep.split("||$||");
@ -25032,7 +25046,14 @@ Current version indicated by LITEVER below.
 				<div id="settingsmenutokens" class="settingsmenu hidden" onchange="setting_tweaked()">
 					<div class="settingitem wide" style="font-size:12px">
 					<div class="justifyleft settinglabel">Extra Stopping Sequences <span class="helpicon">?<span
-						class="helptext">Triggers the text generator to stop generating early if this sequence appears, in addition to default stop sequences. If you want multiple sequences, separate them with the following delimiter: ||$||</span></span></div>
+						class="helptext">Triggers the text generator to stop generating early if this sequence appears, in addition to default stop sequences. If you want multiple sequences, separate them with the following delimiter: ||$||</span></span>
+						<span class="justifyright flex-push-right" >
+							<div class="settinglabel" style="padding-top: 4px;">
+								<div class="justifyleft settingsmall" title="Include default stop sequences. Leave enabled if unsure.">Include Default Stops </div>
+							<input type="checkbox" title="Include default stop sequences." id="includedefaultstops" style="margin:0px 0 0;" checked>
+							</div>
+						</span>
+					</div>
 						<div class="color_red hidden" id="noextrastopseq">Stop Sequences may be unavailable.</div>
 						<div style="display: flex; column-gap: 4px; margin-bottom: 4px;">
 						<input title="Extra Stopping Sequences" class="form-control menuinput_inline" type="text" placeholder="None" value="" id="extrastopseq">
--- a/otherarch/whispercpp/whisper_adapter.cpp
+++ b/otherarch/whispercpp/whisper_adapter.cpp
@ -3,9 +3,6 @@

 #include "whisper.cpp"

-#define DR_WAV_IMPLEMENTATION
-#include "dr_wav.h"
-
 #include <cmath>
 #include <fstream>
 #include <cstdio>
@ -43,89 +40,25 @@ static bool is_wav_buffer(const std::string buf) {
    return true;
 }

-static bool read_wav(const std::string & b64data, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo)
+static bool read_wav(const std::string & b64data, std::vector<float>& pcmf32)
 {
-    drwav wav;
-    std::vector<uint8_t> wav_data = kcpp_base64_decode(b64data);
+    std::vector<uint8_t> media_data_buffer = kcpp_base64_decode(b64data);

-    if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
-        printf("error: failed to open WAV file from stdin\n");
+    bool ok = kcpp_decode_audio_from_buf(media_data_buffer.data(), media_data_buffer.size(), COMMON_SAMPLE_RATE, pcmf32);
+    if (!ok) {
+        printf("\nError: Cannot read input audio file.");
        return false;
    }

-    if (wav.channels != 1 && wav.channels != 2) {
-        printf("WAV file must be mono or stereo\n");
-        drwav_uninit(&wav);
-        return false;
-    }
-
-    if (wav.bitsPerSample != 8 && wav.bitsPerSample != 16 && wav.bitsPerSample != 32) {
-        printf("WAV file must be 8-bit, 16-bit or 32-bit. Detected: %d\n",wav.bitsPerSample);
-        drwav_uninit(&wav);
-        return false;
-    }
-
-    const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
-
-    std::vector<int16_t> pcm16;
-    pcm16.resize(n*wav.channels);
-
-     if (wav.bitsPerSample == 8) {
-        // Handle 8-bit PCM and convert to 16-bit
-        std::vector<uint8_t> pcm8(n * wav.channels);
-        drwav_read_pcm_frames(&wav, n, pcm8.data());
-        drwav_u8_to_s16(pcm16.data(), pcm8.data(), n * wav.channels);
-    } else if (wav.bitsPerSample == 16) {
-        // Handle 16-bit PCM directly
-        drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
-    } else if (wav.bitsPerSample == 32) {
-        // Handle 32-bit PCM and convert to 16-bit
-        std::vector<int32_t> pcm32(n * wav.channels);
-        drwav_read_pcm_frames_s32(&wav, n, pcm32.data());
-        for (uint64_t i = 0; i < n * wav.channels; ++i) {
-            pcm16[i] = static_cast<int16_t>(pcm32[i] >> 16); // Scale down by shifting
-        }
-    }
-    drwav_uninit(&wav);
-
-    std::vector<float> raw_pcm;
-    raw_pcm.resize(n);
-
    if(whisperdebugmode==1 && !whisper_is_quiet)
    {
-        printf("\nwav_data_size: %d, n:%d",wav_data.size(),n);
-    }
-
-    // convert to mono, float
-    if (wav.channels == 1) {
-        for (uint64_t i = 0; i < n; i++) {
-            raw_pcm[i] = float(pcm16[i])/32768.0f;
-        }
-    } else {
-        for (uint64_t i = 0; i < n; i++) {
-            raw_pcm[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
-        }
-    }
-
-    if (wav.sampleRate != COMMON_SAMPLE_RATE) {
-        if(whisperdebugmode==1 && !whisper_is_quiet)
-        {
-            printf("\nResample wav from %" PRIu32 " to %" PRIu32 " (in size: %zu)",
-            wav.sampleRate, COMMON_SAMPLE_RATE, raw_pcm.size());
-        }
-        raw_pcm = resample_wav(raw_pcm, wav.sampleRate, COMMON_SAMPLE_RATE);
-    }
-
-    uint64_t finalsize = raw_pcm.size();
-    pcmf32.resize(finalsize);
-    for (uint64_t i = 0; i < finalsize; i++) {
-        pcmf32[i] = raw_pcm[i];
+        printf("\nwav_data_size: %d",pcmf32.size());
    }

    return true;
 }

-static std::string output_txt(struct whisper_context * ctx, std::vector<std::vector<float>> pcmf32s) {
+static std::string output_txt(struct whisper_context * ctx) {

    std::string outtxt = "";
    const int n_segments = whisper_full_n_segments(ctx);
@ -216,9 +149,8 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
    const std::string langcode = std::string(inputs.langcode);

    std::vector<float> pcmf32;               // mono-channel F32 PCM
-    std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM

-    if (!::read_wav(b64data, pcmf32, pcmf32s, false)) {
+    if (!::read_wav(b64data, pcmf32)) {
        printf("\nWhisper: Failed to read input wav data!\n");
        output.text = "";
        output.status = 0;
@ -270,7 +202,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
    }

    // output text transcription
-    whisper_output_text = output_txt(whisper_ctx, pcmf32s);
+    whisper_output_text = output_txt(whisper_ctx);
    std::string ts = get_timestamp_str();
    if(!whisper_is_quiet)
    {