mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
wav file resampling
This commit is contained in:
parent
62ab344b1e
commit
961c789c91
2 changed files with 46 additions and 10 deletions
|
@ -649,6 +649,8 @@ def whisper_generate(genparams):
|
||||||
is_quiet = True if args.quiet else False
|
is_quiet = True if args.quiet else False
|
||||||
prompt = genparams.get("prompt", "")
|
prompt = genparams.get("prompt", "")
|
||||||
audio_data = genparams.get("audio_data", "")
|
audio_data = genparams.get("audio_data", "")
|
||||||
|
if audio_data.startswith("data:audio"):
|
||||||
|
audio_data = audio_data.split(",", 1)[1]
|
||||||
inputs = whisper_generation_inputs()
|
inputs = whisper_generation_inputs()
|
||||||
inputs.prompt = prompt.encode("UTF-8")
|
inputs.prompt = prompt.encode("UTF-8")
|
||||||
inputs.audio_data = audio_data.encode("UTF-8")
|
inputs.audio_data = audio_data.encode("UTF-8")
|
||||||
|
@ -660,7 +662,7 @@ def whisper_generate(genparams):
|
||||||
return outstr
|
return outstr
|
||||||
|
|
||||||
def utfprint(str):
|
def utfprint(str):
|
||||||
maxlen = 30000
|
maxlen = 25000
|
||||||
strlength = len(str)
|
strlength = len(str)
|
||||||
if strlength > maxlen: #limit max output len
|
if strlength > maxlen: #limit max output len
|
||||||
str = str[:maxlen] + f"... (+{strlength-maxlen} chars)"
|
str = str[:maxlen] + f"... (+{strlength-maxlen} chars)"
|
||||||
|
|
|
@ -40,6 +40,34 @@ static bool is_wav_buffer(const std::string buf) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::vector<float> resample_wav(const std::vector<float>& input, uint32_t input_rate, uint32_t output_rate) {
|
||||||
|
|
||||||
|
size_t input_size = input.size();
|
||||||
|
|
||||||
|
double ratio = static_cast<double>(output_rate) / input_rate;
|
||||||
|
size_t newLength = static_cast<size_t>(input.size() * ratio);
|
||||||
|
std::vector<float> output(newLength);
|
||||||
|
|
||||||
|
if(whisperdebugmode==1)
|
||||||
|
{
|
||||||
|
printf("\nResample wav from %d to %d (in size: %d, out size: %d)", input_rate,output_rate,input_size,output.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Perform simple linear interpolation resampling
|
||||||
|
for (size_t i = 0; i < newLength; ++i) {
|
||||||
|
double srcIndex = i / ratio;
|
||||||
|
size_t srcIndexInt = static_cast<size_t>(srcIndex);
|
||||||
|
double frac = srcIndex - srcIndexInt;
|
||||||
|
if (srcIndexInt + 1 < input_size) {
|
||||||
|
output[i] = static_cast<float>(input[srcIndexInt] * (1 - frac) + input[srcIndexInt + 1] * frac);
|
||||||
|
} else {
|
||||||
|
output[i] = input[srcIndexInt];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
static bool read_wav(const std::string & b64data, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo)
|
static bool read_wav(const std::string & b64data, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo)
|
||||||
{
|
{
|
||||||
drwav wav;
|
drwav wav;
|
||||||
|
@ -56,12 +84,6 @@ static bool read_wav(const std::string & b64data, std::vector<float>& pcmf32, st
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (wav.sampleRate != COMMON_SAMPLE_RATE) {
|
|
||||||
printf("WAV file must be %i kHz\n", COMMON_SAMPLE_RATE/1000);
|
|
||||||
drwav_uninit(&wav);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (wav.bitsPerSample != 16) {
|
if (wav.bitsPerSample != 16) {
|
||||||
printf("WAV file must be 16-bit\n");
|
printf("WAV file must be 16-bit\n");
|
||||||
drwav_uninit(&wav);
|
drwav_uninit(&wav);
|
||||||
|
@ -75,18 +97,30 @@ static bool read_wav(const std::string & b64data, std::vector<float>& pcmf32, st
|
||||||
drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
|
drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
|
||||||
drwav_uninit(&wav);
|
drwav_uninit(&wav);
|
||||||
|
|
||||||
|
std::vector<float> raw_pcm;
|
||||||
|
raw_pcm.resize(n);
|
||||||
|
|
||||||
// convert to mono, float
|
// convert to mono, float
|
||||||
pcmf32.resize(n);
|
|
||||||
if (wav.channels == 1) {
|
if (wav.channels == 1) {
|
||||||
for (uint64_t i = 0; i < n; i++) {
|
for (uint64_t i = 0; i < n; i++) {
|
||||||
pcmf32[i] = float(pcm16[i])/32768.0f;
|
raw_pcm[i] = float(pcm16[i])/32768.0f;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (uint64_t i = 0; i < n; i++) {
|
for (uint64_t i = 0; i < n; i++) {
|
||||||
pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
|
raw_pcm[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (wav.sampleRate != COMMON_SAMPLE_RATE) {
|
||||||
|
raw_pcm = resample_wav(raw_pcm, wav.sampleRate, COMMON_SAMPLE_RATE);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t finalsize = raw_pcm.size();
|
||||||
|
pcmf32.resize(finalsize);
|
||||||
|
for (uint64_t i = 0; i < finalsize; i++) {
|
||||||
|
pcmf32[i] = raw_pcm[i];
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue