wav file resampling

This commit is contained in:
Concedo 2024-05-30 13:41:58 +08:00
parent 62ab344b1e
commit 961c789c91
2 changed files with 46 additions and 10 deletions

View file

@ -649,6 +649,8 @@ def whisper_generate(genparams):
is_quiet = True if args.quiet else False
prompt = genparams.get("prompt", "")
audio_data = genparams.get("audio_data", "")
if audio_data.startswith("data:audio"):
audio_data = audio_data.split(",", 1)[1]
inputs = whisper_generation_inputs()
inputs.prompt = prompt.encode("UTF-8")
inputs.audio_data = audio_data.encode("UTF-8")
@ -660,7 +662,7 @@ def whisper_generate(genparams):
return outstr
def utfprint(str):
maxlen = 30000
maxlen = 25000
strlength = len(str)
if strlength > maxlen: #limit max output len
str = str[:maxlen] + f"... (+{strlength-maxlen} chars)"

View file

@ -40,6 +40,34 @@ static bool is_wav_buffer(const std::string buf) {
return true;
}
static std::vector<float> resample_wav(const std::vector<float>& input, uint32_t input_rate, uint32_t output_rate) {
size_t input_size = input.size();
double ratio = static_cast<double>(output_rate) / input_rate;
size_t newLength = static_cast<size_t>(input.size() * ratio);
std::vector<float> output(newLength);
if(whisperdebugmode==1)
{
printf("\nResample wav from %d to %d (in size: %d, out size: %d)", input_rate,output_rate,input_size,output.size());
}
// Perform simple linear interpolation resampling
for (size_t i = 0; i < newLength; ++i) {
double srcIndex = i / ratio;
size_t srcIndexInt = static_cast<size_t>(srcIndex);
double frac = srcIndex - srcIndexInt;
if (srcIndexInt + 1 < input_size) {
output[i] = static_cast<float>(input[srcIndexInt] * (1 - frac) + input[srcIndexInt + 1] * frac);
} else {
output[i] = input[srcIndexInt];
}
}
return output;
}
static bool read_wav(const std::string & b64data, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo)
{
drwav wav;
@ -56,12 +84,6 @@ static bool read_wav(const std::string & b64data, std::vector<float>& pcmf32, st
return false;
}
if (wav.sampleRate != COMMON_SAMPLE_RATE) {
printf("WAV file must be %i kHz\n", COMMON_SAMPLE_RATE/1000);
drwav_uninit(&wav);
return false;
}
if (wav.bitsPerSample != 16) {
printf("WAV file must be 16-bit\n");
drwav_uninit(&wav);
@ -75,18 +97,30 @@ static bool read_wav(const std::string & b64data, std::vector<float>& pcmf32, st
drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
drwav_uninit(&wav);
std::vector<float> raw_pcm;
raw_pcm.resize(n);
// convert to mono, float
pcmf32.resize(n);
if (wav.channels == 1) {
for (uint64_t i = 0; i < n; i++) {
pcmf32[i] = float(pcm16[i])/32768.0f;
raw_pcm[i] = float(pcm16[i])/32768.0f;
}
} else {
for (uint64_t i = 0; i < n; i++) {
pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
raw_pcm[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
}
}
if (wav.sampleRate != COMMON_SAMPLE_RATE) {
raw_pcm = resample_wav(raw_pcm, wav.sampleRate, COMMON_SAMPLE_RATE);
}
uint64_t finalsize = raw_pcm.size();
pcmf32.resize(finalsize);
for (uint64_t i = 0; i < finalsize; i++) {
pcmf32[i] = raw_pcm[i];
}
return true;
}