diff --git a/embd_res/kcpp_musicui.embd b/embd_res/kcpp_musicui.embd index 0e8ee1182..77230ec8c 100644 --- a/embd_res/kcpp_musicui.embd +++ b/embd_res/kcpp_musicui.embd @@ -151,6 +151,14 @@ audio{width:100%;margin-top:6px;} max-width:300px; font-size:13px; } + +input[type="checkbox"] { + height: 16px; + accent-color: var(--accent); + cursor: pointer; +} + + @@ -190,6 +198,13 @@ audio{width:100%;margin-top:6px;}
+
+
+
+
+ +
+
@@ -287,13 +302,21 @@ function toggleAdvanced(){ function getFormData(){ const ids=["caption","lyrics","bpm","duration","keyscale","timesignature", - "vocal_language","seed","lm_temperature","lm_cfg_scale","lm_top_p","inference_steps"]; + "vocal_language","seed","lm_temperature","lm_cfg_scale","lm_top_p","inference_steps", + "guidance_scale","shift","stereo","gen_codes","audio_codes"]; const data={}; ids.forEach(id=>{ const el=document.getElementById(id); if(!el) return; const v=el.value; - if(v!=="") data[id]=isNaN(v)?v:Number(v); + if(v=="on") + { + data[id]=true; + }else if(v=="off") + { + data[id]=false; + } + else if(v!=="") {data[id]=isNaN(v)?v:Number(v);} }); return data; } @@ -385,7 +408,7 @@ async function generateSong(){ } } -function downloadTrackJSON(id){ +function loadTrackJSON(id){ const tx = db.transaction(STORE, "readonly"); const store = tx.objectStore(STORE); const req = store.get(id); @@ -397,17 +420,9 @@ function downloadTrackJSON(id){ return; } - const blob = new Blob( - [JSON.stringify(item.params, null, 2)], - { type: "application/json" } - ); + const data=(item.params); + updateForm(data); - const url = URL.createObjectURL(blob); - const a = document.createElement("a"); - a.href = url; - a.download = `${item.title}.json`; - a.click(); - URL.revokeObjectURL(url); }; req.onerror = function(){ @@ -444,8 +459,8 @@ function loadLibrary(){ - diff --git a/expose.h b/expose.h index 1ae4fad06..939a13ba7 100644 --- a/expose.h +++ b/expose.h @@ -342,13 +342,15 @@ struct music_load_model_inputs }; struct music_generation_inputs { - const bool is_codes = false; //if true, generate codes, else, generate diffusion music + const bool is_planner_mode = false; //if true, generate codes, else, generate diffusion music + const bool stereo = false; + const bool gen_codes = false; const char * input_json = nullptr; }; struct music_generation_outputs { int status = -1; - const char * codes_json = ""; + const char * music_output_json = ""; const char * data = ""; }; diff --git a/koboldcpp.py b/koboldcpp.py index 8b31b09e5..cce3f1fbd 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -453,12 +453,14 @@ class music_load_model_inputs(ctypes.Structure): ("debugmode", ctypes.c_int)] class music_generation_inputs(ctypes.Structure): - _fields_ = [("is_codes", ctypes.c_bool), + _fields_ = [("is_planner_mode", ctypes.c_bool), + ("stereo", ctypes.c_bool), + ("gen_codes", ctypes.c_bool), ("input_json", ctypes.c_char_p)] class music_generation_outputs(ctypes.Structure): _fields_ = [("status", ctypes.c_int), - ("codes_json", ctypes.c_char_p), + ("music_output_json", ctypes.c_char_p), ("data", ctypes.c_char_p)] class StdoutRedirector: @@ -2383,12 +2385,14 @@ def music_generate_codes(genparams): global args input_json = json.dumps(genparams) inputs = music_generation_inputs() - inputs.is_codes = True + inputs.is_planner_mode = True + inputs.stereo = genparams.get('stereo', False) + inputs.gen_codes = genparams.get('gen_codes', False) inputs.input_json = input_json.encode("UTF-8") ret = handle.music_generate(inputs) outstr = "" if ret.status==1: - outstr = ret.codes_json.decode("UTF-8","ignore") + outstr = ret.music_output_json.decode("UTF-8","ignore") outstr = json.dumps(json.loads(outstr)) return outstr @@ -2396,7 +2400,9 @@ def music_generate_audio(genparams): global args input_json = json.dumps(genparams) inputs = music_generation_inputs() - inputs.is_codes = False + inputs.is_planner_mode = False + inputs.stereo = genparams.get('stereo', False) + inputs.gen_codes = genparams.get('gen_codes', False) inputs.input_json = input_json.encode("UTF-8") ret = handle.music_generate(inputs) outstr = "" diff --git a/otherarch/acestep/dit-vae.cpp b/otherarch/acestep/dit-vae.cpp index d4d02d573..b67256228 100644 --- a/otherarch/acestep/dit-vae.cpp +++ b/otherarch/acestep/dit-vae.cpp @@ -868,8 +868,8 @@ std::string acestep_generate_audio(const music_generation_inputs inputs) // Context building // Silence latent for this T - std::vector silence(Oc * T); - memcpy(silence.data(), silence_full.data(), (size_t)(Oc * T) * sizeof(float)); + // std::vector silence(Oc * T); + // memcpy(silence.data(), silence_full.data(), (size_t)(Oc * T) * sizeof(float)); // Decode audio codes if provided int decoded_T = 0; @@ -895,7 +895,7 @@ std::string acestep_generate_audio(const music_generation_inputs inputs) for (int t = 0; t < T; t++) { const float * src = (t < decoded_T) ? decoded_latents.data() + t * Oc - : silence.data() + t * Oc; + : silence_full.data() + (t - decoded_T) * Oc; for (int c = 0; c < Oc; c++) context_single[t * ctx_ch + c] = src[c]; for (int c = 0; c < Oc; c++) @@ -984,9 +984,15 @@ std::string acestep_generate_audio(const music_generation_inputs inputs) // output wav float muslen = (float)T_audio / 48000.0f; - std::vector mono = mix_planar_stereo_to_mono(audio.data(), T_audio); - std::vector resampled_buf = resample_wav(mono,48000,32000); - std::string finalb64 = save_wav16_base64(resampled_buf, 32000); + std::string finalb64; + if(inputs.stereo) + { + finalb64 = save_stereo_wav16_base64(audio,T_audio,48000); + } else { + std::vector mono = mix_planar_stereo_to_mono(audio.data(), T_audio); + std::vector resampled_buf = resample_wav(mono,48000,32000); + finalb64 = save_wav16_base64(resampled_buf, 32000); + } if(acestep_dit_lowvram) { diff --git a/otherarch/acestep/music_adapter.cpp b/otherarch/acestep/music_adapter.cpp index ec4b8c27a..fab0bcb3b 100644 --- a/otherarch/acestep/music_adapter.cpp +++ b/otherarch/acestep/music_adapter.cpp @@ -25,7 +25,7 @@ static bool music_is_quiet = false; static bool musicgen_loaded = false; static std::string musicvulkandeviceenv; -static std::string codes_json_str = ""; +static std::string music_output_json_str = ""; static std::string b64_music_output = ""; bool musictype_load_model(const music_load_model_inputs inputs) @@ -96,29 +96,29 @@ music_generation_outputs musictype_generate(const music_generation_inputs inputs { printf("\nWarning: KCPP music gen not initialized!\n"); output.status = 0; - output.codes_json = ""; + output.music_output_json = ""; output.data = ""; return output; } - if (inputs.is_codes) { + if (inputs.is_planner_mode) { if (!music_is_quiet) { printf("\nMusic Gen Generating Codes..."); } - codes_json_str = acestep_prepare_request(inputs); - if(codes_json_str=="") + music_output_json_str = acestep_prepare_request(inputs); + if(music_output_json_str=="") { printf("\nMusic codes generation failed!\n"); output.status = 0; - output.codes_json = ""; + output.music_output_json = ""; output.data = ""; return output; } output.status = 1; output.data = ""; - output.codes_json = codes_json_str.c_str(); + output.music_output_json = music_output_json_str.c_str(); if (!music_is_quiet) { - printf("\nMusic Gen Codes Done:\n%s\n",codes_json_str.c_str()); + printf("\nMusic Gen Codes Done:\n%s\n",music_output_json_str.c_str()); } } else { if (!music_is_quiet) { @@ -129,13 +129,13 @@ music_generation_outputs musictype_generate(const music_generation_inputs inputs { printf("\nMusic audio generation failed!\n"); output.status = 0; - output.codes_json = ""; + output.music_output_json = ""; output.data = ""; return output; } output.status = 1; output.data = b64_music_output.c_str(); - output.codes_json = ""; + output.music_output_json = ""; if (!music_is_quiet) { printf("\nMusic Gen Audio Done\n"); } diff --git a/otherarch/utils.cpp b/otherarch/utils.cpp index 3ada28deb..1834a631d 100644 --- a/otherarch/utils.cpp +++ b/otherarch/utils.cpp @@ -487,6 +487,49 @@ std::string save_wav16_base64(const std::vector &data, int sample_rate) { return kcpp_base64_encode(wav_data); //return as base64 string } +//assumes planar stereo input from acestep +std::string save_stereo_wav16_base64(const std::vector & raw_audio, int T_audio, int sample_rate) { + std::ostringstream oss(std::ios::binary); + const int n_channels = 2; + const int bits = 16; + const int byte_rate = sample_rate * n_channels * (bits / 8); + const int block_align = n_channels * (bits / 8); + const int data_size = T_audio * n_channels * (bits / 8); + const int file_size = 36 + data_size; + oss.write("RIFF", 4); + oss.write(reinterpret_cast(&file_size), 4); + oss.write("WAVE", 4); + oss.write("fmt ", 4); + int32_t fmt_size = 16; + oss.write(reinterpret_cast(&fmt_size), 4); + int16_t audio_fmt = 1; // PCM + oss.write(reinterpret_cast(&audio_fmt), 2); + int16_t nc = n_channels; + oss.write(reinterpret_cast(&nc), 2); + oss.write(reinterpret_cast(&sample_rate), 4); + oss.write(reinterpret_cast(&byte_rate), 4); + int16_t ba = block_align; + oss.write(reinterpret_cast(&ba), 2); + int16_t bp = bits; + oss.write(reinterpret_cast(&bp), 2); + oss.write("data", 4); + oss.write(reinterpret_cast(&data_size), 4); + + // EXPECTS PLANAR INPUT: + // raw_audio[0 ... T_audio-1] = Left + // raw_audio[T_audio ... 2*T_audio-1] = Right + for (int t = 0; t < T_audio; ++t) { + for (int c = 0; c < 2; ++c) { + float s = raw_audio[c * T_audio + t]; + s = std::max(-1.0f, std::min(1.0f, s)); // clamp to [-1, 1] + int16_t v = static_cast(s * 32767.0f); + oss.write(reinterpret_cast(&v), 2); + } + } + std::string wav_data = oss.str(); + return kcpp_base64_encode(wav_data); +} + //a very rudimentary all in one sampling function which has no dependencies int32_t kcpp_quick_sample(float * logits, const int n_logits, const std::vector & last_n_tokens, float rep_pen, float top_p, int top_k, float temp, std::mt19937 & rng) { diff --git a/otherarch/utils.h b/otherarch/utils.h index f993aed5c..b5137c08e 100644 --- a/otherarch/utils.h +++ b/otherarch/utils.h @@ -156,4 +156,5 @@ struct wav_ulaw_header { #pragma pack(pop) std::string save_ulaw_wav8_base64(const std::vector &data, int sample_rate); -std::string save_wav16_base64(const std::vector &data, int sample_rate); \ No newline at end of file +std::string save_wav16_base64(const std::vector &data, int sample_rate); +std::string save_stereo_wav16_base64(const std::vector & raw_audio, int T_audio, int sample_rate); \ No newline at end of file