mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-19 08:00:25 +00:00
stereo is working
This commit is contained in:
parent
5a57ed8ca4
commit
ba42f22fc8
7 changed files with 112 additions and 39 deletions
|
|
@ -151,6 +151,14 @@ audio{width:100%;margin-top:6px;}
|
|||
max-width:300px;
|
||||
font-size:13px;
|
||||
}
|
||||
|
||||
input[type="checkbox"] {
|
||||
height: 16px;
|
||||
accent-color: var(--accent);
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
|
||||
</style>
|
||||
|
||||
</head>
|
||||
|
|
@ -190,6 +198,13 @@ audio{width:100%;margin-top:6px;}
|
|||
<div><label>CFG</label><input id="lm_cfg_scale" type="number" step="0.1"></div>
|
||||
<div><label>Top P</label><input id="lm_top_p" type="number" step="0.01"></div>
|
||||
<div><label>Steps</label><input id="inference_steps" type="number"></div>
|
||||
<div><label>Guidance</label><input id="guidance_scale" type="number"></div>
|
||||
<div><label>Shift</label><input id="shift" type="number"></div>
|
||||
<div><label>HD Stereo</label><input id="stereo" type="checkbox"></div>
|
||||
<div><label>Gen Codes</label><input id="gen_codes" type="checkbox"></div>
|
||||
</div>
|
||||
<div>
|
||||
<div><label>AudioCodes</label><textarea id="audio_codes"></textarea></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
|
@ -287,13 +302,21 @@ function toggleAdvanced(){
|
|||
|
||||
function getFormData(){
|
||||
const ids=["caption","lyrics","bpm","duration","keyscale","timesignature",
|
||||
"vocal_language","seed","lm_temperature","lm_cfg_scale","lm_top_p","inference_steps"];
|
||||
"vocal_language","seed","lm_temperature","lm_cfg_scale","lm_top_p","inference_steps",
|
||||
"guidance_scale","shift","stereo","gen_codes","audio_codes"];
|
||||
const data={};
|
||||
ids.forEach(id=>{
|
||||
const el=document.getElementById(id);
|
||||
if(!el) return;
|
||||
const v=el.value;
|
||||
if(v!=="") data[id]=isNaN(v)?v:Number(v);
|
||||
if(v=="on")
|
||||
{
|
||||
data[id]=true;
|
||||
}else if(v=="off")
|
||||
{
|
||||
data[id]=false;
|
||||
}
|
||||
else if(v!=="") {data[id]=isNaN(v)?v:Number(v);}
|
||||
});
|
||||
return data;
|
||||
}
|
||||
|
|
@ -385,7 +408,7 @@ async function generateSong(){
|
|||
}
|
||||
}
|
||||
|
||||
function downloadTrackJSON(id){
|
||||
function loadTrackJSON(id){
|
||||
const tx = db.transaction(STORE, "readonly");
|
||||
const store = tx.objectStore(STORE);
|
||||
const req = store.get(id);
|
||||
|
|
@ -397,17 +420,9 @@ function downloadTrackJSON(id){
|
|||
return;
|
||||
}
|
||||
|
||||
const blob = new Blob(
|
||||
[JSON.stringify(item.params, null, 2)],
|
||||
{ type: "application/json" }
|
||||
);
|
||||
const data=(item.params);
|
||||
updateForm(data);
|
||||
|
||||
const url = URL.createObjectURL(blob);
|
||||
const a = document.createElement("a");
|
||||
a.href = url;
|
||||
a.download = `${item.title}.json`;
|
||||
a.click();
|
||||
URL.revokeObjectURL(url);
|
||||
};
|
||||
|
||||
req.onerror = function(){
|
||||
|
|
@ -444,8 +459,8 @@ function loadLibrary(){
|
|||
<a href="${url}" download="${item.title}.wav">
|
||||
<button class="secondary">Download</button>
|
||||
</a>
|
||||
<button class="secondary" onclick="downloadTrackJSON(${item.id})">
|
||||
Get JSON
|
||||
<button class="secondary" onclick="loadTrackJSON(${item.id})">
|
||||
Load Params
|
||||
</button>
|
||||
|
||||
<button class="danger" onclick="deleteTrack(${item.id})">Delete</button>
|
||||
|
|
|
|||
6
expose.h
6
expose.h
|
|
@ -342,13 +342,15 @@ struct music_load_model_inputs
|
|||
};
|
||||
struct music_generation_inputs
|
||||
{
|
||||
const bool is_codes = false; //if true, generate codes, else, generate diffusion music
|
||||
const bool is_planner_mode = false; //if true, generate codes, else, generate diffusion music
|
||||
const bool stereo = false;
|
||||
const bool gen_codes = false;
|
||||
const char * input_json = nullptr;
|
||||
};
|
||||
struct music_generation_outputs
|
||||
{
|
||||
int status = -1;
|
||||
const char * codes_json = "";
|
||||
const char * music_output_json = "";
|
||||
const char * data = "";
|
||||
};
|
||||
|
||||
|
|
|
|||
16
koboldcpp.py
16
koboldcpp.py
|
|
@ -453,12 +453,14 @@ class music_load_model_inputs(ctypes.Structure):
|
|||
("debugmode", ctypes.c_int)]
|
||||
|
||||
class music_generation_inputs(ctypes.Structure):
|
||||
_fields_ = [("is_codes", ctypes.c_bool),
|
||||
_fields_ = [("is_planner_mode", ctypes.c_bool),
|
||||
("stereo", ctypes.c_bool),
|
||||
("gen_codes", ctypes.c_bool),
|
||||
("input_json", ctypes.c_char_p)]
|
||||
|
||||
class music_generation_outputs(ctypes.Structure):
|
||||
_fields_ = [("status", ctypes.c_int),
|
||||
("codes_json", ctypes.c_char_p),
|
||||
("music_output_json", ctypes.c_char_p),
|
||||
("data", ctypes.c_char_p)]
|
||||
|
||||
class StdoutRedirector:
|
||||
|
|
@ -2383,12 +2385,14 @@ def music_generate_codes(genparams):
|
|||
global args
|
||||
input_json = json.dumps(genparams)
|
||||
inputs = music_generation_inputs()
|
||||
inputs.is_codes = True
|
||||
inputs.is_planner_mode = True
|
||||
inputs.stereo = genparams.get('stereo', False)
|
||||
inputs.gen_codes = genparams.get('gen_codes', False)
|
||||
inputs.input_json = input_json.encode("UTF-8")
|
||||
ret = handle.music_generate(inputs)
|
||||
outstr = ""
|
||||
if ret.status==1:
|
||||
outstr = ret.codes_json.decode("UTF-8","ignore")
|
||||
outstr = ret.music_output_json.decode("UTF-8","ignore")
|
||||
outstr = json.dumps(json.loads(outstr))
|
||||
return outstr
|
||||
|
||||
|
|
@ -2396,7 +2400,9 @@ def music_generate_audio(genparams):
|
|||
global args
|
||||
input_json = json.dumps(genparams)
|
||||
inputs = music_generation_inputs()
|
||||
inputs.is_codes = False
|
||||
inputs.is_planner_mode = False
|
||||
inputs.stereo = genparams.get('stereo', False)
|
||||
inputs.gen_codes = genparams.get('gen_codes', False)
|
||||
inputs.input_json = input_json.encode("UTF-8")
|
||||
ret = handle.music_generate(inputs)
|
||||
outstr = ""
|
||||
|
|
|
|||
|
|
@ -868,8 +868,8 @@ std::string acestep_generate_audio(const music_generation_inputs inputs)
|
|||
|
||||
// Context building
|
||||
// Silence latent for this T
|
||||
std::vector<float> silence(Oc * T);
|
||||
memcpy(silence.data(), silence_full.data(), (size_t)(Oc * T) * sizeof(float));
|
||||
// std::vector<float> silence(Oc * T);
|
||||
// memcpy(silence.data(), silence_full.data(), (size_t)(Oc * T) * sizeof(float));
|
||||
|
||||
// Decode audio codes if provided
|
||||
int decoded_T = 0;
|
||||
|
|
@ -895,7 +895,7 @@ std::string acestep_generate_audio(const music_generation_inputs inputs)
|
|||
for (int t = 0; t < T; t++) {
|
||||
const float * src = (t < decoded_T)
|
||||
? decoded_latents.data() + t * Oc
|
||||
: silence.data() + t * Oc;
|
||||
: silence_full.data() + (t - decoded_T) * Oc;
|
||||
for (int c = 0; c < Oc; c++)
|
||||
context_single[t * ctx_ch + c] = src[c];
|
||||
for (int c = 0; c < Oc; c++)
|
||||
|
|
@ -984,9 +984,15 @@ std::string acestep_generate_audio(const music_generation_inputs inputs)
|
|||
|
||||
// output wav
|
||||
float muslen = (float)T_audio / 48000.0f;
|
||||
std::vector<float> mono = mix_planar_stereo_to_mono(audio.data(), T_audio);
|
||||
std::vector<float> resampled_buf = resample_wav(mono,48000,32000);
|
||||
std::string finalb64 = save_wav16_base64(resampled_buf, 32000);
|
||||
std::string finalb64;
|
||||
if(inputs.stereo)
|
||||
{
|
||||
finalb64 = save_stereo_wav16_base64(audio,T_audio,48000);
|
||||
} else {
|
||||
std::vector<float> mono = mix_planar_stereo_to_mono(audio.data(), T_audio);
|
||||
std::vector<float> resampled_buf = resample_wav(mono,48000,32000);
|
||||
finalb64 = save_wav16_base64(resampled_buf, 32000);
|
||||
}
|
||||
|
||||
if(acestep_dit_lowvram)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ static bool music_is_quiet = false;
|
|||
static bool musicgen_loaded = false;
|
||||
static std::string musicvulkandeviceenv;
|
||||
|
||||
static std::string codes_json_str = "";
|
||||
static std::string music_output_json_str = "";
|
||||
static std::string b64_music_output = "";
|
||||
|
||||
bool musictype_load_model(const music_load_model_inputs inputs)
|
||||
|
|
@ -96,29 +96,29 @@ music_generation_outputs musictype_generate(const music_generation_inputs inputs
|
|||
{
|
||||
printf("\nWarning: KCPP music gen not initialized!\n");
|
||||
output.status = 0;
|
||||
output.codes_json = "";
|
||||
output.music_output_json = "";
|
||||
output.data = "";
|
||||
return output;
|
||||
}
|
||||
|
||||
if (inputs.is_codes) {
|
||||
if (inputs.is_planner_mode) {
|
||||
if (!music_is_quiet) {
|
||||
printf("\nMusic Gen Generating Codes...");
|
||||
}
|
||||
codes_json_str = acestep_prepare_request(inputs);
|
||||
if(codes_json_str=="")
|
||||
music_output_json_str = acestep_prepare_request(inputs);
|
||||
if(music_output_json_str=="")
|
||||
{
|
||||
printf("\nMusic codes generation failed!\n");
|
||||
output.status = 0;
|
||||
output.codes_json = "";
|
||||
output.music_output_json = "";
|
||||
output.data = "";
|
||||
return output;
|
||||
}
|
||||
output.status = 1;
|
||||
output.data = "";
|
||||
output.codes_json = codes_json_str.c_str();
|
||||
output.music_output_json = music_output_json_str.c_str();
|
||||
if (!music_is_quiet) {
|
||||
printf("\nMusic Gen Codes Done:\n%s\n",codes_json_str.c_str());
|
||||
printf("\nMusic Gen Codes Done:\n%s\n",music_output_json_str.c_str());
|
||||
}
|
||||
} else {
|
||||
if (!music_is_quiet) {
|
||||
|
|
@ -129,13 +129,13 @@ music_generation_outputs musictype_generate(const music_generation_inputs inputs
|
|||
{
|
||||
printf("\nMusic audio generation failed!\n");
|
||||
output.status = 0;
|
||||
output.codes_json = "";
|
||||
output.music_output_json = "";
|
||||
output.data = "";
|
||||
return output;
|
||||
}
|
||||
output.status = 1;
|
||||
output.data = b64_music_output.c_str();
|
||||
output.codes_json = "";
|
||||
output.music_output_json = "";
|
||||
if (!music_is_quiet) {
|
||||
printf("\nMusic Gen Audio Done\n");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -487,6 +487,49 @@ std::string save_wav16_base64(const std::vector<float> &data, int sample_rate) {
|
|||
return kcpp_base64_encode(wav_data); //return as base64 string
|
||||
}
|
||||
|
||||
//assumes planar stereo input from acestep
|
||||
std::string save_stereo_wav16_base64(const std::vector<float> & raw_audio, int T_audio, int sample_rate) {
|
||||
std::ostringstream oss(std::ios::binary);
|
||||
const int n_channels = 2;
|
||||
const int bits = 16;
|
||||
const int byte_rate = sample_rate * n_channels * (bits / 8);
|
||||
const int block_align = n_channels * (bits / 8);
|
||||
const int data_size = T_audio * n_channels * (bits / 8);
|
||||
const int file_size = 36 + data_size;
|
||||
oss.write("RIFF", 4);
|
||||
oss.write(reinterpret_cast<const char*>(&file_size), 4);
|
||||
oss.write("WAVE", 4);
|
||||
oss.write("fmt ", 4);
|
||||
int32_t fmt_size = 16;
|
||||
oss.write(reinterpret_cast<const char*>(&fmt_size), 4);
|
||||
int16_t audio_fmt = 1; // PCM
|
||||
oss.write(reinterpret_cast<const char*>(&audio_fmt), 2);
|
||||
int16_t nc = n_channels;
|
||||
oss.write(reinterpret_cast<const char*>(&nc), 2);
|
||||
oss.write(reinterpret_cast<const char*>(&sample_rate), 4);
|
||||
oss.write(reinterpret_cast<const char*>(&byte_rate), 4);
|
||||
int16_t ba = block_align;
|
||||
oss.write(reinterpret_cast<const char*>(&ba), 2);
|
||||
int16_t bp = bits;
|
||||
oss.write(reinterpret_cast<const char*>(&bp), 2);
|
||||
oss.write("data", 4);
|
||||
oss.write(reinterpret_cast<const char*>(&data_size), 4);
|
||||
|
||||
// EXPECTS PLANAR INPUT:
|
||||
// raw_audio[0 ... T_audio-1] = Left
|
||||
// raw_audio[T_audio ... 2*T_audio-1] = Right
|
||||
for (int t = 0; t < T_audio; ++t) {
|
||||
for (int c = 0; c < 2; ++c) {
|
||||
float s = raw_audio[c * T_audio + t];
|
||||
s = std::max(-1.0f, std::min(1.0f, s)); // clamp to [-1, 1]
|
||||
int16_t v = static_cast<int16_t>(s * 32767.0f);
|
||||
oss.write(reinterpret_cast<const char*>(&v), 2);
|
||||
}
|
||||
}
|
||||
std::string wav_data = oss.str();
|
||||
return kcpp_base64_encode(wav_data);
|
||||
}
|
||||
|
||||
//a very rudimentary all in one sampling function which has no dependencies
|
||||
int32_t kcpp_quick_sample(float * logits, const int n_logits, const std::vector<int32_t> & last_n_tokens, float rep_pen, float top_p, int top_k, float temp, std::mt19937 & rng)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -156,4 +156,5 @@ struct wav_ulaw_header {
|
|||
#pragma pack(pop)
|
||||
|
||||
std::string save_ulaw_wav8_base64(const std::vector<float> &data, int sample_rate);
|
||||
std::string save_wav16_base64(const std::vector<float> &data, int sample_rate);
|
||||
std::string save_wav16_base64(const std::vector<float> &data, int sample_rate);
|
||||
std::string save_stereo_wav16_base64(const std::vector<float> & raw_audio, int T_audio, int sample_rate);
|
||||
Loading…
Add table
Add a link
Reference in a new issue