all working, but needs to optimize vram

This commit is contained in:
Concedo 2026-02-24 21:55:57 +08:00
parent 488c431331
commit aa58d1ed3b
5 changed files with 40 additions and 6 deletions

View file

@ -1484,6 +1484,10 @@ std::string acestep_prepare_request(const music_generation_inputs inputs)
ace.timesignature = req.timesignature;
ace.vocal_language = req.vocal_language;
//kcpp: codes suck don't use them
req.thinking = false;
req.audio_codes = "";
bool user_has_codes = !req.audio_codes.empty();
bool need_lm_codes = req.thinking && !user_has_codes;
@ -1578,6 +1582,12 @@ std::string acestep_prepare_request(const music_generation_inputs inputs)
if (!batch_codes[0].empty()) rr.audio_codes = batch_codes[0];
rr.seed = seed;
std::string prefix_erase = "# Lyric";
// Check if the string is long enough and starts with the prefix
if (rr.lyrics.size() >= prefix_erase.size() && rr.lyrics.compare(0, prefix_erase.size(), prefix_erase) == 0) {
rr.lyrics = rr.lyrics.substr(prefix_erase.size()); // Returns a new string starting after the prefix
}
//now convert to string
std::ostringstream oss;
oss << "{\n";

View file

@ -692,6 +692,7 @@ std::string acestep_generate_audio(const music_generation_inputs inputs)
if (req.caption.empty()) {
req.caption = "An interesting song";
}
req.thinking = false;
const int FRAMES_PER_SECOND = 25;
int Oc = music_dit_cfg.out_channels; // 64
@ -917,11 +918,21 @@ std::string acestep_generate_audio(const music_generation_inputs inputs)
}
}
// output wav
std::vector<float> resampled_buf = resample_wav(audio,48000,24000);
std::string finalb64 = save_ulaw_wav8_base64(audio, 24000);
// std::string opath = "egghenlo.wav";
// if (write_wav(opath.c_str(), audio.data(), T_audio, 48000)) {
// fprintf(stderr, "[VAE Batch%d] Wrote %s: %d samples (%.2fs @ 48kHz stereo)\n",
// b, opath.c_str(), T_audio, (float)T_audio / 48000.0f);
// } else {
// fprintf(stderr, "[VAE Batch%d] FATAL: failed to write %s\n", b, opath.c_str());
// }
fprintf(stderr, "[Request Done]\n");
// output wav
float muslen = (float)T_audio / 48000.0f;
std::vector<float> mono = mix_planar_stereo_to_mono(audio.data(), T_audio);
std::vector<float> resampled_buf = resample_wav(mono,48000,32000);
std::string finalb64 = save_wav16_base64(resampled_buf, 32000);
fprintf(stderr, "[Request Done: Music Length %.2fs]\n",muslen);
return finalb64;
}

View file

@ -22,12 +22,12 @@ void request_init(AceRequest * r) {
r->vocal_language = "unknown";
r->task_type = "text2music";
r->seed = -1;
r->thinking = true;
r->thinking = false;
r->lm_temperature = 0.85f;
r->lm_cfg_scale = 2.0f;
r->lm_top_p = 0.9f;
r->lm_top_k = 0;
r->lm_negative_prompt = "NO USER INPUT";
r->lm_negative_prompt = "";
r->audio_codes = "";
r->inference_steps = 8;
r->guidance_scale = 1.0f;

View file

@ -389,6 +389,18 @@ std::vector<float> resample_wav(const std::vector<float>& input, uint32_t input_
return output;
}
std::vector<float> mix_planar_stereo_to_mono(const float* audio, int T_audio)
{
std::vector<float> mono(T_audio);
const float* left = audio;
const float* right = audio + T_audio;
for (int t = 0; t < T_audio; ++t)
{
mono[t] = 0.5f * (left[t] + right[t]);
}
return mono;
}
static uint8_t linear_to_mulaw(int16_t sample)
{
const int16_t BIAS = 0x84; // 132

View file

@ -63,6 +63,7 @@ std::string kcpp_base64_encode(const std::string &data);
std::string get_timestamp_str();
std::vector<std::vector<int>> split_big_vector(const std::vector<int>& big_arr, size_t chunk_size);
std::vector<float> resample_wav(const std::vector<float>& input, uint32_t input_rate, uint32_t output_rate);
std::vector<float> mix_planar_stereo_to_mono(const float* audio, int T_audio);
int32_t kcpp_quick_sample(float * logits, const int n_logits, const std::vector<int32_t> & last_n_tokens, float rep_pen, float top_p, int top_k, float temp, std::mt19937 & rng);