From aa58d1ed3b562c0278ce60ea94625aca25c8ae6e Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Tue, 24 Feb 2026 21:55:57 +0800
Subject: [PATCH] all working, but needs to optimize vram

---
 otherarch/acestep/ace-qwen3.cpp | 10 ++++++++++
 otherarch/acestep/dit-vae.cpp   | 19 +++++++++++++++----
 otherarch/acestep/request.cpp   |  4 ++--
 otherarch/utils.cpp             | 12 ++++++++++++
 otherarch/utils.h               |  1 +
 5 files changed, 40 insertions(+), 6 deletions(-)
diff --git a/otherarch/acestep/ace-qwen3.cpp b/otherarch/acestep/ace-qwen3.cpp
index dc01d7c2d..c49943d34 100644
--- a/otherarch/acestep/ace-qwen3.cpp
+++ b/otherarch/acestep/ace-qwen3.cpp
@@ -1484,6 +1484,10 @@ std::string acestep_prepare_request(const music_generation_inputs inputs)
     ace.timesignature  = req.timesignature;
     ace.vocal_language = req.vocal_language;
 
+    //kcpp: codes suck don't use them
+    req.thinking = false;
+    req.audio_codes = "";
+
     bool user_has_codes = !req.audio_codes.empty();
     bool need_lm_codes  = req.thinking && !user_has_codes;
 
@@ -1578,6 +1582,12 @@ std::string acestep_prepare_request(const music_generation_inputs inputs)
     if (!batch_codes[0].empty()) rr.audio_codes = batch_codes[0];
     rr.seed = seed;
 
+    std::string prefix_erase = "# Lyric";
+    // Check if the string is long enough and starts with the prefix
+    if (rr.lyrics.size() >= prefix_erase.size() && rr.lyrics.compare(0, prefix_erase.size(), prefix_erase) == 0) {
+        rr.lyrics = rr.lyrics.substr(prefix_erase.size()); // Returns a new string starting after the prefix
+    }
+
     //now convert to string
     std::ostringstream oss;
     oss << "{\n";
diff --git a/otherarch/acestep/dit-vae.cpp b/otherarch/acestep/dit-vae.cpp
index 721093cbf..35627e6f5 100644
--- a/otherarch/acestep/dit-vae.cpp
+++ b/otherarch/acestep/dit-vae.cpp
@@ -692,6 +692,7 @@ std::string acestep_generate_audio(const music_generation_inputs inputs)
     if (req.caption.empty()) {
         req.caption = "An interesting song";
     }
+    req.thinking = false;
 
     const int FRAMES_PER_SECOND = 25;
     int Oc = music_dit_cfg.out_channels;          // 64
@@ -917,11 +918,21 @@ std::string acestep_generate_audio(const music_generation_inputs inputs)
         }
     }
 
-    // output wav
-    std::vector<float> resampled_buf = resample_wav(audio,48000,24000);
-    std::string finalb64 = save_ulaw_wav8_base64(audio, 24000);
+    // std::string opath = "egghenlo.wav";
+    // if (write_wav(opath.c_str(), audio.data(), T_audio, 48000)) {
+    //     fprintf(stderr, "[VAE Batch%d] Wrote %s: %d samples (%.2fs @ 48kHz stereo)\n",
+    //             b, opath.c_str(), T_audio, (float)T_audio / 48000.0f);
+    // } else {
+    //     fprintf(stderr, "[VAE Batch%d] FATAL: failed to write %s\n", b, opath.c_str());
+    // }
 
-    fprintf(stderr, "[Request Done]\n");
+    // output wav
+    float muslen = (float)T_audio / 48000.0f;
+    std::vector<float> mono = mix_planar_stereo_to_mono(audio.data(), T_audio);
+    std::vector<float> resampled_buf = resample_wav(mono,48000,32000);
+    std::string finalb64 = save_wav16_base64(resampled_buf, 32000);
+
+    fprintf(stderr, "[Request Done: Music Length %.2fs]\n",muslen);
     return finalb64;
 }
 
diff --git a/otherarch/acestep/request.cpp b/otherarch/acestep/request.cpp
index 7a57ca0e8..0d39c30cb 100644
--- a/otherarch/acestep/request.cpp
+++ b/otherarch/acestep/request.cpp
@@ -22,12 +22,12 @@ void request_init(AceRequest * r) {
     r->vocal_language     = "unknown";
     r->task_type          = "text2music";
     r->seed               = -1;
-    r->thinking           = true;
+    r->thinking           = false;
     r->lm_temperature     = 0.85f;
     r->lm_cfg_scale       = 2.0f;
     r->lm_top_p           = 0.9f;
     r->lm_top_k           = 0;
-    r->lm_negative_prompt = "NO USER INPUT";
+    r->lm_negative_prompt = "";
     r->audio_codes        = "";
     r->inference_steps    = 8;
     r->guidance_scale     = 1.0f;
diff --git a/otherarch/utils.cpp b/otherarch/utils.cpp
index e311c7e90..77f5d7c11 100644
--- a/otherarch/utils.cpp
+++ b/otherarch/utils.cpp
@@ -389,6 +389,18 @@ std::vector<float> resample_wav(const std::vector<float>& input, uint32_t input_
     return output;
 }
 
+std::vector<float> mix_planar_stereo_to_mono(const float* audio, int T_audio)
+{
+    std::vector<float> mono(T_audio);
+    const float* left  = audio;
+    const float* right = audio + T_audio;
+    for (int t = 0; t < T_audio; ++t)
+    {
+        mono[t] = 0.5f * (left[t] + right[t]);
+    }
+    return mono;
+}
+
 static uint8_t linear_to_mulaw(int16_t sample)
 {
     const int16_t BIAS = 0x84;        // 132
diff --git a/otherarch/utils.h b/otherarch/utils.h
index 4fccd78cb..f993aed5c 100644
--- a/otherarch/utils.h
+++ b/otherarch/utils.h
@@ -63,6 +63,7 @@ std::string kcpp_base64_encode(const std::string &data);
 std::string get_timestamp_str();
 std::vector<std::vector<int>> split_big_vector(const std::vector<int>& big_arr, size_t chunk_size);
 std::vector<float> resample_wav(const std::vector<float>& input, uint32_t input_rate, uint32_t output_rate);
+std::vector<float> mix_planar_stereo_to_mono(const float* audio, int T_audio);
 
 int32_t kcpp_quick_sample(float * logits, const int n_logits, const std::vector<int32_t> & last_n_tokens, float rep_pen, float top_p, int top_k, float temp, std::mt19937 & rng);