From edbc4fe592117b1024c6671990cd93f19bb27b7c Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Thu, 26 Feb 2026 14:00:58 +0800
Subject: [PATCH] music lm finally working
---
embd_res/kcpp_musicui.embd | 9 +-
otherarch/acestep/ace-qwen3.cpp | 139 +++++++++++++++-------------
otherarch/acestep/dit-vae.cpp | 7 +-
otherarch/acestep/music_adapter.cpp | 2 +-
4 files changed, 89 insertions(+), 68 deletions(-)
diff --git a/embd_res/kcpp_musicui.embd b/embd_res/kcpp_musicui.embd
index 9f75eba38..0e8ee1182 100644
--- a/embd_res/kcpp_musicui.embd
+++ b/embd_res/kcpp_musicui.embd
@@ -156,7 +156,9 @@ audio{width:100%;margin-top:6px;}
-🎵 KoboldCpp Music Generation
+
+
+KoboldCpp Music Generation UI
@@ -297,10 +299,15 @@ function getFormData(){
}
function updateForm(data){
+ //let origseed = document.getElementById("seed").value;
Object.keys(data).forEach(k=>{
if(document.getElementById(k))
document.getElementById(k).value=data[k]??"";
});
+ //if(origseed=="-1" || origseed=="")
+ //{
+ // document.getElementById("seed").value = "-1";
+ //}
}
function deriveTitle(caption){
diff --git a/otherarch/acestep/ace-qwen3.cpp b/otherarch/acestep/ace-qwen3.cpp
index 1148b375b..57c8865ec 100644
--- a/otherarch/acestep/ace-qwen3.cpp
+++ b/otherarch/acestep/ace-qwen3.cpp
@@ -169,6 +169,9 @@ struct AcePrompt {
std::string vocal_language;
};
+static std::mt19937 acestep_lm_rng;
+static bool acestep_lm_dbg = false;
+
//
// CoT parsing (extract metadata + lyrics from LLM Phase1 output)
//
@@ -758,13 +761,16 @@ static void parse_phase1_into_aces(
}
}
+//hack for kcpp: forcing the correct tokens after end of thinking
+const std::vector
think_chain = {271,2,15953,2216,198}; // "\n# Lyric\n"
+
// Batched Phase 1: N text generations with shared prompt, different seeds.
// No CFG. Each element gets its own FSM state and RNG.
// Returns N generated text strings.
static std::vector generate_phase1_batch(
Qwen3LM * m, BPETokenizer * bpe,
const std::vector & prompt_tokens,
- int max_new_tokens, float temperature, float top_p, int top_k,
+ int max_new_tokens, float temperature, float top_p,
long long base_seed, int N,
MetadataFSM * fsm_template,
bool lyrics_mode,
@@ -798,6 +804,12 @@ static std::vector generate_phase1_batch(
fprintf(stderr, "[Phase1] Prefill %.0fms, %zu tokens, N=%d, CFG=%.2f\n",
t_prefill.ms(), prompt_tokens.size(), N, cfg_scale);
+ if(acestep_lm_dbg)
+ {
+ std::string tks = bpe_decode(*bpe,prompt_tokens);
+ printf("\nN:%d Prompt: %s",prompt_tokens.size(),tks.c_str());
+ }
+
// Per-element state
struct P1Seq {
std::mt19937 rng;
@@ -824,7 +836,7 @@ static std::vector generate_phase1_batch(
if (fsm_template && fsm_template->enabled)
seqs[i].fsm.apply_mask(lg.data());
- int tok = sample_top_k_p(lg.data(), V, temperature, top_p, top_k, seqs[i].rng);
+ int tok = kcpp_quick_sample(lg.data(),V,std::vector(),1.03f,top_p,25,temperature,acestep_lm_rng);
if (tok == TOKEN_IM_END) {
seqs[i].done = true;
@@ -853,37 +865,20 @@ static std::vector generate_phase1_batch(
std::vector logits_uncond(V * N);
std::vector tokens(N);
- // CFG: single forward with 2*N (cond + uncond)
- int N2 = use_cfg ? 2 * N : N;
- std::vector tokens_2n(N2), sets_2n(N2);
- std::vector logits_2n((size_t)V * N2);
- if (use_cfg) {
- for (int i = 0; i < N; i++) {
- sets_2n[i] = cond_sets[i];
- sets_2n[N + i] = uncond_sets[i];
- }
- }
-
int n_active = N;
for (int i = 0; i < N; i++)
if (seqs[i].done) n_active--;
+ std::vector quicklastntoks;
+ std::vector forced_tokens;
+
for (int step = 0; step < max_new_tokens && n_active > 0; step++) {
for (int i = 0; i < N; i++)
tokens[i] = seqs[i].last_token;
- if (use_cfg) {
- // Single batched forward: cond[0..N-1] + uncond[N..2N-1]
- for (int i = 0; i < N; i++) {
- tokens_2n[i] = tokens[i];
- tokens_2n[N + i] = tokens[i];
- }
- qw3lm_forward_batch(m, tokens_2n.data(), sets_2n.data(), N2, logits_2n.data());
- memcpy(logits_cond.data(), logits_2n.data(), (size_t)V * N * sizeof(float));
- memcpy(logits_uncond.data(), logits_2n.data() + (size_t)V * N, (size_t)V * N * sizeof(float));
- } else {
- qw3lm_forward_batch(m, tokens.data(), cond_sets.data(), N, logits_cond.data());
- }
+ qw3lm_forward_batch(m, tokens.data(), cond_sets.data(), N, logits_cond.data());
+ if (use_cfg)
+ qw3lm_forward_batch(m, tokens.data(), uncond_sets.data(), N, logits_uncond.data());
for (int i = 0; i < N; i++) {
if (seqs[i].done) continue;
@@ -902,12 +897,38 @@ static std::vector generate_phase1_batch(
seqs[i].fsm.apply_mask(lc);
// After : audio code constraint unless lyrics_mode
- if (seqs[i].codes_phase && !lyrics_mode) {
- for (int v = 0; v < AUDIO_CODE_BASE; v++)
- if (v != TOKEN_IM_END) lc[v] = -1e9f;
+ // if (seqs[i].codes_phase && !lyrics_mode) {
+ // for (int v = 0; v < AUDIO_CODE_BASE; v++)
+ // if (v != TOKEN_IM_END) lc[v] = -1e9f;
+ // }
+
+ // kcpp: prevent outputting audio codes
+ for (int v = AUDIO_CODE_BASE; v < AUDIO_CODE_COUNT+AUDIO_CODE_BASE; v++)
+ if (v != TOKEN_IM_END) lc[v] = -1e9f;
+
+ int tok = kcpp_quick_sample(lc,V,quicklastntoks,1.03f,top_p,25,temperature,acestep_lm_rng);
+ quicklastntoks.push_back(tok);
+ if (quicklastntoks.size()>32) {
+ quicklastntoks.erase(quicklastntoks.begin());
}
- int tok = sample_top_k_p(lc, V, temperature, top_p, top_k, seqs[i].rng);
+ //kcpp: force lyrics tokens right after think
+ if(forced_tokens.size()>0)
+ {
+ tok = forced_tokens[0];
+ forced_tokens.erase(forced_tokens.begin());
+ }
+ if (tok == TOKEN_THINK_END)
+ {
+ forced_tokens.clear();
+ forced_tokens = think_chain;
+ }
+
+ if(acestep_lm_dbg)
+ {
+ std::string tks = bpe_decode(*bpe,std::vector({tok}));
+ printf("\nDebug temp: %f, top_p:%f, tok:%d = %s (%d)",temperature,top_p,tok,tks.c_str(),forced_tokens.size());
+ }
if (tok == TOKEN_IM_END) {
seqs[i].done = true;
@@ -955,7 +976,7 @@ static std::vector generate_phase1_batch(
// Returns N code strings. Seeds = base_seed + 0, 1, ..., N-1.
static std::vector run_phase2_batch(
Qwen3LM * m, BPETokenizer & bpe, const std::vector & aces,
- float temperature, float top_p, int top_k, long long base_seed, int N,
+ float temperature, float top_p, long long base_seed, int N,
float cfg_scale, const char * negative_prompt) {
int V = m->cfg.vocab_size;
@@ -1042,7 +1063,7 @@ static std::vector run_phase2_batch(
for (int v = 0; v < AUDIO_CODE_BASE; v++)
if (v != TOKEN_IM_END) lg[v] = -1e9f;
- int tok = sample_top_k_p(lg.data(), V, temperature, top_p, top_k, seqs[i].rng);
+ int tok = kcpp_quick_sample(lg.data(),V,std::vector(),1.00f,top_p,25,temperature,acestep_lm_rng);
seqs[i].last_token = tok;
if (tok == TOKEN_IM_END) {
@@ -1065,17 +1086,6 @@ static std::vector run_phase2_batch(
std::vector logits_uncond(V * N);
std::vector tokens(N);
- // CFG: single forward with 2*N (cond + uncond)
- int N2 = use_cfg ? 2 * N : N;
- std::vector tokens_2n(N2), sets_2n(N2);
- std::vector logits_2n((size_t)V * N2);
- if (use_cfg) {
- for (int i = 0; i < N; i++) {
- sets_2n[i] = cond_sets[i];
- sets_2n[N + i] = uncond_sets[i];
- }
- }
-
int n_active = N;
for (int i = 0; i < N; i++)
if (seqs[i].done) n_active--;
@@ -1085,18 +1095,12 @@ static std::vector run_phase2_batch(
for (int i = 0; i < N; i++)
tokens[i] = seqs[i].last_token;
- if (use_cfg) {
- // Single batched forward: cond[0..N-1] + uncond[N..2N-1]
- for (int i = 0; i < N; i++) {
- tokens_2n[i] = tokens[i];
- tokens_2n[N + i] = tokens[i];
- }
- qw3lm_forward_batch(m, tokens_2n.data(), sets_2n.data(), N2, logits_2n.data());
- memcpy(logits_cond.data(), logits_2n.data(), (size_t)V * N * sizeof(float));
- memcpy(logits_uncond.data(), logits_2n.data() + (size_t)V * N, (size_t)V * N * sizeof(float));
- } else {
- qw3lm_forward_batch(m, tokens.data(), cond_sets.data(), N, logits_cond.data());
- }
+ // Batched forward: cond
+ qw3lm_forward_batch(m, tokens.data(), cond_sets.data(), N, logits_cond.data());
+
+ // Batched forward: uncond
+ if (use_cfg)
+ qw3lm_forward_batch(m, tokens.data(), uncond_sets.data(), N, logits_uncond.data());
// Per-sequence: CFG combine + sample
for (int i = 0; i < N; i++) {
@@ -1113,7 +1117,7 @@ static std::vector run_phase2_batch(
for (int v = 0; v < AUDIO_CODE_BASE; v++)
if (v != TOKEN_IM_END) lc[v] = -1e9f;
- int tok = sample_top_k_p(lc, V, temperature, top_p, top_k, seqs[i].rng);
+ int tok = kcpp_quick_sample(lc,V,std::vector(),1.00f,top_p,25,temperature,acestep_lm_rng);
seqs[i].last_token = tok;
if (tok == TOKEN_IM_END) {
@@ -1436,8 +1440,9 @@ void unload_acestep_lm()
}
}
-bool load_acestep_lm(std::string model_path, bool lowvram)
+bool load_acestep_lm(std::string model_path, bool lowvram, bool musicdebugmode)
{
+ acestep_lm_dbg = musicdebugmode;
if(acestep_lm_loaded)
{
unload_acestep_lm();
@@ -1465,7 +1470,7 @@ std::string acestep_prepare_request(const music_generation_inputs inputs)
if(!acestep_lm_loaded && acestep_lm_path!="")
{
printf("\nRuntime reload Music LM model...\n");
- bool ok = load_acestep_lm(acestep_lm_path, acestep_lm_lowvram);
+ bool ok = load_acestep_lm(acestep_lm_path, acestep_lm_lowvram, acestep_lm_dbg);
if(!ok)
{
printf("\nERROR: Acestep LM load fail\n");
@@ -1495,6 +1500,11 @@ std::string acestep_prepare_request(const music_generation_inputs inputs)
seed = (((uint32_t)time(NULL)) % 1000000u);
}
req.seed = seed;
+ acestep_lm_rng = std::mt19937(seed);
+
+ if (req.caption.empty()) {
+ req.caption = "An interesting song";
+ }
// Generation params from request
float temperature = req.lm_temperature;
@@ -1518,7 +1528,7 @@ std::string acestep_prepare_request(const music_generation_inputs inputs)
req.audio_codes = "";
bool user_has_codes = !req.audio_codes.empty();
- bool need_lm_codes = req.thinking && !user_has_codes;
+ bool need_lm_codes = false;//req.thinking && !user_has_codes;
bool is_simple = ace.lyrics.empty();
@@ -1547,7 +1557,7 @@ std::string acestep_prepare_request(const music_generation_inputs inputs)
prompt.size(), batch_size, seed, seed + batch_size - 1);
auto phase1_texts = generate_phase1_batch(
- &acestep_llm, &acestep_bpe, prompt, 2048, temperature, 0.95f, 40,
+ &acestep_llm, &acestep_bpe, prompt, 2048, temperature, top_p,
seed, batch_size, use_fsm ? &fsm : nullptr, true);
parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Simple", true);
@@ -1572,7 +1582,7 @@ std::string acestep_prepare_request(const music_generation_inputs inputs)
fsm.reset();
auto phase1_texts = generate_phase1_batch(
- &acestep_llm, &acestep_bpe, prompt, 2048, temperature, top_p, top_k,
+ &acestep_llm, &acestep_bpe, prompt, 2048, temperature, top_p,
seed, batch_size, use_fsm ? &fsm : nullptr, false,
cfg_scale, uncond.empty() ? nullptr : &uncond, true);
@@ -1590,7 +1600,7 @@ std::string acestep_prepare_request(const music_generation_inputs inputs)
std::vector batch_codes(batch_size);
if (need_lm_codes) {
batch_codes = run_phase2_batch(&acestep_llm, acestep_bpe, aces,
- temperature, top_p, top_k, seed, batch_size, cfg_scale, neg_prompt);
+ temperature, top_p, seed, batch_size, cfg_scale, neg_prompt);
} else {
fprintf(stderr, "[Skip] %s, no code generation\n",
user_has_codes ? "user codes present" : "thinking=false");
@@ -1607,7 +1617,6 @@ std::string acestep_prepare_request(const music_generation_inputs inputs)
rr.timesignature = a.timesignature;
rr.vocal_language = a.vocal_language;
if (!batch_codes[0].empty()) rr.audio_codes = batch_codes[0];
- rr.seed = seed;
std::string prefix_erase = "# Lyric";
// Check if the string is long enough and starts with the prefix
@@ -1615,6 +1624,12 @@ std::string acestep_prepare_request(const music_generation_inputs inputs)
rr.lyrics = rr.lyrics.substr(prefix_erase.size()); // Returns a new string starting after the prefix
}
+ prefix_erase = "keyscale:";
+ // Check if the string is long enough and starts with the prefix
+ if (rr.keyscale.size() >= prefix_erase.size() && rr.keyscale.compare(0, prefix_erase.size(), prefix_erase) == 0) {
+ rr.keyscale = rr.keyscale.substr(prefix_erase.size()); // Returns a new string starting after the prefix
+ }
+
//now convert to string
std::ostringstream oss;
oss << "{\n";
diff --git a/otherarch/acestep/dit-vae.cpp b/otherarch/acestep/dit-vae.cpp
index af4ccfd1a..65481ddad 100644
--- a/otherarch/acestep/dit-vae.cpp
+++ b/otherarch/acestep/dit-vae.cpp
@@ -774,10 +774,9 @@ std::string acestep_generate_audio(const music_generation_inputs inputs)
guidance_scale = 1.0f;
}
- if (seed < 0) {
- std::random_device rd;
- seed = (long long)rd() << 32 | rd();
- if (seed < 0) seed = -seed;
+ if (seed <= 0 || seed==0xFFFFFFFF)
+ {
+ seed = (((uint32_t)time(NULL)) % 1000000u);
}
fprintf(stderr, "[Pipeline] seed=%lld, steps=%d, guidance=%.1f, shift=%.1f, duration=%.1fs\n",
seed, num_steps, guidance_scale, shift, duration);
diff --git a/otherarch/acestep/music_adapter.cpp b/otherarch/acestep/music_adapter.cpp
index f89982b69..15fc15b95 100644
--- a/otherarch/acestep/music_adapter.cpp
+++ b/otherarch/acestep/music_adapter.cpp
@@ -57,7 +57,7 @@ bool musictype_load_model(const music_load_model_inputs inputs)
musicllm_filename.c_str(),musicembedding_filename.c_str(),musicdiffusion_filename.c_str(),musicvae_filename.c_str());
musicdebugmode = inputs.debugmode;
- bool ok = load_acestep_lm(musicllm_filename,lowvram);
+ bool ok = load_acestep_lm(musicllm_filename,lowvram,musicdebugmode);
if (!ok) {
printf("\nFailed to load Music Gen LM Model!\n");
return false;