From a1305ffff91c0821bb7f1de2e0b543736d3f84ee Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Thu, 26 Feb 2026 10:48:21 +0800 Subject: [PATCH] still not working --- otherarch/acestep/ace-qwen3.cpp | 120 +++++++++++++++----------------- 1 file changed, 55 insertions(+), 65 deletions(-) diff --git a/otherarch/acestep/ace-qwen3.cpp b/otherarch/acestep/ace-qwen3.cpp index 1148b375b..f51eb1e15 100644 --- a/otherarch/acestep/ace-qwen3.cpp +++ b/otherarch/acestep/ace-qwen3.cpp @@ -303,8 +303,7 @@ static std::vector build_lm_prompt_uncond(BPETokenizer & bpe, const AceProm ids.push_back(TOKEN_IM_END); append("\n"); ids.push_back(TOKEN_IM_START); - bool has_neg = negative_prompt && strlen(negative_prompt) > 0 - && strcmp(negative_prompt, "NO USER INPUT") != 0; + bool has_neg = negative_prompt && strlen(negative_prompt) > 0; if (has_neg) append("user\n# Caption\n" + std::string(negative_prompt) + "\n\n# Lyric\n" + prompt.lyrics + "\n"); else @@ -397,8 +396,7 @@ static std::vector build_lm_prompt_uncond_with_cot(BPETokenizer & bpe, cons ids.push_back(TOKEN_IM_END); append("\n"); ids.push_back(TOKEN_IM_START); - bool has_neg = negative_prompt && strlen(negative_prompt) > 0 - && strcmp(negative_prompt, "NO USER INPUT") != 0; + bool has_neg = negative_prompt && strlen(negative_prompt) > 0; std::string cap = has_neg ? std::string(negative_prompt) : prompt.caption; append("user\n# Caption\n" + cap + "\n\n# Lyric\n" + prompt.lyrics + "\n"); ids.push_back(TOKEN_IM_END); @@ -746,12 +744,13 @@ static void parse_phase1_into_aces( if (!parse_cot_and_lyrics(texts[i], &parsed)) fprintf(stderr, "WARNING: batch %d CoT parse incomplete\n", i); aces[i] = base; - if (parsed.bpm > 0) aces[i].bpm = parsed.bpm; - if (parsed.duration > 0) aces[i].duration = parsed.duration; - if (!parsed.keyscale.empty()) aces[i].keyscale = parsed.keyscale; - if (!parsed.timesignature.empty()) aces[i].timesignature = parsed.timesignature; - if (!parsed.vocal_language.empty()) aces[i].vocal_language = parsed.vocal_language; - if (!parsed.caption.empty()) aces[i].caption = parsed.caption; + // gap fill: only write fields the user left empty + if (parsed.bpm > 0 && base.bpm <= 0) aces[i].bpm = parsed.bpm; + if (parsed.duration > 0 && base.duration <= 0) aces[i].duration = parsed.duration; + if (!parsed.keyscale.empty() && base.keyscale.empty()) aces[i].keyscale = parsed.keyscale; + if (!parsed.timesignature.empty() && base.timesignature.empty()) aces[i].timesignature = parsed.timesignature; + if (!parsed.vocal_language.empty() && base.vocal_language.empty()) aces[i].vocal_language = parsed.vocal_language; + // lyrics: only generated when user had none if (merge_lyrics && !parsed.lyrics.empty()) aces[i].lyrics = parsed.lyrics; if (aces[i].duration <= 0) aces[i].duration = 120.0f; if (aces[i].duration > 600) aces[i].duration = 600.0f; @@ -1518,82 +1517,73 @@ std::string acestep_prepare_request(const music_generation_inputs inputs) req.audio_codes = ""; bool user_has_codes = !req.audio_codes.empty(); - bool need_lm_codes = req.thinking && !user_has_codes; - - bool is_simple = ace.lyrics.empty(); + bool need_lyrics = ace.lyrics.empty(); + bool has_all_metas = (ace.bpm > 0 && ace.duration > 0 && + !ace.keyscale.empty() && !ace.timesignature.empty()); + bool need_fill = need_lyrics || !has_all_metas; std::vector prompt; std::vector aces; // populated by Phase 1 (simple or partial) - // Preprocessor: simple mode generates lyrics + metas from caption - if (is_simple) { - fprintf(stderr, "[Simple] Inspiration\n"); - - const char * sys = - "# Instruction\n" - "Expand the user's input into a more detailed" - " and specific musical description:\n"; - std::string user_msg = ace.caption + "\n\ninstrumental: " - + std::string(req.instrumental ? "true" : "false"); - prompt = build_custom_prompt(acestep_bpe, sys, user_msg.c_str()); - - // FSM: reset then optionally force language (shared for both paths) + // ONE path: fill what's missing, then generate codes. + // JSON is the instruction. Empty field = "fill it". Filled = "don't touch". + if (user_has_codes) { + fprintf(stderr, "[Pass] audio_codes present, skip LM\n"); + } else if (need_fill) { + if (need_lyrics) { + const char * sys = + "# Instruction\n" + "Expand the user's input into a more detailed" + " and specific musical description:\n"; + std::string user_msg = ace.caption + "\n\ninstrumental: " + + std::string(req.instrumental ? "true" : "false"); + prompt = build_custom_prompt(acestep_bpe, sys, user_msg.c_str()); + } else { + prompt = build_lm_prompt(acestep_bpe, ace); + } + std::vector uncond; + float fill_cfg = cfg_scale; + float fill_top_p = top_p; + int fill_top_k = top_k; + if (need_lyrics) { + // lyrics generation: free sampling, no CFG (matches original behavior) + fill_cfg = 1.0f; + fill_top_p = 1.0f; + fill_top_k = 0; + } else if (fill_cfg > 1.0f) { + uncond = build_lm_prompt_uncond(acestep_bpe, ace, neg_prompt); + } fsm.reset(); - if (use_fsm && ace.vocal_language != "unknown" && !ace.vocal_language.empty()) + if (need_lyrics && use_fsm && ace.vocal_language != "unknown" && !ace.vocal_language.empty()) fsm.force_language(acestep_bpe, ace.vocal_language); - // Phase 1: N lyrics + metadata generations (always batched, N=batch_size) - fprintf(stderr, "[Simple] %zu tokens, N=%d, seeds: %lld..%lld\n", - prompt.size(), batch_size, seed, seed + batch_size - 1); + fprintf(stderr, "[Fill] lyrics=%s metas=%s | %zu tokens, CFG: %.2f, N=%d\n", + need_lyrics ? "generate" : "keep", + has_all_metas ? "complete" : "fill gaps", + prompt.size(), fill_cfg, batch_size); auto phase1_texts = generate_phase1_batch( - &acestep_llm, &acestep_bpe, prompt, 2048, temperature, 0.95f, 40, - seed, batch_size, use_fsm ? &fsm : nullptr, true); + &acestep_llm, &acestep_bpe, prompt, 2048, temperature, fill_top_p, fill_top_k, + seed, batch_size, use_fsm ? &fsm : nullptr, need_lyrics, + fill_cfg, uncond.empty() ? nullptr : &uncond, !need_lyrics); - parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Simple", true); + parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Fill", need_lyrics); - for (int i = 0; i < batch_size; i++) qw3lm_reset_kv(&acestep_llm, i); + int n_kv_reset = (fill_cfg > 1.0f) ? 2 * batch_size : batch_size; + for (int i = 0; i < n_kv_reset; i++) qw3lm_reset_kv(&acestep_llm, i); } - // Re-evaluate after possible simple enrichment - const AcePrompt & ace_ref = aces.empty() ? ace : aces[0]; - bool has_all_metas = (ace_ref.bpm > 0 && ace_ref.duration > 0 && - !ace_ref.keyscale.empty() && !ace_ref.timesignature.empty()); - - if (!has_all_metas) { - // Partial-metas: Phase 1 with CFG to fill missing fields - prompt = build_lm_prompt(acestep_bpe, ace); - std::vector uncond; - if (cfg_scale > 1.0f) - uncond = build_lm_prompt_uncond(acestep_bpe, ace, neg_prompt); - - fprintf(stderr, "[Partial] %zu tokens, CFG: %.2f, N=%d, seeds: %lld..%lld\n", - prompt.size(), cfg_scale, batch_size, seed, seed + batch_size - 1); - - fsm.reset(); - auto phase1_texts = generate_phase1_batch( - &acestep_llm, &acestep_bpe, prompt, 2048, temperature, top_p, top_k, - seed, batch_size, use_fsm ? &fsm : nullptr, false, - cfg_scale, uncond.empty() ? nullptr : &uncond, true); - - parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Partial", false); - - for (int i = 0; i < 2 * batch_size; i++) qw3lm_reset_kv(&acestep_llm, i); - } - - // Guarantee aces is populated (all-metas: single shared ace for prefill optimization) if (aces.empty()) { aces = { ace }; } - // Phase 2: generate audio codes (always batched, N=batch_size) + // Phase 2: generate audio codes std::vector batch_codes(batch_size); - if (need_lm_codes) { + if (!user_has_codes) { batch_codes = run_phase2_batch(&acestep_llm, acestep_bpe, aces, temperature, top_p, top_k, seed, batch_size, cfg_scale, neg_prompt); } else { - fprintf(stderr, "[Skip] %s, no code generation\n", - user_has_codes ? "user codes present" : "thinking=false"); + fprintf(stderr, "[Skip] user audio_codes present, no code generation\n"); } // only batch size 1 is allowed