diff --git a/otherarch/acestep/ace-qwen3.cpp b/otherarch/acestep/ace-qwen3.cpp index f51eb1e15..1148b375b 100644 --- a/otherarch/acestep/ace-qwen3.cpp +++ b/otherarch/acestep/ace-qwen3.cpp @@ -303,7 +303,8 @@ static std::vector build_lm_prompt_uncond(BPETokenizer & bpe, const AceProm ids.push_back(TOKEN_IM_END); append("\n"); ids.push_back(TOKEN_IM_START); - bool has_neg = negative_prompt && strlen(negative_prompt) > 0; + bool has_neg = negative_prompt && strlen(negative_prompt) > 0 + && strcmp(negative_prompt, "NO USER INPUT") != 0; if (has_neg) append("user\n# Caption\n" + std::string(negative_prompt) + "\n\n# Lyric\n" + prompt.lyrics + "\n"); else @@ -396,7 +397,8 @@ static std::vector build_lm_prompt_uncond_with_cot(BPETokenizer & bpe, cons ids.push_back(TOKEN_IM_END); append("\n"); ids.push_back(TOKEN_IM_START); - bool has_neg = negative_prompt && strlen(negative_prompt) > 0; + bool has_neg = negative_prompt && strlen(negative_prompt) > 0 + && strcmp(negative_prompt, "NO USER INPUT") != 0; std::string cap = has_neg ? std::string(negative_prompt) : prompt.caption; append("user\n# Caption\n" + cap + "\n\n# Lyric\n" + prompt.lyrics + "\n"); ids.push_back(TOKEN_IM_END); @@ -744,13 +746,12 @@ static void parse_phase1_into_aces( if (!parse_cot_and_lyrics(texts[i], &parsed)) fprintf(stderr, "WARNING: batch %d CoT parse incomplete\n", i); aces[i] = base; - // gap fill: only write fields the user left empty - if (parsed.bpm > 0 && base.bpm <= 0) aces[i].bpm = parsed.bpm; - if (parsed.duration > 0 && base.duration <= 0) aces[i].duration = parsed.duration; - if (!parsed.keyscale.empty() && base.keyscale.empty()) aces[i].keyscale = parsed.keyscale; - if (!parsed.timesignature.empty() && base.timesignature.empty()) aces[i].timesignature = parsed.timesignature; - if (!parsed.vocal_language.empty() && base.vocal_language.empty()) aces[i].vocal_language = parsed.vocal_language; - // lyrics: only generated when user had none + if (parsed.bpm > 0) aces[i].bpm = parsed.bpm; + if (parsed.duration > 0) aces[i].duration = parsed.duration; + if (!parsed.keyscale.empty()) aces[i].keyscale = parsed.keyscale; + if (!parsed.timesignature.empty()) aces[i].timesignature = parsed.timesignature; + if (!parsed.vocal_language.empty()) aces[i].vocal_language = parsed.vocal_language; + if (!parsed.caption.empty()) aces[i].caption = parsed.caption; if (merge_lyrics && !parsed.lyrics.empty()) aces[i].lyrics = parsed.lyrics; if (aces[i].duration <= 0) aces[i].duration = 120.0f; if (aces[i].duration > 600) aces[i].duration = 600.0f; @@ -1517,73 +1518,82 @@ std::string acestep_prepare_request(const music_generation_inputs inputs) req.audio_codes = ""; bool user_has_codes = !req.audio_codes.empty(); - bool need_lyrics = ace.lyrics.empty(); - bool has_all_metas = (ace.bpm > 0 && ace.duration > 0 && - !ace.keyscale.empty() && !ace.timesignature.empty()); - bool need_fill = need_lyrics || !has_all_metas; + bool need_lm_codes = req.thinking && !user_has_codes; + + bool is_simple = ace.lyrics.empty(); std::vector prompt; std::vector aces; // populated by Phase 1 (simple or partial) - // ONE path: fill what's missing, then generate codes. - // JSON is the instruction. Empty field = "fill it". Filled = "don't touch". - if (user_has_codes) { - fprintf(stderr, "[Pass] audio_codes present, skip LM\n"); - } else if (need_fill) { - if (need_lyrics) { - const char * sys = - "# Instruction\n" - "Expand the user's input into a more detailed" - " and specific musical description:\n"; - std::string user_msg = ace.caption + "\n\ninstrumental: " - + std::string(req.instrumental ? "true" : "false"); - prompt = build_custom_prompt(acestep_bpe, sys, user_msg.c_str()); - } else { - prompt = build_lm_prompt(acestep_bpe, ace); - } - std::vector uncond; - float fill_cfg = cfg_scale; - float fill_top_p = top_p; - int fill_top_k = top_k; - if (need_lyrics) { - // lyrics generation: free sampling, no CFG (matches original behavior) - fill_cfg = 1.0f; - fill_top_p = 1.0f; - fill_top_k = 0; - } else if (fill_cfg > 1.0f) { - uncond = build_lm_prompt_uncond(acestep_bpe, ace, neg_prompt); - } + // Preprocessor: simple mode generates lyrics + metas from caption + if (is_simple) { + fprintf(stderr, "[Simple] Inspiration\n"); + + const char * sys = + "# Instruction\n" + "Expand the user's input into a more detailed" + " and specific musical description:\n"; + std::string user_msg = ace.caption + "\n\ninstrumental: " + + std::string(req.instrumental ? "true" : "false"); + prompt = build_custom_prompt(acestep_bpe, sys, user_msg.c_str()); + + // FSM: reset then optionally force language (shared for both paths) fsm.reset(); - if (need_lyrics && use_fsm && ace.vocal_language != "unknown" && !ace.vocal_language.empty()) + if (use_fsm && ace.vocal_language != "unknown" && !ace.vocal_language.empty()) fsm.force_language(acestep_bpe, ace.vocal_language); - fprintf(stderr, "[Fill] lyrics=%s metas=%s | %zu tokens, CFG: %.2f, N=%d\n", - need_lyrics ? "generate" : "keep", - has_all_metas ? "complete" : "fill gaps", - prompt.size(), fill_cfg, batch_size); + // Phase 1: N lyrics + metadata generations (always batched, N=batch_size) + fprintf(stderr, "[Simple] %zu tokens, N=%d, seeds: %lld..%lld\n", + prompt.size(), batch_size, seed, seed + batch_size - 1); auto phase1_texts = generate_phase1_batch( - &acestep_llm, &acestep_bpe, prompt, 2048, temperature, fill_top_p, fill_top_k, - seed, batch_size, use_fsm ? &fsm : nullptr, need_lyrics, - fill_cfg, uncond.empty() ? nullptr : &uncond, !need_lyrics); + &acestep_llm, &acestep_bpe, prompt, 2048, temperature, 0.95f, 40, + seed, batch_size, use_fsm ? &fsm : nullptr, true); - parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Fill", need_lyrics); + parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Simple", true); - int n_kv_reset = (fill_cfg > 1.0f) ? 2 * batch_size : batch_size; - for (int i = 0; i < n_kv_reset; i++) qw3lm_reset_kv(&acestep_llm, i); + for (int i = 0; i < batch_size; i++) qw3lm_reset_kv(&acestep_llm, i); } + // Re-evaluate after possible simple enrichment + const AcePrompt & ace_ref = aces.empty() ? ace : aces[0]; + bool has_all_metas = (ace_ref.bpm > 0 && ace_ref.duration > 0 && + !ace_ref.keyscale.empty() && !ace_ref.timesignature.empty()); + + if (!has_all_metas) { + // Partial-metas: Phase 1 with CFG to fill missing fields + prompt = build_lm_prompt(acestep_bpe, ace); + std::vector uncond; + if (cfg_scale > 1.0f) + uncond = build_lm_prompt_uncond(acestep_bpe, ace, neg_prompt); + + fprintf(stderr, "[Partial] %zu tokens, CFG: %.2f, N=%d, seeds: %lld..%lld\n", + prompt.size(), cfg_scale, batch_size, seed, seed + batch_size - 1); + + fsm.reset(); + auto phase1_texts = generate_phase1_batch( + &acestep_llm, &acestep_bpe, prompt, 2048, temperature, top_p, top_k, + seed, batch_size, use_fsm ? &fsm : nullptr, false, + cfg_scale, uncond.empty() ? nullptr : &uncond, true); + + parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Partial", false); + + for (int i = 0; i < 2 * batch_size; i++) qw3lm_reset_kv(&acestep_llm, i); + } + + // Guarantee aces is populated (all-metas: single shared ace for prefill optimization) if (aces.empty()) { aces = { ace }; } - // Phase 2: generate audio codes + // Phase 2: generate audio codes (always batched, N=batch_size) std::vector batch_codes(batch_size); - if (!user_has_codes) { + if (need_lm_codes) { batch_codes = run_phase2_batch(&acestep_llm, acestep_bpe, aces, temperature, top_p, top_k, seed, batch_size, cfg_scale, neg_prompt); } else { - fprintf(stderr, "[Skip] user audio_codes present, no code generation\n"); + fprintf(stderr, "[Skip] %s, no code generation\n", + user_has_codes ? "user codes present" : "thinking=false"); } // only batch size 1 is allowed