Revert "still not working"

This reverts commit a1305ffff9.
2026-05-08 09:59:50 +00:00 · 2026-02-26 10:55:55 +08:00 · 2026-02-26 10:55:55 +08:00 · cf042af701
commit cf042af701
parent a1305ffff9
1 changed files with 65 additions and 55 deletions
--- a/otherarch/acestep/ace-qwen3.cpp
+++ b/otherarch/acestep/ace-qwen3.cpp
@ -303,7 +303,8 @@ static std::vector<int> build_lm_prompt_uncond(BPETokenizer & bpe, const AceProm
    ids.push_back(TOKEN_IM_END);
    append("\n");
    ids.push_back(TOKEN_IM_START);
-    bool has_neg = negative_prompt && strlen(negative_prompt) > 0;
+    bool has_neg = negative_prompt && strlen(negative_prompt) > 0
+                   && strcmp(negative_prompt, "NO USER INPUT") != 0;
    if (has_neg)
        append("user\n# Caption\n" + std::string(negative_prompt) + "\n\n# Lyric\n" + prompt.lyrics + "\n");
    else
@ -396,7 +397,8 @@ static std::vector<int> build_lm_prompt_uncond_with_cot(BPETokenizer & bpe, cons
    ids.push_back(TOKEN_IM_END);
    append("\n");
    ids.push_back(TOKEN_IM_START);
-    bool has_neg = negative_prompt && strlen(negative_prompt) > 0;
+    bool has_neg = negative_prompt && strlen(negative_prompt) > 0
+                   && strcmp(negative_prompt, "NO USER INPUT") != 0;
    std::string cap = has_neg ? std::string(negative_prompt) : prompt.caption;
    append("user\n# Caption\n" + cap + "\n\n# Lyric\n" + prompt.lyrics + "\n");
    ids.push_back(TOKEN_IM_END);
@ -744,13 +746,12 @@ static void parse_phase1_into_aces(
        if (!parse_cot_and_lyrics(texts[i], &parsed))
            fprintf(stderr, "WARNING: batch %d CoT parse incomplete\n", i);
        aces[i] = base;
-        // gap fill: only write fields the user left empty
-        if (parsed.bpm > 0 && base.bpm <= 0) aces[i].bpm = parsed.bpm;
-        if (parsed.duration > 0 && base.duration <= 0) aces[i].duration = parsed.duration;
-        if (!parsed.keyscale.empty() && base.keyscale.empty()) aces[i].keyscale = parsed.keyscale;
-        if (!parsed.timesignature.empty() && base.timesignature.empty()) aces[i].timesignature = parsed.timesignature;
-        if (!parsed.vocal_language.empty() && base.vocal_language.empty()) aces[i].vocal_language = parsed.vocal_language;
-        // lyrics: only generated when user had none
+        if (parsed.bpm > 0) aces[i].bpm = parsed.bpm;
+        if (parsed.duration > 0) aces[i].duration = parsed.duration;
+        if (!parsed.keyscale.empty()) aces[i].keyscale = parsed.keyscale;
+        if (!parsed.timesignature.empty()) aces[i].timesignature = parsed.timesignature;
+        if (!parsed.vocal_language.empty()) aces[i].vocal_language = parsed.vocal_language;
+        if (!parsed.caption.empty()) aces[i].caption = parsed.caption;
        if (merge_lyrics && !parsed.lyrics.empty()) aces[i].lyrics = parsed.lyrics;
        if (aces[i].duration <= 0) aces[i].duration = 120.0f;
        if (aces[i].duration > 600) aces[i].duration = 600.0f;
@ -1517,73 +1518,82 @@ std::string acestep_prepare_request(const music_generation_inputs inputs)
    req.audio_codes = "";

    bool user_has_codes = !req.audio_codes.empty();
-    bool need_lyrics    = ace.lyrics.empty();
-    bool has_all_metas  = (ace.bpm > 0 && ace.duration > 0 &&
-                           !ace.keyscale.empty() && !ace.timesignature.empty());
-    bool need_fill      = need_lyrics || !has_all_metas;
+    bool need_lm_codes  = req.thinking && !user_has_codes;
+
+    bool is_simple = ace.lyrics.empty();

    std::vector<int> prompt;
    std::vector<AcePrompt> aces;  // populated by Phase 1 (simple or partial)

-    // ONE path: fill what's missing, then generate codes.
-    // JSON is the instruction. Empty field = "fill it". Filled = "don't touch".
-    if (user_has_codes) {
-        fprintf(stderr, "[Pass] audio_codes present, skip LM\n");
-    } else if (need_fill) {
-        if (need_lyrics) {
-            const char * sys =
-                "# Instruction\n"
-                "Expand the user's input into a more detailed"
-                " and specific musical description:\n";
-            std::string user_msg = ace.caption + "\n\ninstrumental: "
-                + std::string(req.instrumental ? "true" : "false");
-            prompt = build_custom_prompt(acestep_bpe, sys, user_msg.c_str());
-        } else {
-            prompt = build_lm_prompt(acestep_bpe, ace);
-        }
-        std::vector<int> uncond;
-        float fill_cfg = cfg_scale;
-        float fill_top_p = top_p;
-        int   fill_top_k = top_k;
-        if (need_lyrics) {
-            // lyrics generation: free sampling, no CFG (matches original behavior)
-            fill_cfg = 1.0f;
-            fill_top_p = 1.0f;
-            fill_top_k = 0;
-        } else if (fill_cfg > 1.0f) {
-            uncond = build_lm_prompt_uncond(acestep_bpe, ace, neg_prompt);
-        }
+    // Preprocessor: simple mode generates lyrics + metas from caption
+    if (is_simple) {
+        fprintf(stderr, "[Simple] Inspiration\n");
+
+        const char * sys =
+            "# Instruction\n"
+            "Expand the user's input into a more detailed"
+            " and specific musical description:\n";
+        std::string user_msg = ace.caption + "\n\ninstrumental: "
+            + std::string(req.instrumental ? "true" : "false");
+        prompt = build_custom_prompt(acestep_bpe, sys, user_msg.c_str());
+
+        // FSM: reset then optionally force language (shared for both paths)
        fsm.reset();
-        if (need_lyrics && use_fsm && ace.vocal_language != "unknown" && !ace.vocal_language.empty())
+        if (use_fsm && ace.vocal_language != "unknown" && !ace.vocal_language.empty())
            fsm.force_language(acestep_bpe, ace.vocal_language);

-        fprintf(stderr, "[Fill] lyrics=%s metas=%s | %zu tokens, CFG: %.2f, N=%d\n",
-                need_lyrics ? "generate" : "keep",
-                has_all_metas ? "complete" : "fill gaps",
-                prompt.size(), fill_cfg, batch_size);
+        // Phase 1: N lyrics + metadata generations (always batched, N=batch_size)
+        fprintf(stderr, "[Simple] %zu tokens, N=%d, seeds: %lld..%lld\n",
+                prompt.size(), batch_size, seed, seed + batch_size - 1);

        auto phase1_texts = generate_phase1_batch(
-            &acestep_llm, &acestep_bpe, prompt, 2048, temperature, fill_top_p, fill_top_k,
-            seed, batch_size, use_fsm ? &fsm : nullptr, need_lyrics,
-            fill_cfg, uncond.empty() ? nullptr : &uncond, !need_lyrics);
+            &acestep_llm, &acestep_bpe, prompt, 2048, temperature, 0.95f, 40,
+            seed, batch_size, use_fsm ? &fsm : nullptr, true);

-        parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Fill", need_lyrics);
+        parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Simple", true);

-        int n_kv_reset = (fill_cfg > 1.0f) ? 2 * batch_size : batch_size;
-        for (int i = 0; i < n_kv_reset; i++) qw3lm_reset_kv(&acestep_llm, i);
+        for (int i = 0; i < batch_size; i++) qw3lm_reset_kv(&acestep_llm, i);
    }

+    // Re-evaluate after possible simple enrichment
+    const AcePrompt & ace_ref = aces.empty() ? ace : aces[0];
+    bool has_all_metas = (ace_ref.bpm > 0 && ace_ref.duration > 0 &&
+                          !ace_ref.keyscale.empty() && !ace_ref.timesignature.empty());
+
+    if (!has_all_metas) {
+        // Partial-metas: Phase 1 with CFG to fill missing fields
+        prompt = build_lm_prompt(acestep_bpe, ace);
+        std::vector<int> uncond;
+        if (cfg_scale > 1.0f)
+            uncond = build_lm_prompt_uncond(acestep_bpe, ace, neg_prompt);
+
+        fprintf(stderr, "[Partial] %zu tokens, CFG: %.2f, N=%d, seeds: %lld..%lld\n",
+                prompt.size(), cfg_scale, batch_size, seed, seed + batch_size - 1);
+
+        fsm.reset();
+        auto phase1_texts = generate_phase1_batch(
+            &acestep_llm, &acestep_bpe, prompt, 2048, temperature, top_p, top_k,
+            seed, batch_size, use_fsm ? &fsm : nullptr, false,
+            cfg_scale, uncond.empty() ? nullptr : &uncond, true);
+
+        parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Partial", false);
+
+        for (int i = 0; i < 2 * batch_size; i++) qw3lm_reset_kv(&acestep_llm, i);
+    }
+
+    // Guarantee aces is populated (all-metas: single shared ace for prefill optimization)
    if (aces.empty()) {
        aces = { ace };
    }

-    // Phase 2: generate audio codes
+    // Phase 2: generate audio codes (always batched, N=batch_size)
    std::vector<std::string> batch_codes(batch_size);
-    if (!user_has_codes) {
+    if (need_lm_codes) {
        batch_codes = run_phase2_batch(&acestep_llm, acestep_bpe, aces,
            temperature, top_p, top_k, seed, batch_size, cfg_scale, neg_prompt);
    } else {
-        fprintf(stderr, "[Skip] user audio_codes present, no code generation\n");
+        fprintf(stderr, "[Skip] %s, no code generation\n",
+                user_has_codes ? "user codes present" : "thinking=false");
    }

    // only batch size 1 is allowed