still not working

2026-06-02 07:19:23 +00:00 · 2026-02-26 10:48:21 +08:00 · 2026-02-26 10:48:21 +08:00 · a1305ffff9
commit a1305ffff9
parent 5c5fe55f7d
1 changed files with 55 additions and 65 deletions
--- a/otherarch/acestep/ace-qwen3.cpp
+++ b/otherarch/acestep/ace-qwen3.cpp
@ -303,8 +303,7 @@ static std::vector<int> build_lm_prompt_uncond(BPETokenizer & bpe, const AceProm
    ids.push_back(TOKEN_IM_END);
    append("\n");
    ids.push_back(TOKEN_IM_START);
-    bool has_neg = negative_prompt && strlen(negative_prompt) > 0
-                   && strcmp(negative_prompt, "NO USER INPUT") != 0;
+    bool has_neg = negative_prompt && strlen(negative_prompt) > 0;
    if (has_neg)
        append("user\n# Caption\n" + std::string(negative_prompt) + "\n\n# Lyric\n" + prompt.lyrics + "\n");
    else
@ -397,8 +396,7 @@ static std::vector<int> build_lm_prompt_uncond_with_cot(BPETokenizer & bpe, cons
    ids.push_back(TOKEN_IM_END);
    append("\n");
    ids.push_back(TOKEN_IM_START);
-    bool has_neg = negative_prompt && strlen(negative_prompt) > 0
-                   && strcmp(negative_prompt, "NO USER INPUT") != 0;
+    bool has_neg = negative_prompt && strlen(negative_prompt) > 0;
    std::string cap = has_neg ? std::string(negative_prompt) : prompt.caption;
    append("user\n# Caption\n" + cap + "\n\n# Lyric\n" + prompt.lyrics + "\n");
    ids.push_back(TOKEN_IM_END);
@ -746,12 +744,13 @@ static void parse_phase1_into_aces(
        if (!parse_cot_and_lyrics(texts[i], &parsed))
            fprintf(stderr, "WARNING: batch %d CoT parse incomplete\n", i);
        aces[i] = base;
-        if (parsed.bpm > 0) aces[i].bpm = parsed.bpm;
-        if (parsed.duration > 0) aces[i].duration = parsed.duration;
-        if (!parsed.keyscale.empty()) aces[i].keyscale = parsed.keyscale;
-        if (!parsed.timesignature.empty()) aces[i].timesignature = parsed.timesignature;
-        if (!parsed.vocal_language.empty()) aces[i].vocal_language = parsed.vocal_language;
-        if (!parsed.caption.empty()) aces[i].caption = parsed.caption;
+        // gap fill: only write fields the user left empty
+        if (parsed.bpm > 0 && base.bpm <= 0) aces[i].bpm = parsed.bpm;
+        if (parsed.duration > 0 && base.duration <= 0) aces[i].duration = parsed.duration;
+        if (!parsed.keyscale.empty() && base.keyscale.empty()) aces[i].keyscale = parsed.keyscale;
+        if (!parsed.timesignature.empty() && base.timesignature.empty()) aces[i].timesignature = parsed.timesignature;
+        if (!parsed.vocal_language.empty() && base.vocal_language.empty()) aces[i].vocal_language = parsed.vocal_language;
+        // lyrics: only generated when user had none
        if (merge_lyrics && !parsed.lyrics.empty()) aces[i].lyrics = parsed.lyrics;
        if (aces[i].duration <= 0) aces[i].duration = 120.0f;
        if (aces[i].duration > 600) aces[i].duration = 600.0f;
@ -1518,82 +1517,73 @@ std::string acestep_prepare_request(const music_generation_inputs inputs)
    req.audio_codes = "";

    bool user_has_codes = !req.audio_codes.empty();
-    bool need_lm_codes  = req.thinking && !user_has_codes;
-
-    bool is_simple = ace.lyrics.empty();
+    bool need_lyrics    = ace.lyrics.empty();
+    bool has_all_metas  = (ace.bpm > 0 && ace.duration > 0 &&
+                           !ace.keyscale.empty() && !ace.timesignature.empty());
+    bool need_fill      = need_lyrics || !has_all_metas;

    std::vector<int> prompt;
    std::vector<AcePrompt> aces;  // populated by Phase 1 (simple or partial)

-    // Preprocessor: simple mode generates lyrics + metas from caption
-    if (is_simple) {
-        fprintf(stderr, "[Simple] Inspiration\n");
-
-        const char * sys =
-            "# Instruction\n"
-            "Expand the user's input into a more detailed"
-            " and specific musical description:\n";
-        std::string user_msg = ace.caption + "\n\ninstrumental: "
-            + std::string(req.instrumental ? "true" : "false");
-        prompt = build_custom_prompt(acestep_bpe, sys, user_msg.c_str());
-
-        // FSM: reset then optionally force language (shared for both paths)
+    // ONE path: fill what's missing, then generate codes.
+    // JSON is the instruction. Empty field = "fill it". Filled = "don't touch".
+    if (user_has_codes) {
+        fprintf(stderr, "[Pass] audio_codes present, skip LM\n");
+    } else if (need_fill) {
+        if (need_lyrics) {
+            const char * sys =
+                "# Instruction\n"
+                "Expand the user's input into a more detailed"
+                " and specific musical description:\n";
+            std::string user_msg = ace.caption + "\n\ninstrumental: "
+                + std::string(req.instrumental ? "true" : "false");
+            prompt = build_custom_prompt(acestep_bpe, sys, user_msg.c_str());
+        } else {
+            prompt = build_lm_prompt(acestep_bpe, ace);
+        }
+        std::vector<int> uncond;
+        float fill_cfg = cfg_scale;
+        float fill_top_p = top_p;
+        int   fill_top_k = top_k;
+        if (need_lyrics) {
+            // lyrics generation: free sampling, no CFG (matches original behavior)
+            fill_cfg = 1.0f;
+            fill_top_p = 1.0f;
+            fill_top_k = 0;
+        } else if (fill_cfg > 1.0f) {
+            uncond = build_lm_prompt_uncond(acestep_bpe, ace, neg_prompt);
+        }
        fsm.reset();
-        if (use_fsm && ace.vocal_language != "unknown" && !ace.vocal_language.empty())
+        if (need_lyrics && use_fsm && ace.vocal_language != "unknown" && !ace.vocal_language.empty())
            fsm.force_language(acestep_bpe, ace.vocal_language);

-        // Phase 1: N lyrics + metadata generations (always batched, N=batch_size)
-        fprintf(stderr, "[Simple] %zu tokens, N=%d, seeds: %lld..%lld\n",
-                prompt.size(), batch_size, seed, seed + batch_size - 1);
+        fprintf(stderr, "[Fill] lyrics=%s metas=%s | %zu tokens, CFG: %.2f, N=%d\n",
+                need_lyrics ? "generate" : "keep",
+                has_all_metas ? "complete" : "fill gaps",
+                prompt.size(), fill_cfg, batch_size);

        auto phase1_texts = generate_phase1_batch(
-            &acestep_llm, &acestep_bpe, prompt, 2048, temperature, 0.95f, 40,
-            seed, batch_size, use_fsm ? &fsm : nullptr, true);
+            &acestep_llm, &acestep_bpe, prompt, 2048, temperature, fill_top_p, fill_top_k,
+            seed, batch_size, use_fsm ? &fsm : nullptr, need_lyrics,
+            fill_cfg, uncond.empty() ? nullptr : &uncond, !need_lyrics);

-        parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Simple", true);
+        parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Fill", need_lyrics);

-        for (int i = 0; i < batch_size; i++) qw3lm_reset_kv(&acestep_llm, i);
+        int n_kv_reset = (fill_cfg > 1.0f) ? 2 * batch_size : batch_size;
+        for (int i = 0; i < n_kv_reset; i++) qw3lm_reset_kv(&acestep_llm, i);
    }

-    // Re-evaluate after possible simple enrichment
-    const AcePrompt & ace_ref = aces.empty() ? ace : aces[0];
-    bool has_all_metas = (ace_ref.bpm > 0 && ace_ref.duration > 0 &&
-                          !ace_ref.keyscale.empty() && !ace_ref.timesignature.empty());
-
-    if (!has_all_metas) {
-        // Partial-metas: Phase 1 with CFG to fill missing fields
-        prompt = build_lm_prompt(acestep_bpe, ace);
-        std::vector<int> uncond;
-        if (cfg_scale > 1.0f)
-            uncond = build_lm_prompt_uncond(acestep_bpe, ace, neg_prompt);
-
-        fprintf(stderr, "[Partial] %zu tokens, CFG: %.2f, N=%d, seeds: %lld..%lld\n",
-                prompt.size(), cfg_scale, batch_size, seed, seed + batch_size - 1);
-
-        fsm.reset();
-        auto phase1_texts = generate_phase1_batch(
-            &acestep_llm, &acestep_bpe, prompt, 2048, temperature, top_p, top_k,
-            seed, batch_size, use_fsm ? &fsm : nullptr, false,
-            cfg_scale, uncond.empty() ? nullptr : &uncond, true);
-
-        parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Partial", false);
-
-        for (int i = 0; i < 2 * batch_size; i++) qw3lm_reset_kv(&acestep_llm, i);
-    }
-
-    // Guarantee aces is populated (all-metas: single shared ace for prefill optimization)
    if (aces.empty()) {
        aces = { ace };
    }

-    // Phase 2: generate audio codes (always batched, N=batch_size)
+    // Phase 2: generate audio codes
    std::vector<std::string> batch_codes(batch_size);
-    if (need_lm_codes) {
+    if (!user_has_codes) {
        batch_codes = run_phase2_batch(&acestep_llm, acestep_bpe, aces,
            temperature, top_p, top_k, seed, batch_size, cfg_scale, neg_prompt);
    } else {
-        fprintf(stderr, "[Skip] %s, no code generation\n",
-                user_has_codes ? "user codes present" : "thinking=false");
+        fprintf(stderr, "[Skip] user audio_codes present, no code generation\n");
    }

    // only batch size 1 is allowed