Revert "still not working"

This reverts commit a1305ffff9.
This commit is contained in:
Concedo 2026-02-26 10:55:55 +08:00
parent a1305ffff9
commit cf042af701

View file

@ -303,7 +303,8 @@ static std::vector<int> build_lm_prompt_uncond(BPETokenizer & bpe, const AceProm
ids.push_back(TOKEN_IM_END);
append("\n");
ids.push_back(TOKEN_IM_START);
bool has_neg = negative_prompt && strlen(negative_prompt) > 0;
bool has_neg = negative_prompt && strlen(negative_prompt) > 0
&& strcmp(negative_prompt, "NO USER INPUT") != 0;
if (has_neg)
append("user\n# Caption\n" + std::string(negative_prompt) + "\n\n# Lyric\n" + prompt.lyrics + "\n");
else
@ -396,7 +397,8 @@ static std::vector<int> build_lm_prompt_uncond_with_cot(BPETokenizer & bpe, cons
ids.push_back(TOKEN_IM_END);
append("\n");
ids.push_back(TOKEN_IM_START);
bool has_neg = negative_prompt && strlen(negative_prompt) > 0;
bool has_neg = negative_prompt && strlen(negative_prompt) > 0
&& strcmp(negative_prompt, "NO USER INPUT") != 0;
std::string cap = has_neg ? std::string(negative_prompt) : prompt.caption;
append("user\n# Caption\n" + cap + "\n\n# Lyric\n" + prompt.lyrics + "\n");
ids.push_back(TOKEN_IM_END);
@ -744,13 +746,12 @@ static void parse_phase1_into_aces(
if (!parse_cot_and_lyrics(texts[i], &parsed))
fprintf(stderr, "WARNING: batch %d CoT parse incomplete\n", i);
aces[i] = base;
// gap fill: only write fields the user left empty
if (parsed.bpm > 0 && base.bpm <= 0) aces[i].bpm = parsed.bpm;
if (parsed.duration > 0 && base.duration <= 0) aces[i].duration = parsed.duration;
if (!parsed.keyscale.empty() && base.keyscale.empty()) aces[i].keyscale = parsed.keyscale;
if (!parsed.timesignature.empty() && base.timesignature.empty()) aces[i].timesignature = parsed.timesignature;
if (!parsed.vocal_language.empty() && base.vocal_language.empty()) aces[i].vocal_language = parsed.vocal_language;
// lyrics: only generated when user had none
if (parsed.bpm > 0) aces[i].bpm = parsed.bpm;
if (parsed.duration > 0) aces[i].duration = parsed.duration;
if (!parsed.keyscale.empty()) aces[i].keyscale = parsed.keyscale;
if (!parsed.timesignature.empty()) aces[i].timesignature = parsed.timesignature;
if (!parsed.vocal_language.empty()) aces[i].vocal_language = parsed.vocal_language;
if (!parsed.caption.empty()) aces[i].caption = parsed.caption;
if (merge_lyrics && !parsed.lyrics.empty()) aces[i].lyrics = parsed.lyrics;
if (aces[i].duration <= 0) aces[i].duration = 120.0f;
if (aces[i].duration > 600) aces[i].duration = 600.0f;
@ -1517,73 +1518,82 @@ std::string acestep_prepare_request(const music_generation_inputs inputs)
req.audio_codes = "";
bool user_has_codes = !req.audio_codes.empty();
bool need_lyrics = ace.lyrics.empty();
bool has_all_metas = (ace.bpm > 0 && ace.duration > 0 &&
!ace.keyscale.empty() && !ace.timesignature.empty());
bool need_fill = need_lyrics || !has_all_metas;
bool need_lm_codes = req.thinking && !user_has_codes;
bool is_simple = ace.lyrics.empty();
std::vector<int> prompt;
std::vector<AcePrompt> aces; // populated by Phase 1 (simple or partial)
// ONE path: fill what's missing, then generate codes.
// JSON is the instruction. Empty field = "fill it". Filled = "don't touch".
if (user_has_codes) {
fprintf(stderr, "[Pass] audio_codes present, skip LM\n");
} else if (need_fill) {
if (need_lyrics) {
const char * sys =
"# Instruction\n"
"Expand the user's input into a more detailed"
" and specific musical description:\n";
std::string user_msg = ace.caption + "\n\ninstrumental: "
+ std::string(req.instrumental ? "true" : "false");
prompt = build_custom_prompt(acestep_bpe, sys, user_msg.c_str());
} else {
prompt = build_lm_prompt(acestep_bpe, ace);
}
std::vector<int> uncond;
float fill_cfg = cfg_scale;
float fill_top_p = top_p;
int fill_top_k = top_k;
if (need_lyrics) {
// lyrics generation: free sampling, no CFG (matches original behavior)
fill_cfg = 1.0f;
fill_top_p = 1.0f;
fill_top_k = 0;
} else if (fill_cfg > 1.0f) {
uncond = build_lm_prompt_uncond(acestep_bpe, ace, neg_prompt);
}
// Preprocessor: simple mode generates lyrics + metas from caption
if (is_simple) {
fprintf(stderr, "[Simple] Inspiration\n");
const char * sys =
"# Instruction\n"
"Expand the user's input into a more detailed"
" and specific musical description:\n";
std::string user_msg = ace.caption + "\n\ninstrumental: "
+ std::string(req.instrumental ? "true" : "false");
prompt = build_custom_prompt(acestep_bpe, sys, user_msg.c_str());
// FSM: reset then optionally force language (shared for both paths)
fsm.reset();
if (need_lyrics && use_fsm && ace.vocal_language != "unknown" && !ace.vocal_language.empty())
if (use_fsm && ace.vocal_language != "unknown" && !ace.vocal_language.empty())
fsm.force_language(acestep_bpe, ace.vocal_language);
fprintf(stderr, "[Fill] lyrics=%s metas=%s | %zu tokens, CFG: %.2f, N=%d\n",
need_lyrics ? "generate" : "keep",
has_all_metas ? "complete" : "fill gaps",
prompt.size(), fill_cfg, batch_size);
// Phase 1: N lyrics + metadata generations (always batched, N=batch_size)
fprintf(stderr, "[Simple] %zu tokens, N=%d, seeds: %lld..%lld\n",
prompt.size(), batch_size, seed, seed + batch_size - 1);
auto phase1_texts = generate_phase1_batch(
&acestep_llm, &acestep_bpe, prompt, 2048, temperature, fill_top_p, fill_top_k,
seed, batch_size, use_fsm ? &fsm : nullptr, need_lyrics,
fill_cfg, uncond.empty() ? nullptr : &uncond, !need_lyrics);
&acestep_llm, &acestep_bpe, prompt, 2048, temperature, 0.95f, 40,
seed, batch_size, use_fsm ? &fsm : nullptr, true);
parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Fill", need_lyrics);
parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Simple", true);
int n_kv_reset = (fill_cfg > 1.0f) ? 2 * batch_size : batch_size;
for (int i = 0; i < n_kv_reset; i++) qw3lm_reset_kv(&acestep_llm, i);
for (int i = 0; i < batch_size; i++) qw3lm_reset_kv(&acestep_llm, i);
}
// Re-evaluate after possible simple enrichment
const AcePrompt & ace_ref = aces.empty() ? ace : aces[0];
bool has_all_metas = (ace_ref.bpm > 0 && ace_ref.duration > 0 &&
!ace_ref.keyscale.empty() && !ace_ref.timesignature.empty());
if (!has_all_metas) {
// Partial-metas: Phase 1 with CFG to fill missing fields
prompt = build_lm_prompt(acestep_bpe, ace);
std::vector<int> uncond;
if (cfg_scale > 1.0f)
uncond = build_lm_prompt_uncond(acestep_bpe, ace, neg_prompt);
fprintf(stderr, "[Partial] %zu tokens, CFG: %.2f, N=%d, seeds: %lld..%lld\n",
prompt.size(), cfg_scale, batch_size, seed, seed + batch_size - 1);
fsm.reset();
auto phase1_texts = generate_phase1_batch(
&acestep_llm, &acestep_bpe, prompt, 2048, temperature, top_p, top_k,
seed, batch_size, use_fsm ? &fsm : nullptr, false,
cfg_scale, uncond.empty() ? nullptr : &uncond, true);
parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Partial", false);
for (int i = 0; i < 2 * batch_size; i++) qw3lm_reset_kv(&acestep_llm, i);
}
// Guarantee aces is populated (all-metas: single shared ace for prefill optimization)
if (aces.empty()) {
aces = { ace };
}
// Phase 2: generate audio codes
// Phase 2: generate audio codes (always batched, N=batch_size)
std::vector<std::string> batch_codes(batch_size);
if (!user_has_codes) {
if (need_lm_codes) {
batch_codes = run_phase2_batch(&acestep_llm, acestep_bpe, aces,
temperature, top_p, top_k, seed, batch_size, cfg_scale, neg_prompt);
} else {
fprintf(stderr, "[Skip] user audio_codes present, no code generation\n");
fprintf(stderr, "[Skip] %s, no code generation\n",
user_has_codes ? "user codes present" : "thinking=false");
}
// only batch size 1 is allowed