mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-08 09:59:50 +00:00
still not working
This commit is contained in:
parent
5c5fe55f7d
commit
a1305ffff9
1 changed files with 55 additions and 65 deletions
|
|
@ -303,8 +303,7 @@ static std::vector<int> build_lm_prompt_uncond(BPETokenizer & bpe, const AceProm
|
|||
ids.push_back(TOKEN_IM_END);
|
||||
append("\n");
|
||||
ids.push_back(TOKEN_IM_START);
|
||||
bool has_neg = negative_prompt && strlen(negative_prompt) > 0
|
||||
&& strcmp(negative_prompt, "NO USER INPUT") != 0;
|
||||
bool has_neg = negative_prompt && strlen(negative_prompt) > 0;
|
||||
if (has_neg)
|
||||
append("user\n# Caption\n" + std::string(negative_prompt) + "\n\n# Lyric\n" + prompt.lyrics + "\n");
|
||||
else
|
||||
|
|
@ -397,8 +396,7 @@ static std::vector<int> build_lm_prompt_uncond_with_cot(BPETokenizer & bpe, cons
|
|||
ids.push_back(TOKEN_IM_END);
|
||||
append("\n");
|
||||
ids.push_back(TOKEN_IM_START);
|
||||
bool has_neg = negative_prompt && strlen(negative_prompt) > 0
|
||||
&& strcmp(negative_prompt, "NO USER INPUT") != 0;
|
||||
bool has_neg = negative_prompt && strlen(negative_prompt) > 0;
|
||||
std::string cap = has_neg ? std::string(negative_prompt) : prompt.caption;
|
||||
append("user\n# Caption\n" + cap + "\n\n# Lyric\n" + prompt.lyrics + "\n");
|
||||
ids.push_back(TOKEN_IM_END);
|
||||
|
|
@ -746,12 +744,13 @@ static void parse_phase1_into_aces(
|
|||
if (!parse_cot_and_lyrics(texts[i], &parsed))
|
||||
fprintf(stderr, "WARNING: batch %d CoT parse incomplete\n", i);
|
||||
aces[i] = base;
|
||||
if (parsed.bpm > 0) aces[i].bpm = parsed.bpm;
|
||||
if (parsed.duration > 0) aces[i].duration = parsed.duration;
|
||||
if (!parsed.keyscale.empty()) aces[i].keyscale = parsed.keyscale;
|
||||
if (!parsed.timesignature.empty()) aces[i].timesignature = parsed.timesignature;
|
||||
if (!parsed.vocal_language.empty()) aces[i].vocal_language = parsed.vocal_language;
|
||||
if (!parsed.caption.empty()) aces[i].caption = parsed.caption;
|
||||
// gap fill: only write fields the user left empty
|
||||
if (parsed.bpm > 0 && base.bpm <= 0) aces[i].bpm = parsed.bpm;
|
||||
if (parsed.duration > 0 && base.duration <= 0) aces[i].duration = parsed.duration;
|
||||
if (!parsed.keyscale.empty() && base.keyscale.empty()) aces[i].keyscale = parsed.keyscale;
|
||||
if (!parsed.timesignature.empty() && base.timesignature.empty()) aces[i].timesignature = parsed.timesignature;
|
||||
if (!parsed.vocal_language.empty() && base.vocal_language.empty()) aces[i].vocal_language = parsed.vocal_language;
|
||||
// lyrics: only generated when user had none
|
||||
if (merge_lyrics && !parsed.lyrics.empty()) aces[i].lyrics = parsed.lyrics;
|
||||
if (aces[i].duration <= 0) aces[i].duration = 120.0f;
|
||||
if (aces[i].duration > 600) aces[i].duration = 600.0f;
|
||||
|
|
@ -1518,82 +1517,73 @@ std::string acestep_prepare_request(const music_generation_inputs inputs)
|
|||
req.audio_codes = "";
|
||||
|
||||
bool user_has_codes = !req.audio_codes.empty();
|
||||
bool need_lm_codes = req.thinking && !user_has_codes;
|
||||
|
||||
bool is_simple = ace.lyrics.empty();
|
||||
bool need_lyrics = ace.lyrics.empty();
|
||||
bool has_all_metas = (ace.bpm > 0 && ace.duration > 0 &&
|
||||
!ace.keyscale.empty() && !ace.timesignature.empty());
|
||||
bool need_fill = need_lyrics || !has_all_metas;
|
||||
|
||||
std::vector<int> prompt;
|
||||
std::vector<AcePrompt> aces; // populated by Phase 1 (simple or partial)
|
||||
|
||||
// Preprocessor: simple mode generates lyrics + metas from caption
|
||||
if (is_simple) {
|
||||
fprintf(stderr, "[Simple] Inspiration\n");
|
||||
|
||||
const char * sys =
|
||||
"# Instruction\n"
|
||||
"Expand the user's input into a more detailed"
|
||||
" and specific musical description:\n";
|
||||
std::string user_msg = ace.caption + "\n\ninstrumental: "
|
||||
+ std::string(req.instrumental ? "true" : "false");
|
||||
prompt = build_custom_prompt(acestep_bpe, sys, user_msg.c_str());
|
||||
|
||||
// FSM: reset then optionally force language (shared for both paths)
|
||||
// ONE path: fill what's missing, then generate codes.
|
||||
// JSON is the instruction. Empty field = "fill it". Filled = "don't touch".
|
||||
if (user_has_codes) {
|
||||
fprintf(stderr, "[Pass] audio_codes present, skip LM\n");
|
||||
} else if (need_fill) {
|
||||
if (need_lyrics) {
|
||||
const char * sys =
|
||||
"# Instruction\n"
|
||||
"Expand the user's input into a more detailed"
|
||||
" and specific musical description:\n";
|
||||
std::string user_msg = ace.caption + "\n\ninstrumental: "
|
||||
+ std::string(req.instrumental ? "true" : "false");
|
||||
prompt = build_custom_prompt(acestep_bpe, sys, user_msg.c_str());
|
||||
} else {
|
||||
prompt = build_lm_prompt(acestep_bpe, ace);
|
||||
}
|
||||
std::vector<int> uncond;
|
||||
float fill_cfg = cfg_scale;
|
||||
float fill_top_p = top_p;
|
||||
int fill_top_k = top_k;
|
||||
if (need_lyrics) {
|
||||
// lyrics generation: free sampling, no CFG (matches original behavior)
|
||||
fill_cfg = 1.0f;
|
||||
fill_top_p = 1.0f;
|
||||
fill_top_k = 0;
|
||||
} else if (fill_cfg > 1.0f) {
|
||||
uncond = build_lm_prompt_uncond(acestep_bpe, ace, neg_prompt);
|
||||
}
|
||||
fsm.reset();
|
||||
if (use_fsm && ace.vocal_language != "unknown" && !ace.vocal_language.empty())
|
||||
if (need_lyrics && use_fsm && ace.vocal_language != "unknown" && !ace.vocal_language.empty())
|
||||
fsm.force_language(acestep_bpe, ace.vocal_language);
|
||||
|
||||
// Phase 1: N lyrics + metadata generations (always batched, N=batch_size)
|
||||
fprintf(stderr, "[Simple] %zu tokens, N=%d, seeds: %lld..%lld\n",
|
||||
prompt.size(), batch_size, seed, seed + batch_size - 1);
|
||||
fprintf(stderr, "[Fill] lyrics=%s metas=%s | %zu tokens, CFG: %.2f, N=%d\n",
|
||||
need_lyrics ? "generate" : "keep",
|
||||
has_all_metas ? "complete" : "fill gaps",
|
||||
prompt.size(), fill_cfg, batch_size);
|
||||
|
||||
auto phase1_texts = generate_phase1_batch(
|
||||
&acestep_llm, &acestep_bpe, prompt, 2048, temperature, 0.95f, 40,
|
||||
seed, batch_size, use_fsm ? &fsm : nullptr, true);
|
||||
&acestep_llm, &acestep_bpe, prompt, 2048, temperature, fill_top_p, fill_top_k,
|
||||
seed, batch_size, use_fsm ? &fsm : nullptr, need_lyrics,
|
||||
fill_cfg, uncond.empty() ? nullptr : &uncond, !need_lyrics);
|
||||
|
||||
parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Simple", true);
|
||||
parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Fill", need_lyrics);
|
||||
|
||||
for (int i = 0; i < batch_size; i++) qw3lm_reset_kv(&acestep_llm, i);
|
||||
int n_kv_reset = (fill_cfg > 1.0f) ? 2 * batch_size : batch_size;
|
||||
for (int i = 0; i < n_kv_reset; i++) qw3lm_reset_kv(&acestep_llm, i);
|
||||
}
|
||||
|
||||
// Re-evaluate after possible simple enrichment
|
||||
const AcePrompt & ace_ref = aces.empty() ? ace : aces[0];
|
||||
bool has_all_metas = (ace_ref.bpm > 0 && ace_ref.duration > 0 &&
|
||||
!ace_ref.keyscale.empty() && !ace_ref.timesignature.empty());
|
||||
|
||||
if (!has_all_metas) {
|
||||
// Partial-metas: Phase 1 with CFG to fill missing fields
|
||||
prompt = build_lm_prompt(acestep_bpe, ace);
|
||||
std::vector<int> uncond;
|
||||
if (cfg_scale > 1.0f)
|
||||
uncond = build_lm_prompt_uncond(acestep_bpe, ace, neg_prompt);
|
||||
|
||||
fprintf(stderr, "[Partial] %zu tokens, CFG: %.2f, N=%d, seeds: %lld..%lld\n",
|
||||
prompt.size(), cfg_scale, batch_size, seed, seed + batch_size - 1);
|
||||
|
||||
fsm.reset();
|
||||
auto phase1_texts = generate_phase1_batch(
|
||||
&acestep_llm, &acestep_bpe, prompt, 2048, temperature, top_p, top_k,
|
||||
seed, batch_size, use_fsm ? &fsm : nullptr, false,
|
||||
cfg_scale, uncond.empty() ? nullptr : &uncond, true);
|
||||
|
||||
parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Partial", false);
|
||||
|
||||
for (int i = 0; i < 2 * batch_size; i++) qw3lm_reset_kv(&acestep_llm, i);
|
||||
}
|
||||
|
||||
// Guarantee aces is populated (all-metas: single shared ace for prefill optimization)
|
||||
if (aces.empty()) {
|
||||
aces = { ace };
|
||||
}
|
||||
|
||||
// Phase 2: generate audio codes (always batched, N=batch_size)
|
||||
// Phase 2: generate audio codes
|
||||
std::vector<std::string> batch_codes(batch_size);
|
||||
if (need_lm_codes) {
|
||||
if (!user_has_codes) {
|
||||
batch_codes = run_phase2_batch(&acestep_llm, acestep_bpe, aces,
|
||||
temperature, top_p, top_k, seed, batch_size, cfg_scale, neg_prompt);
|
||||
} else {
|
||||
fprintf(stderr, "[Skip] %s, no code generation\n",
|
||||
user_has_codes ? "user codes present" : "thinking=false");
|
||||
fprintf(stderr, "[Skip] user audio_codes present, no code generation\n");
|
||||
}
|
||||
|
||||
// only batch size 1 is allowed
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue