mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-04-28 03:30:20 +00:00
ace step diffusion loading
This commit is contained in:
parent
749536f464
commit
0fd7d2c0e5
4 changed files with 592 additions and 226 deletions
2
Makefile
2
Makefile
|
|
@ -745,8 +745,6 @@ ttscppmain: otherarch/ttscpp/cli/cli.cpp otherarch/ttscpp/cli/playback.cpp other
|
|||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
qwen3tts: otherarch/qwen3tts/q3ttsmain.cpp otherarch/qwen3tts/qwen3_tts.cpp otherarch/qwen3tts/text_tokenizer.cpp otherarch/qwen3tts/gguf_loader.cpp otherarch/qwen3tts/tts_transformer.cpp otherarch/qwen3tts/audio_tokenizer_decoder.cpp otherarch/qwen3tts/audio_tokenizer_encoder.cpp otherarch/qwen3tts/coreml_code_predictor_stub.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
acestep-b: otherarch/acestep/dit-vae.cpp otherarch/acestep/request.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
ggml/src/ggml-vulkan-shaders.cpp:
|
||||
ifdef VULKAN_BUILD
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ struct Timer {
|
|||
return std::chrono::duration<double, std::milli>(
|
||||
std::chrono::steady_clock::now() - t).count();
|
||||
}
|
||||
void reset() { t = std::chrono::steady_clock::now(); }
|
||||
};
|
||||
|
||||
// Special token IDs (Qwen3 extended vocab)
|
||||
|
|
@ -1171,260 +1172,262 @@ static void usage(const char * prog) {
|
|||
, prog);
|
||||
}
|
||||
|
||||
// int main(int argc, char ** argv) {
|
||||
// const char * model_path = nullptr;
|
||||
// const char * request_path = nullptr;
|
||||
// int max_seq = 8192;
|
||||
// int batch_size = 1;
|
||||
// bool use_fsm = true;
|
||||
// const char * dump_logits = nullptr;
|
||||
// const char * dump_tokens = nullptr;
|
||||
/*
|
||||
int main(int argc, char ** argv) {
|
||||
const char * model_path = nullptr;
|
||||
const char * request_path = nullptr;
|
||||
int max_seq = 8192;
|
||||
int batch_size = 1;
|
||||
bool use_fsm = true;
|
||||
const char * dump_logits = nullptr;
|
||||
const char * dump_tokens = nullptr;
|
||||
|
||||
// if (argc < 2) {
|
||||
// usage(argv[0]);
|
||||
// return 1;
|
||||
// }
|
||||
if (argc < 2) {
|
||||
usage(argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// for (int i = 1; i < argc; i++) {
|
||||
// if (!strcmp(argv[i], "--model") && i + 1 < argc)
|
||||
// model_path = argv[++i];
|
||||
// else if (!strcmp(argv[i], "--request") && i + 1 < argc)
|
||||
// request_path = argv[++i];
|
||||
// else if (!strcmp(argv[i], "--max-seq") && i + 1 < argc)
|
||||
// max_seq = atoi(argv[++i]);
|
||||
// else if (!strcmp(argv[i], "--batch") && i + 1 < argc)
|
||||
// batch_size = atoi(argv[++i]);
|
||||
// else if (!strcmp(argv[i], "--no-fsm"))
|
||||
// use_fsm = false;
|
||||
// else if (!strcmp(argv[i], "--dump-logits") && i + 1 < argc)
|
||||
// dump_logits = argv[++i];
|
||||
// else if (!strcmp(argv[i], "--dump-tokens") && i + 1 < argc)
|
||||
// dump_tokens = argv[++i];
|
||||
// else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) {
|
||||
// usage(argv[0]);
|
||||
// return 0;
|
||||
// }
|
||||
// else {
|
||||
// fprintf(stderr, "Unknown option: %s\n", argv[i]);
|
||||
// usage(argv[0]);
|
||||
// return 1;
|
||||
// }
|
||||
// }
|
||||
for (int i = 1; i < argc; i++) {
|
||||
if (!strcmp(argv[i], "--model") && i + 1 < argc)
|
||||
model_path = argv[++i];
|
||||
else if (!strcmp(argv[i], "--request") && i + 1 < argc)
|
||||
request_path = argv[++i];
|
||||
else if (!strcmp(argv[i], "--max-seq") && i + 1 < argc)
|
||||
max_seq = atoi(argv[++i]);
|
||||
else if (!strcmp(argv[i], "--batch") && i + 1 < argc)
|
||||
batch_size = atoi(argv[++i]);
|
||||
else if (!strcmp(argv[i], "--no-fsm"))
|
||||
use_fsm = false;
|
||||
else if (!strcmp(argv[i], "--dump-logits") && i + 1 < argc)
|
||||
dump_logits = argv[++i];
|
||||
else if (!strcmp(argv[i], "--dump-tokens") && i + 1 < argc)
|
||||
dump_tokens = argv[++i];
|
||||
else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) {
|
||||
usage(argv[0]);
|
||||
return 0;
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "Unknown option: %s\n", argv[i]);
|
||||
usage(argv[0]);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// if (!model_path) {
|
||||
// fprintf(stderr, "ERROR: --model required\n");
|
||||
// usage(argv[0]); return 1;
|
||||
// }
|
||||
// if (!request_path) {
|
||||
// fprintf(stderr, "ERROR: --request required\n");
|
||||
// usage(argv[0]); return 1;
|
||||
// }
|
||||
if (!model_path) {
|
||||
fprintf(stderr, "ERROR: --model required\n");
|
||||
usage(argv[0]); return 1;
|
||||
}
|
||||
if (!request_path) {
|
||||
fprintf(stderr, "ERROR: --request required\n");
|
||||
usage(argv[0]); return 1;
|
||||
}
|
||||
|
||||
// // Read request JSON
|
||||
// AceRequest req;
|
||||
// if (!request_parse(&req, request_path)) return 1;
|
||||
// request_dump(&req, stderr);
|
||||
// Read request JSON
|
||||
AceRequest req;
|
||||
if (!request_parse(&req, request_path)) return 1;
|
||||
request_dump(&req, stderr);
|
||||
|
||||
// if (req.caption.empty()) {
|
||||
// fprintf(stderr, "ERROR: caption is empty in %s\n", request_path);
|
||||
// return 1;
|
||||
// }
|
||||
if (req.caption.empty()) {
|
||||
fprintf(stderr, "ERROR: caption is empty in %s\n", request_path);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// // Resolve seed
|
||||
// long long seed = req.seed;
|
||||
// if (seed < 0) {
|
||||
// std::random_device rd;
|
||||
// seed = (int64_t)rd() << 32 | rd();
|
||||
// if (seed < 0) seed = -seed; // keep positive
|
||||
// }
|
||||
// req.seed = seed;
|
||||
// Resolve seed
|
||||
long long seed = req.seed;
|
||||
if (seed < 0) {
|
||||
std::random_device rd;
|
||||
seed = (int64_t)rd() << 32 | rd();
|
||||
if (seed < 0) seed = -seed; // keep positive
|
||||
}
|
||||
req.seed = seed;
|
||||
|
||||
// // Generation params from request
|
||||
// float temperature = req.lm_temperature;
|
||||
// float top_p = req.lm_top_p;
|
||||
// int top_k = req.lm_top_k;
|
||||
// float cfg_scale = req.lm_cfg_scale;
|
||||
// const char * neg_prompt = req.lm_negative_prompt.c_str();
|
||||
// Generation params from request
|
||||
float temperature = req.lm_temperature;
|
||||
float top_p = req.lm_top_p;
|
||||
int top_k = req.lm_top_k;
|
||||
float cfg_scale = req.lm_cfg_scale;
|
||||
const char * neg_prompt = req.lm_negative_prompt.c_str();
|
||||
|
||||
// Timer t_total;
|
||||
Timer t_total;
|
||||
|
||||
// // Load BPE tokenizer from model GGUF
|
||||
// BPETokenizer bpe;
|
||||
// if (!load_bpe_from_gguf(&bpe, model_path)) return 1;
|
||||
// Load BPE tokenizer from model GGUF
|
||||
BPETokenizer bpe;
|
||||
if (!load_bpe_from_gguf(&bpe, model_path)) return 1;
|
||||
|
||||
// // Load model
|
||||
// int n_kv_sets = (cfg_scale > 1.0f) ? 2 * batch_size : batch_size;
|
||||
// Timer t_load;
|
||||
// Qwen3LM model;
|
||||
// if (!qw3lm_load(&model, model_path, max_seq, n_kv_sets)) return 1;
|
||||
// double load_ms = t_load.ms();
|
||||
// Load model
|
||||
int n_kv_sets = (cfg_scale > 1.0f) ? 2 * batch_size : batch_size;
|
||||
Timer t_load;
|
||||
Qwen3LM model;
|
||||
if (!qw3lm_load(&model, model_path, max_seq, n_kv_sets)) return 1;
|
||||
double load_ms = t_load.ms();
|
||||
|
||||
// // FSM
|
||||
// MetadataFSM fsm;
|
||||
// if (use_fsm) fsm.init(bpe, model.cfg.vocab_size);
|
||||
// FSM
|
||||
MetadataFSM fsm;
|
||||
if (use_fsm) fsm.init(bpe, model.cfg.vocab_size);
|
||||
|
||||
// // Copy request -> AcePrompt (internal LLM struct)
|
||||
// AcePrompt ace = {};
|
||||
// ace.caption = req.caption;
|
||||
// ace.lyrics = req.lyrics;
|
||||
// ace.duration = req.duration;
|
||||
// ace.bpm = req.bpm;
|
||||
// ace.keyscale = req.keyscale;
|
||||
// ace.timesignature = req.timesignature;
|
||||
// ace.vocal_language = req.vocal_language;
|
||||
// Copy request -> AcePrompt (internal LLM struct)
|
||||
AcePrompt ace = {};
|
||||
ace.caption = req.caption;
|
||||
ace.lyrics = req.lyrics;
|
||||
ace.duration = req.duration;
|
||||
ace.bpm = req.bpm;
|
||||
ace.keyscale = req.keyscale;
|
||||
ace.timesignature = req.timesignature;
|
||||
ace.vocal_language = req.vocal_language;
|
||||
|
||||
// bool user_has_codes = !req.audio_codes.empty();
|
||||
// bool need_lm_codes = req.thinking && !user_has_codes;
|
||||
bool user_has_codes = !req.audio_codes.empty();
|
||||
bool need_lm_codes = req.thinking && !user_has_codes;
|
||||
|
||||
// bool is_simple = ace.lyrics.empty() &&
|
||||
// ace.bpm <= 0 && ace.duration <= 0 &&
|
||||
// ace.keyscale.empty() && ace.timesignature.empty();
|
||||
bool is_simple = ace.lyrics.empty() &&
|
||||
ace.bpm <= 0 && ace.duration <= 0 &&
|
||||
ace.keyscale.empty() && ace.timesignature.empty();
|
||||
|
||||
// std::vector<int> prompt;
|
||||
// std::vector<AcePrompt> aces; // populated by Phase 1 (simple or partial)
|
||||
std::vector<int> prompt;
|
||||
std::vector<AcePrompt> aces; // populated by Phase 1 (simple or partial)
|
||||
|
||||
// // Preprocessor: simple mode generates lyrics + metas from caption
|
||||
// if (is_simple) {
|
||||
// fprintf(stderr, "[Simple] Inspiration\n");
|
||||
// Preprocessor: simple mode generates lyrics + metas from caption
|
||||
if (is_simple) {
|
||||
fprintf(stderr, "[Simple] Inspiration\n");
|
||||
|
||||
// const char * sys =
|
||||
// "# Instruction\n"
|
||||
// "Expand the user's input into a more detailed"
|
||||
// " and specific musical description:\n";
|
||||
// std::string user_msg = ace.caption + "\n\ninstrumental: "
|
||||
// + std::string(req.instrumental ? "true" : "false");
|
||||
// prompt = build_custom_prompt(bpe, sys, user_msg.c_str());
|
||||
const char * sys =
|
||||
"# Instruction\n"
|
||||
"Expand the user's input into a more detailed"
|
||||
" and specific musical description:\n";
|
||||
std::string user_msg = ace.caption + "\n\ninstrumental: "
|
||||
+ std::string(req.instrumental ? "true" : "false");
|
||||
prompt = build_custom_prompt(bpe, sys, user_msg.c_str());
|
||||
|
||||
// // FSM: reset then optionally force language (shared for both paths)
|
||||
// fsm.reset();
|
||||
// if (use_fsm && ace.vocal_language != "unknown" && !ace.vocal_language.empty())
|
||||
// fsm.force_language(bpe, ace.vocal_language);
|
||||
// FSM: reset then optionally force language (shared for both paths)
|
||||
fsm.reset();
|
||||
if (use_fsm && ace.vocal_language != "unknown" && !ace.vocal_language.empty())
|
||||
fsm.force_language(bpe, ace.vocal_language);
|
||||
|
||||
// // Phase 1: N lyrics + metadata generations (always batched, N=batch_size)
|
||||
// fprintf(stderr, "[Simple] %zu tokens, N=%d, seeds: %lld..%lld\n",
|
||||
// prompt.size(), batch_size, seed, seed + batch_size - 1);
|
||||
// Phase 1: N lyrics + metadata generations (always batched, N=batch_size)
|
||||
fprintf(stderr, "[Simple] %zu tokens, N=%d, seeds: %lld..%lld\n",
|
||||
prompt.size(), batch_size, seed, seed + batch_size - 1);
|
||||
|
||||
// auto phase1_texts = generate_phase1_batch(
|
||||
// &model, &bpe, prompt, 2048, temperature, 1.0f, 0,
|
||||
// seed, batch_size, use_fsm ? &fsm : nullptr, true);
|
||||
auto phase1_texts = generate_phase1_batch(
|
||||
&model, &bpe, prompt, 2048, temperature, 1.0f, 0,
|
||||
seed, batch_size, use_fsm ? &fsm : nullptr, true);
|
||||
|
||||
// parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Simple", true);
|
||||
parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Simple", true);
|
||||
|
||||
// for (int i = 0; i < batch_size; i++) qw3lm_reset_kv(&model, i);
|
||||
// }
|
||||
for (int i = 0; i < batch_size; i++) qw3lm_reset_kv(&model, i);
|
||||
}
|
||||
|
||||
// // Re-evaluate after possible simple enrichment
|
||||
// const AcePrompt & ace_ref = aces.empty() ? ace : aces[0];
|
||||
// bool has_all_metas = (ace_ref.bpm > 0 && ace_ref.duration > 0 &&
|
||||
// !ace_ref.keyscale.empty() && !ace_ref.timesignature.empty());
|
||||
// Re-evaluate after possible simple enrichment
|
||||
const AcePrompt & ace_ref = aces.empty() ? ace : aces[0];
|
||||
bool has_all_metas = (ace_ref.bpm > 0 && ace_ref.duration > 0 &&
|
||||
!ace_ref.keyscale.empty() && !ace_ref.timesignature.empty());
|
||||
|
||||
// if (!has_all_metas) {
|
||||
// // Partial-metas: Phase 1 with CFG to fill missing fields
|
||||
// prompt = build_lm_prompt(bpe, ace);
|
||||
// std::vector<int> uncond;
|
||||
// if (cfg_scale > 1.0f)
|
||||
// uncond = build_lm_prompt_uncond(bpe, ace, neg_prompt);
|
||||
if (!has_all_metas) {
|
||||
// Partial-metas: Phase 1 with CFG to fill missing fields
|
||||
prompt = build_lm_prompt(bpe, ace);
|
||||
std::vector<int> uncond;
|
||||
if (cfg_scale > 1.0f)
|
||||
uncond = build_lm_prompt_uncond(bpe, ace, neg_prompt);
|
||||
|
||||
// fprintf(stderr, "[Partial] %zu tokens, CFG: %.2f, N=%d, seeds: %lld..%lld\n",
|
||||
// prompt.size(), cfg_scale, batch_size, seed, seed + batch_size - 1);
|
||||
fprintf(stderr, "[Partial] %zu tokens, CFG: %.2f, N=%d, seeds: %lld..%lld\n",
|
||||
prompt.size(), cfg_scale, batch_size, seed, seed + batch_size - 1);
|
||||
|
||||
// fsm.reset();
|
||||
// auto phase1_texts = generate_phase1_batch(
|
||||
// &model, &bpe, prompt, 2048, temperature, top_p, top_k,
|
||||
// seed, batch_size, use_fsm ? &fsm : nullptr, false,
|
||||
// cfg_scale, uncond.empty() ? nullptr : &uncond, true);
|
||||
fsm.reset();
|
||||
auto phase1_texts = generate_phase1_batch(
|
||||
&model, &bpe, prompt, 2048, temperature, top_p, top_k,
|
||||
seed, batch_size, use_fsm ? &fsm : nullptr, false,
|
||||
cfg_scale, uncond.empty() ? nullptr : &uncond, true);
|
||||
|
||||
// parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Partial", false);
|
||||
parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Partial", false);
|
||||
|
||||
// for (int i = 0; i < 2 * batch_size; i++) qw3lm_reset_kv(&model, i);
|
||||
// }
|
||||
for (int i = 0; i < 2 * batch_size; i++) qw3lm_reset_kv(&model, i);
|
||||
}
|
||||
|
||||
// // Guarantee aces is populated (all-metas: single shared ace for prefill optimization)
|
||||
// if (aces.empty()) aces = {ace};
|
||||
// Guarantee aces is populated (all-metas: single shared ace for prefill optimization)
|
||||
if (aces.empty()) aces = {ace};
|
||||
|
||||
// // Debug: dump tokens/logits
|
||||
// if (need_lm_codes && (dump_logits || dump_tokens)) {
|
||||
// std::string cot = build_cot_yaml(aces[0]);
|
||||
// auto dbg_prompt = build_lm_prompt_with_cot(bpe, aces[0], cot);
|
||||
// Debug: dump tokens/logits
|
||||
if (need_lm_codes && (dump_logits || dump_tokens)) {
|
||||
std::string cot = build_cot_yaml(aces[0]);
|
||||
auto dbg_prompt = build_lm_prompt_with_cot(bpe, aces[0], cot);
|
||||
|
||||
// if (dump_tokens) {
|
||||
// FILE * f = fopen(dump_tokens, "w");
|
||||
// if (f) {
|
||||
// for (size_t j = 0; j < dbg_prompt.size(); j++)
|
||||
// fprintf(f, "%s%d", j ? "," : "", dbg_prompt[j]);
|
||||
// fprintf(f, "\n");
|
||||
// fclose(f);
|
||||
// fprintf(stderr, "[Debug] Tokens -> %s (%zu)\n",
|
||||
// dump_tokens, dbg_prompt.size());
|
||||
// }
|
||||
// }
|
||||
// if (dump_logits) {
|
||||
// std::vector<float> dbg_logits(model.cfg.vocab_size);
|
||||
// qw3lm_forward(&model, dbg_prompt.data(), (int)dbg_prompt.size(), 0, dbg_logits.data());
|
||||
// FILE * f = fopen(dump_logits, "wb");
|
||||
// if (f) {
|
||||
// fwrite(dbg_logits.data(), sizeof(float), model.cfg.vocab_size, f);
|
||||
// fclose(f);
|
||||
// fprintf(stderr, "[Debug] Logits -> %s (%d floats, argmax=%d)\n",
|
||||
// dump_logits, model.cfg.vocab_size,
|
||||
// (int)(std::max_element(dbg_logits.begin(), dbg_logits.end()) - dbg_logits.begin()));
|
||||
// }
|
||||
// qw3lm_reset_kv(&model, 0);
|
||||
// }
|
||||
// }
|
||||
if (dump_tokens) {
|
||||
FILE * f = fopen(dump_tokens, "w");
|
||||
if (f) {
|
||||
for (size_t j = 0; j < dbg_prompt.size(); j++)
|
||||
fprintf(f, "%s%d", j ? "," : "", dbg_prompt[j]);
|
||||
fprintf(f, "\n");
|
||||
fclose(f);
|
||||
fprintf(stderr, "[Debug] Tokens -> %s (%zu)\n",
|
||||
dump_tokens, dbg_prompt.size());
|
||||
}
|
||||
}
|
||||
if (dump_logits) {
|
||||
std::vector<float> dbg_logits(model.cfg.vocab_size);
|
||||
qw3lm_forward(&model, dbg_prompt.data(), (int)dbg_prompt.size(), 0, dbg_logits.data());
|
||||
FILE * f = fopen(dump_logits, "wb");
|
||||
if (f) {
|
||||
fwrite(dbg_logits.data(), sizeof(float), model.cfg.vocab_size, f);
|
||||
fclose(f);
|
||||
fprintf(stderr, "[Debug] Logits -> %s (%d floats, argmax=%d)\n",
|
||||
dump_logits, model.cfg.vocab_size,
|
||||
(int)(std::max_element(dbg_logits.begin(), dbg_logits.end()) - dbg_logits.begin()));
|
||||
}
|
||||
qw3lm_reset_kv(&model, 0);
|
||||
}
|
||||
}
|
||||
|
||||
// // Phase 2: generate audio codes (always batched, N=batch_size)
|
||||
// std::vector<std::string> batch_codes(batch_size);
|
||||
// if (need_lm_codes) {
|
||||
// batch_codes = run_phase2_batch(&model, bpe, aces,
|
||||
// temperature, top_p, top_k, seed, batch_size, cfg_scale, neg_prompt);
|
||||
// } else {
|
||||
// fprintf(stderr, "[Skip] %s, no code generation\n",
|
||||
// user_has_codes ? "user codes present" : "thinking=false");
|
||||
// }
|
||||
// Phase 2: generate audio codes (always batched, N=batch_size)
|
||||
std::vector<std::string> batch_codes(batch_size);
|
||||
if (need_lm_codes) {
|
||||
batch_codes = run_phase2_batch(&model, bpe, aces,
|
||||
temperature, top_p, top_k, seed, batch_size, cfg_scale, neg_prompt);
|
||||
} else {
|
||||
fprintf(stderr, "[Skip] %s, no code generation\n",
|
||||
user_has_codes ? "user codes present" : "thinking=false");
|
||||
}
|
||||
|
||||
// // Write N output files: request0.json, request1.json, ...
|
||||
// {
|
||||
// std::string base(request_path);
|
||||
// std::string ext = ".json";
|
||||
// size_t dot = base.rfind('.');
|
||||
// if (dot != std::string::npos) { ext = base.substr(dot); base = base.substr(0, dot); }
|
||||
// for (int b = 0; b < batch_size; b++) {
|
||||
// AceRequest rr = req;
|
||||
// const AcePrompt & a = aces[b < (int)aces.size() ? b : 0];
|
||||
// rr.caption = a.caption;
|
||||
// rr.lyrics = a.lyrics;
|
||||
// rr.bpm = a.bpm;
|
||||
// rr.duration = a.duration;
|
||||
// rr.keyscale = a.keyscale;
|
||||
// rr.timesignature = a.timesignature;
|
||||
// rr.vocal_language = a.vocal_language;
|
||||
// if (!batch_codes[b].empty()) rr.audio_codes = batch_codes[b];
|
||||
// rr.seed = seed + b;
|
||||
// char path[512];
|
||||
// snprintf(path, sizeof(path), "%s%d%s", base.c_str(), b, ext.c_str());
|
||||
// request_write(&rr, path);
|
||||
// fprintf(stderr, "[Output] Wrote %s\n", path);
|
||||
// }
|
||||
// }
|
||||
// Write N output files: request0.json, request1.json, ...
|
||||
{
|
||||
std::string base(request_path);
|
||||
std::string ext = ".json";
|
||||
size_t dot = base.rfind('.');
|
||||
if (dot != std::string::npos) { ext = base.substr(dot); base = base.substr(0, dot); }
|
||||
for (int b = 0; b < batch_size; b++) {
|
||||
AceRequest rr = req;
|
||||
const AcePrompt & a = aces[b < (int)aces.size() ? b : 0];
|
||||
rr.caption = a.caption;
|
||||
rr.lyrics = a.lyrics;
|
||||
rr.bpm = a.bpm;
|
||||
rr.duration = a.duration;
|
||||
rr.keyscale = a.keyscale;
|
||||
rr.timesignature = a.timesignature;
|
||||
rr.vocal_language = a.vocal_language;
|
||||
if (!batch_codes[b].empty()) rr.audio_codes = batch_codes[b];
|
||||
rr.seed = seed + b;
|
||||
char path[512];
|
||||
snprintf(path, sizeof(path), "%s%d%s", base.c_str(), b, ext.c_str());
|
||||
request_write(&rr, path);
|
||||
fprintf(stderr, "[Output] Wrote %s\n", path);
|
||||
}
|
||||
}
|
||||
|
||||
// fprintf(stderr, "[Ace-Qwen3] Load %.0f | Total %.0fms | seed=%lld\n",
|
||||
// load_ms, t_total.ms(), seed);
|
||||
fprintf(stderr, "[Ace-Qwen3] Load %.0f | Total %.0fms | seed=%lld\n",
|
||||
load_ms, t_total.ms(), seed);
|
||||
|
||||
// qw3lm_free(&model);
|
||||
// return 0;
|
||||
// }
|
||||
qw3lm_free(&model);
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
//kcpp stuff
|
||||
|
||||
static Qwen3LM acestep_llm;
|
||||
static BPETokenizer acestep_bpe;
|
||||
static bool acestep_loaded = false;
|
||||
static bool acestep_lm_loaded = false;
|
||||
|
||||
bool load_acestep(std::string model_path)
|
||||
bool load_acestep_lm(std::string model_path)
|
||||
{
|
||||
acestep_loaded = false;
|
||||
acestep_lm_loaded = false;
|
||||
int max_seq = 8192;
|
||||
const int batch_size = 1; //only bs 1 is allowed
|
||||
if (!load_bpe_from_gguf(&acestep_bpe, model_path.c_str())) {
|
||||
|
|
@ -1435,7 +1438,7 @@ bool load_acestep(std::string model_path)
|
|||
if (!qw3lm_load(&acestep_llm, model_path.c_str(), max_seq, n_kv_sets)) {
|
||||
return false;
|
||||
}
|
||||
acestep_loaded = true;
|
||||
acestep_lm_loaded = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -22,16 +22,6 @@
|
|||
#include "./debug.h"
|
||||
#include "./request.h"
|
||||
|
||||
struct Timer {
|
||||
std::chrono::steady_clock::time_point t;
|
||||
Timer() : t(std::chrono::steady_clock::now()) {}
|
||||
double ms() const {
|
||||
return std::chrono::duration<double, std::milli>(
|
||||
std::chrono::steady_clock::now() - t).count();
|
||||
}
|
||||
void reset() { t = std::chrono::steady_clock::now(); }
|
||||
};
|
||||
|
||||
// Minimal WAV writer (16-bit PCM stereo)
|
||||
static bool write_wav(const char * path, const float * audio, int T_audio, int sr) {
|
||||
FILE * f = fopen(path, "wb");
|
||||
|
|
@ -100,6 +90,7 @@ static std::vector<int> parse_codes_string(const std::string & s) {
|
|||
return codes;
|
||||
}
|
||||
|
||||
/*
|
||||
int main(int argc, char ** argv) {
|
||||
if (argc < 2) { print_usage(argv[0]); return 1; }
|
||||
|
||||
|
|
@ -579,3 +570,370 @@ int main(int argc, char ** argv) {
|
|||
fprintf(stderr, "[Pipeline] All done\n");
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
//kcpp stuff
|
||||
static DiTGGML acestep_dit = {};
|
||||
static bool acestep_dit_loaded = false;
|
||||
static DiTGGMLConfig music_dit_cfg;
|
||||
static Timer music_dit_timer;
|
||||
static bool is_turbo = false;
|
||||
static VAEGGML vae = {};
|
||||
static BPETokenizer music_tok;
|
||||
static Qwen3GGML music_text_enc = {};
|
||||
static GGUFModel gf_te = {};
|
||||
static const void * musice_te_embed_data = nullptr;
|
||||
static CondGGML music_cond = {};
|
||||
static std::vector<float> silence_full; // [15000, 64] f32
|
||||
static DetokGGML detok = {};
|
||||
|
||||
bool load_acestep_dit(std::string music_embd_path, std::string music_dit_path, std::string music_vae_path)
|
||||
{
|
||||
const char * text_enc_gguf = music_embd_path.c_str();
|
||||
const char * dit_gguf = music_dit_path.c_str();
|
||||
const char * vae_gguf = music_vae_path.c_str();
|
||||
|
||||
// Load DiT model (once for all requests)
|
||||
dit_ggml_init_backend(&acestep_dit);
|
||||
fprintf(stderr, "[Load] Backend init: %.1f ms\n", music_dit_timer.ms());
|
||||
|
||||
music_dit_timer.reset();
|
||||
if (!dit_ggml_load(&acestep_dit, dit_gguf, music_dit_cfg)) {
|
||||
fprintf(stderr, "FATAL: failed to load DiT model\n");
|
||||
return false;
|
||||
}
|
||||
fprintf(stderr, "[Load] DiT weight load: %.1f ms\n", music_dit_timer.ms());
|
||||
|
||||
// Read DiT GGUF metadata + silence_latent tensor (once)
|
||||
is_turbo = false;
|
||||
{
|
||||
GGUFModel gf = {};
|
||||
if (gf_load(&gf, dit_gguf)) {
|
||||
is_turbo = gf_get_bool(gf, "acestep.is_turbo");
|
||||
const void * sl_data = gf_get_data(gf, "silence_latent");
|
||||
if (sl_data) {
|
||||
silence_full.resize(15000 * 64);
|
||||
memcpy(silence_full.data(), sl_data, 15000 * 64 * sizeof(float));
|
||||
fprintf(stderr, "[Load] silence_latent: [15000, 64] from GGUF\n");
|
||||
} else {
|
||||
fprintf(stderr, "FATAL: silence_latent tensor not found in %s\n", dit_gguf);
|
||||
return false;
|
||||
}
|
||||
gf_close(&gf);
|
||||
} else {
|
||||
fprintf(stderr, "FATAL: cannot reopen %s for metadata\n", dit_gguf);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Load VAE model (once for all requests)
|
||||
music_dit_timer.reset();
|
||||
vae_ggml_load(&vae, vae_gguf);
|
||||
fprintf(stderr, "[Load] VAE weights: %.1f ms\n", music_dit_timer.ms());
|
||||
|
||||
|
||||
music_dit_timer.reset();
|
||||
if (!load_bpe_from_gguf(&music_tok, text_enc_gguf)) {
|
||||
fprintf(stderr, "FATAL: failed to load music tokenizer from %s\n", text_enc_gguf);
|
||||
return false;
|
||||
}
|
||||
fprintf(stderr, "[Load] BPE tokenizer: %.1f ms\n", music_dit_timer.ms());
|
||||
|
||||
// Text encoder forward (caption only)
|
||||
music_dit_timer.reset();
|
||||
qwen3_init_backend(&music_text_enc);
|
||||
if (!qwen3_load_text_encoder(&music_text_enc, text_enc_gguf)) {
|
||||
fprintf(stderr, "FATAL: failed to load text encoder\n");
|
||||
return false;
|
||||
}
|
||||
fprintf(stderr, "[Load] TextEncoder: %.1f ms\n", music_dit_timer.ms());
|
||||
|
||||
if (!gf_load(&gf_te, text_enc_gguf)) {
|
||||
fprintf(stderr, "FATAL: cannot reopen text encoder GGUF for lyric embed\n");
|
||||
return false;
|
||||
}
|
||||
musice_te_embed_data = gf_get_data(gf_te, "embed_tokens.weight");
|
||||
if (!musice_te_embed_data) {
|
||||
fprintf(stderr, "FATAL: embed_tokens.weight not found\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Condition encoder forward
|
||||
music_dit_timer.reset();
|
||||
cond_ggml_init_backend(&music_cond);
|
||||
if (!cond_ggml_load(&music_cond, dit_gguf)) {
|
||||
fprintf(stderr, "FATAL: failed to load condition encoder\n");
|
||||
return false;
|
||||
}
|
||||
fprintf(stderr, "[Load] ConditionEncoder: %.1f ms\n", music_dit_timer.ms());
|
||||
|
||||
music_dit_timer.reset();
|
||||
|
||||
if (!detok_ggml_load(&detok, dit_gguf, acestep_dit.backend, acestep_dit.cpu_backend)) {
|
||||
fprintf(stderr, "FATAL: failed to load detokenizer\n");
|
||||
return false;
|
||||
}
|
||||
fprintf(stderr, "[Load] Detokenizer: %.1f ms\n", music_dit_timer.ms());
|
||||
|
||||
acestep_dit_loaded = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string acestep_generate_audio(const music_generation_inputs inputs)
|
||||
{
|
||||
// Parse request JSON
|
||||
AceRequest req;
|
||||
std::string injson = inputs.input_json;
|
||||
request_init(&req);
|
||||
if (!request_parse_from_str(&req, injson)) {
|
||||
fprintf(stderr, "ERROR: failed to parse music gen request\n");
|
||||
return "";
|
||||
}
|
||||
if (req.caption.empty()) {
|
||||
fprintf(stderr, "ERROR: music gen caption is empty!\n");
|
||||
return "";
|
||||
}
|
||||
|
||||
const int FRAMES_PER_SECOND = 25;
|
||||
int Oc = music_dit_cfg.out_channels; // 64
|
||||
int ctx_ch = music_dit_cfg.in_channels - Oc; // 128
|
||||
int batch_n = 1;
|
||||
int vae_chunk = 256;
|
||||
int vae_overlap = 64;
|
||||
|
||||
// Extract params
|
||||
const char * caption = req.caption.c_str();
|
||||
const char * lyrics = req.lyrics.empty() ? "[Instrumental]" : req.lyrics.c_str();
|
||||
char bpm_str[16] = "N/A";
|
||||
if (req.bpm > 0) snprintf(bpm_str, sizeof(bpm_str), "%d", req.bpm);
|
||||
const char * bpm = bpm_str;
|
||||
const char * keyscale = req.keyscale.empty() ? "N/A" : req.keyscale.c_str();
|
||||
const char * timesig = req.timesignature.empty() ? "N/A" : req.timesignature.c_str();
|
||||
const char * language = req.vocal_language.empty() ? "en" : req.vocal_language.c_str();
|
||||
float duration = req.duration > 0 ? req.duration : 120.0f;
|
||||
long long seed = req.seed;
|
||||
int num_steps = req.inference_steps > 0 ? req.inference_steps : 8;
|
||||
float guidance_scale = req.guidance_scale > 0 ? req.guidance_scale : 7.0f;
|
||||
float shift = req.shift > 0 ? req.shift : 1.0f;
|
||||
|
||||
if (is_turbo && guidance_scale > 1.0f) {
|
||||
fprintf(stderr, "[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was %.1f)\n",
|
||||
guidance_scale);
|
||||
guidance_scale = 1.0f;
|
||||
}
|
||||
|
||||
if (seed < 0) {
|
||||
std::random_device rd;
|
||||
seed = (long long)rd() << 32 | rd();
|
||||
if (seed < 0) seed = -seed;
|
||||
}
|
||||
fprintf(stderr, "[Pipeline] seed=%lld, steps=%d, guidance=%.1f, shift=%.1f, duration=%.1fs\n",
|
||||
seed, num_steps, guidance_scale, shift, duration);
|
||||
|
||||
// Parse audio codes from request
|
||||
std::vector<int> codes_vec = parse_codes_string(req.audio_codes);
|
||||
if (!codes_vec.empty())
|
||||
fprintf(stderr, "[Pipeline] %zu audio codes (%.1fs @ 5Hz)\n",
|
||||
codes_vec.size(), (float)codes_vec.size() / 5.0f);
|
||||
|
||||
// Build schedule: t_i = shift * t / (1 + (shift-1)*t) where t = 1 - i/steps
|
||||
std::vector<float> schedule(num_steps);
|
||||
for (int i = 0; i < num_steps; i++) {
|
||||
float t = 1.0f - (float)i / (float)num_steps;
|
||||
schedule[i] = shift * t / (1.0f + (shift - 1.0f) * t);
|
||||
}
|
||||
|
||||
// T = number of 25Hz latent frames for DiT
|
||||
// When audio codes are present, T is determined by the codes.
|
||||
// Otherwise, T is derived from the requested duration.
|
||||
int T = codes_vec.empty()
|
||||
? (int)(duration * FRAMES_PER_SECOND)
|
||||
: (int)codes_vec.size() * 5;
|
||||
T = ((T + music_dit_cfg.patch_size - 1) / music_dit_cfg.patch_size) * music_dit_cfg.patch_size;
|
||||
int S = T / music_dit_cfg.patch_size;
|
||||
int enc_S = 0;
|
||||
|
||||
fprintf(stderr, "[Pipeline] T=%d, S=%d\n", T, S);
|
||||
|
||||
if (T > 15000) {
|
||||
fprintf(stderr, "ERROR: T=%d exceeds silence_latent max 15000, skipping\n", T);
|
||||
return "";
|
||||
}
|
||||
|
||||
// Text encoding
|
||||
// 1. Load BPE tokenizer
|
||||
music_dit_timer.reset();
|
||||
|
||||
// 2. Build formatted prompts
|
||||
const char * instruction = "Generate audio semantic tokens based on the given conditions:";
|
||||
char metas[512];
|
||||
snprintf(metas, sizeof(metas),
|
||||
"- bpm: %s\n- timesignature: %s\n- keyscale: %s\n- duration: %d seconds\n",
|
||||
bpm, timesig, keyscale, (int)duration);
|
||||
std::string text_str = std::string("# Instruction\n")
|
||||
+ instruction + "\n\n"
|
||||
+ "# Caption\n" + caption + "\n\n"
|
||||
+ "# Metas\n" + metas + "<|endoftext|>\n";
|
||||
|
||||
bool instrumental = (strcmp(lyrics, "[Instrumental]") == 0 || strcmp(lyrics, "[instrumental]") == 0);
|
||||
std::string lyric_str = std::string("# Languages\n") + language + "\n\n# Lyric\n"
|
||||
+ (instrumental ? "[Instrumental]" : lyrics) + "<|endoftext|>";
|
||||
|
||||
// 3. Tokenize
|
||||
auto text_ids = bpe_encode(&music_tok, text_str.c_str(), true);
|
||||
auto lyric_ids = bpe_encode(&music_tok, lyric_str.c_str(), true);
|
||||
int S_text = (int)text_ids.size();
|
||||
int S_lyric = (int)lyric_ids.size();
|
||||
fprintf(stderr, "[Pipeline] caption: %d tokens, lyrics: %d tokens\n", S_text, S_lyric);
|
||||
|
||||
int H_text = music_text_enc.cfg.hidden_size; // 1024
|
||||
std::vector<float> text_hidden(H_text * S_text);
|
||||
|
||||
music_dit_timer.reset();
|
||||
qwen3_forward(&music_text_enc, text_ids.data(), S_text, text_hidden.data());
|
||||
fprintf(stderr, "[Encode] TextEncoder (%d tokens): %.1f ms\n", S_text, music_dit_timer.ms());
|
||||
|
||||
// 5. Lyric embedding (CPU vocab lookup from text encoder embed table)
|
||||
music_dit_timer.reset();
|
||||
std::vector<float> lyric_embed(H_text * S_lyric);
|
||||
qwen3_cpu_embed_lookup(musice_te_embed_data, H_text,
|
||||
lyric_ids.data(), S_lyric,
|
||||
lyric_embed.data());
|
||||
|
||||
fprintf(stderr, "[Encode] Lyric vocab lookup (%d tokens): %.1f ms\n", S_lyric, music_dit_timer.ms());
|
||||
|
||||
// Silence feats for timbre input: first 750 frames (30s @ 25Hz)
|
||||
const int S_ref = 750;
|
||||
std::vector<float> silence_feats(S_ref * 64);
|
||||
memcpy(silence_feats.data(), silence_full.data(), S_ref * 64 * sizeof(float));
|
||||
|
||||
music_dit_timer.reset();
|
||||
std::vector<float> enc_hidden;
|
||||
cond_ggml_forward(&music_cond, text_hidden.data(), S_text,
|
||||
lyric_embed.data(), S_lyric,
|
||||
silence_feats.data(), S_ref,
|
||||
enc_hidden, &enc_S);
|
||||
fprintf(stderr, "[Encode] ConditionEncoder: %.1f ms, enc_S=%d\n", music_dit_timer.ms(), enc_S);
|
||||
|
||||
// Context building
|
||||
// Silence latent for this T
|
||||
std::vector<float> silence(Oc * T);
|
||||
memcpy(silence.data(), silence_full.data(), (size_t)(Oc * T) * sizeof(float));
|
||||
|
||||
// Decode audio codes if provided
|
||||
int decoded_T = 0;
|
||||
std::vector<float> decoded_latents;
|
||||
if (!codes_vec.empty()) {
|
||||
int T_5Hz = (int)codes_vec.size();
|
||||
int T_25Hz_codes = T_5Hz * 5;
|
||||
decoded_latents.resize(T_25Hz_codes * Oc);
|
||||
|
||||
music_dit_timer.reset();
|
||||
int ret = detok_ggml_decode(&detok, codes_vec.data(), T_5Hz, decoded_latents.data());
|
||||
if (ret < 0) {
|
||||
fprintf(stderr, "FATAL: music detokenizer decode failed\n");
|
||||
return "";
|
||||
}
|
||||
fprintf(stderr, "[Context] Detokenizer: %.1f ms\n", music_dit_timer.ms());
|
||||
|
||||
decoded_T = T_25Hz_codes < T ? T_25Hz_codes : T;
|
||||
}
|
||||
|
||||
// Build single context: [T, ctx_ch] = src_latents[64] + mask_ones[64]
|
||||
std::vector<float> context_single(T * ctx_ch);
|
||||
for (int t = 0; t < T; t++) {
|
||||
const float * src = (t < decoded_T)
|
||||
? decoded_latents.data() + t * Oc
|
||||
: silence.data() + t * Oc;
|
||||
for (int c = 0; c < Oc; c++)
|
||||
context_single[t * ctx_ch + c] = src[c];
|
||||
for (int c = 0; c < Oc; c++)
|
||||
context_single[t * ctx_ch + Oc + c] = 1.0f;
|
||||
}
|
||||
|
||||
// Replicate context for N batch samples (all identical)
|
||||
std::vector<float> context(batch_n * T * ctx_ch);
|
||||
for (int b = 0; b < batch_n; b++)
|
||||
{
|
||||
memcpy(context.data() + b * T * ctx_ch, context_single.data(), T * ctx_ch * sizeof(float));
|
||||
}
|
||||
|
||||
// Generate N noise samples
|
||||
std::vector<float> noise(batch_n * Oc * T);
|
||||
|
||||
{
|
||||
// Generate N noise samples with seeds: seed, seed+1, ..., seed+N-1
|
||||
for (int b = 0; b < batch_n; b++) {
|
||||
std::mt19937 rng((uint32_t)(seed + b));
|
||||
std::normal_distribution<float> normal(0.0f, 1.0f);
|
||||
float * dst = noise.data() + b * Oc * T;
|
||||
for (int i = 0; i < Oc * T; i++)
|
||||
dst[i] = normal(rng);
|
||||
fprintf(stderr, "[Context Batch%d] noise seed=%lld\n", b, seed + b);
|
||||
}
|
||||
}
|
||||
|
||||
// DiT Generate
|
||||
std::vector<float> output(batch_n * Oc * T);
|
||||
|
||||
fprintf(stderr, "[DiT] Starting: T=%d, S=%d, enc_S=%d, steps=%d, batch=%d\n",
|
||||
T, S, enc_S, num_steps, batch_n);
|
||||
|
||||
music_dit_timer.reset();
|
||||
dit_ggml_generate(&acestep_dit, noise.data(), context.data(), enc_hidden.data(),
|
||||
enc_S, T, batch_n, num_steps, schedule.data(), output.data(),
|
||||
guidance_scale);
|
||||
fprintf(stderr, "[DiT] Total generation: %.1f ms (%.1f ms/sample)\n",
|
||||
music_dit_timer.ms(), music_dit_timer.ms() / batch_n);
|
||||
|
||||
// VAE Decode + Write WAVs
|
||||
int T_latent = T;
|
||||
int T_audio_max = T_latent * 1920;
|
||||
std::vector<float> audio(2 * T_audio_max);
|
||||
|
||||
int b = 0;
|
||||
float * dit_out = output.data() + b * Oc * T;
|
||||
|
||||
music_dit_timer.reset();
|
||||
int T_audio = vae_ggml_decode_tiled(&vae, dit_out, T_latent, audio.data(), T_audio_max, vae_chunk, vae_overlap);
|
||||
if (T_audio < 0) {
|
||||
fprintf(stderr, "[VAE] ERROR: decode failed\n");
|
||||
return "";
|
||||
}
|
||||
fprintf(stderr, "[VAE] Decode: %.1f ms\n", music_dit_timer.ms());
|
||||
|
||||
// Peak normalization to -1.0 dB
|
||||
{
|
||||
float peak = 0.0f;
|
||||
int n_samples = 2 * T_audio;
|
||||
for (int i = 0; i < n_samples; i++) {
|
||||
float a = audio[i] < 0 ? -audio[i] : audio[i];
|
||||
if (a > peak) peak = a;
|
||||
}
|
||||
if (peak > 1e-6f) {
|
||||
const float target_amp = powf(10.0f, -1.0f / 20.0f);
|
||||
float gain = target_amp / peak;
|
||||
for (int i = 0; i < n_samples; i++)
|
||||
audio[i] *= gain;
|
||||
}
|
||||
}
|
||||
|
||||
// output wav
|
||||
std::vector<float> resampled_buf = resample_wav(audio,48000,24000);
|
||||
std::string finalb64 = save_ulaw_wav8_base64(audio, 24000);
|
||||
|
||||
fprintf(stderr, "[Request Done]\n");
|
||||
return finalb64;
|
||||
}
|
||||
|
||||
// void music_dit_free()
|
||||
// {
|
||||
// if (have_vae) vae_ggml_free(&vae);
|
||||
// dit_ggml_free(&acestep_dit);
|
||||
|
||||
// gf_close(&gf_te);
|
||||
|
||||
// qwen3_free(&music_text_enc);
|
||||
// cond_ggml_free(&music_cond);
|
||||
// detok_ggml_free(&detok);
|
||||
// }
|
||||
|
|
@ -14,6 +14,7 @@
|
|||
|
||||
#include "./request.cpp"
|
||||
#include "./ace-qwen3.cpp"
|
||||
#include "./dit-vae.cpp"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
|
|
@ -54,9 +55,15 @@ bool musictype_load_model(const music_load_model_inputs inputs)
|
|||
musicllm_filename.c_str(),musicembedding_filename.c_str(),musicdiffusion_filename.c_str(),musicvae_filename.c_str());
|
||||
musicdebugmode = inputs.debugmode;
|
||||
|
||||
bool ok = load_acestep(musicllm_filename);
|
||||
bool ok = load_acestep_lm(musicllm_filename);
|
||||
if (!ok) {
|
||||
printf("\nFailed to load Music Gen Model!\n");
|
||||
printf("\nFailed to load Music Gen LM Model!\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
ok = load_acestep_dit(musicembedding_filename,musicdiffusion_filename,musicvae_filename);
|
||||
if (!ok) {
|
||||
printf("\nFailed to load Music Gen Diffusion, Embed or VAE Model!\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue