ace step diffusion loading

2026-05-18 23:49:46 +00:00 · 2026-02-24 15:24:15 +08:00 · 2026-02-24 15:24:15 +08:00 · 0fd7d2c0e5
commit 0fd7d2c0e5
parent 749536f464
4 changed files with 592 additions and 226 deletions
--- a/2
+++ b/2
@ -745,8 +745,6 @@ ttscppmain: otherarch/ttscpp/cli/cli.cpp otherarch/ttscpp/cli/playback.cpp other
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 qwen3tts: otherarch/qwen3tts/q3ttsmain.cpp otherarch/qwen3tts/qwen3_tts.cpp otherarch/qwen3tts/text_tokenizer.cpp otherarch/qwen3tts/gguf_loader.cpp otherarch/qwen3tts/tts_transformer.cpp otherarch/qwen3tts/audio_tokenizer_decoder.cpp otherarch/qwen3tts/audio_tokenizer_encoder.cpp otherarch/qwen3tts/coreml_code_predictor_stub.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-acestep-b: otherarch/acestep/dit-vae.cpp otherarch/acestep/request.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 ggml/src/ggml-vulkan-shaders.cpp:
 ifdef VULKAN_BUILD
--- a/otherarch/acestep/ace-qwen3.cpp
+++ b/otherarch/acestep/ace-qwen3.cpp
@ -27,6 +27,7 @@ struct Timer {
        return std::chrono::duration<double, std::milli>(
            std::chrono::steady_clock::now() - t).count();
    }
+    void reset() { t = std::chrono::steady_clock::now(); }
 };

 // Special token IDs (Qwen3 extended vocab)
@ -1171,260 +1172,262 @@ static void usage(const char * prog) {
        , prog);
 }

-// int main(int argc, char ** argv) {
-//     const char * model_path   = nullptr;
-//     const char * request_path = nullptr;
-//     int max_seq     = 8192;
-//     int batch_size  = 1;
-//     bool use_fsm    = true;
-//     const char * dump_logits  = nullptr;
-//     const char * dump_tokens  = nullptr;
+/*
+int main(int argc, char ** argv) {
+    const char * model_path   = nullptr;
+    const char * request_path = nullptr;
+    int max_seq     = 8192;
+    int batch_size  = 1;
+    bool use_fsm    = true;
+    const char * dump_logits  = nullptr;
+    const char * dump_tokens  = nullptr;

-//     if (argc < 2) {
-//         usage(argv[0]);
-//         return 1;
-//     }
+    if (argc < 2) {
+        usage(argv[0]);
+        return 1;
+    }

-//     for (int i = 1; i < argc; i++) {
-//         if (!strcmp(argv[i], "--model") && i + 1 < argc)
-//             model_path = argv[++i];
-//         else if (!strcmp(argv[i], "--request") && i + 1 < argc)
-//             request_path = argv[++i];
-//         else if (!strcmp(argv[i], "--max-seq") && i + 1 < argc)
-//             max_seq = atoi(argv[++i]);
-//         else if (!strcmp(argv[i], "--batch") && i + 1 < argc)
-//             batch_size = atoi(argv[++i]);
-//         else if (!strcmp(argv[i], "--no-fsm"))
-//             use_fsm = false;
-//         else if (!strcmp(argv[i], "--dump-logits") && i + 1 < argc)
-//             dump_logits = argv[++i];
-//         else if (!strcmp(argv[i], "--dump-tokens") && i + 1 < argc)
-//             dump_tokens = argv[++i];
-//         else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) {
-//             usage(argv[0]);
-//             return 0;
-//         }
-//         else {
-//             fprintf(stderr, "Unknown option: %s\n", argv[i]);
-//             usage(argv[0]);
-//             return 1;
-//         }
-//     }
+    for (int i = 1; i < argc; i++) {
+        if (!strcmp(argv[i], "--model") && i + 1 < argc)
+            model_path = argv[++i];
+        else if (!strcmp(argv[i], "--request") && i + 1 < argc)
+            request_path = argv[++i];
+        else if (!strcmp(argv[i], "--max-seq") && i + 1 < argc)
+            max_seq = atoi(argv[++i]);
+        else if (!strcmp(argv[i], "--batch") && i + 1 < argc)
+            batch_size = atoi(argv[++i]);
+        else if (!strcmp(argv[i], "--no-fsm"))
+            use_fsm = false;
+        else if (!strcmp(argv[i], "--dump-logits") && i + 1 < argc)
+            dump_logits = argv[++i];
+        else if (!strcmp(argv[i], "--dump-tokens") && i + 1 < argc)
+            dump_tokens = argv[++i];
+        else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) {
+            usage(argv[0]);
+            return 0;
+        }
+        else {
+            fprintf(stderr, "Unknown option: %s\n", argv[i]);
+            usage(argv[0]);
+            return 1;
+        }
+    }

-//     if (!model_path) {
-//         fprintf(stderr, "ERROR: --model required\n");
-//         usage(argv[0]); return 1;
-//     }
-//     if (!request_path) {
-//         fprintf(stderr, "ERROR: --request required\n");
-//         usage(argv[0]); return 1;
-//     }
+    if (!model_path) {
+        fprintf(stderr, "ERROR: --model required\n");
+        usage(argv[0]); return 1;
+    }
+    if (!request_path) {
+        fprintf(stderr, "ERROR: --request required\n");
+        usage(argv[0]); return 1;
+    }

-//     // Read request JSON
-//     AceRequest req;
-//     if (!request_parse(&req, request_path)) return 1;
-//     request_dump(&req, stderr);
+    // Read request JSON
+    AceRequest req;
+    if (!request_parse(&req, request_path)) return 1;
+    request_dump(&req, stderr);

-//     if (req.caption.empty()) {
-//         fprintf(stderr, "ERROR: caption is empty in %s\n", request_path);
-//         return 1;
-//     }
+    if (req.caption.empty()) {
+        fprintf(stderr, "ERROR: caption is empty in %s\n", request_path);
+        return 1;
+    }

-//     // Resolve seed
-//     long long seed = req.seed;
-//     if (seed < 0) {
-//         std::random_device rd;
-//         seed = (int64_t)rd() << 32 | rd();
-//         if (seed < 0) seed = -seed;  // keep positive
-//     }
-//     req.seed = seed;
+    // Resolve seed
+    long long seed = req.seed;
+    if (seed < 0) {
+        std::random_device rd;
+        seed = (int64_t)rd() << 32 | rd();
+        if (seed < 0) seed = -seed;  // keep positive
+    }
+    req.seed = seed;

-//     // Generation params from request
-//     float temperature      = req.lm_temperature;
-//     float top_p            = req.lm_top_p;
-//     int   top_k            = req.lm_top_k;
-//     float cfg_scale        = req.lm_cfg_scale;
-//     const char * neg_prompt = req.lm_negative_prompt.c_str();
+    // Generation params from request
+    float temperature      = req.lm_temperature;
+    float top_p            = req.lm_top_p;
+    int   top_k            = req.lm_top_k;
+    float cfg_scale        = req.lm_cfg_scale;
+    const char * neg_prompt = req.lm_negative_prompt.c_str();

-//     Timer t_total;
+    Timer t_total;

-//     // Load BPE tokenizer from model GGUF
-//     BPETokenizer bpe;
-//     if (!load_bpe_from_gguf(&bpe, model_path)) return 1;
+    // Load BPE tokenizer from model GGUF
+    BPETokenizer bpe;
+    if (!load_bpe_from_gguf(&bpe, model_path)) return 1;

-//     // Load model
-//     int n_kv_sets = (cfg_scale > 1.0f) ? 2 * batch_size : batch_size;
-//     Timer t_load;
-//     Qwen3LM model;
-//     if (!qw3lm_load(&model, model_path, max_seq, n_kv_sets)) return 1;
-//     double load_ms = t_load.ms();
+    // Load model
+    int n_kv_sets = (cfg_scale > 1.0f) ? 2 * batch_size : batch_size;
+    Timer t_load;
+    Qwen3LM model;
+    if (!qw3lm_load(&model, model_path, max_seq, n_kv_sets)) return 1;
+    double load_ms = t_load.ms();

-//     // FSM
-//     MetadataFSM fsm;
-//     if (use_fsm) fsm.init(bpe, model.cfg.vocab_size);
+    // FSM
+    MetadataFSM fsm;
+    if (use_fsm) fsm.init(bpe, model.cfg.vocab_size);

-//     // Copy request -> AcePrompt (internal LLM struct)
-//     AcePrompt ace = {};
-//     ace.caption        = req.caption;
-//     ace.lyrics         = req.lyrics;
-//     ace.duration       = req.duration;
-//     ace.bpm            = req.bpm;
-//     ace.keyscale       = req.keyscale;
-//     ace.timesignature  = req.timesignature;
-//     ace.vocal_language = req.vocal_language;
+    // Copy request -> AcePrompt (internal LLM struct)
+    AcePrompt ace = {};
+    ace.caption        = req.caption;
+    ace.lyrics         = req.lyrics;
+    ace.duration       = req.duration;
+    ace.bpm            = req.bpm;
+    ace.keyscale       = req.keyscale;
+    ace.timesignature  = req.timesignature;
+    ace.vocal_language = req.vocal_language;

-//     bool user_has_codes = !req.audio_codes.empty();
-//     bool need_lm_codes  = req.thinking && !user_has_codes;
+    bool user_has_codes = !req.audio_codes.empty();
+    bool need_lm_codes  = req.thinking && !user_has_codes;

-//     bool is_simple = ace.lyrics.empty() &&
-//                      ace.bpm <= 0 && ace.duration <= 0 &&
-//                      ace.keyscale.empty() && ace.timesignature.empty();
+    bool is_simple = ace.lyrics.empty() &&
+                     ace.bpm <= 0 && ace.duration <= 0 &&
+                     ace.keyscale.empty() && ace.timesignature.empty();

-//     std::vector<int> prompt;
-//     std::vector<AcePrompt> aces;  // populated by Phase 1 (simple or partial)
+    std::vector<int> prompt;
+    std::vector<AcePrompt> aces;  // populated by Phase 1 (simple or partial)

-//     // Preprocessor: simple mode generates lyrics + metas from caption
-//     if (is_simple) {
-//         fprintf(stderr, "[Simple] Inspiration\n");
+    // Preprocessor: simple mode generates lyrics + metas from caption
+    if (is_simple) {
+        fprintf(stderr, "[Simple] Inspiration\n");

-//         const char * sys =
-//             "# Instruction\n"
-//             "Expand the user's input into a more detailed"
-//             " and specific musical description:\n";
-//         std::string user_msg = ace.caption + "\n\ninstrumental: "
-//             + std::string(req.instrumental ? "true" : "false");
-//         prompt = build_custom_prompt(bpe, sys, user_msg.c_str());
+        const char * sys =
+            "# Instruction\n"
+            "Expand the user's input into a more detailed"
+            " and specific musical description:\n";
+        std::string user_msg = ace.caption + "\n\ninstrumental: "
+            + std::string(req.instrumental ? "true" : "false");
+        prompt = build_custom_prompt(bpe, sys, user_msg.c_str());

-//         // FSM: reset then optionally force language (shared for both paths)
-//         fsm.reset();
-//         if (use_fsm && ace.vocal_language != "unknown" && !ace.vocal_language.empty())
-//             fsm.force_language(bpe, ace.vocal_language);
+        // FSM: reset then optionally force language (shared for both paths)
+        fsm.reset();
+        if (use_fsm && ace.vocal_language != "unknown" && !ace.vocal_language.empty())
+            fsm.force_language(bpe, ace.vocal_language);

-//         // Phase 1: N lyrics + metadata generations (always batched, N=batch_size)
-//         fprintf(stderr, "[Simple] %zu tokens, N=%d, seeds: %lld..%lld\n",
-//                 prompt.size(), batch_size, seed, seed + batch_size - 1);
+        // Phase 1: N lyrics + metadata generations (always batched, N=batch_size)
+        fprintf(stderr, "[Simple] %zu tokens, N=%d, seeds: %lld..%lld\n",
+                prompt.size(), batch_size, seed, seed + batch_size - 1);

-//         auto phase1_texts = generate_phase1_batch(
-//             &model, &bpe, prompt, 2048, temperature, 1.0f, 0,
-//             seed, batch_size, use_fsm ? &fsm : nullptr, true);
+        auto phase1_texts = generate_phase1_batch(
+            &model, &bpe, prompt, 2048, temperature, 1.0f, 0,
+            seed, batch_size, use_fsm ? &fsm : nullptr, true);

-//         parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Simple", true);
+        parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Simple", true);

-//         for (int i = 0; i < batch_size; i++) qw3lm_reset_kv(&model, i);
-//     }
+        for (int i = 0; i < batch_size; i++) qw3lm_reset_kv(&model, i);
+    }

-//     // Re-evaluate after possible simple enrichment
-//     const AcePrompt & ace_ref = aces.empty() ? ace : aces[0];
-//     bool has_all_metas = (ace_ref.bpm > 0 && ace_ref.duration > 0 &&
-//                           !ace_ref.keyscale.empty() && !ace_ref.timesignature.empty());
+    // Re-evaluate after possible simple enrichment
+    const AcePrompt & ace_ref = aces.empty() ? ace : aces[0];
+    bool has_all_metas = (ace_ref.bpm > 0 && ace_ref.duration > 0 &&
+                          !ace_ref.keyscale.empty() && !ace_ref.timesignature.empty());

-//     if (!has_all_metas) {
-//         // Partial-metas: Phase 1 with CFG to fill missing fields
-//         prompt = build_lm_prompt(bpe, ace);
-//         std::vector<int> uncond;
-//         if (cfg_scale > 1.0f)
-//             uncond = build_lm_prompt_uncond(bpe, ace, neg_prompt);
+    if (!has_all_metas) {
+        // Partial-metas: Phase 1 with CFG to fill missing fields
+        prompt = build_lm_prompt(bpe, ace);
+        std::vector<int> uncond;
+        if (cfg_scale > 1.0f)
+            uncond = build_lm_prompt_uncond(bpe, ace, neg_prompt);

-//         fprintf(stderr, "[Partial] %zu tokens, CFG: %.2f, N=%d, seeds: %lld..%lld\n",
-//                 prompt.size(), cfg_scale, batch_size, seed, seed + batch_size - 1);
+        fprintf(stderr, "[Partial] %zu tokens, CFG: %.2f, N=%d, seeds: %lld..%lld\n",
+                prompt.size(), cfg_scale, batch_size, seed, seed + batch_size - 1);

-//         fsm.reset();
-//         auto phase1_texts = generate_phase1_batch(
-//             &model, &bpe, prompt, 2048, temperature, top_p, top_k,
-//             seed, batch_size, use_fsm ? &fsm : nullptr, false,
-//             cfg_scale, uncond.empty() ? nullptr : &uncond, true);
+        fsm.reset();
+        auto phase1_texts = generate_phase1_batch(
+            &model, &bpe, prompt, 2048, temperature, top_p, top_k,
+            seed, batch_size, use_fsm ? &fsm : nullptr, false,
+            cfg_scale, uncond.empty() ? nullptr : &uncond, true);

-//         parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Partial", false);
+        parse_phase1_into_aces(phase1_texts, ace, aces, seed, "Partial", false);

-//         for (int i = 0; i < 2 * batch_size; i++) qw3lm_reset_kv(&model, i);
-//     }
+        for (int i = 0; i < 2 * batch_size; i++) qw3lm_reset_kv(&model, i);
+    }

-//     // Guarantee aces is populated (all-metas: single shared ace for prefill optimization)
-//     if (aces.empty()) aces = {ace};
+    // Guarantee aces is populated (all-metas: single shared ace for prefill optimization)
+    if (aces.empty()) aces = {ace};

-//     // Debug: dump tokens/logits
-//     if (need_lm_codes && (dump_logits || dump_tokens)) {
-//         std::string cot = build_cot_yaml(aces[0]);
-//         auto dbg_prompt = build_lm_prompt_with_cot(bpe, aces[0], cot);
+    // Debug: dump tokens/logits
+    if (need_lm_codes && (dump_logits || dump_tokens)) {
+        std::string cot = build_cot_yaml(aces[0]);
+        auto dbg_prompt = build_lm_prompt_with_cot(bpe, aces[0], cot);

-//         if (dump_tokens) {
-//             FILE * f = fopen(dump_tokens, "w");
-//             if (f) {
-//                 for (size_t j = 0; j < dbg_prompt.size(); j++)
-//                     fprintf(f, "%s%d", j ? "," : "", dbg_prompt[j]);
-//                 fprintf(f, "\n");
-//                 fclose(f);
-//                 fprintf(stderr, "[Debug] Tokens -> %s (%zu)\n",
-//                         dump_tokens, dbg_prompt.size());
-//             }
-//         }
-//         if (dump_logits) {
-//             std::vector<float> dbg_logits(model.cfg.vocab_size);
-//             qw3lm_forward(&model, dbg_prompt.data(), (int)dbg_prompt.size(), 0, dbg_logits.data());
-//             FILE * f = fopen(dump_logits, "wb");
-//             if (f) {
-//                 fwrite(dbg_logits.data(), sizeof(float), model.cfg.vocab_size, f);
-//                 fclose(f);
-//                 fprintf(stderr, "[Debug] Logits -> %s (%d floats, argmax=%d)\n",
-//                         dump_logits, model.cfg.vocab_size,
-//                         (int)(std::max_element(dbg_logits.begin(), dbg_logits.end()) - dbg_logits.begin()));
-//             }
-//             qw3lm_reset_kv(&model, 0);
-//         }
-//     }
+        if (dump_tokens) {
+            FILE * f = fopen(dump_tokens, "w");
+            if (f) {
+                for (size_t j = 0; j < dbg_prompt.size(); j++)
+                    fprintf(f, "%s%d", j ? "," : "", dbg_prompt[j]);
+                fprintf(f, "\n");
+                fclose(f);
+                fprintf(stderr, "[Debug] Tokens -> %s (%zu)\n",
+                        dump_tokens, dbg_prompt.size());
+            }
+        }
+        if (dump_logits) {
+            std::vector<float> dbg_logits(model.cfg.vocab_size);
+            qw3lm_forward(&model, dbg_prompt.data(), (int)dbg_prompt.size(), 0, dbg_logits.data());
+            FILE * f = fopen(dump_logits, "wb");
+            if (f) {
+                fwrite(dbg_logits.data(), sizeof(float), model.cfg.vocab_size, f);
+                fclose(f);
+                fprintf(stderr, "[Debug] Logits -> %s (%d floats, argmax=%d)\n",
+                        dump_logits, model.cfg.vocab_size,
+                        (int)(std::max_element(dbg_logits.begin(), dbg_logits.end()) - dbg_logits.begin()));
+            }
+            qw3lm_reset_kv(&model, 0);
+        }
+    }

-//     // Phase 2: generate audio codes (always batched, N=batch_size)
-//     std::vector<std::string> batch_codes(batch_size);
-//     if (need_lm_codes) {
-//         batch_codes = run_phase2_batch(&model, bpe, aces,
-//             temperature, top_p, top_k, seed, batch_size, cfg_scale, neg_prompt);
-//     } else {
-//         fprintf(stderr, "[Skip] %s, no code generation\n",
-//                 user_has_codes ? "user codes present" : "thinking=false");
-//     }
+    // Phase 2: generate audio codes (always batched, N=batch_size)
+    std::vector<std::string> batch_codes(batch_size);
+    if (need_lm_codes) {
+        batch_codes = run_phase2_batch(&model, bpe, aces,
+            temperature, top_p, top_k, seed, batch_size, cfg_scale, neg_prompt);
+    } else {
+        fprintf(stderr, "[Skip] %s, no code generation\n",
+                user_has_codes ? "user codes present" : "thinking=false");
+    }

-//     // Write N output files: request0.json, request1.json, ...
-//     {
-//         std::string base(request_path);
-//         std::string ext = ".json";
-//         size_t dot = base.rfind('.');
-//         if (dot != std::string::npos) { ext = base.substr(dot); base = base.substr(0, dot); }
-//         for (int b = 0; b < batch_size; b++) {
-//             AceRequest rr = req;
-//             const AcePrompt & a = aces[b < (int)aces.size() ? b : 0];
-//             rr.caption        = a.caption;
-//             rr.lyrics         = a.lyrics;
-//             rr.bpm            = a.bpm;
-//             rr.duration       = a.duration;
-//             rr.keyscale       = a.keyscale;
-//             rr.timesignature  = a.timesignature;
-//             rr.vocal_language = a.vocal_language;
-//             if (!batch_codes[b].empty()) rr.audio_codes = batch_codes[b];
-//             rr.seed = seed + b;
-//             char path[512];
-//             snprintf(path, sizeof(path), "%s%d%s", base.c_str(), b, ext.c_str());
-//             request_write(&rr, path);
-//             fprintf(stderr, "[Output] Wrote %s\n", path);
-//         }
-//     }
+    // Write N output files: request0.json, request1.json, ...
+    {
+        std::string base(request_path);
+        std::string ext = ".json";
+        size_t dot = base.rfind('.');
+        if (dot != std::string::npos) { ext = base.substr(dot); base = base.substr(0, dot); }
+        for (int b = 0; b < batch_size; b++) {
+            AceRequest rr = req;
+            const AcePrompt & a = aces[b < (int)aces.size() ? b : 0];
+            rr.caption        = a.caption;
+            rr.lyrics         = a.lyrics;
+            rr.bpm            = a.bpm;
+            rr.duration       = a.duration;
+            rr.keyscale       = a.keyscale;
+            rr.timesignature  = a.timesignature;
+            rr.vocal_language = a.vocal_language;
+            if (!batch_codes[b].empty()) rr.audio_codes = batch_codes[b];
+            rr.seed = seed + b;
+            char path[512];
+            snprintf(path, sizeof(path), "%s%d%s", base.c_str(), b, ext.c_str());
+            request_write(&rr, path);
+            fprintf(stderr, "[Output] Wrote %s\n", path);
+        }
+    }

-//     fprintf(stderr, "[Ace-Qwen3] Load %.0f | Total %.0fms | seed=%lld\n",
-//             load_ms, t_total.ms(), seed);
+    fprintf(stderr, "[Ace-Qwen3] Load %.0f | Total %.0fms | seed=%lld\n",
+            load_ms, t_total.ms(), seed);

-//     qw3lm_free(&model);
-//     return 0;
-// }
+    qw3lm_free(&model);
+    return 0;
+}
+*/


 //kcpp stuff

 static Qwen3LM acestep_llm;
 static BPETokenizer acestep_bpe;
-static bool acestep_loaded = false;
+static bool acestep_lm_loaded = false;

-bool load_acestep(std::string model_path)
+bool load_acestep_lm(std::string model_path)
 {
-    acestep_loaded = false;
+    acestep_lm_loaded = false;
    int max_seq     = 8192;
    const int batch_size  = 1; //only bs 1 is allowed
    if (!load_bpe_from_gguf(&acestep_bpe, model_path.c_str())) {
@ -1435,7 +1438,7 @@ bool load_acestep(std::string model_path)
    if (!qw3lm_load(&acestep_llm, model_path.c_str(), max_seq, n_kv_sets)) {
        return false;
    }
-    acestep_loaded = true;
+    acestep_lm_loaded = true;
    return true;
 }

--- a/otherarch/acestep/dit-vae.cpp
+++ b/otherarch/acestep/dit-vae.cpp
@ -22,16 +22,6 @@
 #include "./debug.h"
 #include "./request.h"

-struct Timer {
-    std::chrono::steady_clock::time_point t;
-    Timer() : t(std::chrono::steady_clock::now()) {}
-    double ms() const {
-        return std::chrono::duration<double, std::milli>(
-            std::chrono::steady_clock::now() - t).count();
-    }
-    void reset() { t = std::chrono::steady_clock::now(); }
-};
-
 // Minimal WAV writer (16-bit PCM stereo)
 static bool write_wav(const char * path, const float * audio, int T_audio, int sr) {
    FILE * f = fopen(path, "wb");
@ -100,6 +90,7 @@ static std::vector<int> parse_codes_string(const std::string & s) {
    return codes;
 }

+/*
 int main(int argc, char ** argv) {
    if (argc < 2) { print_usage(argv[0]); return 1; }

@ -579,3 +570,370 @@ int main(int argc, char ** argv) {
    fprintf(stderr, "[Pipeline] All done\n");
    return 0;
 }
+*/
+
+//kcpp stuff
+static DiTGGML acestep_dit = {};
+static bool acestep_dit_loaded = false;
+static DiTGGMLConfig music_dit_cfg;
+static Timer music_dit_timer;
+static bool is_turbo = false;
+static VAEGGML vae = {};
+static BPETokenizer music_tok;
+static Qwen3GGML music_text_enc = {};
+static GGUFModel gf_te = {};
+static const void * musice_te_embed_data = nullptr;
+static CondGGML music_cond = {};
+static std::vector<float> silence_full;  // [15000, 64] f32
+static DetokGGML detok = {};
+
+bool load_acestep_dit(std::string music_embd_path, std::string music_dit_path, std::string music_vae_path)
+{
+    const char * text_enc_gguf = music_embd_path.c_str();
+    const char * dit_gguf      = music_dit_path.c_str();
+    const char * vae_gguf       = music_vae_path.c_str();
+
+    // Load DiT model (once for all requests)
+    dit_ggml_init_backend(&acestep_dit);
+    fprintf(stderr, "[Load] Backend init: %.1f ms\n", music_dit_timer.ms());
+
+    music_dit_timer.reset();
+    if (!dit_ggml_load(&acestep_dit, dit_gguf, music_dit_cfg)) {
+        fprintf(stderr, "FATAL: failed to load DiT model\n");
+        return false;
+    }
+    fprintf(stderr, "[Load] DiT weight load: %.1f ms\n", music_dit_timer.ms());
+
+    // Read DiT GGUF metadata + silence_latent tensor (once)
+    is_turbo = false;
+    {
+        GGUFModel gf = {};
+        if (gf_load(&gf, dit_gguf)) {
+            is_turbo = gf_get_bool(gf, "acestep.is_turbo");
+            const void * sl_data = gf_get_data(gf, "silence_latent");
+            if (sl_data) {
+                silence_full.resize(15000 * 64);
+                memcpy(silence_full.data(), sl_data, 15000 * 64 * sizeof(float));
+                fprintf(stderr, "[Load] silence_latent: [15000, 64] from GGUF\n");
+            } else {
+                fprintf(stderr, "FATAL: silence_latent tensor not found in %s\n", dit_gguf);
+                return false;
+            }
+            gf_close(&gf);
+        } else {
+            fprintf(stderr, "FATAL: cannot reopen %s for metadata\n", dit_gguf);
+            return false;
+        }
+    }
+
+    // Load VAE model (once for all requests)
+    music_dit_timer.reset();
+    vae_ggml_load(&vae, vae_gguf);
+    fprintf(stderr, "[Load] VAE weights: %.1f ms\n", music_dit_timer.ms());
+
+
+     music_dit_timer.reset();
+    if (!load_bpe_from_gguf(&music_tok, text_enc_gguf)) {
+        fprintf(stderr, "FATAL: failed to load music tokenizer from %s\n", text_enc_gguf);
+        return false;
+    }
+    fprintf(stderr, "[Load] BPE tokenizer: %.1f ms\n", music_dit_timer.ms());
+
+    // Text encoder forward (caption only)
+    music_dit_timer.reset();
+    qwen3_init_backend(&music_text_enc);
+    if (!qwen3_load_text_encoder(&music_text_enc, text_enc_gguf)) {
+        fprintf(stderr, "FATAL: failed to load text encoder\n");
+        return false;
+    }
+    fprintf(stderr, "[Load] TextEncoder: %.1f ms\n", music_dit_timer.ms());
+
+    if (!gf_load(&gf_te, text_enc_gguf)) {
+        fprintf(stderr, "FATAL: cannot reopen text encoder GGUF for lyric embed\n");
+        return false;
+    }
+    musice_te_embed_data = gf_get_data(gf_te, "embed_tokens.weight");
+    if (!musice_te_embed_data) {
+        fprintf(stderr, "FATAL: embed_tokens.weight not found\n");
+        return false;
+    }
+
+    // Condition encoder forward
+    music_dit_timer.reset();
+    cond_ggml_init_backend(&music_cond);
+    if (!cond_ggml_load(&music_cond, dit_gguf)) {
+        fprintf(stderr, "FATAL: failed to load condition encoder\n");
+        return false;
+    }
+    fprintf(stderr, "[Load] ConditionEncoder: %.1f ms\n", music_dit_timer.ms());
+
+    music_dit_timer.reset();
+
+    if (!detok_ggml_load(&detok, dit_gguf, acestep_dit.backend, acestep_dit.cpu_backend)) {
+        fprintf(stderr, "FATAL: failed to load detokenizer\n");
+        return false;
+    }
+    fprintf(stderr, "[Load] Detokenizer: %.1f ms\n", music_dit_timer.ms());
+
+    acestep_dit_loaded = true;
+    return true;
+}
+
+std::string acestep_generate_audio(const music_generation_inputs inputs)
+{
+    // Parse request JSON
+    AceRequest req;
+    std::string injson =  inputs.input_json;
+    request_init(&req);
+    if (!request_parse_from_str(&req, injson)) {
+        fprintf(stderr, "ERROR: failed to parse music gen request\n");
+        return "";
+    }
+    if (req.caption.empty()) {
+        fprintf(stderr, "ERROR: music gen caption is empty!\n");
+        return "";
+    }
+
+    const int FRAMES_PER_SECOND = 25;
+    int Oc = music_dit_cfg.out_channels;          // 64
+    int ctx_ch = music_dit_cfg.in_channels - Oc;  // 128
+    int batch_n                = 1;
+    int vae_chunk              = 256;
+    int vae_overlap            = 64;
+
+    // Extract params
+    const char * caption  = req.caption.c_str();
+    const char * lyrics   = req.lyrics.empty() ? "[Instrumental]" : req.lyrics.c_str();
+    char bpm_str[16] = "N/A";
+    if (req.bpm > 0) snprintf(bpm_str, sizeof(bpm_str), "%d", req.bpm);
+    const char * bpm      = bpm_str;
+    const char * keyscale = req.keyscale.empty() ? "N/A" : req.keyscale.c_str();
+    const char * timesig  = req.timesignature.empty() ? "N/A" : req.timesignature.c_str();
+    const char * language = req.vocal_language.empty() ? "en" : req.vocal_language.c_str();
+    float duration        = req.duration > 0 ? req.duration : 120.0f;
+    long long seed        = req.seed;
+    int num_steps         = req.inference_steps > 0 ? req.inference_steps : 8;
+    float guidance_scale  = req.guidance_scale > 0 ? req.guidance_scale : 7.0f;
+    float shift           = req.shift > 0 ? req.shift : 1.0f;
+
+    if (is_turbo && guidance_scale > 1.0f) {
+        fprintf(stderr, "[Pipeline] WARNING: turbo model, forcing guidance_scale=1.0 (was %.1f)\n",
+                guidance_scale);
+        guidance_scale = 1.0f;
+    }
+
+    if (seed < 0) {
+        std::random_device rd;
+        seed = (long long)rd() << 32 | rd();
+        if (seed < 0) seed = -seed;
+    }
+    fprintf(stderr, "[Pipeline] seed=%lld, steps=%d, guidance=%.1f, shift=%.1f, duration=%.1fs\n",
+            seed, num_steps, guidance_scale, shift, duration);
+
+    // Parse audio codes from request
+    std::vector<int> codes_vec = parse_codes_string(req.audio_codes);
+    if (!codes_vec.empty())
+        fprintf(stderr, "[Pipeline] %zu audio codes (%.1fs @ 5Hz)\n",
+                codes_vec.size(), (float)codes_vec.size() / 5.0f);
+
+    // Build schedule: t_i = shift * t / (1 + (shift-1)*t) where t = 1 - i/steps
+    std::vector<float> schedule(num_steps);
+    for (int i = 0; i < num_steps; i++) {
+        float t = 1.0f - (float)i / (float)num_steps;
+        schedule[i] = shift * t / (1.0f + (shift - 1.0f) * t);
+    }
+
+    // T = number of 25Hz latent frames for DiT
+    // When audio codes are present, T is determined by the codes.
+    // Otherwise, T is derived from the requested duration.
+    int T = codes_vec.empty()
+        ? (int)(duration * FRAMES_PER_SECOND)
+        : (int)codes_vec.size() * 5;
+    T = ((T + music_dit_cfg.patch_size - 1) / music_dit_cfg.patch_size) * music_dit_cfg.patch_size;
+    int S = T / music_dit_cfg.patch_size;
+    int enc_S = 0;
+
+    fprintf(stderr, "[Pipeline] T=%d, S=%d\n", T, S);
+
+    if (T > 15000) {
+        fprintf(stderr, "ERROR: T=%d exceeds silence_latent max 15000, skipping\n", T);
+        return "";
+    }
+
+    // Text encoding
+    // 1. Load BPE tokenizer
+    music_dit_timer.reset();
+
+    // 2. Build formatted prompts
+    const char * instruction = "Generate audio semantic tokens based on the given conditions:";
+    char metas[512];
+    snprintf(metas, sizeof(metas),
+                "- bpm: %s\n- timesignature: %s\n- keyscale: %s\n- duration: %d seconds\n",
+                bpm, timesig, keyscale, (int)duration);
+    std::string text_str = std::string("# Instruction\n")
+        + instruction + "\n\n"
+        + "# Caption\n" + caption + "\n\n"
+        + "# Metas\n" + metas + "<|endoftext|>\n";
+
+    bool instrumental = (strcmp(lyrics, "[Instrumental]") == 0 || strcmp(lyrics, "[instrumental]") == 0);
+    std::string lyric_str = std::string("# Languages\n") + language + "\n\n# Lyric\n"
+        + (instrumental ? "[Instrumental]" : lyrics) + "<|endoftext|>";
+
+    // 3. Tokenize
+    auto text_ids  = bpe_encode(&music_tok, text_str.c_str(), true);
+    auto lyric_ids = bpe_encode(&music_tok, lyric_str.c_str(), true);
+    int S_text  = (int)text_ids.size();
+    int S_lyric = (int)lyric_ids.size();
+    fprintf(stderr, "[Pipeline] caption: %d tokens, lyrics: %d tokens\n", S_text, S_lyric);
+
+    int H_text = music_text_enc.cfg.hidden_size;  // 1024
+    std::vector<float> text_hidden(H_text * S_text);
+
+    music_dit_timer.reset();
+    qwen3_forward(&music_text_enc, text_ids.data(), S_text, text_hidden.data());
+    fprintf(stderr, "[Encode] TextEncoder (%d tokens): %.1f ms\n", S_text, music_dit_timer.ms());
+
+    // 5. Lyric embedding (CPU vocab lookup from text encoder embed table)
+    music_dit_timer.reset();
+    std::vector<float> lyric_embed(H_text * S_lyric);
+    qwen3_cpu_embed_lookup(musice_te_embed_data, H_text,
+                                lyric_ids.data(), S_lyric,
+                                lyric_embed.data());
+
+    fprintf(stderr, "[Encode] Lyric vocab lookup (%d tokens): %.1f ms\n", S_lyric, music_dit_timer.ms());
+
+    // Silence feats for timbre input: first 750 frames (30s @ 25Hz)
+    const int S_ref = 750;
+    std::vector<float> silence_feats(S_ref * 64);
+    memcpy(silence_feats.data(), silence_full.data(), S_ref * 64 * sizeof(float));
+
+    music_dit_timer.reset();
+    std::vector<float> enc_hidden;
+    cond_ggml_forward(&music_cond, text_hidden.data(), S_text,
+                        lyric_embed.data(), S_lyric,
+                        silence_feats.data(), S_ref,
+                        enc_hidden, &enc_S);
+    fprintf(stderr, "[Encode] ConditionEncoder: %.1f ms, enc_S=%d\n", music_dit_timer.ms(), enc_S);
+
+    // Context building
+    // Silence latent for this T
+    std::vector<float> silence(Oc * T);
+    memcpy(silence.data(), silence_full.data(), (size_t)(Oc * T) * sizeof(float));
+
+    // Decode audio codes if provided
+    int decoded_T = 0;
+    std::vector<float> decoded_latents;
+    if (!codes_vec.empty()) {
+        int T_5Hz = (int)codes_vec.size();
+        int T_25Hz_codes = T_5Hz * 5;
+        decoded_latents.resize(T_25Hz_codes * Oc);
+
+        music_dit_timer.reset();
+        int ret = detok_ggml_decode(&detok, codes_vec.data(), T_5Hz, decoded_latents.data());
+        if (ret < 0) {
+            fprintf(stderr, "FATAL: music detokenizer decode failed\n");
+            return "";
+        }
+        fprintf(stderr, "[Context] Detokenizer: %.1f ms\n", music_dit_timer.ms());
+
+        decoded_T = T_25Hz_codes < T ? T_25Hz_codes : T;
+    }
+
+    // Build single context: [T, ctx_ch] = src_latents[64] + mask_ones[64]
+    std::vector<float> context_single(T * ctx_ch);
+    for (int t = 0; t < T; t++) {
+        const float * src = (t < decoded_T)
+            ? decoded_latents.data() + t * Oc
+            : silence.data() + t * Oc;
+        for (int c = 0; c < Oc; c++)
+            context_single[t * ctx_ch + c] = src[c];
+        for (int c = 0; c < Oc; c++)
+            context_single[t * ctx_ch + Oc + c] = 1.0f;
+    }
+
+    // Replicate context for N batch samples (all identical)
+    std::vector<float> context(batch_n * T * ctx_ch);
+    for (int b = 0; b < batch_n; b++)
+    {
+        memcpy(context.data() + b * T * ctx_ch, context_single.data(), T * ctx_ch * sizeof(float));
+    }
+
+    // Generate N noise samples
+    std::vector<float> noise(batch_n * Oc * T);
+
+    {
+        // Generate N noise samples with seeds: seed, seed+1, ..., seed+N-1
+        for (int b = 0; b < batch_n; b++) {
+            std::mt19937 rng((uint32_t)(seed + b));
+            std::normal_distribution<float> normal(0.0f, 1.0f);
+            float * dst = noise.data() + b * Oc * T;
+            for (int i = 0; i < Oc * T; i++)
+                dst[i] = normal(rng);
+            fprintf(stderr, "[Context Batch%d] noise seed=%lld\n", b, seed + b);
+        }
+    }
+
+    // DiT Generate
+    std::vector<float> output(batch_n * Oc * T);
+
+    fprintf(stderr, "[DiT] Starting: T=%d, S=%d, enc_S=%d, steps=%d, batch=%d\n",
+            T, S, enc_S, num_steps, batch_n);
+
+    music_dit_timer.reset();
+    dit_ggml_generate(&acestep_dit, noise.data(), context.data(), enc_hidden.data(),
+                        enc_S, T, batch_n, num_steps, schedule.data(), output.data(),
+                        guidance_scale);
+    fprintf(stderr, "[DiT] Total generation: %.1f ms (%.1f ms/sample)\n",
+            music_dit_timer.ms(), music_dit_timer.ms() / batch_n);
+
+    // VAE Decode + Write WAVs
+    int T_latent = T;
+    int T_audio_max = T_latent * 1920;
+    std::vector<float> audio(2 * T_audio_max);
+
+    int b = 0;
+    float * dit_out = output.data() + b * Oc * T;
+
+    music_dit_timer.reset();
+    int T_audio = vae_ggml_decode_tiled(&vae, dit_out, T_latent, audio.data(), T_audio_max, vae_chunk, vae_overlap);
+    if (T_audio < 0) {
+        fprintf(stderr, "[VAE] ERROR: decode failed\n");
+        return "";
+    }
+    fprintf(stderr, "[VAE] Decode: %.1f ms\n", music_dit_timer.ms());
+
+    // Peak normalization to -1.0 dB
+    {
+        float peak = 0.0f;
+        int n_samples = 2 * T_audio;
+        for (int i = 0; i < n_samples; i++) {
+            float a = audio[i] < 0 ? -audio[i] : audio[i];
+            if (a > peak) peak = a;
+        }
+        if (peak > 1e-6f) {
+            const float target_amp = powf(10.0f, -1.0f / 20.0f);
+            float gain = target_amp / peak;
+            for (int i = 0; i < n_samples; i++)
+                audio[i] *= gain;
+        }
+    }
+
+    // output wav
+    std::vector<float> resampled_buf = resample_wav(audio,48000,24000);
+    std::string finalb64 = save_ulaw_wav8_base64(audio, 24000);
+
+    fprintf(stderr, "[Request Done]\n");
+    return finalb64;
+}
+
+// void music_dit_free()
+// {
+//     if (have_vae) vae_ggml_free(&vae);
+//     dit_ggml_free(&acestep_dit);
+
+//  gf_close(&gf_te);
+
+    // qwen3_free(&music_text_enc);
+    // cond_ggml_free(&music_cond);
+    // detok_ggml_free(&detok);
+// }
--- a/otherarch/acestep/music_adapter.cpp
+++ b/otherarch/acestep/music_adapter.cpp
@ -14,6 +14,7 @@

 #include "./request.cpp"
 #include "./ace-qwen3.cpp"
+#include "./dit-vae.cpp"

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@ -54,9 +55,15 @@ bool musictype_load_model(const music_load_model_inputs inputs)
    musicllm_filename.c_str(),musicembedding_filename.c_str(),musicdiffusion_filename.c_str(),musicvae_filename.c_str());
    musicdebugmode = inputs.debugmode;

-    bool ok = load_acestep(musicllm_filename);
+    bool ok = load_acestep_lm(musicllm_filename);
    if (!ok) {
-        printf("\nFailed to load Music Gen Model!\n");
+        printf("\nFailed to load Music Gen LM Model!\n");
+        return false;
+    }
+
+    ok = load_acestep_dit(musicembedding_filename,musicdiffusion_filename,musicvae_filename);
+    if (!ok) {
+        printf("\nFailed to load Music Gen Diffusion, Embed or VAE Model!\n");
        return false;
    }