kv snapshots save and load last logits for correctness. added some text for musicui, updated docs

2026-04-28 03:30:20 +00:00 · 2026-03-04 21:57:28 +08:00 · 2026-03-04 21:57:28 +08:00 · 4f1b22c415
commit 4f1b22c415
parent 54cf43ae64
4 changed files with 178 additions and 14 deletions
--- a/embd_res/kcpp_docs.embd
+++ b/embd_res/kcpp_docs.embd
@ -1644,6 +1644,144 @@
                      },
                   }
                },
+                "/api/extra/music/prepare": {
+                   "post": {
+                      "summary": "Creates song generation parameters such as caption, lyrics, BPM and duration",
+                      "description": "Creates song generation parameters such as caption, lyrics, BPM and duration. This should be called to produce the generation input for /api/extra/music/generate",
+                      "requestBody": {
+                         "content": {
+                            "application/json": {
+                               "example": {
+                                 "caption": "An emotional rap song about the kobold war.",
+                              },
+                               "schema": {
+                                  "properties": {
+                                     "caption": {
+                                        "type": "string",
+                                        "description": "A short description of the song to create"
+                                     }
+                                  },
+                                  "type": "object"
+                               }
+                            }
+                         },
+                         "required": true
+                      },
+                      "tags": [
+                         "api/extra"
+                      ],
+                      "responses": {
+                         "200": {
+                            "content": {
+                               "application/json": {
+                                  "example":
+                                  {
+                                    "caption": "A melancholic and narrative-driven rap track built around a heavy beat",
+                                    "lyrics": "\n[Verse 1]\nKobold is love, yeah, kobold is life\nKobold is the only way we survive\nShe said okay, yo, keep it that way\nIn kobold we trust, just kobold today\n\n[Chorus]\nKobold, kobold, we carry on now\nKobold, kobold, each way and how\nKobold, kobold, right here we go\nKobold, kobold, all that I know.\n\n[Outro]",
+                                    "bpm": 120,
+                                    "duration": 64.0,
+                                    "keyscale": "G minor",
+                                    "timesignature": "2",
+                                    "vocal_language": "en",
+                                    "task_type": "text2music",
+                                    "seed": 622315,
+                                    "thinking": false,
+                                    "lm_temperature": 0.85,
+                                    "lm_cfg_scale": 2.0,
+                                    "lm_top_p": 0.9,
+                                    "lm_negative_prompt": "",
+                                    "inference_steps": 8,
+                                    "guidance_scale": 1.0,
+                                    "shift": 3.0,
+                                    "audio_codes": ""
+                                 },
+                                  "schema": {
+                                    "properties": {},
+                                    "type": "object"
+                                 }
+                               }
+                            },
+                            "description": "Successful request"
+                         }
+                      },
+                   }
+                },
+                "/api/extra/music/generate": {
+                   "post": {
+                      "summary": "Generates music based on provided captions, lyrics and configurations",
+                      "description": "Generates music based on provided captions, lyrics and configurations. The config can be generated using /api/extra/music/prepare or crafted manually",
+                      "requestBody": {
+                         "content": {
+                            "application/json": {
+                               "example": {
+                                    "caption": "A melancholic and narrative-driven rap track built around a heavy beat",
+                                    "lyrics": "\n[Verse 1]\nKobold is love, yeah, kobold is life\nKobold is the only way we survive\nShe said okay, yo, keep it that way\nIn kobold we trust, just kobold today\n\n[Chorus]\nKobold, kobold, we carry on now\nKobold, kobold, each way and how\nKobold, kobold, right here we go\nKobold, kobold, all that I know.\n\n[Outro]",
+                                    "bpm": 120,
+                                    "duration": 64.0,
+                                    "keyscale": "G minor",
+                                    "timesignature": "2",
+                                    "vocal_language": "en",
+                                    "inference_steps": 8
+                                 },
+                               "schema": {
+                                  "properties": {
+                                     "caption": {
+                                        "type": "string",
+                                        "description": "A short description of the song to create"
+                                     },
+                                     "lyrics": {
+                                        "type": "string",
+                                        "description": "The full lyrics of the song to generate"
+                                     },
+                                     "bpm": {
+                                        "type": "number",
+                                        "description": "The song Beats Per Minute"
+                                     },
+                                     "duration": {
+                                        "type": "number",
+                                        "description": "The length of the song, in seconds."
+                                     },
+                                     "keyscale": {
+                                        "type": "string",
+                                        "description": "The musical key of the song."
+                                     },
+                                     "timesignature": {
+                                        "type": "string",
+                                        "description": "The musical time signature of the song."
+                                     },
+                                     "vocal_language": {
+                                        "type": "string",
+                                        "description": "The language of the song lyrics."
+                                     },
+                                     "inference_steps": {
+                                        "type": "number",
+                                        "description": "How many diffusion steps to use."
+                                     }
+                                  },
+                                  "type": "object"
+                               }
+                            }
+                         },
+                         "required": true
+                      },
+                      "tags": [
+                         "api/extra"
+                      ],
+                      "responses": {
+                        "200": {
+                            "content": {
+                                 "audio/wav": {
+                                    "schema": {
+                                       "type": "string",
+                                       "format": "binary"
+                                    }
+                                 }
+                           },
+                            "description": "Successful request"
+                         }
+                      },
+                   }
+                },
                "/api/extra/json_to_grammar": {
                   "post": {
                      "summary": "Converts a provided JSON schema into GBNF grammar.",
--- a/embd_res/kcpp_musicui.embd
+++ b/embd_res/kcpp_musicui.embd
@ -23,7 +23,7 @@ body{
  color:var(--text);
 }
 header{
-  padding:16px 20px;
+  padding:16px 12px;
  font-size:20px;
  font-weight:600;
  background:rgba(0,0,0,0.3);
@ -32,8 +32,8 @@ header{
 .wrapper{
  display:grid;
  grid-template-columns:minmax(340px,500px) 1fr;
-  gap:20px;
-  padding:20px;
+  gap:14px;
+  padding:10px;
 }
@media(max-width:1100px){
  .wrapper{grid-template-columns:1fr;}
@ -45,7 +45,7 @@ header{
  box-shadow:0 10px 40px rgba(0,0,0,.4);
 }
 h2{
-  margin:0 0 14px 0;
+  margin:0 0 10px 0;
  font-size:16px;
  color:var(--accent2);
 }
@ -112,7 +112,7 @@ button{
 audio{width:100%;margin-top:6px;}
 .advanced-toggle{
  margin-top:8px;
-  font-size:12px;
+  font-size:14px;
  cursor:pointer;
  color:var(--accent2);
 }
@ -174,11 +174,11 @@ input[type="checkbox"] {
  <h2>Song Setup</h2>

  <label>Caption</label>
-  <input id="caption">
+  <input id="caption" placeholder="Describe the song">

  <div style="margin-top:10px">
    <label>Lyrics</label>
-    <textarea id="lyrics"></textarea>
+    <textarea id="lyrics" placeholder="Enter song lyrics, or press 'Plan' to generate them."></textarea>
  </div>

  <div class="form-grid" style="margin-top:12px">
@ -217,7 +217,7 @@ input[type="checkbox"] {

  <div class="actions" id="actionContainer">
    <div id="normalActions" style="display:flex; gap:10px; flex-wrap:wrap;">
-      <button class="secondary" onclick="planSong()">Plan</button>
+      <button class="primary" onclick="planSong()">Plan</button>
      <button class="primary" onclick="generateSong()">Generate</button>
      <button class="danger" onclick="clearFields()">Clear</button>
      <button onclick="exportPlan()">Export JSON</button>
@ -229,6 +229,10 @@ input[type="checkbox"] {

    <input type="file" id="importFile" hidden accept="application/json" onchange="importPlan(event)">
  </div>
+  <div>
+  <p style="font-size:14px">Click 'Plan' first to generate lyrics, BPM and duration. Edit as needed.
+  <br>When satisfied, click 'Generate' to make the music</p>
+  </div>

 </div>

--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -127,6 +127,7 @@ static int debugmode = 0; //-1 = hide all, 0 = normal, 1 = showall
 static bool is_quiet = false;
 static std::vector<gpt_vocab::id> last_n_tokens;
 static std::vector<gpt_vocab::id> current_context_tokens;
+static std::vector<float> loaded_latest_logits; //do not use normally, this is only required when loading state happens and we need to override logits
 static size_t mem_per_token = 0;
 static std::vector<float> logits;
 static std::vector<int> smartcontext;
@ -4668,12 +4669,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
            }
            while(logits_sampled<logits_to_sample && remaining_tokens>0 && !abort_draft && !early_abort)
            {
-                if(!firstdecodedone && current_context_tokens.size()>0)
-                {
-                    embd.clear();
-                    embd.push_back(current_context_tokens[current_context_tokens.size()-1]);
-                    break;
-                }
                if(logits_sampled>0)
                {
                    //this is not the first loop, so we need to increment some things
@ -4708,6 +4703,28 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                    lowestLogit = LowestLogit(logits);
                }

+                if(!firstdecodedone && current_context_tokens.size()>0)
+                {
+                    if(loaded_latest_logits.size()>0)
+                    {
+                        if(debugmode==1 && !is_quiet)
+                        {
+                            printf("\nLoading %d saved logits...\n",loaded_latest_logits.size());
+                        }
+                        //first decode was not done. this can happen when reloading from a perfectly matched state.
+                        //to prevent a catastrophic failure, we must prepare emergency logits for usage
+                        logitsPtr = loaded_latest_logits.data();
+                        lowestLogit = LowestLogit(logitsPtr,n_vocab);
+                    }
+                    else
+                    {
+                        printf("\nNo cached logits and we need them, emergency fallback with degraded quality...\n");
+                        embd.clear();
+                        embd.push_back(current_context_tokens[current_context_tokens.size()-1]);
+                        break;
+                    }
+                }
+
                //if adaptive p sampling is used, we need to cache the original probabilities
                std::vector<llama_token_data> original_candidates;
                if(adaptive_target > 0.0f)
@ -5237,6 +5254,7 @@ size_t gpttype_save_state_kv(int slot)
            savestates[slot].current_savestate_buffer.clear();
            savestates[slot].current_draft_savestate_buffer.clear();
            savestates[slot].savestate_context_tokens.clear();
+            savestates[slot].latest_logits.clear();
            savestates[slot].current_savestate_size = 0;
            savestates[slot].current_draft_savestate_size = 0;
            savestates[slot].media_signature = "";
@ -5258,6 +5276,8 @@ size_t gpttype_save_state_kv(int slot)
            savestates[slot].current_savestate_size   = newsize;
            savestates[slot].savestate_context_tokens = current_context_tokens;
            savestates[slot].media_signature = media_composite_image_signature;
+            float * lgptr = llama_get_logits(llama_ctx_v4);
+            savestates[slot].latest_logits.assign(lgptr,lgptr+n_vocab);
            int maxedpos = llama_memory_seq_pos_max(llama_get_memory(llama_ctx_v4),0);
            //kcpp: so maxedpos appears to always be equal to ctx tokens - 2, if savestate_ctx_tokens > maxedpos + 2 then trim excess
            if(maxedpos > 0 && savestates[slot].savestate_context_tokens.size() > maxedpos + 2)
@ -5316,6 +5336,7 @@ bool gpttype_load_state_kv(int slot)
        if(res > 0)
        {
            current_context_tokens = savestates[slot].savestate_context_tokens;
+            loaded_latest_logits = savestates[slot].latest_logits;
            printf("\nKV Load SaveState %d: Restored KV with %zu tokens.\n", slot,current_context_tokens.size());
            if(draft_ctx && savestates[slot].current_draft_savestate_size>0)
            {
--- a/otherarch/otherarch.h
+++ b/otherarch/otherarch.h
@ -537,6 +537,7 @@ struct savestate_data
    size_t current_draft_savestate_size = 0;
    std::vector<uint8_t> current_draft_savestate_buffer;
    std::vector<gpt_vocab::id> savestate_context_tokens; //for context clones
+    std::vector<float> latest_logits;
    int64_t last_used = 0; //unix timestamp, updated on save or load
    std::string media_signature = "";
 };